| | |
| |
|
| | import torch |
| | from datasets import Dataset |
| | from transformers import Trainer, TrainingArguments, DataCollatorWithPadding |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| | from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
| | from sklearn.model_selection import train_test_split |
| | import pandas as pd |
| | import numpy as np |
| | import matplotlib.pyplot as plt |
| | import re |
| | import nltk |
| | from nltk.corpus import stopwords |
| | nltk.download('stopwords') |
| | stopwords = set(stopwords.words('english')) |
| |
|
| | |
| |
|
| | |
| |
|
| | input_list = [""" WRITE HERE YOUR FIRST ESSAY """, |
| | """ WRITE HERE YOUR SECOND ESSAY """] |
| |
|
| | |
| |
|
| | |
| |
|
| |
|
| | def clean_text(text): |
| | """ |
| | This funtion get's rid of nonalphabetical characters, stopwords and lower cases the text. |
| | |
| | Args: |
| | text (str): The text to be cleaned |
| | |
| | Returns: |
| | text (str): The cleaned text |
| | |
| | Example: |
| | df['text'] = df['text'].apply(clean_text) |
| | """ |
| | text = re.sub(r'[^a-zA-Z]', ' ', text) |
| | text = text.lower() |
| | words = text.split() |
| | text = [word for word in words if not word in stopwords] |
| | text = ' '.join(words) |
| | return text |
| |
|
| |
|
| | def tokenize_function(dataframe): |
| | """ |
| | This funtion tokenizes the 'text' field of the dataframe. |
| | |
| | Args: |
| | dataframe (pandas.DataFrame): The dataframe to be tokenized |
| | |
| | Returns: |
| | dataframe (pandas.DataFrame): The tokenized dataframe |
| | |
| | Example and output: |
| | train_dataset_token = train_dataset.map(tokenize_function, batched=True) |
| | """ |
| | return tokenizer(dataframe["text"], truncation=True) |
| |
|
| |
|
| | def compute_metrics(eval_pred): |
| | """ |
| | This funtion computes the accuracy, precision, recall and f1 score of the model. |
| | |
| | It'is passed to the trainer and it outputs when evaluating the model. |
| | |
| | Args: |
| | eval_pred (tuple): The predictions and labels of the model |
| | |
| | Returns: |
| | dict: The accuracy, precision, recall and f1 score of the model |
| | |
| | Example: |
| | >>> trainer.evaluate() |
| | { |
| | 'accuracy': accuracy, |
| | 'precision': precision, |
| | 'recall': recall, |
| | 'f1': f1 |
| | } |
| | """ |
| | predictions, labels = eval_pred |
| | predictions = predictions.argmax(axis=-1) |
| | accuracy = accuracy_score(labels, predictions) |
| | precision, recall, f1, _ = precision_recall_fscore_support( |
| | labels, predictions, average='binary') |
| | return { |
| | 'accuracy': accuracy, |
| | 'precision': precision, |
| | 'recall': recall, |
| | 'f1': f1 |
| | } |
| |
|
| | |
| |
|
| | |
| |
|
| |
|
| | |
| | checkpoint = "diegovelilla/EssAI" |
| | tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
| | model = AutoModelForSequenceClassification.from_pretrained(checkpoint) |
| |
|
| | |
| |
|
| | |
| |
|
| | n_input = len(input_list) |
| |
|
| | |
| | df = pd.DataFrame({'text': input_list}) |
| |
|
| |
|
| | |
| | df['text'] = df['text'].apply(clean_text) |
| |
|
| | |
| | ds = Dataset.from_pandas(df) |
| | ds_token = ds.map(tokenize_function, batched=True) |
| |
|
| | |
| | ds_token = ds_token.remove_columns(["text", "token_type_ids"]) |
| | ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask']) |
| |
|
| | |
| |
|
| | |
| |
|
| | |
| | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
| |
|
| | |
| | training_args = TrainingArguments(".") |
| |
|
| | |
| | trainer = Trainer( |
| | model, |
| | training_args, |
| | eval_dataset=ds_token, |
| | data_collator=data_collator, |
| | tokenizer=tokenizer, |
| | compute_metrics=compute_metrics |
| | ) |
| |
|
| | |
| |
|
| | |
| |
|
| | |
| | predictions = trainer.predict(ds_token) |
| | predictions = torch.from_numpy(predictions.predictions) |
| | predictions = torch.nn.functional.softmax(predictions, dim=-1) |
| |
|
| | print('\n\n') |
| | for i in range(n_input): |
| | index = torch.argmax(predictions[i]) |
| | print(f'{i+1}: HUMAN {round(predictions[i][0].item() * 100, 2)}% of confidence.') if index == 0 else print( |
| | f'{i+1}: AI {round(predictions[i][1].item() * 100, 2)}% of confidence.') |
| |
|
| | |
| |
|