Spaces:
Sleeping
Sleeping
# --- IMPORTS --- | |
import gradio as gr | |
import torch | |
from datasets import Dataset | |
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
from sklearn.model_selection import train_test_split | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
nltk.download('stopwords') | |
stopwords = set(stopwords.words('english')) | |
# ------------------------- | |
# --- USEFUL FUNCTIONS ---- | |
def clean_text(text): | |
""" | |
This function get's rid of nonalphabetical characters, stopwords and lower cases the text. | |
Args: | |
text (str): The text to be cleaned | |
Returns: | |
text (str): The cleaned text | |
Example: | |
df['text'] = df['text'].apply(clean_text) | |
""" | |
text = re.sub(r'[^a-zA-Z]', ' ', text) | |
text = text.lower() | |
words = text.split() | |
text = [word for word in words if not word in stopwords] | |
text = ' '.join(words) | |
return text | |
def tokenize_function(dataframe): | |
""" | |
This function tokenizes the 'text' field of the dataframe. | |
Args: | |
dataframe (pandas.DataFrame): The dataframe to be tokenized | |
Returns: | |
dataframe (pandas.DataFrame): The tokenized dataframe | |
Example and output: | |
train_dataset_token = train_dataset.map(tokenize_function, batched=True) | |
""" | |
return tokenizer(dataframe["text"], truncation=True) | |
def compute_metrics(eval_pred): | |
""" | |
This function computes the accuracy, precision, recall and f1 score of the model. | |
It'is passed to the trainer and it outputs when evaluating the model. | |
Args: | |
eval_pred (tuple): The predictions and labels of the model | |
Returns: | |
dict: The accuracy, precision, recall and f1 score of the model | |
Example: | |
>>> trainer.evaluate() | |
{ | |
'accuracy': accuracy, | |
'precision': precision, | |
'recall': recall, | |
'f1': f1 | |
} | |
""" | |
predictions, labels = eval_pred | |
predictions = predictions.argmax(axis=-1) | |
accuracy = accuracy_score(labels, predictions) | |
precision, recall, f1, _ = precision_recall_fscore_support( | |
labels, predictions, average='binary') | |
return { | |
'accuracy': accuracy, | |
'precision': precision, | |
'recall': recall, | |
'f1': f1 | |
} | |
def predict(essay): | |
""" | |
This function makes a prediction based on the text input. | |
Args: | |
text (list): List of all essays to check. | |
Returns: | |
Prediction | |
""" | |
# --- DATA PREPROCESSING --- | |
# Now we convert the input to a dataset | |
df = pd.DataFrame({'text': [essay]}) | |
# Get rid of nonalphatetical characters, stopwords and we lower case it. | |
df['text'] = df['text'].apply(clean_text) | |
# We convert the pandas dataframe into hugging face datasets and tokenize both of them | |
ds = Dataset.from_pandas(df) | |
ds_token = ds.map(tokenize_function, batched=True) | |
# Drop columns that are not necessary and set the dataset format to pytorch tensors | |
ds_token = ds_token.remove_columns(["text", "token_type_ids"]) | |
ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask']) | |
# ------------------------- | |
# --- INSTANTIATING TRAINER ---- | |
# We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training | |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
# Create the training arguments | |
training_args = TrainingArguments(".") | |
# Create the trainer | |
trainer = Trainer( | |
model, | |
training_args, | |
eval_dataset=ds_token, | |
data_collator=data_collator, | |
tokenizer=tokenizer, | |
compute_metrics=compute_metrics | |
) | |
# ------------------------- | |
# --- PREDICT --- | |
# We predict and then format the output | |
predictions = trainer.predict(ds_token) | |
predictions = torch.from_numpy(predictions.predictions) | |
predictions = torch.nn.functional.softmax(predictions, dim=-1) | |
results = [] | |
index = torch.argmax(predictions[0]) | |
confidence = round(predictions[0][index].item() * 100, 2) | |
label = "HUMAN" if index == 0 else "AI" | |
results.append(f'{label} with {confidence}% confidence.') | |
return "\n".join(results) | |
# ------------------------- | |
# ------------------------- | |
# --- LOADING THE MODEL --- | |
# Load the initial tokenizer and model to set the number of labels its going to classify as 2 | |
checkpoint = "diegovelilla/EssAI" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) | |
# ------------------------- | |
iface = gr.Interface( | |
fn=predict, | |
inputs=gr.Textbox( | |
lines=2, placeholder="Enter your essay here...", label="Your essay"), | |
outputs=gr.Textbox(label="Prediction Result"), | |
title="EssAI", | |
description="Detect AI-generated essays in a few seconds." | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() | |