EssAI-app / app.py
diegovelilla's picture
first upload of app.py
b9c3ba7 verified
raw
history blame
5.24 kB
# --- IMPORTS ---
import gradio as gr
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
# -------------------------
# --- USEFUL FUNCTIONS ----
def clean_text(text):
"""
This function get's rid of nonalphabetical characters, stopwords and lower cases the text.
Args:
text (str): The text to be cleaned
Returns:
text (str): The cleaned text
Example:
df['text'] = df['text'].apply(clean_text)
"""
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = text.lower()
words = text.split()
text = [word for word in words if not word in stopwords]
text = ' '.join(words)
return text
def tokenize_function(dataframe):
"""
This function tokenizes the 'text' field of the dataframe.
Args:
dataframe (pandas.DataFrame): The dataframe to be tokenized
Returns:
dataframe (pandas.DataFrame): The tokenized dataframe
Example and output:
train_dataset_token = train_dataset.map(tokenize_function, batched=True)
"""
return tokenizer(dataframe["text"], truncation=True)
def compute_metrics(eval_pred):
"""
This function computes the accuracy, precision, recall and f1 score of the model.
It'is passed to the trainer and it outputs when evaluating the model.
Args:
eval_pred (tuple): The predictions and labels of the model
Returns:
dict: The accuracy, precision, recall and f1 score of the model
Example:
>>> trainer.evaluate()
{
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
"""
predictions, labels = eval_pred
predictions = predictions.argmax(axis=-1)
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average='binary')
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
def predict(essay):
"""
This function makes a prediction based on the text input.
Args:
text (list): List of all essays to check.
Returns:
Prediction
"""
# --- DATA PREPROCESSING ---
# Now we convert the input to a dataset
df = pd.DataFrame({'text': [essay]})
# Get rid of nonalphatetical characters, stopwords and we lower case it.
df['text'] = df['text'].apply(clean_text)
# We convert the pandas dataframe into hugging face datasets and tokenize both of them
ds = Dataset.from_pandas(df)
ds_token = ds.map(tokenize_function, batched=True)
# Drop columns that are not necessary and set the dataset format to pytorch tensors
ds_token = ds_token.remove_columns(["text", "token_type_ids"])
ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask'])
# -------------------------
# --- INSTANTIATING TRAINER ----
# We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Create the training arguments
training_args = TrainingArguments(".")
# Create the trainer
trainer = Trainer(
model,
training_args,
eval_dataset=ds_token,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# -------------------------
# --- PREDICT ---
# We predict and then format the output
predictions = trainer.predict(ds_token)
predictions = torch.from_numpy(predictions.predictions)
predictions = torch.nn.functional.softmax(predictions, dim=-1)
results = []
index = torch.argmax(predictions[0])
confidence = round(predictions[0][index].item() * 100, 2)
label = "HUMAN" if index == 0 else "AI"
results.append(f'{label} with {confidence}% confidence.')
return "\n".join(results)
# -------------------------
# -------------------------
# --- LOADING THE MODEL ---
# Load the initial tokenizer and model to set the number of labels its going to classify as 2
checkpoint = "diegovelilla/EssAI"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# -------------------------
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(
lines=2, placeholder="Enter your essay here...", label="Your essay"),
outputs=gr.Textbox(label="Prediction Result"),
title="EssAI",
description="Detect AI-generated essays in a few seconds."
)
# Launch the app
if __name__ == "__main__":
iface.launch()