Spaces:
Sleeping
Sleeping
File size: 5,238 Bytes
b9c3ba7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# --- IMPORTS ---
import gradio as gr
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
# -------------------------
# --- USEFUL FUNCTIONS ----
def clean_text(text):
"""
This function get's rid of nonalphabetical characters, stopwords and lower cases the text.
Args:
text (str): The text to be cleaned
Returns:
text (str): The cleaned text
Example:
df['text'] = df['text'].apply(clean_text)
"""
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = text.lower()
words = text.split()
text = [word for word in words if not word in stopwords]
text = ' '.join(words)
return text
def tokenize_function(dataframe):
"""
This function tokenizes the 'text' field of the dataframe.
Args:
dataframe (pandas.DataFrame): The dataframe to be tokenized
Returns:
dataframe (pandas.DataFrame): The tokenized dataframe
Example and output:
train_dataset_token = train_dataset.map(tokenize_function, batched=True)
"""
return tokenizer(dataframe["text"], truncation=True)
def compute_metrics(eval_pred):
"""
This function computes the accuracy, precision, recall and f1 score of the model.
It'is passed to the trainer and it outputs when evaluating the model.
Args:
eval_pred (tuple): The predictions and labels of the model
Returns:
dict: The accuracy, precision, recall and f1 score of the model
Example:
>>> trainer.evaluate()
{
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
"""
predictions, labels = eval_pred
predictions = predictions.argmax(axis=-1)
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average='binary')
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
def predict(essay):
"""
This function makes a prediction based on the text input.
Args:
text (list): List of all essays to check.
Returns:
Prediction
"""
# --- DATA PREPROCESSING ---
# Now we convert the input to a dataset
df = pd.DataFrame({'text': [essay]})
# Get rid of nonalphatetical characters, stopwords and we lower case it.
df['text'] = df['text'].apply(clean_text)
# We convert the pandas dataframe into hugging face datasets and tokenize both of them
ds = Dataset.from_pandas(df)
ds_token = ds.map(tokenize_function, batched=True)
# Drop columns that are not necessary and set the dataset format to pytorch tensors
ds_token = ds_token.remove_columns(["text", "token_type_ids"])
ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask'])
# -------------------------
# --- INSTANTIATING TRAINER ----
# We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Create the training arguments
training_args = TrainingArguments(".")
# Create the trainer
trainer = Trainer(
model,
training_args,
eval_dataset=ds_token,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# -------------------------
# --- PREDICT ---
# We predict and then format the output
predictions = trainer.predict(ds_token)
predictions = torch.from_numpy(predictions.predictions)
predictions = torch.nn.functional.softmax(predictions, dim=-1)
results = []
index = torch.argmax(predictions[0])
confidence = round(predictions[0][index].item() * 100, 2)
label = "HUMAN" if index == 0 else "AI"
results.append(f'{label} with {confidence}% confidence.')
return "\n".join(results)
# -------------------------
# -------------------------
# --- LOADING THE MODEL ---
# Load the initial tokenizer and model to set the number of labels its going to classify as 2
checkpoint = "diegovelilla/EssAI"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# -------------------------
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(
lines=2, placeholder="Enter your essay here...", label="Your essay"),
outputs=gr.Textbox(label="Prediction Result"),
title="EssAI",
description="Detect AI-generated essays in a few seconds."
)
# Launch the app
if __name__ == "__main__":
iface.launch()
|