Spaces:

diegovelilla
/

EssAI-app

Sleeping

File size: 5,238 Bytes

b9c3ba7

# --- IMPORTS ---

import gradio as gr
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

# -------------------------

# --- USEFUL FUNCTIONS ----


def clean_text(text):
    """

    This function get's rid of nonalphabetical characters, stopwords and lower cases the text.



    Args:

    text (str): The text to be cleaned



    Returns:

    text (str): The cleaned text



    Example:

    df['text'] = df['text'].apply(clean_text)

    """
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    text = [word for word in words if not word in stopwords]
    text = ' '.join(words)
    return text


def tokenize_function(dataframe):
    """

    This function tokenizes the 'text' field of the dataframe.



    Args:

    dataframe (pandas.DataFrame): The dataframe to be tokenized



    Returns:

    dataframe (pandas.DataFrame): The tokenized dataframe



    Example and output:

    train_dataset_token = train_dataset.map(tokenize_function, batched=True)

    """
    return tokenizer(dataframe["text"], truncation=True)


def compute_metrics(eval_pred):
    """

    This function computes the accuracy, precision, recall and f1 score of the model.



    It'is passed to the trainer and it outputs when evaluating the model.



    Args:

    eval_pred (tuple): The predictions and labels of the model



    Returns:

    dict: The accuracy, precision, recall and f1 score of the model



    Example:

    >>> trainer.evaluate()

    {

        'accuracy': accuracy,

        'precision': precision,

        'recall': recall,

        'f1': f1

    }

    """
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


def predict(essay):
    """

    This function makes a prediction based on the text input.



    Args:

    text (list): List of all essays to check.



    Returns:

    Prediction



    """
    # --- DATA PREPROCESSING ---

    # Now we convert the input to a dataset
    df = pd.DataFrame({'text': [essay]})

    # Get rid of nonalphatetical characters, stopwords and we lower case it.
    df['text'] = df['text'].apply(clean_text)

    # We convert the pandas dataframe into hugging face datasets and tokenize both of them
    ds = Dataset.from_pandas(df)
    ds_token = ds.map(tokenize_function, batched=True)

    # Drop columns that are not necessary and set the dataset format to pytorch tensors
    ds_token = ds_token.remove_columns(["text", "token_type_ids"])
    ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask'])

    # -------------------------

    # --- INSTANTIATING TRAINER ----

    # We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Create the training arguments
    training_args = TrainingArguments(".")

    # Create the trainer
    trainer = Trainer(
        model,
        training_args,
        eval_dataset=ds_token,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # -------------------------

    # --- PREDICT ---

    # We predict and then format the output

    predictions = trainer.predict(ds_token)
    predictions = torch.from_numpy(predictions.predictions)
    predictions = torch.nn.functional.softmax(predictions, dim=-1)
    results = []
    index = torch.argmax(predictions[0])
    confidence = round(predictions[0][index].item() * 100, 2)
    label = "HUMAN" if index == 0 else "AI"
    results.append(f'{label} with {confidence}% confidence.')

    return "\n".join(results)
    # -------------------------

# -------------------------

# --- LOADING THE MODEL ---


# Load the initial tokenizer and model to set the number of labels its going to classify as 2
checkpoint = "diegovelilla/EssAI"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# -------------------------

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(
        lines=2, placeholder="Enter your essay here...", label="Your essay"),
    outputs=gr.Textbox(label="Prediction Result"),
    title="EssAI",
    description="Detect AI-generated essays in a few seconds."
)

# Launch the app
if __name__ == "__main__":
    iface.launch()