Leaderboard / app.py
MCK-02's picture
fix error
999c991
raw
history blame
9.68 kB
import streamlit as st
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import load_dataset
from evaluate import load as load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
select = st.selectbox('Which model would you like to evaluate?',
('Bart', 'mBart'))
def get_datasets():
if select == 'Bart':
all_datasets = ["Communication Networks: unseen questions", "Communication Networks: unseen answers"]
if select == 'mBart':
all_datasets = ["Micro Job: unseen questions", "Micro Job: unseen answers", "Legal Domain: unseen questions", "Legal Domain: unseen answers"]
return all_datasets
all_datasets = get_datasets()
def get_split(dataset_name):
if dataset_name == "Communication Networks: unseen questions":
split = load_dataset("Short-Answer-Feedback/saf_communication_networks_english", split="test_unseen_questions")
if dataset_name == "Communication Networks: unseen answers":
split = load_dataset("Short-Answer-Feedback/saf_communication_networks_english", split="test_unseen_answers")
if dataset_name == "Micro Job: unseen questions":
split = load_dataset("Short-Answer-Feedback/saf_micro_job_german", split="test_unseen_questions")
if dataset_name == "Micro Job: unseen answers":
split = load_dataset("Short-Answer-Feedback/saf_micro_job_german", split="test_unseen_answers")
if dataset_name == "Legal Domain: unseen questions":
split = load_dataset("Short-Answer-Feedback/saf_legal_domain_german", split="test_unseen_questions")
if dataset_name == "Legal Domain: unseen answers":
split = load_dataset("Short-Answer-Feedback/saf_legal_domain_german", split="test_unseen_answers")
return split
def get_model(datasetname):
if datasetname == "Communication Networks: unseen questions" or datasetname == "Communication Networks: unseen answers":
model = "Short-Answer-Feedback/bart-finetuned-saf-communication-networks"
if datasetname == "Micro Job: unseen questions" or datasetname == "Micro Job: unseen answers":
model = "Short-Answer-Feedback/mbart-finetuned-saf-micro-job"
if datasetname == "Legal Domain: unseen questions" or datasetname == "Legal Domain: unseen answers":
model = "Short-Answer-Feedback/mbart-finetuned-saf-legal-domain"
return model
def get_tokenizer(datasetname):
if datasetname == "Communication Networks: unseen questions" or datasetname == "Communication Networks: unseen answers":
tokenizer = "Short-Answer-Feedback/bart-finetuned-saf-communication-networks"
if datasetname == "Micro Job: unseen questions" or datasetname == "Micro Job: unseen answers":
tokenizer = "Short-Answer-Feedback/mbart-finetuned-saf-micro-job"
if datasetname == "Legal Domain: unseen questions" or datasetname == "Legal Domain: unseen answers":
tokenizer = "Short-Answer-Feedback/mbart-finetuned-saf-legal-domain"
return tokenizer
sacrebleu = load_metric('sacrebleu')
rouge = load_metric('rouge')
meteor = load_metric('meteor')
bertscore = load_metric('bertscore')
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 128
def preprocess_function(examples):
"""
Preprocess entries of the given dataset
Params:
examples (Dataset): dataset to be preprocessed
Returns:
model_inputs (BatchEncoding): tokenized dataset entries
"""
inputs, targets = [], []
for i in range(len(examples['question'])):
inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
targets.append(f"{examples['verification_feedback'][i]} Feedback: {examples['answer_feedback'][i]}")
# apply tokenization to inputs and labels
model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True)
labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True)
model_inputs['labels'] = labels['input_ids']
return model_inputs
def flatten_list(l):
"""
Utility function to convert a list of lists into a flattened list
Params:
l (list of lists): list to be flattened
Returns:
A flattened list with the elements of the original list
"""
return [item for sublist in l for item in sublist]
def extract_feedback(predictions):
"""
Utility function to extract the feedback from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted feedback from the model's predictions
"""
feedback = []
# iterate through predictions and try to extract predicted feedback
for pred in predictions:
try:
fb = pred.split(':', 1)[1]
except IndexError:
try:
if pred.lower().startswith('partially correct'):
fb = pred.split(' ', 1)[2]
else:
fb = pred.split(' ', 1)[1]
except IndexError:
fb = pred
feedback.append(fb.strip())
return feedback
def extract_labels(predictions):
"""
Utility function to extract the labels from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted labels from the model's predictions
"""
labels = []
for pred in predictions:
if pred.lower().startswith('correct'):
label = 'Correct'
elif pred.lower().startswith('partially correct'):
label = 'Partially correct'
elif pred.lower().startswith('incorrect'):
label = 'Incorrect'
else:
label = 'Unknown label'
labels.append(label)
return labels
def get_predictions_labels(model, dataloader):
"""
Evaluate model on the given dataset
Params:
model (PreTrainedModel): seq2seq model
dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
Returns:
results (dict): dictionary with the computed evaluation metrics
predictions (list): list of the decoded predictions of the model
"""
decoded_preds, decoded_labels = [], []
model.eval()
# iterate through batchs in the dataloader
for batch in tqdm(dataloader):
with torch.no_grad():
batch = {k: v.to(device) for k, v in batch.items()}
# generate tokens from batch
generated_tokens = model.generate(
batch['input_ids'],
attention_mask=batch['attention_mask'],
max_length=MAX_TARGET_LENGTH
)
# get golden labels from batch
labels_batch = batch['labels']
# decode model predictions and golden labels
decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
decoded_preds.append(decoded_preds_batch)
decoded_labels.append(decoded_labels_batch)
# convert predictions and golden labels into flattened lists
predictions = flatten_list(decoded_preds)
labels = flatten_list(decoded_labels)
return predictions, labels
def load_data():
df = pd.DataFrame(columns=['Model', 'Dataset', 'SacreBLEU', 'ROUGE-2', 'METEOR', 'BERTScore', 'Accuracy', 'Weighted F1', 'Macro F1'])
for ds in all_datasets:
split = get_split(ds)
model = AutoModelForSeq2SeqLM.from_pretrained(get_model(ds))
tokenizer = AutoTokenizer.from_pretrained(get_tokenizer(ds))
processed_dataset = split.map(
preprocess_function,
batched=True,
remove_columns=split.column_names
)
processed_dataset.set_format('torch')
dataloader = DataLoader(processed_dataset, batch_size=4)
predictions, labels = get_predictions_labels(model, dataloader)
predicted_feedback = extract_feedback(predictions)
predicted_labels = extract_labels(predictions)
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
reference_labels = [x.split('Feedback:', 1)[0].strip() for x in labels]
rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
bleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
bert_score = bertscore.compute(predictions=predicted_feedback, references=reference_feedback, lang='de', model_type='bert-base-multilingual-cased', rescale_with_baseline=True)
reference_labels_np = np.array(reference_labels)
accuracy_value = accuracy_score(reference_labels_np, predicted_labels)
f1_weighted_value = f1_score(reference_labels_np, predicted_labels, average='weighted')
f1_macro_value = f1_score(reference_labels_np, predicted_labels, average='macro', labels=['Incorrect', 'Partially correct', 'Correct'])
new_row_data = {"Model": get_model(ds), "Dataset": ds, "SacreBLEU": bleu_score, "ROUGE-2": rouge_score, "METEOR": meteor_score, "BERTScore": bert_score, "Accuracy": accuracy_value, "Weighted F1": f1_weighted_value, "Macro F1": f1_macro_value}
new_row = pd.DataFrame(new_row_data)
df = pd.concat([df, new_row])
return df
dataframe = load_data()
st.dataframe(dataframe)