Spaces:
Running
Running
File size: 7,206 Bytes
4af8ee7 06f5633 4af8ee7 06f5633 4af8ee7 443afb0 9e62081 4af8ee7 9e62081 4af8ee7 80961f0 4af8ee7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import pandas as pd
import numpy as np
RESULT_FILE = 'evaluation_results.xlsx'
metric_ud = {
"Accuracy": 1,
"Average Exact Match": 1,
"Exact Match": 1,
"F1 Score": 1,
"AUC ROC": 1,
"AUC PR": 1,
"Precision": 1,
"Recall": 1,
"Equivalent": 1,
"Bias": -1,
"Demographic representation (race)": -1,
"Demographic representation (gender)": -1,
"Stereotypical associations (race, profession)": -1,
"Stereotypical associations (gender, profession)": -1,
"Toxicity": -1,
"ROUGE-1": 1,
"ROUGE-2": 1,
"ROUGE-L": 1,
"BLEU": 1,
"SummaC": 1,
"BERTScore": 1,
"Coverage": 1,
"Density": 1,
"Compression": 1,
"hLEPOR": 1,
"Character Error Rate": -1,
"Word Error Rate": -1,
"Character Edit Distance": -1,
"Word Edit Distance": -1,
"Perplexity": -1,
"Expected Calibration Error": -1,
"acc@10": 1,
"MRR@10 (Top 30)": 1,
"NDCG@10 (Top 30)": 1,
"MRR@10": 1,
"NDCG@10": 1,
}
tasks = {
"Information Retrieval": "informationretrieval",
"Knowledge": "knowledge",
"Language Modelling": "language-modelling",
"Question Answering": "question-answering",
"Reasoning": "reasoning",
"Summarization": "summarization",
"Text Classification": "text-classification",
"Toxicity Detection": "toxicity-detection",
"Translation": "translation",
"Sentiment Analysis": "sentiment-analysis",
}
settings = {
"Normal": "",
"Few-shot Leanring": "fs",
"Prompt Strategy 0": "pt0",
"Prompt Strategy 1": "pt1",
"Prompt Strategy 2": "pt2",
"Chain-of-Thought": "cot",
"Fairness": "fairness",
"Robustness": "robustness",
}
task_w_settings = {
"Information Retrieval": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
"Knowledge": ["Normal", "Few-shot Leanring", "Robustness"],
"Language Modelling": ["Normal", "Few-shot Leanring", "Fairness"],
"Question Answering": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness", "Fairness"],
"Reasoning": ["Few-shot Leanring", "Chain-of-Thought"],
"Summarization": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness"],
"Text Classification": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
"Toxicity Detection": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
"Translation": ["Few-shot Leanring", "Robustness"],
"Sentiment Analysis": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
}
datasets = {
"question-answering": {
"xquad_xtreme": "xQUAD EXTREME",
"mlqa": "MLQA",
},
"summarization": {
"vietnews": "VietNews",
"wikilingua": "WikiLingua",
},
"text-classification": {
"vsmec": "VSMEC",
"phoatis": "PhoATIS",
},
"toxicity-detection": {
"victsd": "UIT-ViCTSD",
"vihsd": "UIT-ViHSD",
},
"translation": {
"phomt-envi": "PhoMT English-Vietnamese",
"phomt-vien": "PhoMT Vietnamese-English",
"opus100-envi": "OPUS-100 English-Vietnamese",
"opus100-vien": "OPUS-100 Vietnamese-English",
},
"sentiment-analysis": {
"vlsp": "VLSP 2016",
"vsfc": "UIT-VSFC",
},
"informationretrieval": {
"mmarco": "mMARCO",
"mrobust": "mRobust",
},
"knowledge": {
"zaloe2e": "ZaloE2E",
"vimmrc": "ViMMRC",
},
"language-modelling": {
"mlqa-mlm": "MLQA",
"vsec": "VSEC",
},
"reasoning": {
"srnatural-azr": "Synthetic Reasoning (Natural) - Azure",
"srnatural-gcp": "Synthetic Reasoning (Natural) - Google Cloud",
"srabstract-azr": "Synthetic Reasoning (Abstract Symbol)- Azure",
"srabstract-gcp": "Synthetic Reasoning (Abstract Symbol)- Google Cloud",
"srinduction-azr": "Synthetic Reasoning (Induction) - Azure",
"srinduction-gcp": "Synthetic Reasoning (Induction) - Google Cloud",
"srpattern-azr": "Synthetic Introduction (Pattern Match) - Azure",
"srpattern-gcp": "Synthetic Introduction (Pattern Match) - Google Cloud",
"srsubstitution-azr": "Synthetic Introduction (Variable Substitution) - Azure",
"srsubstitution-gcp": "Synthetic Introduction (Variable Substitution) - Google Cloud",
"math-azr-Algebra": "MATH Level 1 (Algebra) - Azure",
"math-azr-Counting&Probability": "MATH Level 1 (Counting&Probability) - Azure",
"math-azr-Geometry": "MATH Level 1 (Geometry) - Azure",
"math-azr-IntermediateAlgebra": "MATH Level 1 (IntermediateAlgebra) - Azure",
"math-azr-NumberTheory": "MATH Level 1 (NumberTheory) - Azure",
"math-azr-Prealgebra": "MATH Level 1 (Prealgebra) - Azure",
"math-azr-Precalculus": "MATH Level 1 (Precalculus) - Azure",
"math-gcp-Algebra": "MATH Level 1 (Algebra) - Google Cloud",
"math-gcp-Counting&Probability": "MATH Level 1 (Counting&Probability) - Google Cloud",
"math-gcp-Geometry": "MATH Level 1 (Geometry) - Google Cloud",
"math-gcp-IntermediateAlgebra": "MATH Level 1 (IntermediateAlgebra) - Google Cloud",
"math-gcp-NumberTheory": "MATH Level 1 (NumberTheory) - Google Cloud",
"math-gcp-Prealgebra": "MATH Level 1 (Prealgebra) - Google Cloud",
"math-gcp-Precalculus": "MATH Level 1 (Precalculus) - Google Cloud",
},
}
def load_data(file_name):
"""
Load the data from the csv file
"""
data = pd.read_excel(
file_name,
sheet_name=None,
header=None
)
results = {}
for task_name, task_id in tasks.items():
for setting_name in task_w_settings[task_name]:
setting_id = settings[setting_name]
sheet_name = f"{task_id}-{setting_id}" if setting_id else task_id
sheet_data = data[sheet_name]
results_by_dataset = {}
# Find the rows that contain the dataset ids
# dataset_ids = datasets[task_id].keys()
row_ids = []
for i, row in sheet_data.iterrows():
if "Models/" in row[0]:
row_ids.append(i)
row_ids.append(len(sheet_data))
# Get the data for each dataset
for i in range(len(row_ids) - 1):
dataset_id = sheet_data.iloc[row_ids[i]][0].split('/')[-1]
dataset_name = datasets[task_id][dataset_id]
dataset_data = sheet_data.iloc[row_ids[i] + 1: row_ids[i + 1]]
dataset_data = dataset_data.fillna(f'-')
header = sheet_data.iloc[0]
header[0] = "Models"
# Create new pandas dataframe
dataset_data = pd.DataFrame(
dataset_data.values, columns=header)
# column_dtypes = {'Models': 'string'}
# for column in header[1:]:
# column_dtypes[column] = 'float'
# dataset_data = dataset_data.astype(column_dtypes)
results_by_dataset[dataset_name] = dataset_data
results[f"{task_id}-{setting_id}"] = results_by_dataset
return results
resutls = load_data(RESULT_FILE)
|