|
import os |
|
import json |
|
import glob |
|
from collections import defaultdict |
|
import pandas as pd |
|
import gradio as gr |
|
from content import * |
|
from css import * |
|
import glob |
|
|
|
AFRIMMLU_DIRECT = "afrimmlu_direct" |
|
AFRIMMLU_TRANSLATE = "afrimmlu_translate" |
|
AFRIXNLI_DIRECT = "afrixnli_direct" |
|
AFRIXNLI_TRANSLATE = "afrixnli_translate" |
|
|
|
|
|
BENCHMARKS = [AFRIMMLU_DIRECT, AFRIMMLU_TRANSLATE, AFRIXNLI_DIRECT, AFRIXNLI_TRANSLATE] |
|
|
|
METRICS = ["acc", "acc_stderr", "f1"] |
|
|
|
LANGS = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul'] |
|
|
|
LANG_NAME = { |
|
'amh': 'Amharic', |
|
'eng': 'English', |
|
'ewe': 'Ewe', |
|
'fra': 'French', |
|
'hau': 'Hausa', |
|
'ibo': 'Igbo', |
|
'kin': 'Kinyarwanda', |
|
'lin': 'Lingala', |
|
'lug': 'Luganda', |
|
'orm': 'Oromo', |
|
'sna': 'Shona', |
|
'sot': 'Sotho', |
|
'swa': 'Swahili', |
|
'twi': 'Twi', |
|
'wol': 'Wolof', |
|
'xho': 'Xhosa', |
|
'yor': 'Yoruba', |
|
'zul': 'Zulu' |
|
} |
|
|
|
|
|
def collect_results(): |
|
performance_dict = defaultdict(dict) |
|
pretrained_models = set() |
|
for file in glob.glob('evals/*/*.json'): |
|
with open(file, 'r') as f: |
|
data = json.load(f) |
|
if 'results' not in data: |
|
continue |
|
if 'config' not in data: |
|
continue |
|
results = data['results'] |
|
config = data['config'] |
|
if 'model_args' not in config: |
|
continue |
|
|
|
model_args = config['model_args'].split(',') |
|
pretrained = [x for x in model_args if x.startswith('pretrained=')] |
|
if len(pretrained) != 1: |
|
continue |
|
pretrained = pretrained[0].split('=')[1] |
|
pretrained = pretrained.split('/')[-1] |
|
pretrained_models.add(pretrained) |
|
|
|
for lang_task, perfs in results.items(): |
|
lang_task = lang_task.split('_') |
|
lang = lang_task[-1] |
|
task = '_'.join(lang_task[:-1]) |
|
|
|
assert task in BENCHMARKS |
|
|
|
if lang and task: |
|
metric = METRICS[BENCHMARKS.index(task)-1] |
|
p = round(perfs[metric] * 100, 1) |
|
performance_dict[(pretrained, lang)][task] = p |
|
return performance_dict, pretrained_models |
|
|
|
|
|
def get_leaderboard_df(performance_dict, pretrained_models): |
|
|
|
df = list() |
|
for (pretrained, lang), perfs in performance_dict.items(): |
|
lang_name = LANG_NAME[lang] |
|
afrimmlu_direct_perf = perfs.get(AFRIMMLU_DIRECT, 0.0) |
|
afrimmlu_translate_perf = perfs.get(AFRIMMLU_TRANSLATE, 0.0) |
|
afrixnli_direct_perf = perfs.get(AFRIXNLI_DIRECT, 0.0) |
|
afrixnli_translate_perf = perfs.get(AFRIXNLI_TRANSLATE, 0.0) |
|
|
|
|
|
|
|
|
|
average_divide = [1 if afrimmlu_direct_perf else 0, 1 if afrimmlu_translate_perf else 0, 1 if afrixnli_direct_perf else 0, 1 if afrixnli_translate_perf else 0] |
|
|
|
avg = round((afrimmlu_direct_perf + afrimmlu_translate_perf + afrixnli_direct_perf + afrixnli_translate_perf) / sum(average_divide), 1) |
|
notes = ' '.join([pretrained, lang_name]) |
|
row = [pretrained, lang_name, lang, avg, afrimmlu_direct_perf, afrimmlu_translate_perf, afrixnli_direct_perf, afrixnli_translate_perf, notes] |
|
df.append(row) |
|
|
|
df = pd.DataFrame.from_records(df, columns=COLS) |
|
df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False) |
|
df = df[COLS] |
|
|
|
return df |
|
|
|
|
|
def search_table(df, query): |
|
filtered_df = df[df[NOTES_COL].str.contains(query, case=False)] |
|
return filtered_df |
|
|
|
|
|
|
|
MODEL_COL = "Model" |
|
LANG_COL = "Language" |
|
CODE_COL = "Code" |
|
AVERAGE_COL = "Average" |
|
AFRIMMLU_DIRECT_COL = "AfriMMLU Direct (0-Shot)" |
|
AFRIMMLU_TRANSLATE_COL = "AfriMMLU Translate (0-Shot)" |
|
AFRIXNLI_DIRECT_COL = "AfriXNLI Direct (0-Shot)" |
|
AFRIXNLI_TRANSLATE_COL = "AfriXNLI Translate (0-Shot)" |
|
NOTES_COL = "Notes" |
|
|
|
COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, AFRIMMLU_DIRECT_COL, AFRIMMLU_TRANSLATE_COL, AFRIXNLI_DIRECT_COL, AFRIXNLI_TRANSLATE_COL, NOTES_COL] |
|
TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"] |
|
|
|
args = collect_results() |
|
original_df = get_leaderboard_df(*args) |
|
|
|
demo = gr.Blocks(css=CUSTOM_CSS) |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text") |
|
gr.Markdown(HOW_TO, elem_classes="markdown-text") |
|
|
|
with gr.Group(): |
|
search_bar = gr.Textbox( |
|
placeholder="Search models and languages...", show_label=False, elem_id="search-bar" |
|
) |
|
|
|
leaderboard_table = gr.components.Dataframe( |
|
value=original_df, |
|
headers=COLS, |
|
datatype=TYPES, |
|
|
|
elem_id="leaderboard-table", |
|
) |
|
|
|
|
|
hidden_leaderboard_table_for_search = gr.components.Dataframe( |
|
value=original_df, |
|
headers=COLS, |
|
datatype=TYPES, |
|
|
|
visible=False |
|
) |
|
|
|
search_bar.change( |
|
search_table, |
|
[hidden_leaderboard_table_for_search, search_bar], |
|
leaderboard_table, |
|
) |
|
|
|
gr.Markdown(CREDIT, elem_classes="markdown-text") |
|
gr.Markdown(CITATION, elem_classes="markdown-text") |
|
|
|
demo.launch() |