|
{ |
|
"config_general": { |
|
"lighteval_sha": "?", |
|
"num_fewshot_seeds": 1, |
|
"override_batch_size": -1, |
|
"max_samples": null, |
|
"job_id": "", |
|
"start_time": 6506311.714952203, |
|
"end_time": 6527604.5975109, |
|
"total_evaluation_time_secondes": "21292.882558696903", |
|
"model_name": "taozi555/llama3-Mirage-Walker-8b", |
|
"model_sha": "f14b1a5faecce896e7f12c601756ed2aa3680cac", |
|
"model_dtype": "torch.bfloat16", |
|
"model_size": "15.08 GB", |
|
"config": null |
|
}, |
|
"results": { |
|
"leaderboard|arc:challenge|25": { |
|
"acc": 0.5767918088737202, |
|
"acc_stderr": 0.01443803622084802, |
|
"acc_norm": 0.5810580204778157, |
|
"acc_norm_stderr": 0.014418106953639013 |
|
}, |
|
"leaderboard|hellaswag|10": { |
|
"acc": 0.6086436964748058, |
|
"acc_stderr": 0.004870563921220625, |
|
"acc_norm": 0.783608842859988, |
|
"acc_norm_stderr": 0.004109423832097878 |
|
}, |
|
"leaderboard|mmlu:abstract_algebra|5": { |
|
"acc": 0.43, |
|
"acc_stderr": 0.049756985195624284 |
|
}, |
|
"leaderboard|mmlu:anatomy|5": { |
|
"acc": 0.674074074074074, |
|
"acc_stderr": 0.040491220417025055 |
|
}, |
|
"leaderboard|mmlu:astronomy|5": { |
|
"acc": 0.7631578947368421, |
|
"acc_stderr": 0.03459777606810536 |
|
}, |
|
"leaderboard|mmlu:business_ethics|5": { |
|
"acc": 0.69, |
|
"acc_stderr": 0.04648231987117316 |
|
}, |
|
"leaderboard|mmlu:clinical_knowledge|5": { |
|
"acc": 0.7509433962264151, |
|
"acc_stderr": 0.026616482980501704 |
|
}, |
|
"leaderboard|mmlu:college_biology|5": { |
|
"acc": 0.8333333333333334, |
|
"acc_stderr": 0.031164899666948617 |
|
}, |
|
"leaderboard|mmlu:college_chemistry|5": { |
|
"acc": 0.48, |
|
"acc_stderr": 0.050211673156867795 |
|
}, |
|
"leaderboard|mmlu:college_computer_science|5": { |
|
"acc": 0.62, |
|
"acc_stderr": 0.048783173121456316 |
|
}, |
|
"leaderboard|mmlu:college_mathematics|5": { |
|
"acc": 0.4, |
|
"acc_stderr": 0.04923659639173309 |
|
}, |
|
"leaderboard|mmlu:college_medicine|5": { |
|
"acc": 0.6705202312138728, |
|
"acc_stderr": 0.03583901754736412 |
|
}, |
|
"leaderboard|mmlu:college_physics|5": { |
|
"acc": 0.45098039215686275, |
|
"acc_stderr": 0.049512182523962625 |
|
}, |
|
"leaderboard|mmlu:computer_security|5": { |
|
"acc": 0.79, |
|
"acc_stderr": 0.04093601807403326 |
|
}, |
|
"leaderboard|mmlu:conceptual_physics|5": { |
|
"acc": 0.6085106382978723, |
|
"acc_stderr": 0.03190701242326812 |
|
}, |
|
"leaderboard|mmlu:econometrics|5": { |
|
"acc": 0.5, |
|
"acc_stderr": 0.047036043419179864 |
|
}, |
|
"leaderboard|mmlu:electrical_engineering|5": { |
|
"acc": 0.6275862068965518, |
|
"acc_stderr": 0.0402873153294756 |
|
}, |
|
"leaderboard|mmlu:elementary_mathematics|5": { |
|
"acc": 0.47619047619047616, |
|
"acc_stderr": 0.02572209706438853 |
|
}, |
|
"leaderboard|mmlu:formal_logic|5": { |
|
"acc": 0.5476190476190477, |
|
"acc_stderr": 0.044518079590553275 |
|
}, |
|
"leaderboard|mmlu:global_facts|5": { |
|
"acc": 0.46, |
|
"acc_stderr": 0.05009082659620332 |
|
}, |
|
"leaderboard|mmlu:high_school_biology|5": { |
|
"acc": 0.8129032258064516, |
|
"acc_stderr": 0.022185710092252252 |
|
}, |
|
"leaderboard|mmlu:high_school_chemistry|5": { |
|
"acc": 0.5862068965517241, |
|
"acc_stderr": 0.03465304488406795 |
|
}, |
|
"leaderboard|mmlu:high_school_computer_science|5": { |
|
"acc": 0.72, |
|
"acc_stderr": 0.04512608598542127 |
|
}, |
|
"leaderboard|mmlu:high_school_european_history|5": { |
|
"acc": 0.7454545454545455, |
|
"acc_stderr": 0.03401506715249039 |
|
}, |
|
"leaderboard|mmlu:high_school_geography|5": { |
|
"acc": 0.8383838383838383, |
|
"acc_stderr": 0.02622591986362928 |
|
}, |
|
"leaderboard|mmlu:high_school_government_and_politics|5": { |
|
"acc": 0.917098445595855, |
|
"acc_stderr": 0.01989934131572178 |
|
}, |
|
"leaderboard|mmlu:high_school_macroeconomics|5": { |
|
"acc": 0.6717948717948717, |
|
"acc_stderr": 0.02380763319865726 |
|
}, |
|
"leaderboard|mmlu:high_school_mathematics|5": { |
|
"acc": 0.3888888888888889, |
|
"acc_stderr": 0.029723278961476664 |
|
}, |
|
"leaderboard|mmlu:high_school_microeconomics|5": { |
|
"acc": 0.7605042016806722, |
|
"acc_stderr": 0.027722065493361252 |
|
}, |
|
"leaderboard|mmlu:high_school_physics|5": { |
|
"acc": 0.4304635761589404, |
|
"acc_stderr": 0.04042809961395634 |
|
}, |
|
"leaderboard|mmlu:high_school_psychology|5": { |
|
"acc": 0.8642201834862385, |
|
"acc_stderr": 0.014686907556340022 |
|
}, |
|
"leaderboard|mmlu:high_school_statistics|5": { |
|
"acc": 0.5416666666666666, |
|
"acc_stderr": 0.033981108902946366 |
|
}, |
|
"leaderboard|mmlu:high_school_us_history|5": { |
|
"acc": 0.8578431372549019, |
|
"acc_stderr": 0.02450980392156861 |
|
}, |
|
"leaderboard|mmlu:high_school_world_history|5": { |
|
"acc": 0.8396624472573839, |
|
"acc_stderr": 0.02388438092596567 |
|
}, |
|
"leaderboard|mmlu:human_aging|5": { |
|
"acc": 0.695067264573991, |
|
"acc_stderr": 0.030898610882477518 |
|
}, |
|
"leaderboard|mmlu:human_sexuality|5": { |
|
"acc": 0.7786259541984732, |
|
"acc_stderr": 0.03641297081313729 |
|
}, |
|
"leaderboard|mmlu:international_law|5": { |
|
"acc": 0.8264462809917356, |
|
"acc_stderr": 0.03457272836917669 |
|
}, |
|
"leaderboard|mmlu:jurisprudence|5": { |
|
"acc": 0.8148148148148148, |
|
"acc_stderr": 0.03755265865037181 |
|
}, |
|
"leaderboard|mmlu:logical_fallacies|5": { |
|
"acc": 0.7484662576687117, |
|
"acc_stderr": 0.034089978868575295 |
|
}, |
|
"leaderboard|mmlu:machine_learning|5": { |
|
"acc": 0.5089285714285714, |
|
"acc_stderr": 0.04745033255489123 |
|
}, |
|
"leaderboard|mmlu:management|5": { |
|
"acc": 0.8349514563106796, |
|
"acc_stderr": 0.036756688322331886 |
|
}, |
|
"leaderboard|mmlu:marketing|5": { |
|
"acc": 0.8846153846153846, |
|
"acc_stderr": 0.020930193185179326 |
|
}, |
|
"leaderboard|mmlu:medical_genetics|5": { |
|
"acc": 0.81, |
|
"acc_stderr": 0.03942772444036623 |
|
}, |
|
"leaderboard|mmlu:miscellaneous|5": { |
|
"acc": 0.8454661558109834, |
|
"acc_stderr": 0.012925773495095974 |
|
}, |
|
"leaderboard|mmlu:moral_disputes|5": { |
|
"acc": 0.7254335260115607, |
|
"acc_stderr": 0.024027745155265012 |
|
}, |
|
"leaderboard|mmlu:moral_scenarios|5": { |
|
"acc": 0.41787709497206704, |
|
"acc_stderr": 0.01649540063582008 |
|
}, |
|
"leaderboard|mmlu:nutrition|5": { |
|
"acc": 0.7712418300653595, |
|
"acc_stderr": 0.024051029739912248 |
|
}, |
|
"leaderboard|mmlu:philosophy|5": { |
|
"acc": 0.7427652733118971, |
|
"acc_stderr": 0.024826171289250888 |
|
}, |
|
"leaderboard|mmlu:prehistory|5": { |
|
"acc": 0.7438271604938271, |
|
"acc_stderr": 0.024288533637726095 |
|
}, |
|
"leaderboard|mmlu:professional_accounting|5": { |
|
"acc": 0.5070921985815603, |
|
"acc_stderr": 0.02982449855912901 |
|
}, |
|
"leaderboard|mmlu:professional_law|5": { |
|
"acc": 0.4810951760104302, |
|
"acc_stderr": 0.012761104871472658 |
|
}, |
|
"leaderboard|mmlu:professional_medicine|5": { |
|
"acc": 0.75, |
|
"acc_stderr": 0.026303648393696036 |
|
}, |
|
"leaderboard|mmlu:professional_psychology|5": { |
|
"acc": 0.6993464052287581, |
|
"acc_stderr": 0.01855063450295296 |
|
}, |
|
"leaderboard|mmlu:public_relations|5": { |
|
"acc": 0.6545454545454545, |
|
"acc_stderr": 0.04554619617541054 |
|
}, |
|
"leaderboard|mmlu:security_studies|5": { |
|
"acc": 0.7510204081632653, |
|
"acc_stderr": 0.027682979522960227 |
|
}, |
|
"leaderboard|mmlu:sociology|5": { |
|
"acc": 0.8308457711442786, |
|
"acc_stderr": 0.026508590656233268 |
|
}, |
|
"leaderboard|mmlu:us_foreign_policy|5": { |
|
"acc": 0.86, |
|
"acc_stderr": 0.03487350880197769 |
|
}, |
|
"leaderboard|mmlu:virology|5": { |
|
"acc": 0.5120481927710844, |
|
"acc_stderr": 0.03891364495835817 |
|
}, |
|
"leaderboard|mmlu:world_religions|5": { |
|
"acc": 0.8245614035087719, |
|
"acc_stderr": 0.029170885500727665 |
|
}, |
|
"leaderboard|truthfulqa:mc|0": { |
|
"truthfulqa_mc1": 0.34761321909424725, |
|
"truthfulqa_mc1_stderr": 0.016670769188897306, |
|
"truthfulqa_mc2": 0.5156373144783575, |
|
"truthfulqa_mc2_stderr": 0.015703082442877 |
|
}, |
|
"leaderboard|winogrande|5": { |
|
"acc": 0.7529597474348856, |
|
"acc_stderr": 0.012121402942855576 |
|
}, |
|
"leaderboard|gsm8k|5": { |
|
"qem": 0.6724791508718726, |
|
"qem_stderr": 0.012927102210426719 |
|
}, |
|
"leaderboard|mmlu:_average|5": { |
|
"acc": 0.6801243622973331, |
|
"acc_stderr": 0.03296281402260026 |
|
}, |
|
"all": { |
|
"acc": 0.6784247317288568, |
|
"acc_stderr": 0.03183850670621899, |
|
"acc_norm": 0.6823334316689018, |
|
"acc_norm_stderr": 0.009263765392868446, |
|
"truthfulqa_mc1": 0.34761321909424725, |
|
"truthfulqa_mc1_stderr": 0.016670769188897306, |
|
"truthfulqa_mc2": 0.5156373144783575, |
|
"truthfulqa_mc2_stderr": 0.015703082442877, |
|
"qem": 0.6724791508718726, |
|
"qem_stderr": 0.012927102210426719 |
|
} |
|
}, |
|
"versions": { |
|
"leaderboard|arc:challenge|25": 0, |
|
"leaderboard|gsm8k|5": 0, |
|
"leaderboard|hellaswag|10": 0, |
|
"leaderboard|mmlu:abstract_algebra|5": 0, |
|
"leaderboard|mmlu:anatomy|5": 0, |
|
"leaderboard|mmlu:astronomy|5": 0, |
|
"leaderboard|mmlu:business_ethics|5": 0, |
|
"leaderboard|mmlu:clinical_knowledge|5": 0, |
|
"leaderboard|mmlu:college_biology|5": 0, |
|
"leaderboard|mmlu:college_chemistry|5": 0, |
|
"leaderboard|mmlu:college_computer_science|5": 0, |
|
"leaderboard|mmlu:college_mathematics|5": 0, |
|
"leaderboard|mmlu:college_medicine|5": 0, |
|
"leaderboard|mmlu:college_physics|5": 0, |
|
"leaderboard|mmlu:computer_security|5": 0, |
|
"leaderboard|mmlu:conceptual_physics|5": 0, |
|
"leaderboard|mmlu:econometrics|5": 0, |
|
"leaderboard|mmlu:electrical_engineering|5": 0, |
|
"leaderboard|mmlu:elementary_mathematics|5": 0, |
|
"leaderboard|mmlu:formal_logic|5": 0, |
|
"leaderboard|mmlu:global_facts|5": 0, |
|
"leaderboard|mmlu:high_school_biology|5": 0, |
|
"leaderboard|mmlu:high_school_chemistry|5": 0, |
|
"leaderboard|mmlu:high_school_computer_science|5": 0, |
|
"leaderboard|mmlu:high_school_european_history|5": 0, |
|
"leaderboard|mmlu:high_school_geography|5": 0, |
|
"leaderboard|mmlu:high_school_government_and_politics|5": 0, |
|
"leaderboard|mmlu:high_school_macroeconomics|5": 0, |
|
"leaderboard|mmlu:high_school_mathematics|5": 0, |
|
"leaderboard|mmlu:high_school_microeconomics|5": 0, |
|
"leaderboard|mmlu:high_school_physics|5": 0, |
|
"leaderboard|mmlu:high_school_psychology|5": 0, |
|
"leaderboard|mmlu:high_school_statistics|5": 0, |
|
"leaderboard|mmlu:high_school_us_history|5": 0, |
|
"leaderboard|mmlu:high_school_world_history|5": 0, |
|
"leaderboard|mmlu:human_aging|5": 0, |
|
"leaderboard|mmlu:human_sexuality|5": 0, |
|
"leaderboard|mmlu:international_law|5": 0, |
|
"leaderboard|mmlu:jurisprudence|5": 0, |
|
"leaderboard|mmlu:logical_fallacies|5": 0, |
|
"leaderboard|mmlu:machine_learning|5": 0, |
|
"leaderboard|mmlu:management|5": 0, |
|
"leaderboard|mmlu:marketing|5": 0, |
|
"leaderboard|mmlu:medical_genetics|5": 0, |
|
"leaderboard|mmlu:miscellaneous|5": 0, |
|
"leaderboard|mmlu:moral_disputes|5": 0, |
|
"leaderboard|mmlu:moral_scenarios|5": 0, |
|
"leaderboard|mmlu:nutrition|5": 0, |
|
"leaderboard|mmlu:philosophy|5": 0, |
|
"leaderboard|mmlu:prehistory|5": 0, |
|
"leaderboard|mmlu:professional_accounting|5": 0, |
|
"leaderboard|mmlu:professional_law|5": 0, |
|
"leaderboard|mmlu:professional_medicine|5": 0, |
|
"leaderboard|mmlu:professional_psychology|5": 0, |
|
"leaderboard|mmlu:public_relations|5": 0, |
|
"leaderboard|mmlu:security_studies|5": 0, |
|
"leaderboard|mmlu:sociology|5": 0, |
|
"leaderboard|mmlu:us_foreign_policy|5": 0, |
|
"leaderboard|mmlu:virology|5": 0, |
|
"leaderboard|mmlu:world_religions|5": 0, |
|
"leaderboard|truthfulqa:mc|0": 0, |
|
"leaderboard|winogrande|5": 0 |
|
}, |
|
"config_tasks": { |
|
"leaderboard|arc:challenge": { |
|
"name": "arc:challenge", |
|
"prompt_function": "arc", |
|
"hf_repo": "ai2_arc", |
|
"hf_subset": "ARC-Challenge", |
|
"metric": [ |
|
"loglikelihood_acc", |
|
"loglikelihood_acc_norm_nospace" |
|
], |
|
"hf_avail_splits": [ |
|
"train", |
|
"test" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": null, |
|
"few_shots_select": "random_sampling_from_train", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"arc" |
|
], |
|
"original_num_docs": 1172, |
|
"effective_num_docs": 1172, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|gsm8k": { |
|
"name": "gsm8k", |
|
"prompt_function": "gsm8k", |
|
"hf_repo": "gsm8k", |
|
"hf_subset": "main", |
|
"metric": [ |
|
"quasi_exact_match_gsm8k" |
|
], |
|
"hf_avail_splits": [ |
|
"train", |
|
"test" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": null, |
|
"few_shots_select": "random_sampling_from_train", |
|
"generation_size": 256, |
|
"stop_sequence": [ |
|
"Question:", |
|
"Question", |
|
":" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard" |
|
], |
|
"original_num_docs": 1319, |
|
"effective_num_docs": 1319, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|hellaswag": { |
|
"name": "hellaswag", |
|
"prompt_function": "hellaswag_harness", |
|
"hf_repo": "hellaswag", |
|
"hf_subset": "default", |
|
"metric": [ |
|
"loglikelihood_acc", |
|
"loglikelihood_acc_norm" |
|
], |
|
"hf_avail_splits": [ |
|
"train", |
|
"test", |
|
"validation" |
|
], |
|
"evaluation_splits": [ |
|
"validation" |
|
], |
|
"few_shots_split": null, |
|
"few_shots_select": "random_sampling_from_train", |
|
"generation_size": -1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard" |
|
], |
|
"original_num_docs": 10042, |
|
"effective_num_docs": 10042, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:abstract_algebra": { |
|
"name": "mmlu:abstract_algebra", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "abstract_algebra", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:anatomy": { |
|
"name": "mmlu:anatomy", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "anatomy", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 135, |
|
"effective_num_docs": 135, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:astronomy": { |
|
"name": "mmlu:astronomy", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "astronomy", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 152, |
|
"effective_num_docs": 152, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:business_ethics": { |
|
"name": "mmlu:business_ethics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "business_ethics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:clinical_knowledge": { |
|
"name": "mmlu:clinical_knowledge", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "clinical_knowledge", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 265, |
|
"effective_num_docs": 265, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:college_biology": { |
|
"name": "mmlu:college_biology", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "college_biology", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 144, |
|
"effective_num_docs": 144, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:college_chemistry": { |
|
"name": "mmlu:college_chemistry", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "college_chemistry", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:college_computer_science": { |
|
"name": "mmlu:college_computer_science", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "college_computer_science", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:college_mathematics": { |
|
"name": "mmlu:college_mathematics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "college_mathematics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:college_medicine": { |
|
"name": "mmlu:college_medicine", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "college_medicine", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 173, |
|
"effective_num_docs": 173, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:college_physics": { |
|
"name": "mmlu:college_physics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "college_physics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 102, |
|
"effective_num_docs": 102, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:computer_security": { |
|
"name": "mmlu:computer_security", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "computer_security", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:conceptual_physics": { |
|
"name": "mmlu:conceptual_physics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "conceptual_physics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 235, |
|
"effective_num_docs": 235, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:econometrics": { |
|
"name": "mmlu:econometrics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "econometrics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 114, |
|
"effective_num_docs": 114, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:electrical_engineering": { |
|
"name": "mmlu:electrical_engineering", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "electrical_engineering", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 145, |
|
"effective_num_docs": 145, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:elementary_mathematics": { |
|
"name": "mmlu:elementary_mathematics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "elementary_mathematics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 378, |
|
"effective_num_docs": 378, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:formal_logic": { |
|
"name": "mmlu:formal_logic", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "formal_logic", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 126, |
|
"effective_num_docs": 126, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:global_facts": { |
|
"name": "mmlu:global_facts", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "global_facts", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_biology": { |
|
"name": "mmlu:high_school_biology", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_biology", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 310, |
|
"effective_num_docs": 310, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_chemistry": { |
|
"name": "mmlu:high_school_chemistry", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_chemistry", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 203, |
|
"effective_num_docs": 203, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_computer_science": { |
|
"name": "mmlu:high_school_computer_science", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_computer_science", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_european_history": { |
|
"name": "mmlu:high_school_european_history", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_european_history", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 165, |
|
"effective_num_docs": 165, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_geography": { |
|
"name": "mmlu:high_school_geography", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_geography", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 198, |
|
"effective_num_docs": 198, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_government_and_politics": { |
|
"name": "mmlu:high_school_government_and_politics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_government_and_politics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 193, |
|
"effective_num_docs": 193, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_macroeconomics": { |
|
"name": "mmlu:high_school_macroeconomics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_macroeconomics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 390, |
|
"effective_num_docs": 390, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_mathematics": { |
|
"name": "mmlu:high_school_mathematics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_mathematics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 270, |
|
"effective_num_docs": 270, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_microeconomics": { |
|
"name": "mmlu:high_school_microeconomics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_microeconomics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 238, |
|
"effective_num_docs": 238, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_physics": { |
|
"name": "mmlu:high_school_physics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_physics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 151, |
|
"effective_num_docs": 151, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_psychology": { |
|
"name": "mmlu:high_school_psychology", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_psychology", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 545, |
|
"effective_num_docs": 545, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_statistics": { |
|
"name": "mmlu:high_school_statistics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_statistics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 216, |
|
"effective_num_docs": 216, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_us_history": { |
|
"name": "mmlu:high_school_us_history", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_us_history", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 204, |
|
"effective_num_docs": 204, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:high_school_world_history": { |
|
"name": "mmlu:high_school_world_history", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "high_school_world_history", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 237, |
|
"effective_num_docs": 237, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:human_aging": { |
|
"name": "mmlu:human_aging", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "human_aging", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 223, |
|
"effective_num_docs": 223, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:human_sexuality": { |
|
"name": "mmlu:human_sexuality", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "human_sexuality", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 131, |
|
"effective_num_docs": 131, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:international_law": { |
|
"name": "mmlu:international_law", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "international_law", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 121, |
|
"effective_num_docs": 121, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:jurisprudence": { |
|
"name": "mmlu:jurisprudence", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "jurisprudence", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 108, |
|
"effective_num_docs": 108, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:logical_fallacies": { |
|
"name": "mmlu:logical_fallacies", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "logical_fallacies", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 163, |
|
"effective_num_docs": 163, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:machine_learning": { |
|
"name": "mmlu:machine_learning", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "machine_learning", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 112, |
|
"effective_num_docs": 112, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:management": { |
|
"name": "mmlu:management", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "management", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 103, |
|
"effective_num_docs": 103, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:marketing": { |
|
"name": "mmlu:marketing", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "marketing", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 234, |
|
"effective_num_docs": 234, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:medical_genetics": { |
|
"name": "mmlu:medical_genetics", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "medical_genetics", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:miscellaneous": { |
|
"name": "mmlu:miscellaneous", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "miscellaneous", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 783, |
|
"effective_num_docs": 783, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:moral_disputes": { |
|
"name": "mmlu:moral_disputes", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "moral_disputes", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 346, |
|
"effective_num_docs": 346, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:moral_scenarios": { |
|
"name": "mmlu:moral_scenarios", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "moral_scenarios", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 895, |
|
"effective_num_docs": 895, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:nutrition": { |
|
"name": "mmlu:nutrition", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "nutrition", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 306, |
|
"effective_num_docs": 306, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:philosophy": { |
|
"name": "mmlu:philosophy", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "philosophy", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 311, |
|
"effective_num_docs": 311, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:prehistory": { |
|
"name": "mmlu:prehistory", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "prehistory", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 324, |
|
"effective_num_docs": 324, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:professional_accounting": { |
|
"name": "mmlu:professional_accounting", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "professional_accounting", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 282, |
|
"effective_num_docs": 282, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:professional_law": { |
|
"name": "mmlu:professional_law", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "professional_law", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 1534, |
|
"effective_num_docs": 1534, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:professional_medicine": { |
|
"name": "mmlu:professional_medicine", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "professional_medicine", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 272, |
|
"effective_num_docs": 272, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:professional_psychology": { |
|
"name": "mmlu:professional_psychology", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "professional_psychology", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 612, |
|
"effective_num_docs": 612, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:public_relations": { |
|
"name": "mmlu:public_relations", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "public_relations", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 110, |
|
"effective_num_docs": 110, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:security_studies": { |
|
"name": "mmlu:security_studies", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "security_studies", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 245, |
|
"effective_num_docs": 245, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:sociology": { |
|
"name": "mmlu:sociology", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "sociology", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 201, |
|
"effective_num_docs": 201, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:us_foreign_policy": { |
|
"name": "mmlu:us_foreign_policy", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "us_foreign_policy", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 100, |
|
"effective_num_docs": 100, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:virology": { |
|
"name": "mmlu:virology", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "virology", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 166, |
|
"effective_num_docs": 166, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|mmlu:world_religions": { |
|
"name": "mmlu:world_religions", |
|
"prompt_function": "mmlu_harness", |
|
"hf_repo": "lighteval/mmlu", |
|
"hf_subset": "world_religions", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"auxiliary_train", |
|
"test", |
|
"validation", |
|
"dev" |
|
], |
|
"evaluation_splits": [ |
|
"test" |
|
], |
|
"few_shots_split": "dev", |
|
"few_shots_select": "sequential", |
|
"generation_size": 1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard", |
|
"mmlu" |
|
], |
|
"original_num_docs": 171, |
|
"effective_num_docs": 171, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|truthfulqa:mc": { |
|
"name": "truthfulqa:mc", |
|
"prompt_function": "truthful_qa_multiple_choice", |
|
"hf_repo": "truthful_qa", |
|
"hf_subset": "multiple_choice", |
|
"metric": [ |
|
"truthfulqa_mc_metrics" |
|
], |
|
"hf_avail_splits": [ |
|
"validation" |
|
], |
|
"evaluation_splits": [ |
|
"validation" |
|
], |
|
"few_shots_split": null, |
|
"few_shots_select": null, |
|
"generation_size": -1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard" |
|
], |
|
"original_num_docs": 817, |
|
"effective_num_docs": 817, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
}, |
|
"leaderboard|winogrande": { |
|
"name": "winogrande", |
|
"prompt_function": "winogrande", |
|
"hf_repo": "winogrande", |
|
"hf_subset": "winogrande_xl", |
|
"metric": [ |
|
"loglikelihood_acc" |
|
], |
|
"hf_avail_splits": [ |
|
"train", |
|
"test", |
|
"validation" |
|
], |
|
"evaluation_splits": [ |
|
"validation" |
|
], |
|
"few_shots_split": null, |
|
"few_shots_select": "random_sampling", |
|
"generation_size": -1, |
|
"stop_sequence": [ |
|
"\n" |
|
], |
|
"output_regex": null, |
|
"frozen": false, |
|
"suite": [ |
|
"leaderboard" |
|
], |
|
"original_num_docs": 1267, |
|
"effective_num_docs": 1267, |
|
"trust_dataset": true, |
|
"must_remove_duplicate_docs": null |
|
} |
|
}, |
|
"summary_tasks": { |
|
"leaderboard|arc:challenge|25": { |
|
"hashes": { |
|
"hash_examples": "17b0cae357c0259e", |
|
"hash_full_prompts": "4aeb23a740784b86", |
|
"hash_input_tokens": "6327b032f3de83c4", |
|
"hash_cont_tokens": "c77636140035b318" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 1172, |
|
"padded": 4687, |
|
"non_padded": 0, |
|
"effective_few_shots": 25.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|hellaswag|10": { |
|
"hashes": { |
|
"hash_examples": "31985c805c3a737e", |
|
"hash_full_prompts": "3c2d3440e190b07b", |
|
"hash_input_tokens": "bb027c2cf1da51d3", |
|
"hash_cont_tokens": "2d70b9577ac439d0" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 10042, |
|
"padded": 40105, |
|
"non_padded": 63, |
|
"effective_few_shots": 10.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:abstract_algebra|5": { |
|
"hashes": { |
|
"hash_examples": "4c76229e00c9c0e9", |
|
"hash_full_prompts": "faefa0cccb952fe0", |
|
"hash_input_tokens": "c7100cded1fd23c7", |
|
"hash_cont_tokens": "a886b3552371a98b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:anatomy|5": { |
|
"hashes": { |
|
"hash_examples": "6a1f8104dccbd33b", |
|
"hash_full_prompts": "eacd03e46972fa59", |
|
"hash_input_tokens": "66c3858c5e24e62f", |
|
"hash_cont_tokens": "9be31d13c42ead00" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 135, |
|
"padded": 540, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:astronomy|5": { |
|
"hashes": { |
|
"hash_examples": "1302effa3a76ce4c", |
|
"hash_full_prompts": "826cacbdf1f6bfd0", |
|
"hash_input_tokens": "5c83cc7051903092", |
|
"hash_cont_tokens": "5da09bc77752f437" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 152, |
|
"padded": 608, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:business_ethics|5": { |
|
"hashes": { |
|
"hash_examples": "03cb8bce5336419a", |
|
"hash_full_prompts": "518511169382ac39", |
|
"hash_input_tokens": "7aeea403244c4473", |
|
"hash_cont_tokens": "03b2ebbdc5224bb0" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:clinical_knowledge|5": { |
|
"hashes": { |
|
"hash_examples": "ffbb9c7b2be257f9", |
|
"hash_full_prompts": "0b07b0bc774fdfd9", |
|
"hash_input_tokens": "ec0c6a5f110eb99d", |
|
"hash_cont_tokens": "40dd7263ce5af5de" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 265, |
|
"padded": 1060, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:college_biology|5": { |
|
"hashes": { |
|
"hash_examples": "3ee77f176f38eb8e", |
|
"hash_full_prompts": "22cbe0e8dabf98b1", |
|
"hash_input_tokens": "98495e6d43b43601", |
|
"hash_cont_tokens": "78048b26c5552ac3" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 144, |
|
"padded": 576, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:college_chemistry|5": { |
|
"hashes": { |
|
"hash_examples": "ce61a69c46d47aeb", |
|
"hash_full_prompts": "9c1288940a4afb59", |
|
"hash_input_tokens": "6d15ae51e4fb0734", |
|
"hash_cont_tokens": "e27ea803720e4f81" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:college_computer_science|5": { |
|
"hashes": { |
|
"hash_examples": "32805b52d7d5daab", |
|
"hash_full_prompts": "9522781d0cdf1a43", |
|
"hash_input_tokens": "d067a9964676ea01", |
|
"hash_cont_tokens": "00f531b5784e741a" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:college_mathematics|5": { |
|
"hashes": { |
|
"hash_examples": "55da1a0a0bd33722", |
|
"hash_full_prompts": "72fe6f46a57e6ca4", |
|
"hash_input_tokens": "cd2d6c5695665f54", |
|
"hash_cont_tokens": "7a6c30f41cc94aa7" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:college_medicine|5": { |
|
"hashes": { |
|
"hash_examples": "c33e143163049176", |
|
"hash_full_prompts": "dee0989b2c8993f4", |
|
"hash_input_tokens": "976ce2b55b7907d5", |
|
"hash_cont_tokens": "5f84bdb85e243e5d" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 173, |
|
"padded": 692, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:college_physics|5": { |
|
"hashes": { |
|
"hash_examples": "ebdab1cdb7e555df", |
|
"hash_full_prompts": "a1be6b64ea1948c3", |
|
"hash_input_tokens": "2bf98ac7bc989c60", |
|
"hash_cont_tokens": "f32a0cc41acb4bf8" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 102, |
|
"padded": 408, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:computer_security|5": { |
|
"hashes": { |
|
"hash_examples": "a24fd7d08a560921", |
|
"hash_full_prompts": "01bc3fdfdefe67a4", |
|
"hash_input_tokens": "239fad08f7e25672", |
|
"hash_cont_tokens": "a886b3552371a98b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:conceptual_physics|5": { |
|
"hashes": { |
|
"hash_examples": "8300977a79386993", |
|
"hash_full_prompts": "b39315a8ada3ca79", |
|
"hash_input_tokens": "8fd1fa091cf77da8", |
|
"hash_cont_tokens": "6408f70f3d9ada31" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 235, |
|
"padded": 940, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:econometrics|5": { |
|
"hashes": { |
|
"hash_examples": "ddde36788a04a46f", |
|
"hash_full_prompts": "70bab37ca5fcc48f", |
|
"hash_input_tokens": "75797ac68b074a88", |
|
"hash_cont_tokens": "2fab100ce81d11e3" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 114, |
|
"padded": 456, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:electrical_engineering|5": { |
|
"hashes": { |
|
"hash_examples": "acbc5def98c19b3f", |
|
"hash_full_prompts": "86a4747481c11c61", |
|
"hash_input_tokens": "d30b3949f1a869bc", |
|
"hash_cont_tokens": "e75df8f470aa4973" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 145, |
|
"padded": 580, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:elementary_mathematics|5": { |
|
"hashes": { |
|
"hash_examples": "146e61d07497a9bd", |
|
"hash_full_prompts": "1fe56333735325fa", |
|
"hash_input_tokens": "b14ababf1fdaf847", |
|
"hash_cont_tokens": "4ea4b4978c1fb85a" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 378, |
|
"padded": 1512, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:formal_logic|5": { |
|
"hashes": { |
|
"hash_examples": "8635216e1909a03f", |
|
"hash_full_prompts": "cc83c1ede45f974c", |
|
"hash_input_tokens": "0dee944c92ba09fd", |
|
"hash_cont_tokens": "bd7b90f7fcc6628b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 126, |
|
"padded": 504, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:global_facts|5": { |
|
"hashes": { |
|
"hash_examples": "30b315aa6353ee47", |
|
"hash_full_prompts": "3a2ec1e2785c69a5", |
|
"hash_input_tokens": "5ba3e5396bf746e6", |
|
"hash_cont_tokens": "a886b3552371a98b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_biology|5": { |
|
"hashes": { |
|
"hash_examples": "c9136373af2180de", |
|
"hash_full_prompts": "27646a569cf2a6f8", |
|
"hash_input_tokens": "4f3e8567ca1086f0", |
|
"hash_cont_tokens": "d294ad795a4ba989" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 310, |
|
"padded": 1240, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_chemistry|5": { |
|
"hashes": { |
|
"hash_examples": "b0661bfa1add6404", |
|
"hash_full_prompts": "6905c6ca76f7b2b7", |
|
"hash_input_tokens": "d06720f4af19fcde", |
|
"hash_cont_tokens": "208aff39cfca671a" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 203, |
|
"padded": 812, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_computer_science|5": { |
|
"hashes": { |
|
"hash_examples": "80fc1d623a3d665f", |
|
"hash_full_prompts": "b80092241e8b6c06", |
|
"hash_input_tokens": "4b42a8ce6184222f", |
|
"hash_cont_tokens": "3b482b98e18c249b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_european_history|5": { |
|
"hashes": { |
|
"hash_examples": "854da6e5af0fe1a1", |
|
"hash_full_prompts": "a3bc32a5dc022ce7", |
|
"hash_input_tokens": "9829b92f11e38c39", |
|
"hash_cont_tokens": "7b6f4c22b304c3cc" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 165, |
|
"padded": 656, |
|
"non_padded": 4, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_geography|5": { |
|
"hashes": { |
|
"hash_examples": "7dc963c7acd19ad8", |
|
"hash_full_prompts": "53f91beae305905d", |
|
"hash_input_tokens": "a6e83c8e9a37451f", |
|
"hash_cont_tokens": "1a85c9e696d91a66" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 198, |
|
"padded": 792, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_government_and_politics|5": { |
|
"hashes": { |
|
"hash_examples": "1f675dcdebc9758f", |
|
"hash_full_prompts": "623fd7e3495f243f", |
|
"hash_input_tokens": "70d3312474815a5e", |
|
"hash_cont_tokens": "a47a4530b8790081" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 193, |
|
"padded": 772, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_macroeconomics|5": { |
|
"hashes": { |
|
"hash_examples": "2fb32cf2d80f0b35", |
|
"hash_full_prompts": "378ac13c8abb6c5f", |
|
"hash_input_tokens": "f580d17a3214af15", |
|
"hash_cont_tokens": "e71e7c6acf44c3e5" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 390, |
|
"padded": 1560, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_mathematics|5": { |
|
"hashes": { |
|
"hash_examples": "fd6646fdb5d58a1f", |
|
"hash_full_prompts": "14d34e0b34750627", |
|
"hash_input_tokens": "361a779f3e9723b0", |
|
"hash_cont_tokens": "0a886cdd21b224a6" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 270, |
|
"padded": 1080, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_microeconomics|5": { |
|
"hashes": { |
|
"hash_examples": "2118f21f71d87d84", |
|
"hash_full_prompts": "9ac09e5d4da991c9", |
|
"hash_input_tokens": "b5bcfd3df743cee0", |
|
"hash_cont_tokens": "a5f61d5beba13cc2" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 238, |
|
"padded": 952, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_physics|5": { |
|
"hashes": { |
|
"hash_examples": "dc3ce06378548565", |
|
"hash_full_prompts": "b4832a554d47d224", |
|
"hash_input_tokens": "4caf36cb75ba8552", |
|
"hash_cont_tokens": "c4135c191e57e8e6" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 151, |
|
"padded": 604, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_psychology|5": { |
|
"hashes": { |
|
"hash_examples": "c8d1d98a40e11f2f", |
|
"hash_full_prompts": "1e8cd27064546274", |
|
"hash_input_tokens": "9f7a7525450c0b5b", |
|
"hash_cont_tokens": "287bec936450f9c6" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 545, |
|
"padded": 2180, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_statistics|5": { |
|
"hashes": { |
|
"hash_examples": "666c8759b98ee4ff", |
|
"hash_full_prompts": "e05ab41077ec0afa", |
|
"hash_input_tokens": "dbb29057733d0628", |
|
"hash_cont_tokens": "7e446857c7d6d869" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 216, |
|
"padded": 864, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_us_history|5": { |
|
"hashes": { |
|
"hash_examples": "95fef1c4b7d3f81e", |
|
"hash_full_prompts": "a4b275996a416b4a", |
|
"hash_input_tokens": "d2c8de257e0f76fa", |
|
"hash_cont_tokens": "8b827fc7dfd3c1c5" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 204, |
|
"padded": 816, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:high_school_world_history|5": { |
|
"hashes": { |
|
"hash_examples": "7e5085b6184b0322", |
|
"hash_full_prompts": "8adf16361f0f320a", |
|
"hash_input_tokens": "c5e010d66997c529", |
|
"hash_cont_tokens": "74875ba92d6648af" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 237, |
|
"padded": 948, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:human_aging|5": { |
|
"hashes": { |
|
"hash_examples": "c17333e7c7c10797", |
|
"hash_full_prompts": "918d91a3141aac4d", |
|
"hash_input_tokens": "05e6f5df9e81a997", |
|
"hash_cont_tokens": "ca87074f1dc39668" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 223, |
|
"padded": 892, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:human_sexuality|5": { |
|
"hashes": { |
|
"hash_examples": "4edd1e9045df5e3d", |
|
"hash_full_prompts": "bcee39ecea32fcc8", |
|
"hash_input_tokens": "9604ec0f5616cd26", |
|
"hash_cont_tokens": "491a0ab53f54aeb9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 131, |
|
"padded": 524, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:international_law|5": { |
|
"hashes": { |
|
"hash_examples": "db2fa00d771a062a", |
|
"hash_full_prompts": "ffe12a3b5bf350c2", |
|
"hash_input_tokens": "727bb86160a250d9", |
|
"hash_cont_tokens": "8c75cab59d57904d" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 121, |
|
"padded": 484, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:jurisprudence|5": { |
|
"hashes": { |
|
"hash_examples": "e956f86b124076fe", |
|
"hash_full_prompts": "b4293c3c08bebaf7", |
|
"hash_input_tokens": "013c7941768fda49", |
|
"hash_cont_tokens": "4c69d7671fa1ab1c" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 108, |
|
"padded": 432, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:logical_fallacies|5": { |
|
"hashes": { |
|
"hash_examples": "956e0e6365ab79f1", |
|
"hash_full_prompts": "8c1b7733e98cbe81", |
|
"hash_input_tokens": "8e4f39d6d98efdc5", |
|
"hash_cont_tokens": "57e78d3d09b7db81" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 163, |
|
"padded": 652, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:machine_learning|5": { |
|
"hashes": { |
|
"hash_examples": "397997cc6f4d581e", |
|
"hash_full_prompts": "24a206a1c639ab8d", |
|
"hash_input_tokens": "202eb581c240b8f3", |
|
"hash_cont_tokens": "8669a529b8d281b3" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 112, |
|
"padded": 448, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:management|5": { |
|
"hashes": { |
|
"hash_examples": "2bcbe6f6ca63d740", |
|
"hash_full_prompts": "77e1c79d988beecc", |
|
"hash_input_tokens": "5349fe24ec6c3315", |
|
"hash_cont_tokens": "79499fecb18f1cb1" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 103, |
|
"padded": 412, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:marketing|5": { |
|
"hashes": { |
|
"hash_examples": "8ddb20d964a1b065", |
|
"hash_full_prompts": "83cec2fa6b681d9d", |
|
"hash_input_tokens": "2d35adb4e63840cc", |
|
"hash_cont_tokens": "c5e9cd86b1a58fac" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 234, |
|
"padded": 936, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:medical_genetics|5": { |
|
"hashes": { |
|
"hash_examples": "182a71f4763d2cea", |
|
"hash_full_prompts": "195eb7ff99749730", |
|
"hash_input_tokens": "012f4687f48a688b", |
|
"hash_cont_tokens": "a886b3552371a98b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:miscellaneous|5": { |
|
"hashes": { |
|
"hash_examples": "4c404fdbb4ca57fc", |
|
"hash_full_prompts": "33539955c9a96851", |
|
"hash_input_tokens": "4089d35aa35d7c39", |
|
"hash_cont_tokens": "8578b82c42cc7026" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 783, |
|
"padded": 3132, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:moral_disputes|5": { |
|
"hashes": { |
|
"hash_examples": "60cbd2baa3fea5c9", |
|
"hash_full_prompts": "009b7d0e7f819eff", |
|
"hash_input_tokens": "92852a9aaaa68ac1", |
|
"hash_cont_tokens": "26b0f808ec46464d" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 346, |
|
"padded": 1384, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:moral_scenarios|5": { |
|
"hashes": { |
|
"hash_examples": "fd8b0431fbdd75ef", |
|
"hash_full_prompts": "f6e63c9fb9d3bff0", |
|
"hash_input_tokens": "05add168b9a55fbc", |
|
"hash_cont_tokens": "24ce197370bb5b07" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 895, |
|
"padded": 3580, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:nutrition|5": { |
|
"hashes": { |
|
"hash_examples": "71e55e2b829b6528", |
|
"hash_full_prompts": "8294d5e3ad435377", |
|
"hash_input_tokens": "742231f73012b1e2", |
|
"hash_cont_tokens": "4745352f3c85c108" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 306, |
|
"padded": 1224, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:philosophy|5": { |
|
"hashes": { |
|
"hash_examples": "a6d489a8d208fa4b", |
|
"hash_full_prompts": "db68c0f4503e4793", |
|
"hash_input_tokens": "cad5ce61a647bc46", |
|
"hash_cont_tokens": "8c34ab2fa65c3b6e" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 311, |
|
"padded": 1244, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:prehistory|5": { |
|
"hashes": { |
|
"hash_examples": "6cc50f032a19acaa", |
|
"hash_full_prompts": "3972bcfa8c80e964", |
|
"hash_input_tokens": "32a29cc657790558", |
|
"hash_cont_tokens": "ab44396c679556f3" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 324, |
|
"padded": 1296, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:professional_accounting|5": { |
|
"hashes": { |
|
"hash_examples": "50f57ab32f5f6cea", |
|
"hash_full_prompts": "25f0becc2483bd32", |
|
"hash_input_tokens": "cacacb04b2a59c5a", |
|
"hash_cont_tokens": "e3eb8866fd5dce77" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 282, |
|
"padded": 1120, |
|
"non_padded": 8, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:professional_law|5": { |
|
"hashes": { |
|
"hash_examples": "a8fdc85c64f4b215", |
|
"hash_full_prompts": "7a6f6c5706f00c7d", |
|
"hash_input_tokens": "4b463ba71a1b650f", |
|
"hash_cont_tokens": "2ae4ea5b043b942a" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 1534, |
|
"padded": 6136, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:professional_medicine|5": { |
|
"hashes": { |
|
"hash_examples": "c373a28a3050a73a", |
|
"hash_full_prompts": "a74b6ac7c5c545d2", |
|
"hash_input_tokens": "b2744b569a6a32fc", |
|
"hash_cont_tokens": "fc82ad9eca8a7b98" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 272, |
|
"padded": 1088, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:professional_psychology|5": { |
|
"hashes": { |
|
"hash_examples": "bf5254fe818356af", |
|
"hash_full_prompts": "c53fa139ec25f502", |
|
"hash_input_tokens": "3775c049ee940ea3", |
|
"hash_cont_tokens": "0cc4c9bd9df094ef" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 612, |
|
"padded": 2448, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:public_relations|5": { |
|
"hashes": { |
|
"hash_examples": "b66d52e28e7d14e0", |
|
"hash_full_prompts": "55b5eff05aa6bf13", |
|
"hash_input_tokens": "be078a9672a35a48", |
|
"hash_cont_tokens": "680235f5ede0b353" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 110, |
|
"padded": 440, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:security_studies|5": { |
|
"hashes": { |
|
"hash_examples": "514c14feaf000ad9", |
|
"hash_full_prompts": "6690ecdc054f7b0c", |
|
"hash_input_tokens": "3022dd1ffded02a9", |
|
"hash_cont_tokens": "2119792a6103cc24" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 245, |
|
"padded": 980, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:sociology|5": { |
|
"hashes": { |
|
"hash_examples": "f6c9bc9d18c80870", |
|
"hash_full_prompts": "945fbdd091c72d64", |
|
"hash_input_tokens": "4762d7cdcc303fe1", |
|
"hash_cont_tokens": "2178ff937c0c1a29" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 201, |
|
"padded": 804, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:us_foreign_policy|5": { |
|
"hashes": { |
|
"hash_examples": "ed7b78629db6678f", |
|
"hash_full_prompts": "ebba6ea6eca4ae53", |
|
"hash_input_tokens": "880355a94d9fe5b1", |
|
"hash_cont_tokens": "a886b3552371a98b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 392, |
|
"non_padded": 8, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:virology|5": { |
|
"hashes": { |
|
"hash_examples": "bc52ffdc3f9b994a", |
|
"hash_full_prompts": "a2ee4984d6877fe3", |
|
"hash_input_tokens": "65c8ea545351aa14", |
|
"hash_cont_tokens": "ec5c187546c7c842" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 166, |
|
"padded": 660, |
|
"non_padded": 4, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|mmlu:world_religions|5": { |
|
"hashes": { |
|
"hash_examples": "ecdb4a4f94f62930", |
|
"hash_full_prompts": "a89c8dddd1d8ced0", |
|
"hash_input_tokens": "0d36fd4bf3b571e1", |
|
"hash_cont_tokens": "65bc44ac97c3227a" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 171, |
|
"padded": 684, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|truthfulqa:mc|0": { |
|
"hashes": { |
|
"hash_examples": "36a6d90e75d92d4a", |
|
"hash_full_prompts": "8d9ca0a8bd458a1c", |
|
"hash_input_tokens": "89f619d8a8d594e0", |
|
"hash_cont_tokens": "8eaf3b80e9854172" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 817, |
|
"padded": 9996, |
|
"non_padded": 0, |
|
"effective_few_shots": 0.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|winogrande|5": { |
|
"hashes": { |
|
"hash_examples": "087d5d1a1afd4c7b", |
|
"hash_full_prompts": "35da55e47222e0e1", |
|
"hash_input_tokens": "25973bc571721c55", |
|
"hash_cont_tokens": "39be0da00f68561c" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 1267, |
|
"padded": 2534, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"leaderboard|gsm8k|5": { |
|
"hashes": { |
|
"hash_examples": "0ed016e24e7512fd", |
|
"hash_full_prompts": "f7ab209f6467841e", |
|
"hash_input_tokens": "650eb62258948f16", |
|
"hash_cont_tokens": "bd3608724a4cf68d" |
|
}, |
|
"truncated": 1319, |
|
"non_truncated": 0, |
|
"padded": 487, |
|
"non_padded": 832, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
} |
|
}, |
|
"summary_general": { |
|
"hashes": { |
|
"hash_examples": "670666fa3a90ce5d", |
|
"hash_full_prompts": "56c005e427046302", |
|
"hash_input_tokens": "3d48c4bd6b9d4a57", |
|
"hash_cont_tokens": "9c01009736bb767d" |
|
}, |
|
"truncated": 1319, |
|
"non_truncated": 27340, |
|
"padded": 113953, |
|
"non_padded": 919, |
|
"num_truncated_few_shots": 0 |
|
} |
|
} |