{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": -1, "max_samples": null, "job_id": "", "start_time": 6506311.714952203, "end_time": 6527604.5975109, "total_evaluation_time_secondes": "21292.882558696903", "model_name": "taozi555/llama3-Mirage-Walker-8b", "model_sha": "f14b1a5faecce896e7f12c601756ed2aa3680cac", "model_dtype": "torch.bfloat16", "model_size": "15.08 GB", "config": null }, "results": { "leaderboard|arc:challenge|25": { "acc": 0.5767918088737202, "acc_stderr": 0.01443803622084802, "acc_norm": 0.5810580204778157, "acc_norm_stderr": 0.014418106953639013 }, "leaderboard|hellaswag|10": { "acc": 0.6086436964748058, "acc_stderr": 0.004870563921220625, "acc_norm": 0.783608842859988, "acc_norm_stderr": 0.004109423832097878 }, "leaderboard|mmlu:abstract_algebra|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284 }, "leaderboard|mmlu:anatomy|5": { "acc": 0.674074074074074, "acc_stderr": 0.040491220417025055 }, "leaderboard|mmlu:astronomy|5": { "acc": 0.7631578947368421, "acc_stderr": 0.03459777606810536 }, "leaderboard|mmlu:business_ethics|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316 }, "leaderboard|mmlu:clinical_knowledge|5": { "acc": 0.7509433962264151, "acc_stderr": 0.026616482980501704 }, "leaderboard|mmlu:college_biology|5": { "acc": 0.8333333333333334, "acc_stderr": 0.031164899666948617 }, "leaderboard|mmlu:college_chemistry|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795 }, "leaderboard|mmlu:college_computer_science|5": { "acc": 0.62, "acc_stderr": 0.048783173121456316 }, "leaderboard|mmlu:college_mathematics|5": { "acc": 0.4, "acc_stderr": 0.04923659639173309 }, "leaderboard|mmlu:college_medicine|5": { "acc": 0.6705202312138728, "acc_stderr": 0.03583901754736412 }, "leaderboard|mmlu:college_physics|5": { "acc": 0.45098039215686275, "acc_stderr": 0.049512182523962625 }, "leaderboard|mmlu:computer_security|5": { "acc": 0.79, "acc_stderr": 0.04093601807403326 }, "leaderboard|mmlu:conceptual_physics|5": { "acc": 0.6085106382978723, "acc_stderr": 0.03190701242326812 }, "leaderboard|mmlu:econometrics|5": { "acc": 0.5, "acc_stderr": 0.047036043419179864 }, "leaderboard|mmlu:electrical_engineering|5": { "acc": 0.6275862068965518, "acc_stderr": 0.0402873153294756 }, "leaderboard|mmlu:elementary_mathematics|5": { "acc": 0.47619047619047616, "acc_stderr": 0.02572209706438853 }, "leaderboard|mmlu:formal_logic|5": { "acc": 0.5476190476190477, "acc_stderr": 0.044518079590553275 }, "leaderboard|mmlu:global_facts|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332 }, "leaderboard|mmlu:high_school_biology|5": { "acc": 0.8129032258064516, "acc_stderr": 0.022185710092252252 }, "leaderboard|mmlu:high_school_chemistry|5": { "acc": 0.5862068965517241, "acc_stderr": 0.03465304488406795 }, "leaderboard|mmlu:high_school_computer_science|5": { "acc": 0.72, "acc_stderr": 0.04512608598542127 }, "leaderboard|mmlu:high_school_european_history|5": { "acc": 0.7454545454545455, "acc_stderr": 0.03401506715249039 }, "leaderboard|mmlu:high_school_geography|5": { "acc": 0.8383838383838383, "acc_stderr": 0.02622591986362928 }, "leaderboard|mmlu:high_school_government_and_politics|5": { "acc": 0.917098445595855, "acc_stderr": 0.01989934131572178 }, "leaderboard|mmlu:high_school_macroeconomics|5": { "acc": 0.6717948717948717, "acc_stderr": 0.02380763319865726 }, "leaderboard|mmlu:high_school_mathematics|5": { "acc": 0.3888888888888889, "acc_stderr": 0.029723278961476664 }, "leaderboard|mmlu:high_school_microeconomics|5": { "acc": 0.7605042016806722, "acc_stderr": 0.027722065493361252 }, "leaderboard|mmlu:high_school_physics|5": { "acc": 0.4304635761589404, "acc_stderr": 0.04042809961395634 }, "leaderboard|mmlu:high_school_psychology|5": { "acc": 0.8642201834862385, "acc_stderr": 0.014686907556340022 }, "leaderboard|mmlu:high_school_statistics|5": { "acc": 0.5416666666666666, "acc_stderr": 0.033981108902946366 }, "leaderboard|mmlu:high_school_us_history|5": { "acc": 0.8578431372549019, "acc_stderr": 0.02450980392156861 }, "leaderboard|mmlu:high_school_world_history|5": { "acc": 0.8396624472573839, "acc_stderr": 0.02388438092596567 }, "leaderboard|mmlu:human_aging|5": { "acc": 0.695067264573991, "acc_stderr": 0.030898610882477518 }, "leaderboard|mmlu:human_sexuality|5": { "acc": 0.7786259541984732, "acc_stderr": 0.03641297081313729 }, "leaderboard|mmlu:international_law|5": { "acc": 0.8264462809917356, "acc_stderr": 0.03457272836917669 }, "leaderboard|mmlu:jurisprudence|5": { "acc": 0.8148148148148148, "acc_stderr": 0.03755265865037181 }, "leaderboard|mmlu:logical_fallacies|5": { "acc": 0.7484662576687117, "acc_stderr": 0.034089978868575295 }, "leaderboard|mmlu:machine_learning|5": { "acc": 0.5089285714285714, "acc_stderr": 0.04745033255489123 }, "leaderboard|mmlu:management|5": { "acc": 0.8349514563106796, "acc_stderr": 0.036756688322331886 }, "leaderboard|mmlu:marketing|5": { "acc": 0.8846153846153846, "acc_stderr": 0.020930193185179326 }, "leaderboard|mmlu:medical_genetics|5": { "acc": 0.81, "acc_stderr": 0.03942772444036623 }, "leaderboard|mmlu:miscellaneous|5": { "acc": 0.8454661558109834, "acc_stderr": 0.012925773495095974 }, "leaderboard|mmlu:moral_disputes|5": { "acc": 0.7254335260115607, "acc_stderr": 0.024027745155265012 }, "leaderboard|mmlu:moral_scenarios|5": { "acc": 0.41787709497206704, "acc_stderr": 0.01649540063582008 }, "leaderboard|mmlu:nutrition|5": { "acc": 0.7712418300653595, "acc_stderr": 0.024051029739912248 }, "leaderboard|mmlu:philosophy|5": { "acc": 0.7427652733118971, "acc_stderr": 0.024826171289250888 }, "leaderboard|mmlu:prehistory|5": { "acc": 0.7438271604938271, "acc_stderr": 0.024288533637726095 }, "leaderboard|mmlu:professional_accounting|5": { "acc": 0.5070921985815603, "acc_stderr": 0.02982449855912901 }, "leaderboard|mmlu:professional_law|5": { "acc": 0.4810951760104302, "acc_stderr": 0.012761104871472658 }, "leaderboard|mmlu:professional_medicine|5": { "acc": 0.75, "acc_stderr": 0.026303648393696036 }, "leaderboard|mmlu:professional_psychology|5": { "acc": 0.6993464052287581, "acc_stderr": 0.01855063450295296 }, "leaderboard|mmlu:public_relations|5": { "acc": 0.6545454545454545, "acc_stderr": 0.04554619617541054 }, "leaderboard|mmlu:security_studies|5": { "acc": 0.7510204081632653, "acc_stderr": 0.027682979522960227 }, "leaderboard|mmlu:sociology|5": { "acc": 0.8308457711442786, "acc_stderr": 0.026508590656233268 }, "leaderboard|mmlu:us_foreign_policy|5": { "acc": 0.86, "acc_stderr": 0.03487350880197769 }, "leaderboard|mmlu:virology|5": { "acc": 0.5120481927710844, "acc_stderr": 0.03891364495835817 }, "leaderboard|mmlu:world_religions|5": { "acc": 0.8245614035087719, "acc_stderr": 0.029170885500727665 }, "leaderboard|truthfulqa:mc|0": { "truthfulqa_mc1": 0.34761321909424725, "truthfulqa_mc1_stderr": 0.016670769188897306, "truthfulqa_mc2": 0.5156373144783575, "truthfulqa_mc2_stderr": 0.015703082442877 }, "leaderboard|winogrande|5": { "acc": 0.7529597474348856, "acc_stderr": 0.012121402942855576 }, "leaderboard|gsm8k|5": { "qem": 0.6724791508718726, "qem_stderr": 0.012927102210426719 }, "leaderboard|mmlu:_average|5": { "acc": 0.6801243622973331, "acc_stderr": 0.03296281402260026 }, "all": { "acc": 0.6784247317288568, "acc_stderr": 0.03183850670621899, "acc_norm": 0.6823334316689018, "acc_norm_stderr": 0.009263765392868446, "truthfulqa_mc1": 0.34761321909424725, "truthfulqa_mc1_stderr": 0.016670769188897306, "truthfulqa_mc2": 0.5156373144783575, "truthfulqa_mc2_stderr": 0.015703082442877, "qem": 0.6724791508718726, "qem_stderr": 0.012927102210426719 } }, "versions": { "leaderboard|arc:challenge|25": 0, "leaderboard|gsm8k|5": 0, "leaderboard|hellaswag|10": 0, "leaderboard|mmlu:abstract_algebra|5": 0, "leaderboard|mmlu:anatomy|5": 0, "leaderboard|mmlu:astronomy|5": 0, "leaderboard|mmlu:business_ethics|5": 0, "leaderboard|mmlu:clinical_knowledge|5": 0, "leaderboard|mmlu:college_biology|5": 0, "leaderboard|mmlu:college_chemistry|5": 0, "leaderboard|mmlu:college_computer_science|5": 0, "leaderboard|mmlu:college_mathematics|5": 0, "leaderboard|mmlu:college_medicine|5": 0, "leaderboard|mmlu:college_physics|5": 0, "leaderboard|mmlu:computer_security|5": 0, "leaderboard|mmlu:conceptual_physics|5": 0, "leaderboard|mmlu:econometrics|5": 0, "leaderboard|mmlu:electrical_engineering|5": 0, "leaderboard|mmlu:elementary_mathematics|5": 0, "leaderboard|mmlu:formal_logic|5": 0, "leaderboard|mmlu:global_facts|5": 0, "leaderboard|mmlu:high_school_biology|5": 0, "leaderboard|mmlu:high_school_chemistry|5": 0, "leaderboard|mmlu:high_school_computer_science|5": 0, "leaderboard|mmlu:high_school_european_history|5": 0, "leaderboard|mmlu:high_school_geography|5": 0, "leaderboard|mmlu:high_school_government_and_politics|5": 0, "leaderboard|mmlu:high_school_macroeconomics|5": 0, "leaderboard|mmlu:high_school_mathematics|5": 0, "leaderboard|mmlu:high_school_microeconomics|5": 0, "leaderboard|mmlu:high_school_physics|5": 0, "leaderboard|mmlu:high_school_psychology|5": 0, "leaderboard|mmlu:high_school_statistics|5": 0, "leaderboard|mmlu:high_school_us_history|5": 0, "leaderboard|mmlu:high_school_world_history|5": 0, "leaderboard|mmlu:human_aging|5": 0, "leaderboard|mmlu:human_sexuality|5": 0, "leaderboard|mmlu:international_law|5": 0, "leaderboard|mmlu:jurisprudence|5": 0, "leaderboard|mmlu:logical_fallacies|5": 0, "leaderboard|mmlu:machine_learning|5": 0, "leaderboard|mmlu:management|5": 0, "leaderboard|mmlu:marketing|5": 0, "leaderboard|mmlu:medical_genetics|5": 0, "leaderboard|mmlu:miscellaneous|5": 0, "leaderboard|mmlu:moral_disputes|5": 0, "leaderboard|mmlu:moral_scenarios|5": 0, "leaderboard|mmlu:nutrition|5": 0, "leaderboard|mmlu:philosophy|5": 0, "leaderboard|mmlu:prehistory|5": 0, "leaderboard|mmlu:professional_accounting|5": 0, "leaderboard|mmlu:professional_law|5": 0, "leaderboard|mmlu:professional_medicine|5": 0, "leaderboard|mmlu:professional_psychology|5": 0, "leaderboard|mmlu:public_relations|5": 0, "leaderboard|mmlu:security_studies|5": 0, "leaderboard|mmlu:sociology|5": 0, "leaderboard|mmlu:us_foreign_policy|5": 0, "leaderboard|mmlu:virology|5": 0, "leaderboard|mmlu:world_religions|5": 0, "leaderboard|truthfulqa:mc|0": 0, "leaderboard|winogrande|5": 0 }, "config_tasks": { "leaderboard|arc:challenge": { "name": "arc:challenge", "prompt_function": "arc", "hf_repo": "ai2_arc", "hf_subset": "ARC-Challenge", "metric": [ "loglikelihood_acc", "loglikelihood_acc_norm_nospace" ], "hf_avail_splits": [ "train", "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": "random_sampling_from_train", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "arc" ], "original_num_docs": 1172, "effective_num_docs": 1172, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|gsm8k": { "name": "gsm8k", "prompt_function": "gsm8k", "hf_repo": "gsm8k", "hf_subset": "main", "metric": [ "quasi_exact_match_gsm8k" ], "hf_avail_splits": [ "train", "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": "random_sampling_from_train", "generation_size": 256, "stop_sequence": [ "Question:", "Question", ":" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard" ], "original_num_docs": 1319, "effective_num_docs": 1319, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|hellaswag": { "name": "hellaswag", "prompt_function": "hellaswag_harness", "hf_repo": "hellaswag", "hf_subset": "default", "metric": [ "loglikelihood_acc", "loglikelihood_acc_norm" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "validation" ], "few_shots_split": null, "few_shots_select": "random_sampling_from_train", "generation_size": -1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard" ], "original_num_docs": 10042, "effective_num_docs": 10042, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:abstract_algebra": { "name": "mmlu:abstract_algebra", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "abstract_algebra", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:anatomy": { "name": "mmlu:anatomy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "anatomy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 135, "effective_num_docs": 135, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:astronomy": { "name": "mmlu:astronomy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "astronomy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 152, "effective_num_docs": 152, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:business_ethics": { "name": "mmlu:business_ethics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "business_ethics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:clinical_knowledge": { "name": "mmlu:clinical_knowledge", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "clinical_knowledge", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 265, "effective_num_docs": 265, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_biology": { "name": "mmlu:college_biology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_biology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 144, "effective_num_docs": 144, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_chemistry": { "name": "mmlu:college_chemistry", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_chemistry", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_computer_science": { "name": "mmlu:college_computer_science", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_computer_science", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_mathematics": { "name": "mmlu:college_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_medicine": { "name": "mmlu:college_medicine", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_medicine", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 173, "effective_num_docs": 173, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_physics": { "name": "mmlu:college_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 102, "effective_num_docs": 102, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:computer_security": { "name": "mmlu:computer_security", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "computer_security", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:conceptual_physics": { "name": "mmlu:conceptual_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "conceptual_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 235, "effective_num_docs": 235, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:econometrics": { "name": "mmlu:econometrics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "econometrics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 114, "effective_num_docs": 114, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:electrical_engineering": { "name": "mmlu:electrical_engineering", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "electrical_engineering", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 145, "effective_num_docs": 145, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:elementary_mathematics": { "name": "mmlu:elementary_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "elementary_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 378, "effective_num_docs": 378, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:formal_logic": { "name": "mmlu:formal_logic", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "formal_logic", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 126, "effective_num_docs": 126, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:global_facts": { "name": "mmlu:global_facts", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "global_facts", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_biology": { "name": "mmlu:high_school_biology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_biology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 310, "effective_num_docs": 310, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_chemistry": { "name": "mmlu:high_school_chemistry", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_chemistry", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 203, "effective_num_docs": 203, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_computer_science": { "name": "mmlu:high_school_computer_science", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_computer_science", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_european_history": { "name": "mmlu:high_school_european_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_european_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 165, "effective_num_docs": 165, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_geography": { "name": "mmlu:high_school_geography", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_geography", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 198, "effective_num_docs": 198, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_government_and_politics": { "name": "mmlu:high_school_government_and_politics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_government_and_politics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 193, "effective_num_docs": 193, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_macroeconomics": { "name": "mmlu:high_school_macroeconomics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_macroeconomics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 390, "effective_num_docs": 390, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_mathematics": { "name": "mmlu:high_school_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 270, "effective_num_docs": 270, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_microeconomics": { "name": "mmlu:high_school_microeconomics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_microeconomics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 238, "effective_num_docs": 238, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_physics": { "name": "mmlu:high_school_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 151, "effective_num_docs": 151, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_psychology": { "name": "mmlu:high_school_psychology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_psychology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 545, "effective_num_docs": 545, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_statistics": { "name": "mmlu:high_school_statistics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_statistics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 216, "effective_num_docs": 216, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_us_history": { "name": "mmlu:high_school_us_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_us_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 204, "effective_num_docs": 204, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_world_history": { "name": "mmlu:high_school_world_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_world_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 237, "effective_num_docs": 237, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:human_aging": { "name": "mmlu:human_aging", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "human_aging", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 223, "effective_num_docs": 223, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:human_sexuality": { "name": "mmlu:human_sexuality", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "human_sexuality", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 131, "effective_num_docs": 131, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:international_law": { "name": "mmlu:international_law", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "international_law", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 121, "effective_num_docs": 121, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:jurisprudence": { "name": "mmlu:jurisprudence", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "jurisprudence", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 108, "effective_num_docs": 108, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:logical_fallacies": { "name": "mmlu:logical_fallacies", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "logical_fallacies", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 163, "effective_num_docs": 163, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:machine_learning": { "name": "mmlu:machine_learning", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "machine_learning", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 112, "effective_num_docs": 112, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:management": { "name": "mmlu:management", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "management", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 103, "effective_num_docs": 103, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:marketing": { "name": "mmlu:marketing", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "marketing", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 234, "effective_num_docs": 234, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:medical_genetics": { "name": "mmlu:medical_genetics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "medical_genetics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:miscellaneous": { "name": "mmlu:miscellaneous", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "miscellaneous", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 783, "effective_num_docs": 783, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:moral_disputes": { "name": "mmlu:moral_disputes", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "moral_disputes", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 346, "effective_num_docs": 346, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:moral_scenarios": { "name": "mmlu:moral_scenarios", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "moral_scenarios", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 895, "effective_num_docs": 895, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:nutrition": { "name": "mmlu:nutrition", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "nutrition", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 306, "effective_num_docs": 306, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:philosophy": { "name": "mmlu:philosophy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "philosophy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 311, "effective_num_docs": 311, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:prehistory": { "name": "mmlu:prehistory", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "prehistory", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 324, "effective_num_docs": 324, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:professional_accounting": { "name": "mmlu:professional_accounting", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_accounting", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 282, "effective_num_docs": 282, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:professional_law": { "name": "mmlu:professional_law", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_law", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 1534, "effective_num_docs": 1534, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:professional_medicine": { "name": "mmlu:professional_medicine", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_medicine", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 272, "effective_num_docs": 272, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:professional_psychology": { "name": "mmlu:professional_psychology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_psychology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 612, "effective_num_docs": 612, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:public_relations": { "name": "mmlu:public_relations", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "public_relations", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 110, "effective_num_docs": 110, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:security_studies": { "name": "mmlu:security_studies", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "security_studies", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 245, "effective_num_docs": 245, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:sociology": { "name": "mmlu:sociology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "sociology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 201, "effective_num_docs": 201, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:us_foreign_policy": { "name": "mmlu:us_foreign_policy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "us_foreign_policy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:virology": { "name": "mmlu:virology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "virology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 166, "effective_num_docs": 166, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:world_religions": { "name": "mmlu:world_religions", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "world_religions", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 171, "effective_num_docs": 171, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|truthfulqa:mc": { "name": "truthfulqa:mc", "prompt_function": "truthful_qa_multiple_choice", "hf_repo": "truthful_qa", "hf_subset": "multiple_choice", "metric": [ "truthfulqa_mc_metrics" ], "hf_avail_splits": [ "validation" ], "evaluation_splits": [ "validation" ], "few_shots_split": null, "few_shots_select": null, "generation_size": -1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard" ], "original_num_docs": 817, "effective_num_docs": 817, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|winogrande": { "name": "winogrande", "prompt_function": "winogrande", "hf_repo": "winogrande", "hf_subset": "winogrande_xl", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "validation" ], "few_shots_split": null, "few_shots_select": "random_sampling", "generation_size": -1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard" ], "original_num_docs": 1267, "effective_num_docs": 1267, "trust_dataset": true, "must_remove_duplicate_docs": null } }, "summary_tasks": { "leaderboard|arc:challenge|25": { "hashes": { "hash_examples": "17b0cae357c0259e", "hash_full_prompts": "4aeb23a740784b86", "hash_input_tokens": "6327b032f3de83c4", "hash_cont_tokens": "c77636140035b318" }, "truncated": 0, "non_truncated": 1172, "padded": 4687, "non_padded": 0, "effective_few_shots": 25.0, "num_truncated_few_shots": 0 }, "leaderboard|hellaswag|10": { "hashes": { "hash_examples": "31985c805c3a737e", "hash_full_prompts": "3c2d3440e190b07b", "hash_input_tokens": "bb027c2cf1da51d3", "hash_cont_tokens": "2d70b9577ac439d0" }, "truncated": 0, "non_truncated": 10042, "padded": 40105, "non_padded": 63, "effective_few_shots": 10.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:abstract_algebra|5": { "hashes": { "hash_examples": "4c76229e00c9c0e9", "hash_full_prompts": "faefa0cccb952fe0", "hash_input_tokens": "c7100cded1fd23c7", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:anatomy|5": { "hashes": { "hash_examples": "6a1f8104dccbd33b", "hash_full_prompts": "eacd03e46972fa59", "hash_input_tokens": "66c3858c5e24e62f", "hash_cont_tokens": "9be31d13c42ead00" }, "truncated": 0, "non_truncated": 135, "padded": 540, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:astronomy|5": { "hashes": { "hash_examples": "1302effa3a76ce4c", "hash_full_prompts": "826cacbdf1f6bfd0", "hash_input_tokens": "5c83cc7051903092", "hash_cont_tokens": "5da09bc77752f437" }, "truncated": 0, "non_truncated": 152, "padded": 608, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:business_ethics|5": { "hashes": { "hash_examples": "03cb8bce5336419a", "hash_full_prompts": "518511169382ac39", "hash_input_tokens": "7aeea403244c4473", "hash_cont_tokens": "03b2ebbdc5224bb0" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:clinical_knowledge|5": { "hashes": { "hash_examples": "ffbb9c7b2be257f9", "hash_full_prompts": "0b07b0bc774fdfd9", "hash_input_tokens": "ec0c6a5f110eb99d", "hash_cont_tokens": "40dd7263ce5af5de" }, "truncated": 0, "non_truncated": 265, "padded": 1060, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_biology|5": { "hashes": { "hash_examples": "3ee77f176f38eb8e", "hash_full_prompts": "22cbe0e8dabf98b1", "hash_input_tokens": "98495e6d43b43601", "hash_cont_tokens": "78048b26c5552ac3" }, "truncated": 0, "non_truncated": 144, "padded": 576, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_chemistry|5": { "hashes": { "hash_examples": "ce61a69c46d47aeb", "hash_full_prompts": "9c1288940a4afb59", "hash_input_tokens": "6d15ae51e4fb0734", "hash_cont_tokens": "e27ea803720e4f81" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_computer_science|5": { "hashes": { "hash_examples": "32805b52d7d5daab", "hash_full_prompts": "9522781d0cdf1a43", "hash_input_tokens": "d067a9964676ea01", "hash_cont_tokens": "00f531b5784e741a" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_mathematics|5": { "hashes": { "hash_examples": "55da1a0a0bd33722", "hash_full_prompts": "72fe6f46a57e6ca4", "hash_input_tokens": "cd2d6c5695665f54", "hash_cont_tokens": "7a6c30f41cc94aa7" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_medicine|5": { "hashes": { "hash_examples": "c33e143163049176", "hash_full_prompts": "dee0989b2c8993f4", "hash_input_tokens": "976ce2b55b7907d5", "hash_cont_tokens": "5f84bdb85e243e5d" }, "truncated": 0, "non_truncated": 173, "padded": 692, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_physics|5": { "hashes": { "hash_examples": "ebdab1cdb7e555df", "hash_full_prompts": "a1be6b64ea1948c3", "hash_input_tokens": "2bf98ac7bc989c60", "hash_cont_tokens": "f32a0cc41acb4bf8" }, "truncated": 0, "non_truncated": 102, "padded": 408, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:computer_security|5": { "hashes": { "hash_examples": "a24fd7d08a560921", "hash_full_prompts": "01bc3fdfdefe67a4", "hash_input_tokens": "239fad08f7e25672", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:conceptual_physics|5": { "hashes": { "hash_examples": "8300977a79386993", "hash_full_prompts": "b39315a8ada3ca79", "hash_input_tokens": "8fd1fa091cf77da8", "hash_cont_tokens": "6408f70f3d9ada31" }, "truncated": 0, "non_truncated": 235, "padded": 940, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:econometrics|5": { "hashes": { "hash_examples": "ddde36788a04a46f", "hash_full_prompts": "70bab37ca5fcc48f", "hash_input_tokens": "75797ac68b074a88", "hash_cont_tokens": "2fab100ce81d11e3" }, "truncated": 0, "non_truncated": 114, "padded": 456, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:electrical_engineering|5": { "hashes": { "hash_examples": "acbc5def98c19b3f", "hash_full_prompts": "86a4747481c11c61", "hash_input_tokens": "d30b3949f1a869bc", "hash_cont_tokens": "e75df8f470aa4973" }, "truncated": 0, "non_truncated": 145, "padded": 580, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:elementary_mathematics|5": { "hashes": { "hash_examples": "146e61d07497a9bd", "hash_full_prompts": "1fe56333735325fa", "hash_input_tokens": "b14ababf1fdaf847", "hash_cont_tokens": "4ea4b4978c1fb85a" }, "truncated": 0, "non_truncated": 378, "padded": 1512, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:formal_logic|5": { "hashes": { "hash_examples": "8635216e1909a03f", "hash_full_prompts": "cc83c1ede45f974c", "hash_input_tokens": "0dee944c92ba09fd", "hash_cont_tokens": "bd7b90f7fcc6628b" }, "truncated": 0, "non_truncated": 126, "padded": 504, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:global_facts|5": { "hashes": { "hash_examples": "30b315aa6353ee47", "hash_full_prompts": "3a2ec1e2785c69a5", "hash_input_tokens": "5ba3e5396bf746e6", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_biology|5": { "hashes": { "hash_examples": "c9136373af2180de", "hash_full_prompts": "27646a569cf2a6f8", "hash_input_tokens": "4f3e8567ca1086f0", "hash_cont_tokens": "d294ad795a4ba989" }, "truncated": 0, "non_truncated": 310, "padded": 1240, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_chemistry|5": { "hashes": { "hash_examples": "b0661bfa1add6404", "hash_full_prompts": "6905c6ca76f7b2b7", "hash_input_tokens": "d06720f4af19fcde", "hash_cont_tokens": "208aff39cfca671a" }, "truncated": 0, "non_truncated": 203, "padded": 812, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_computer_science|5": { "hashes": { "hash_examples": "80fc1d623a3d665f", "hash_full_prompts": "b80092241e8b6c06", "hash_input_tokens": "4b42a8ce6184222f", "hash_cont_tokens": "3b482b98e18c249b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_european_history|5": { "hashes": { "hash_examples": "854da6e5af0fe1a1", "hash_full_prompts": "a3bc32a5dc022ce7", "hash_input_tokens": "9829b92f11e38c39", "hash_cont_tokens": "7b6f4c22b304c3cc" }, "truncated": 0, "non_truncated": 165, "padded": 656, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_geography|5": { "hashes": { "hash_examples": "7dc963c7acd19ad8", "hash_full_prompts": "53f91beae305905d", "hash_input_tokens": "a6e83c8e9a37451f", "hash_cont_tokens": "1a85c9e696d91a66" }, "truncated": 0, "non_truncated": 198, "padded": 792, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_government_and_politics|5": { "hashes": { "hash_examples": "1f675dcdebc9758f", "hash_full_prompts": "623fd7e3495f243f", "hash_input_tokens": "70d3312474815a5e", "hash_cont_tokens": "a47a4530b8790081" }, "truncated": 0, "non_truncated": 193, "padded": 772, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_macroeconomics|5": { "hashes": { "hash_examples": "2fb32cf2d80f0b35", "hash_full_prompts": "378ac13c8abb6c5f", "hash_input_tokens": "f580d17a3214af15", "hash_cont_tokens": "e71e7c6acf44c3e5" }, "truncated": 0, "non_truncated": 390, "padded": 1560, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_mathematics|5": { "hashes": { "hash_examples": "fd6646fdb5d58a1f", "hash_full_prompts": "14d34e0b34750627", "hash_input_tokens": "361a779f3e9723b0", "hash_cont_tokens": "0a886cdd21b224a6" }, "truncated": 0, "non_truncated": 270, "padded": 1080, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_microeconomics|5": { "hashes": { "hash_examples": "2118f21f71d87d84", "hash_full_prompts": "9ac09e5d4da991c9", "hash_input_tokens": "b5bcfd3df743cee0", "hash_cont_tokens": "a5f61d5beba13cc2" }, "truncated": 0, "non_truncated": 238, "padded": 952, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_physics|5": { "hashes": { "hash_examples": "dc3ce06378548565", "hash_full_prompts": "b4832a554d47d224", "hash_input_tokens": "4caf36cb75ba8552", "hash_cont_tokens": "c4135c191e57e8e6" }, "truncated": 0, "non_truncated": 151, "padded": 604, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_psychology|5": { "hashes": { "hash_examples": "c8d1d98a40e11f2f", "hash_full_prompts": "1e8cd27064546274", "hash_input_tokens": "9f7a7525450c0b5b", "hash_cont_tokens": "287bec936450f9c6" }, "truncated": 0, "non_truncated": 545, "padded": 2180, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_statistics|5": { "hashes": { "hash_examples": "666c8759b98ee4ff", "hash_full_prompts": "e05ab41077ec0afa", "hash_input_tokens": "dbb29057733d0628", "hash_cont_tokens": "7e446857c7d6d869" }, "truncated": 0, "non_truncated": 216, "padded": 864, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_us_history|5": { "hashes": { "hash_examples": "95fef1c4b7d3f81e", "hash_full_prompts": "a4b275996a416b4a", "hash_input_tokens": "d2c8de257e0f76fa", "hash_cont_tokens": "8b827fc7dfd3c1c5" }, "truncated": 0, "non_truncated": 204, "padded": 816, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_world_history|5": { "hashes": { "hash_examples": "7e5085b6184b0322", "hash_full_prompts": "8adf16361f0f320a", "hash_input_tokens": "c5e010d66997c529", "hash_cont_tokens": "74875ba92d6648af" }, "truncated": 0, "non_truncated": 237, "padded": 948, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:human_aging|5": { "hashes": { "hash_examples": "c17333e7c7c10797", "hash_full_prompts": "918d91a3141aac4d", "hash_input_tokens": "05e6f5df9e81a997", "hash_cont_tokens": "ca87074f1dc39668" }, "truncated": 0, "non_truncated": 223, "padded": 892, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:human_sexuality|5": { "hashes": { "hash_examples": "4edd1e9045df5e3d", "hash_full_prompts": "bcee39ecea32fcc8", "hash_input_tokens": "9604ec0f5616cd26", "hash_cont_tokens": "491a0ab53f54aeb9" }, "truncated": 0, "non_truncated": 131, "padded": 524, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:international_law|5": { "hashes": { "hash_examples": "db2fa00d771a062a", "hash_full_prompts": "ffe12a3b5bf350c2", "hash_input_tokens": "727bb86160a250d9", "hash_cont_tokens": "8c75cab59d57904d" }, "truncated": 0, "non_truncated": 121, "padded": 484, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:jurisprudence|5": { "hashes": { "hash_examples": "e956f86b124076fe", "hash_full_prompts": "b4293c3c08bebaf7", "hash_input_tokens": "013c7941768fda49", "hash_cont_tokens": "4c69d7671fa1ab1c" }, "truncated": 0, "non_truncated": 108, "padded": 432, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:logical_fallacies|5": { "hashes": { "hash_examples": "956e0e6365ab79f1", "hash_full_prompts": "8c1b7733e98cbe81", "hash_input_tokens": "8e4f39d6d98efdc5", "hash_cont_tokens": "57e78d3d09b7db81" }, "truncated": 0, "non_truncated": 163, "padded": 652, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:machine_learning|5": { "hashes": { "hash_examples": "397997cc6f4d581e", "hash_full_prompts": "24a206a1c639ab8d", "hash_input_tokens": "202eb581c240b8f3", "hash_cont_tokens": "8669a529b8d281b3" }, "truncated": 0, "non_truncated": 112, "padded": 448, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:management|5": { "hashes": { "hash_examples": "2bcbe6f6ca63d740", "hash_full_prompts": "77e1c79d988beecc", "hash_input_tokens": "5349fe24ec6c3315", "hash_cont_tokens": "79499fecb18f1cb1" }, "truncated": 0, "non_truncated": 103, "padded": 412, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:marketing|5": { "hashes": { "hash_examples": "8ddb20d964a1b065", "hash_full_prompts": "83cec2fa6b681d9d", "hash_input_tokens": "2d35adb4e63840cc", "hash_cont_tokens": "c5e9cd86b1a58fac" }, "truncated": 0, "non_truncated": 234, "padded": 936, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:medical_genetics|5": { "hashes": { "hash_examples": "182a71f4763d2cea", "hash_full_prompts": "195eb7ff99749730", "hash_input_tokens": "012f4687f48a688b", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:miscellaneous|5": { "hashes": { "hash_examples": "4c404fdbb4ca57fc", "hash_full_prompts": "33539955c9a96851", "hash_input_tokens": "4089d35aa35d7c39", "hash_cont_tokens": "8578b82c42cc7026" }, "truncated": 0, "non_truncated": 783, "padded": 3132, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:moral_disputes|5": { "hashes": { "hash_examples": "60cbd2baa3fea5c9", "hash_full_prompts": "009b7d0e7f819eff", "hash_input_tokens": "92852a9aaaa68ac1", "hash_cont_tokens": "26b0f808ec46464d" }, "truncated": 0, "non_truncated": 346, "padded": 1384, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:moral_scenarios|5": { "hashes": { "hash_examples": "fd8b0431fbdd75ef", "hash_full_prompts": "f6e63c9fb9d3bff0", "hash_input_tokens": "05add168b9a55fbc", "hash_cont_tokens": "24ce197370bb5b07" }, "truncated": 0, "non_truncated": 895, "padded": 3580, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:nutrition|5": { "hashes": { "hash_examples": "71e55e2b829b6528", "hash_full_prompts": "8294d5e3ad435377", "hash_input_tokens": "742231f73012b1e2", "hash_cont_tokens": "4745352f3c85c108" }, "truncated": 0, "non_truncated": 306, "padded": 1224, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:philosophy|5": { "hashes": { "hash_examples": "a6d489a8d208fa4b", "hash_full_prompts": "db68c0f4503e4793", "hash_input_tokens": "cad5ce61a647bc46", "hash_cont_tokens": "8c34ab2fa65c3b6e" }, "truncated": 0, "non_truncated": 311, "padded": 1244, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:prehistory|5": { "hashes": { "hash_examples": "6cc50f032a19acaa", "hash_full_prompts": "3972bcfa8c80e964", "hash_input_tokens": "32a29cc657790558", "hash_cont_tokens": "ab44396c679556f3" }, "truncated": 0, "non_truncated": 324, "padded": 1296, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_accounting|5": { "hashes": { "hash_examples": "50f57ab32f5f6cea", "hash_full_prompts": "25f0becc2483bd32", "hash_input_tokens": "cacacb04b2a59c5a", "hash_cont_tokens": "e3eb8866fd5dce77" }, "truncated": 0, "non_truncated": 282, "padded": 1120, "non_padded": 8, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_law|5": { "hashes": { "hash_examples": "a8fdc85c64f4b215", "hash_full_prompts": "7a6f6c5706f00c7d", "hash_input_tokens": "4b463ba71a1b650f", "hash_cont_tokens": "2ae4ea5b043b942a" }, "truncated": 0, "non_truncated": 1534, "padded": 6136, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_medicine|5": { "hashes": { "hash_examples": "c373a28a3050a73a", "hash_full_prompts": "a74b6ac7c5c545d2", "hash_input_tokens": "b2744b569a6a32fc", "hash_cont_tokens": "fc82ad9eca8a7b98" }, "truncated": 0, "non_truncated": 272, "padded": 1088, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_psychology|5": { "hashes": { "hash_examples": "bf5254fe818356af", "hash_full_prompts": "c53fa139ec25f502", "hash_input_tokens": "3775c049ee940ea3", "hash_cont_tokens": "0cc4c9bd9df094ef" }, "truncated": 0, "non_truncated": 612, "padded": 2448, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:public_relations|5": { "hashes": { "hash_examples": "b66d52e28e7d14e0", "hash_full_prompts": "55b5eff05aa6bf13", "hash_input_tokens": "be078a9672a35a48", "hash_cont_tokens": "680235f5ede0b353" }, "truncated": 0, "non_truncated": 110, "padded": 440, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:security_studies|5": { "hashes": { "hash_examples": "514c14feaf000ad9", "hash_full_prompts": "6690ecdc054f7b0c", "hash_input_tokens": "3022dd1ffded02a9", "hash_cont_tokens": "2119792a6103cc24" }, "truncated": 0, "non_truncated": 245, "padded": 980, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:sociology|5": { "hashes": { "hash_examples": "f6c9bc9d18c80870", "hash_full_prompts": "945fbdd091c72d64", "hash_input_tokens": "4762d7cdcc303fe1", "hash_cont_tokens": "2178ff937c0c1a29" }, "truncated": 0, "non_truncated": 201, "padded": 804, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:us_foreign_policy|5": { "hashes": { "hash_examples": "ed7b78629db6678f", "hash_full_prompts": "ebba6ea6eca4ae53", "hash_input_tokens": "880355a94d9fe5b1", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 392, "non_padded": 8, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:virology|5": { "hashes": { "hash_examples": "bc52ffdc3f9b994a", "hash_full_prompts": "a2ee4984d6877fe3", "hash_input_tokens": "65c8ea545351aa14", "hash_cont_tokens": "ec5c187546c7c842" }, "truncated": 0, "non_truncated": 166, "padded": 660, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:world_religions|5": { "hashes": { "hash_examples": "ecdb4a4f94f62930", "hash_full_prompts": "a89c8dddd1d8ced0", "hash_input_tokens": "0d36fd4bf3b571e1", "hash_cont_tokens": "65bc44ac97c3227a" }, "truncated": 0, "non_truncated": 171, "padded": 684, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|truthfulqa:mc|0": { "hashes": { "hash_examples": "36a6d90e75d92d4a", "hash_full_prompts": "8d9ca0a8bd458a1c", "hash_input_tokens": "89f619d8a8d594e0", "hash_cont_tokens": "8eaf3b80e9854172" }, "truncated": 0, "non_truncated": 817, "padded": 9996, "non_padded": 0, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 }, "leaderboard|winogrande|5": { "hashes": { "hash_examples": "087d5d1a1afd4c7b", "hash_full_prompts": "35da55e47222e0e1", "hash_input_tokens": "25973bc571721c55", "hash_cont_tokens": "39be0da00f68561c" }, "truncated": 0, "non_truncated": 1267, "padded": 2534, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|gsm8k|5": { "hashes": { "hash_examples": "0ed016e24e7512fd", "hash_full_prompts": "f7ab209f6467841e", "hash_input_tokens": "650eb62258948f16", "hash_cont_tokens": "bd3608724a4cf68d" }, "truncated": 1319, "non_truncated": 0, "padded": 487, "non_padded": 832, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "670666fa3a90ce5d", "hash_full_prompts": "56c005e427046302", "hash_input_tokens": "3d48c4bd6b9d4a57", "hash_cont_tokens": "9c01009736bb767d" }, "truncated": 1319, "non_truncated": 27340, "padded": 113953, "non_padded": 919, "num_truncated_few_shots": 0 } }