|
raidhon/coven_7b_128k_orpo_alpha |
|
|
|
| Tasks |Version| Filter |n-shot| Metric | Value | |Stderr| |
|
|---------------------------------------|-------|----------------|-----:|-----------|------:|---|-----:| |
|
|winogrande | 1|none | 0|acc | 0.7782|± |0.0117| |
|
|truthfulqa |N/A |none | 0|rouge1_max |47.8575|± |0.8139| |
|
| | |none | 0|bleu_max |21.9412|± |0.7280| |
|
| | |none | 0|rouge2_max |32.7726|± |0.9228| |
|
| | |none | 0|rougeL_diff|-1.4310|± |0.7806| |
|
| | |none | 0|acc | 0.4955|± |0.0115| |
|
| | |none | 0|bleu_diff |-0.2883|± |0.6228| |
|
| | |none | 0|rouge2_acc | 0.3807|± |0.0170| |
|
| | |none | 0|rougeL_max |44.1785|± |0.8274| |
|
| | |none | 0|rougeL_acc | 0.4443|± |0.0174| |
|
| | |none | 0|rouge2_diff|-1.5603|± |0.8950| |
|
| | |none | 0|bleu_acc | 0.4321|± |0.0173| |
|
| | |none | 0|rouge1_diff|-0.7276|± |0.7721| |
|
| | |none | 0|rouge1_acc | 0.4774|± |0.0175| |
|
| - truthfulqa_gen | 3|none | 0|bleu_max |21.9412|± |0.7280| |
|
| | |none | 0|bleu_acc | 0.4321|± |0.0173| |
|
| | |none | 0|bleu_diff |-0.2883|± |0.6228| |
|
| | |none | 0|rouge1_max |47.8575|± |0.8139| |
|
| | |none | 0|rouge1_acc | 0.4774|± |0.0175| |
|
| | |none | 0|rouge1_diff|-0.7276|± |0.7721| |
|
| | |none | 0|rouge2_max |32.7726|± |0.9228| |
|
| | |none | 0|rouge2_acc | 0.3807|± |0.0170| |
|
| | |none | 0|rouge2_diff|-1.5603|± |0.8950| |
|
| | |none | 0|rougeL_max |44.1785|± |0.8274| |
|
| | |none | 0|rougeL_acc | 0.4443|± |0.0174| |
|
| | |none | 0|rougeL_diff|-1.4310|± |0.7806| |
|
| - truthfulqa_mc1 | 2|none | 0|acc | 0.4174|± |0.0173| |
|
| - truthfulqa_mc2 | 2|none | 0|acc | 0.5736|± |0.0151| |
|
|piqa | 1|none | 0|acc | 0.8205|± |0.0090| |
|
| | |none | 0|acc_norm | 0.8395|± |0.0086| |
|
|openbookqa | 1|none | 0|acc | 0.3460|± |0.0213| |
|
| | |none | 0|acc_norm | 0.4800|± |0.0224| |
|
|mmlu |N/A |none | 0|acc | 0.6300|± |0.0038| |
|
| - humanities |N/A |none | 0|acc | 0.5779|± |0.0066| |
|
| - formal_logic | 0|none | 0|acc | 0.4127|± |0.0440| |
|
| - high_school_european_history | 0|none | 0|acc | 0.8061|± |0.0309| |
|
| - high_school_us_history | 0|none | 0|acc | 0.8480|± |0.0252| |
|
| - high_school_world_history | 0|none | 0|acc | 0.8523|± |0.0231| |
|
| - international_law | 0|none | 0|acc | 0.7934|± |0.0370| |
|
| - international_law | 0|none | 0|acc | 0.7934|± |0.0370| [46/1966] |
|
| - jurisprudence | 0|none | 0|acc | 0.7685|± |0.0408| |
|
| - logical_fallacies | 0|none | 0|acc | 0.7730|± |0.0329| |
|
| - moral_disputes | 0|none | 0|acc | 0.7110|± |0.0244| |
|
| - moral_scenarios | 0|none | 0|acc | 0.2894|± |0.0152| |
|
| - philosophy | 0|none | 0|acc | 0.7106|± |0.0258| |
|
| - prehistory | 0|none | 0|acc | 0.7685|± |0.0235| |
|
| - professional_law | 0|none | 0|acc | 0.4824|± |0.0128| |
|
| - world_religions | 0|none | 0|acc | 0.8129|± |0.0299| |
|
| - other |N/A |none | 0|acc | 0.7090|± |0.0078| |
|
| - business_ethics | 0|none | 0|acc | 0.5900|± |0.0494| |
|
| - clinical_knowledge | 0|none | 0|acc | 0.7245|± |0.0275| |
|
| - college_medicine | 0|none | 0|acc | 0.6532|± |0.0363| |
|
| - global_facts | 0|none | 0|acc | 0.3200|± |0.0469| |
|
| - human_aging | 0|none | 0|acc | 0.7040|± |0.0306| |
|
| - management | 0|none | 0|acc | 0.7864|± |0.0406| |
|
| - marketing | 0|none | 0|acc | 0.8632|± |0.0225| |
|
| - medical_genetics | 0|none | 0|acc | 0.7500|± |0.0435| |
|
| - miscellaneous | 0|none | 0|acc | 0.8212|± |0.0137| |
|
| - nutrition | 0|none | 0|acc | 0.7451|± |0.0250| |
|
| - professional_accounting | 0|none | 0|acc | 0.5000|± |0.0298| |
|
| - professional_medicine | 0|none | 0|acc | 0.7059|± |0.0277| |
|
| - virology | 0|none | 0|acc | 0.5301|± |0.0389| |
|
| - social_sciences |N/A |none | 0|acc | 0.7358|± |0.0077| |
|
| - econometrics | 0|none | 0|acc | 0.4474|± |0.0468| |
|
| - high_school_geography | 0|none | 0|acc | 0.7525|± |0.0307| |
|
| - high_school_government_and_politics| 0|none | 0|acc | 0.9016|± |0.0215| |
|
| - high_school_macroeconomics | 0|none | 0|acc | 0.6564|± |0.0241| |
|
| - high_school_microeconomics | 0|none | 0|acc | 0.6807|± |0.0303| |
|
| - high_school_psychology | 0|none | 0|acc | 0.8404|± |0.0157| |
|
| - human_sexuality | 0|none | 0|acc | 0.7405|± |0.0384| |
|
| - professional_psychology | 0|none | 0|acc | 0.6552|± |0.0192| |
|
| - public_relations | 0|none | 0|acc | 0.6727|± |0.0449| |
|
| - security_studies | 0|none | 0|acc | 0.7673|± |0.0270| |
|
| - sociology | 0|none | 0|acc | 0.8358|± |0.0262| |
|
| - us_foreign_policy | 0|none | 0|acc | 0.8600|± |0.0349| |
|
| - stem |N/A |none | 0|acc | 0.5265|± |0.0085| |
|
| - abstract_algebra | 0|none | 0|acc | 0.3100|± |0.0465| |
|
| - anatomy | 0|none | 0|acc | 0.6000|± |0.0423| |
|
| - astronomy | 0|none | 0|acc | 0.6842|± |0.0378| |
|
| - college_biology | 0|none | 0|acc | 0.7292|± |0.0372| |
|
| - college_chemistry | 0|none | 0|acc | 0.4700|± |0.0502| |
|
| - college_chemistry | 0|none | 0|acc | 0.4700|± |0.0502| [5/1966] |
|
| - college_computer_science | 0|none | 0|acc | 0.5600|± |0.0499| |
|
| - college_mathematics | 0|none | 0|acc | 0.3500|± |0.0479| |
|
| - college_physics | 0|none | 0|acc | 0.3529|± |0.0476| |
|
| - computer_security | 0|none | 0|acc | 0.7100|± |0.0456| |
|
| - conceptual_physics | 0|none | 0|acc | 0.5574|± |0.0325| |
|
| - electrical_engineering | 0|none | 0|acc | 0.5793|± |0.0411| |
|
| - elementary_mathematics | 0|none | 0|acc | 0.4101|± |0.0253| |
|
| - high_school_biology | 0|none | 0|acc | 0.7903|± |0.0232| |
|
| - high_school_chemistry | 0|none | 0|acc | 0.4828|± |0.0352| |
|
| - high_school_computer_science | 0|none | 0|acc | 0.6600|± |0.0476| |
|
| - high_school_mathematics | 0|none | 0|acc | 0.3444|± |0.0290| |
|
| - high_school_physics | 0|none | 0|acc | 0.3642|± |0.0393| |
|
| - high_school_statistics | 0|none | 0|acc | 0.5000|± |0.0341| |
|
| - machine_learning | 0|none | 0|acc | 0.5268|± |0.0474| |
|
|hellaswag | 1|none | 0|acc | 0.6537|± |0.0047| |
|
| | |none | 0|acc_norm | 0.8429|± |0.0036| |
|
|gsm8k | 3|strict-match | 5|exact_match| 0.7218|± |0.0123| |
|
| | |flexible-extract| 5|exact_match| 0.7263|± |0.0123| |
|
|boolq | 2|none | 0|acc | 0.8743|± |0.0058| |
|
|arc_easy | 1|none | 0|acc | 0.8502|± |0.0073| |
|
| | |none | 0|acc_norm | 0.8295|± |0.0077| |
|
|arc_challenge | 1|none | 0|acc | 0.5964|± |0.0143| |
|
| | |none | 0|acc_norm | 0.6169|± |0.0142| |
|
|
|
| Groups |Version|Filter|n-shot| Metric | Value | |Stderr| |
|
|------------------|-------|------|-----:|-----------|------:|---|-----:| |
|
|truthfulqa |N/A |none | 0|rouge1_max |47.8575|± |0.8139| |
|
| | |none | 0|bleu_max |21.9412|± |0.7280| |
|
| | |none | 0|rouge2_max |32.7726|± |0.9228| |
|
| | |none | 0|rougeL_diff|-1.4310|± |0.7806| |
|
| | |none | 0|acc | 0.4955|± |0.0115| |
|
| | |none | 0|bleu_diff |-0.2883|± |0.6228| |
|
| | |none | 0|rouge2_acc | 0.3807|± |0.0170| |
|
| | |none | 0|rougeL_max |44.1785|± |0.8274| |
|
| | |none | 0|rougeL_acc | 0.4443|± |0.0174| |
|
| | |none | 0|rouge2_diff|-1.5603|± |0.8950| |
|
| | |none | 0|bleu_acc | 0.4321|± |0.0173| |
|
| | |none | 0|rouge1_diff|-0.7276|± |0.7721| |
|
| | |none | 0|rouge1_acc | 0.4774|± |0.0175| |
|
|mmlu |N/A |none | 0|acc | 0.6300|± |0.0038| |
|
| - humanities |N/A |none | 0|acc | 0.5779|± |0.0066| |
|
| - other |N/A |none | 0|acc | 0.7090|± |0.0078| |
|
| - social_sciences|N/A |none | 0|acc | 0.7358|± |0.0077| |
|
| - stem |N/A |none | 0|acc | 0.5265|± |0.0085| |
|
|