zhuohan-7 commited on
Commit
eb7faaf
·
verified ·
1 Parent(s): 0523925

Upload folder using huggingface_hub

Browse files
Files changed (39) hide show
  1. results/cross_lingual/zero_shot/cross_logiqa.csv +3 -2
  2. results/cross_lingual/zero_shot/cross_logiqa_no_prompt.csv +8 -5
  3. results/cross_lingual/zero_shot/cross_mmlu.csv +3 -2
  4. results/cross_lingual/zero_shot/cross_mmlu_no_prompt.csv +7 -2
  5. results/cross_lingual/zero_shot/cross_xquad.csv +2 -2
  6. results/cross_lingual/zero_shot/cross_xquad_no_prompt.csv +7 -2
  7. results/cultural_reasoning/zero_shot/cn_eval.csv +3 -2
  8. results/cultural_reasoning/zero_shot/ph_eval.csv +3 -2
  9. results/cultural_reasoning/zero_shot/sg_eval.csv +3 -2
  10. results/cultural_reasoning/zero_shot/sg_eval_v1_cleaned.csv +3 -2
  11. results/cultural_reasoning/zero_shot/sg_eval_v2_mcq.csv +4 -3
  12. results/cultural_reasoning/zero_shot/sg_eval_v2_mcq_no_prompt.csv +7 -2
  13. results/cultural_reasoning/zero_shot/sg_eval_v2_open.csv +3 -2
  14. results/cultural_reasoning/zero_shot/us_eval.csv +2 -1
  15. results/dialogue/zero_shot/dialogsum.csv +3 -2
  16. results/dialogue/zero_shot/dream.csv +3 -2
  17. results/dialogue/zero_shot/samsum.csv +3 -2
  18. results/emotion/zero_shot/ind_emotion.csv +3 -2
  19. results/emotion/zero_shot/sst2.csv +3 -2
  20. results/flores_translation/zero_shot/ind2eng.csv +3 -2
  21. results/flores_translation/zero_shot/vie2eng.csv +3 -2
  22. results/flores_translation/zero_shot/zho2eng.csv +3 -2
  23. results/flores_translation/zero_shot/zsm2eng.csv +3 -2
  24. results/fundamental_nlp_tasks/zero_shot/c3.csv +2 -2
  25. results/fundamental_nlp_tasks/zero_shot/cola.csv +3 -2
  26. results/fundamental_nlp_tasks/zero_shot/mnli.csv +3 -2
  27. results/fundamental_nlp_tasks/zero_shot/mrpc.csv +3 -2
  28. results/fundamental_nlp_tasks/zero_shot/ocnli.csv +4 -2
  29. results/fundamental_nlp_tasks/zero_shot/qnli.csv +4 -2
  30. results/fundamental_nlp_tasks/zero_shot/qqp.csv +3 -2
  31. results/fundamental_nlp_tasks/zero_shot/rte.csv +3 -2
  32. results/fundamental_nlp_tasks/zero_shot/wnli.csv +3 -2
  33. results/general_reasoning/zero_shot/c_eval.csv +3 -2
  34. results/general_reasoning/zero_shot/cmmlu.csv +4 -2
  35. results/general_reasoning/zero_shot/indommlu.csv +2 -2
  36. results/general_reasoning/zero_shot/indommlu_no_prompt.csv +7 -2
  37. results/general_reasoning/zero_shot/mmlu.csv +4 -2
  38. results/general_reasoning/zero_shot/mmlu_no_prompt.csv +7 -2
  39. results/general_reasoning/zero_shot/zbench.csv +3 -2
results/cross_lingual/zero_shot/cross_logiqa.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
  Qwen2-7B-Instruct,0.564935064935065,0.48279220779220783,0.5206435955861558,0.6590909090909091,0.7045454545454546,0.5340909090909091,0.5738636363636364,0.5397727272727273,0.5113636363636364,0.4318181818181818
3
- Meta-Llama-3.1-8B-Instruct,0.4472402597402597,0.43717532467532455,0.44215052105151864,0.5227272727272727,0.4602272727272727,0.4715909090909091,0.4715909090909091,0.4147727272727273,0.3977272727272727,0.39204545454545453
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.49918831168831174,0.45974025974025984,0.4786528859163277,0.5454545454545454,0.5340909090909091,0.5340909090909091,0.5340909090909091,0.5,0.45454545454545453,0.39204545454545453
5
  Qwen2_5_32B_Instruct,0.6931818181818182,0.6397727272727273,0.6654072695772988,0.7727272727272727,0.7897727272727273,0.6704545454545454,0.6761363636363636,0.6875,0.6875,0.5681818181818182
6
  Qwen2_5_7B_Instruct,0.599025974025974,0.5034090909090908,0.5470709896292291,0.7102272727272727,0.7215909090909091,0.6136363636363636,0.6022727272727273,0.5738636363636364,0.5511363636363636,0.42045454545454547
7
  Qwen2_5_1_5B_Instruct,0.46834415584415584,0.348538961038961,0.3996561615557665,0.5511363636363636,0.5909090909090909,0.4659090909090909,0.5113636363636364,0.4375,0.375,0.3465909090909091
8
  Qwen2-72B-Instruct,0.6728896103896104,0.6762987012987012,0.6745898487968579,0.75,0.8068181818181818,0.6534090909090909,0.6193181818181818,0.625,0.6534090909090909,0.6022727272727273
 
9
  Meta-Llama-3-8B-Instruct,0.4610389610389611,0.45097402597402597,0.4559509553669637,0.5965909090909091,0.48295454545454547,0.5,0.4602272727272727,0.42045454545454547,0.4034090909090909,0.36363636363636365
 
10
  Meta-Llama-3.1-70B-Instruct,0.6566558441558442,0.598051948051948,0.6259852839118454,0.7443181818181818,0.7215909090909091,0.6647727272727273,0.6534090909090909,0.6193181818181818,0.625,0.5681818181818182
11
  Qwen2_5_3B_Instruct,0.4878246753246754,0.3594155844155844,0.41388918606681485,0.6079545454545454,0.6420454545454546,0.45454545454545453,0.4602272727272727,0.48295454545454547,0.42045454545454547,0.3465909090909091
12
  SeaLLMs-v3-7B-Chat,0.5551948051948051,0.5142857142857142,0.5339578453833284,0.6022727272727273,0.6647727272727273,0.5738636363636364,0.5454545454545454,0.5170454545454546,0.5,0.48295454545454547
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.6436688311688312,0.5938311688311688,0.6177464473895627,0.
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.6055194805194805,0.6220779220779219,0.6136870270197391,0.6590909090909091,0.6590909090909091,0.5625,0.6193181818181818,0.5681818181818182,0.5852272727272727,0.5852272727272727
18
  gemma-2-2b-it,0.4780844155844156,0.4448051948051948,0.46084478401384643,0.5568181818181818,0.5,0.5,0.48863636363636365,0.4375,0.4602272727272727,0.4034090909090909
19
  llama3-8b-cpt-sea-lionv2-instruct,0.48538961038961037,0.4472402597402597,0.46553468284769084,0.5284090909090909,0.5113636363636364,0.5227272727272727,0.5227272727272727,0.48863636363636365,0.44886363636363635,0.375
20
- cross_openhermes_llama3_8b_12288_inst,0.48701298701298695,0.46964285714285725,0.4781702261263516,0.5511363636363636,0.5227272727272727,0.4715909090909091,0.4772727272727273,0.4943181818181818,0.5056818181818182,0.38636363636363635
21
  Qwen2_5_0_5B_Instruct,0.3538961038961039,0.1978896103896103,0.25383898238962527,0.45454545454545453,0.39204545454545453,0.3465909090909091,0.375,0.3409090909090909,0.30113636363636365,0.26704545454545453
22
  GPT4o_0513,0.7159090909090909,0.6941558441558444,0.7048646724637749,0.7613636363636364,0.7670454545454546,0.6988636363636364,0.6988636363636364,0.7045454545454546,0.6761363636363636,0.7045454545454546
 
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
  Qwen2-7B-Instruct,0.564935064935065,0.48279220779220783,0.5206435955861558,0.6590909090909091,0.7045454545454546,0.5340909090909091,0.5738636363636364,0.5397727272727273,0.5113636363636364,0.4318181818181818
3
+ Meta-Llama-3.1-8B-Instruct,0.4862012987012987,0.45876623376623393,0.47208550763939805,0.5909090909090909,0.4715909090909091,0.48863636363636365,0.5340909090909091,0.45454545454545453,0.44886363636363635,0.4147727272727273
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.49918831168831174,0.45974025974025984,0.4786528859163277,0.5454545454545454,0.5340909090909091,0.5340909090909091,0.5340909090909091,0.5,0.45454545454545453,0.39204545454545453
5
  Qwen2_5_32B_Instruct,0.6931818181818182,0.6397727272727273,0.6654072695772988,0.7727272727272727,0.7897727272727273,0.6704545454545454,0.6761363636363636,0.6875,0.6875,0.5681818181818182
6
  Qwen2_5_7B_Instruct,0.599025974025974,0.5034090909090908,0.5470709896292291,0.7102272727272727,0.7215909090909091,0.6136363636363636,0.6022727272727273,0.5738636363636364,0.5511363636363636,0.42045454545454547
7
  Qwen2_5_1_5B_Instruct,0.46834415584415584,0.348538961038961,0.3996561615557665,0.5511363636363636,0.5909090909090909,0.4659090909090909,0.5113636363636364,0.4375,0.375,0.3465909090909091
8
  Qwen2-72B-Instruct,0.6728896103896104,0.6762987012987012,0.6745898487968579,0.75,0.8068181818181818,0.6534090909090909,0.6193181818181818,0.625,0.6534090909090909,0.6022727272727273
9
+ Sailor2-8B-Chat,0.5405844155844156,0.5628246753246753,0.551480408610067,0.625,0.5852272727272727,0.4943181818181818,0.5568181818181818,0.5056818181818182,0.5568181818181818,0.4602272727272727
10
  Meta-Llama-3-8B-Instruct,0.4610389610389611,0.45097402597402597,0.4559509553669637,0.5965909090909091,0.48295454545454547,0.5,0.4602272727272727,0.42045454545454547,0.4034090909090909,0.36363636363636365
11
+ merged_llama3_8b_sg_inst_avg_diff,0.4829545454545454,0.4952922077922078,0.48904557518459746,0.5397727272727273,0.5340909090909091,0.4772727272727273,0.5056818181818182,0.4602272727272727,0.45454545454545453,0.4090909090909091
12
  Meta-Llama-3.1-70B-Instruct,0.6566558441558442,0.598051948051948,0.6259852839118454,0.7443181818181818,0.7215909090909091,0.6647727272727273,0.6534090909090909,0.6193181818181818,0.625,0.5681818181818182
13
  Qwen2_5_3B_Instruct,0.4878246753246754,0.3594155844155844,0.41388918606681485,0.6079545454545454,0.6420454545454546,0.45454545454545453,0.4602272727272727,0.48295454545454547,0.42045454545454547,0.3465909090909091
14
  SeaLLMs-v3-7B-Chat,0.5551948051948051,0.5142857142857142,0.5339578453833284,0.6022727272727273,0.6647727272727273,0.5738636363636364,0.5454545454545454,0.5170454545454546,0.5,0.48295454545454547
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.6055194805194805,0.6220779220779219,0.6136870270197391,0.6590909090909091,0.6590909090909091,0.5625,0.6193181818181818,0.5681818181818182,0.5852272727272727,0.5852272727272727
20
  gemma-2-2b-it,0.4780844155844156,0.4448051948051948,0.46084478401384643,0.5568181818181818,0.5,0.5,0.48863636363636365,0.4375,0.4602272727272727,0.4034090909090909
21
  llama3-8b-cpt-sea-lionv2-instruct,0.48538961038961037,0.4472402597402597,0.46553468284769084,0.5284090909090909,0.5113636363636364,0.5227272727272727,0.5227272727272727,0.48863636363636365,0.44886363636363635,0.375
 
22
  Qwen2_5_0_5B_Instruct,0.3538961038961039,0.1978896103896103,0.25383898238962527,0.45454545454545453,0.39204545454545453,0.3465909090909091,0.375,0.3409090909090909,0.30113636363636365,0.26704545454545453
23
  GPT4o_0513,0.7159090909090909,0.6941558441558444,0.7048646724637749,0.7613636363636364,0.7670454545454546,0.6988636363636364,0.6988636363636364,0.7045454545454546,0.6761363636363636,0.7045454545454546
results/cross_lingual/zero_shot/cross_logiqa_no_prompt.csv CHANGED
@@ -1,6 +1,9 @@
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
- llama3-8b-cpt-sea-lionv2.1-instruct,0.4375,0.2728896103896104,0.3361231717991198,0.4318181818181818,0.4772727272727273,0.4943181818181818,0.4034090909090909,0.4318181818181818,0.42613636363636365,0.3977272727272727
3
- Meta-Llama-3-8B-Instruct,0.41558441558441567,0.24577922077922076,0.3088830658319437,0.45454545454545453,0.4772727272727273,0.42045454545454547,0.3522727272727273,0.3977272727272727,0.42613636363636365,0.3806818181818182
4
- gemma2-9b-cpt-sea-lionv3-instruct,0.48214285714285715,0.3753246753246754,0.4220803807589114,0.5454545454545454,0.5227272727272727,0.4943181818181818,0.4431818181818182,0.45454545454545453,0.4318181818181818,0.48295454545454547
5
- cross_openhermes_llama3_8b_12288_inst,0.3896103896103896,0.23279220779220777,0.2914456435114937,0.4659090909090909,0.3806818181818182,0.4147727272727273,0.30113636363636365,0.3693181818181818,0.42613636363636365,0.3693181818181818
6
- GPT4o_0513,0.575487012987013,0.4172077922077923,0.48372906728622567,0.6534090909090909,0.6079545454545454,0.6022727272727273,0.4659090909090909,0.5511363636363636,0.5965909090909091,0.5511363636363636
 
 
 
 
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
+ Meta-Llama-3.1-8B-Instruct,0.512987012987013,0.4394480519480519,0.4733785048611023,0.5852272727272727,0.5852272727272727,0.5454545454545454,0.5,0.45454545454545453,0.5227272727272727,0.3977272727272727
3
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.45779220779220786,0.3751623376623376,0.412378792469608,0.5284090909090909,0.5170454545454546,0.5340909090909091,0.4602272727272727,0.4034090909090909,0.4431818181818182,0.3181818181818182
4
+ Qwen2_5_7B_Instruct,0.6047077922077922,0.47938311688311697,0.5348014705675028,0.6931818181818182,0.7102272727272727,0.6420454545454546,0.5795454545454546,0.6306818181818182,0.5340909090909091,0.4431818181818182
5
+ Sailor2-8B-Chat,0.5503246753246753,0.5363636363636365,0.5432544747850031,0.6136363636363636,0.625,0.5056818181818182,0.5625,0.5113636363636364,0.5511363636363636,0.48295454545454547
6
+ merged_llama3_8b_sg_inst_avg_diff,0.5105519480519481,0.4558441558441559,0.48164954476113636,0.5909090909090909,0.5284090909090909,0.5454545454545454,0.5,0.4943181818181818,0.48863636363636365,0.42613636363636365
7
+ SeaLLMs-v3-7B-Chat,0.5324675324675324,0.41266233766233773,0.46497164802104307,0.5681818181818182,0.5852272727272727,0.5738636363636364,0.5568181818181818,0.4943181818181818,0.5170454545454546,0.4318181818181818
8
+ gemma-2-9b-it,0.6006493506493508,0.5753246753246755,0.587714328691409,0.6590909090909091,0.6363636363636364,0.5511363636363636,0.6022727272727273,0.5852272727272727,0.6022727272727273,0.5681818181818182
9
+ gemma2-9b-cpt-sea-lionv3-instruct,0.5844155844155844,0.605844155844156,0.5949369778657175,0.6363636363636364,0.6420454545454546,0.5625,0.5681818181818182,0.5568181818181818,0.5511363636363636,0.5738636363636364
results/cross_lingual/zero_shot/cross_mmlu.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
  Qwen2-7B-Instruct,0.6628571428571428,0.5257142857142858,0.5863736263242921,0.76,0.6666666666666666,0.72,0.5933333333333334,0.7066666666666667,0.6133333333333333,0.58
3
- Meta-Llama-3.1-8B-Instruct,0.5619047619047618,0.5020952380952383,0.5303189947159841,0.66,0.5266666666666666,0.5733333333333334,0.5266666666666666,0.5533333333333333,0.5533333333333333,0.54
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6114285714285713,0.5478095238095239,0.5778733392299966,0.72,0.6,0.6066666666666667,0.62,0.6466666666666666,0.56,0.5266666666666666
5
  Qwen2_5_32B_Instruct,0.8019047619047619,0.7386666666666668,0.7689878008073214,0.8533333333333334,0.8533333333333334,0.82,0.7933333333333333,0.8,0.7866666666666666,0.7066666666666667
6
  Qwen2_5_7B_Instruct,0.6733333333333332,0.580952380952381,0.6237408250578389,0.7666666666666667,0.7066666666666667,0.72,0.6666666666666666,0.6866666666666666,0.6266666666666667,0.54
7
  Qwen2_5_1_5B_Instruct,0.5076190476190475,0.3721904761904762,0.42948154099799957,0.6,0.6066666666666667,0.5333333333333333,0.4866666666666667,0.5666666666666667,0.4,0.36
8
  Qwen2-72B-Instruct,0.779047619047619,0.7611428571428573,0.7699911663398871,0.8133333333333334,0.7933333333333333,0.7933333333333333,0.7333333333333333,0.7666666666666667,0.78,0.7733333333333333
 
9
  Meta-Llama-3-8B-Instruct,0.5733333333333334,0.4742857142857144,0.5191272726777197,0.7133333333333334,0.5866666666666667,0.5733333333333334,0.5866666666666667,0.5066666666666667,0.5333333333333333,0.5133333333333333
 
10
  Meta-Llama-3.1-70B-Instruct,0.7638095238095238,0.7716190476190474,0.7676944251955988,0.8,0.74,0.7666666666666667,0.7666666666666667,0.76,0.7666666666666667,0.7466666666666667
11
  Qwen2_5_3B_Instruct,0.5857142857142856,0.48952380952380964,0.533316462053399,0.6933333333333334,0.6666666666666666,0.64,0.5266666666666666,0.6333333333333333,0.5466666666666666,0.3933333333333333
12
  SeaLLMs-v3-7B-Chat,0.6628571428571429,0.6135238095238095,0.6372370860992635,0.74,0.6933333333333334,0.6933333333333334,0.6466666666666666,0.68,0.6,0.5866666666666667
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.7266666666666666,0.680952380952381,0.7030672078887086,0.7
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.7180952380952382,0.7485714285714284,0.7330166975381478,0.78,0.7133333333333334,0.7133333333333334,0.6866666666666666,0.7266666666666667,0.7,0.7066666666666667
18
  gemma-2-2b-it,0.5780952380952381,0.5480000000000002,0.5626454667971265,0.7,0.5866666666666667,0.5866666666666667,0.5333333333333333,0.5666666666666667,0.5333333333333333,0.54
19
  llama3-8b-cpt-sea-lionv2-instruct,0.6104761904761905,0.5685714285714286,0.5887791368067445,0.72,0.6,0.6133333333333333,0.58,0.6333333333333333,0.5933333333333334,0.5333333333333333
20
- cross_openhermes_llama3_8b_12288_inst,0.5733333333333333,0.5337142857142858,0.5528148657449711,0.6666666666666666,0.6,0.5466666666666666,0.5333333333333333,0.5666666666666667,0.5933333333333334,0.5066666666666667
21
  Qwen2_5_0_5B_Instruct,0.4228571428571429,0.2436190476190476,0.3091364879297727,0.6133333333333333,0.5,0.4266666666666667,0.4066666666666667,0.3933333333333333,0.3333333333333333,0.2866666666666667
22
  GPT4o_0513,0.8038095238095239,0.8506666666666668,0.8265745643832277,0.8266666666666667,0.7933333333333333,0.8,0.7666666666666667,0.7933333333333333,0.8266666666666667,0.82
 
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
  Qwen2-7B-Instruct,0.6628571428571428,0.5257142857142858,0.5863736263242921,0.76,0.6666666666666666,0.72,0.5933333333333334,0.7066666666666667,0.6133333333333333,0.58
3
+ Meta-Llama-3.1-8B-Instruct,0.62,0.5765714285714285,0.5974976121754717,0.72,0.6133333333333333,0.6266666666666667,0.6066666666666667,0.5866666666666667,0.58,0.6066666666666667
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6114285714285713,0.5478095238095239,0.5778733392299966,0.72,0.6,0.6066666666666667,0.62,0.6466666666666666,0.56,0.5266666666666666
5
  Qwen2_5_32B_Instruct,0.8019047619047619,0.7386666666666668,0.7689878008073214,0.8533333333333334,0.8533333333333334,0.82,0.7933333333333333,0.8,0.7866666666666666,0.7066666666666667
6
  Qwen2_5_7B_Instruct,0.6733333333333332,0.580952380952381,0.6237408250578389,0.7666666666666667,0.7066666666666667,0.72,0.6666666666666666,0.6866666666666666,0.6266666666666667,0.54
7
  Qwen2_5_1_5B_Instruct,0.5076190476190475,0.3721904761904762,0.42948154099799957,0.6,0.6066666666666667,0.5333333333333333,0.4866666666666667,0.5666666666666667,0.4,0.36
8
  Qwen2-72B-Instruct,0.779047619047619,0.7611428571428573,0.7699911663398871,0.8133333333333334,0.7933333333333333,0.7933333333333333,0.7333333333333333,0.7666666666666667,0.78,0.7733333333333333
9
+ Sailor2-8B-Chat,0.6542857142857142,0.6586666666666667,0.6564688814239598,0.7133333333333334,0.6733333333333333,0.6533333333333333,0.6066666666666667,0.62,0.6466666666666666,0.6666666666666666
10
  Meta-Llama-3-8B-Instruct,0.5733333333333334,0.4742857142857144,0.5191272726777197,0.7133333333333334,0.5866666666666667,0.5733333333333334,0.5866666666666667,0.5066666666666667,0.5333333333333333,0.5133333333333333
11
+ merged_llama3_8b_sg_inst_avg_diff,0.5980952380952381,0.5817142857142859,0.5897910419722433,0.76,0.5866666666666667,0.6266666666666667,0.5466666666666666,0.5666666666666667,0.5533333333333333,0.5466666666666666
12
  Meta-Llama-3.1-70B-Instruct,0.7638095238095238,0.7716190476190474,0.7676944251955988,0.8,0.74,0.7666666666666667,0.7666666666666667,0.76,0.7666666666666667,0.7466666666666667
13
  Qwen2_5_3B_Instruct,0.5857142857142856,0.48952380952380964,0.533316462053399,0.6933333333333334,0.6666666666666666,0.64,0.5266666666666666,0.6333333333333333,0.5466666666666666,0.3933333333333333
14
  SeaLLMs-v3-7B-Chat,0.6628571428571429,0.6135238095238095,0.6372370860992635,0.74,0.6933333333333334,0.6933333333333334,0.6466666666666666,0.68,0.6,0.5866666666666667
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.7180952380952382,0.7485714285714284,0.7330166975381478,0.78,0.7133333333333334,0.7133333333333334,0.6866666666666666,0.7266666666666667,0.7,0.7066666666666667
20
  gemma-2-2b-it,0.5780952380952381,0.5480000000000002,0.5626454667971265,0.7,0.5866666666666667,0.5866666666666667,0.5333333333333333,0.5666666666666667,0.5333333333333333,0.54
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6104761904761905,0.5685714285714286,0.5887791368067445,0.72,0.6,0.6133333333333333,0.58,0.6333333333333333,0.5933333333333334,0.5333333333333333
 
22
  Qwen2_5_0_5B_Instruct,0.4228571428571429,0.2436190476190476,0.3091364879297727,0.6133333333333333,0.5,0.4266666666666667,0.4066666666666667,0.3933333333333333,0.3333333333333333,0.2866666666666667
23
  GPT4o_0513,0.8038095238095239,0.8506666666666668,0.8265745643832277,0.8266666666666667,0.7933333333333333,0.8,0.7666666666666667,0.7933333333333333,0.8266666666666667,0.82
results/cross_lingual/zero_shot/cross_mmlu_no_prompt.csv CHANGED
@@ -1,6 +1,11 @@
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
- llama3-8b-cpt-sea-lionv2.1-instruct,0.6638095238095237,0.5363809523809523,0.5933304614797237,0.78,0.62,0.6933333333333334,0.64,0.66,0.6466666666666666,0.6066666666666667
 
 
 
3
  Meta-Llama-3-8B-Instruct,0.6428571428571429,0.49542857142857133,0.5595955249078094,0.7666666666666667,0.6533333333333333,0.7,0.6466666666666666,0.5733333333333334,0.5733333333333334,0.5866666666666667
 
 
 
4
  gemma2-9b-cpt-sea-lionv3-instruct,0.7809523809523808,0.7506666666666667,0.7655100940510849,0.8466666666666667,0.7866666666666666,0.7733333333333333,0.78,0.7933333333333333,0.7333333333333333,0.7533333333333333
5
- cross_openhermes_llama3_8b_12288_inst,0.6066666666666667,0.4874285714285715,0.54055013922636,0.7266666666666667,0.6,0.6,0.5866666666666667,0.58,0.5733333333333334,0.58
6
  GPT4o_0513,0.8819047619047619,0.8609523809523807,0.8713026281050943,0.9266666666666666,0.8866666666666667,0.9066666666666666,0.7933333333333333,0.88,0.9066666666666666,0.8733333333333333
 
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
+ Meta-Llama-3.1-8B-Instruct,0.6876190476190477,0.5615238095238096,0.6182070607559236,0.82,0.6333333333333333,0.72,0.6666666666666666,0.66,0.6466666666666666,0.6666666666666666
3
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.6676190476190476,0.5590476190476189,0.6085285418019147,0.7533333333333333,0.6666666666666666,0.68,0.6333333333333333,0.6933333333333334,0.64,0.6066666666666667
4
+ Qwen2_5_7B_Instruct,0.7742857142857141,0.6222857142857142,0.6900140284752591,0.8466666666666667,0.84,0.8266666666666667,0.74,0.7533333333333333,0.7133333333333334,0.7
5
+ Sailor2-8B-Chat,0.6923809523809524,0.6592380952380954,0.6754031781322388,0.7266666666666667,0.7066666666666667,0.7133333333333334,0.6733333333333333,0.6733333333333333,0.6466666666666666,0.7066666666666667
6
  Meta-Llama-3-8B-Instruct,0.6428571428571429,0.49542857142857133,0.5595955249078094,0.7666666666666667,0.6533333333333333,0.7,0.6466666666666666,0.5733333333333334,0.5733333333333334,0.5866666666666667
7
+ merged_llama3_8b_sg_inst_avg_diff,0.6980952380952381,0.5891428571428572,0.6390081595918414,0.8466666666666667,0.6933333333333334,0.6933333333333334,0.6933333333333334,0.7133333333333334,0.6133333333333333,0.6333333333333333
8
+ SeaLLMs-v3-7B-Chat,0.7342857142857142,0.5765714285714287,0.6459409639562039,0.8333333333333334,0.7266666666666667,0.7866666666666666,0.7133333333333334,0.74,0.6866666666666666,0.6533333333333333
9
+ gemma-2-9b-it,0.781904761904762,0.747047619047619,0.7640788528690432,0.84,0.7933333333333333,0.7866666666666666,0.7466666666666667,0.78,0.7466666666666667,0.78
10
  gemma2-9b-cpt-sea-lionv3-instruct,0.7809523809523808,0.7506666666666667,0.7655100940510849,0.8466666666666667,0.7866666666666666,0.7733333333333333,0.78,0.7933333333333333,0.7333333333333333,0.7533333333333333
 
11
  GPT4o_0513,0.8819047619047619,0.8609523809523807,0.8713026281050943,0.9266666666666666,0.8866666666666667,0.9066666666666666,0.7933333333333333,0.88,0.9066666666666666,0.8733333333333333
results/cross_lingual/zero_shot/cross_xquad.csv CHANGED
@@ -1,11 +1,12 @@
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
  Qwen2-7B-Instruct,0.9418067226890756,0.9046218487394958,0.9228398561109394,0.957983193277311,0.9336134453781513,0.9436974789915966,0.9319327731092437,,,
3
- Meta-Llama-3.1-8B-Instruct,0.9287815126050419,0.8867647058823529,0.9072869161050563,0.9420168067226891,0.9193277310924369,0.9361344537815126,0.9176470588235294,,,
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.9361344537815126,0.9067226890756303,0.9211938673738631,0.9403361344537815,0.9260504201680673,0.9420168067226891,0.9361344537815126,,,
5
  Qwen2_5_7B_Instruct,0.9460084033613445,0.9178571428571428,0.9317201790045005,0.9554621848739496,0.9487394957983193,0.9445378151260504,0.9352941176470588,,,
6
  Qwen2_5_1_5B_Instruct,0.8939075630252101,0.8308823529411764,0.8612434620121144,0.9100840336134454,0.9,0.8957983193277311,0.8697478991596639,,,
7
  Qwen2-72B-Instruct,0.9613445378151261,0.9516806722689075,0.956488195931227,0.9638655462184874,0.9596638655462185,0.9596638655462185,0.9621848739495799,,,
8
  Meta-Llama-3-8B-Instruct,0.9210084033613445,0.880672268907563,0.9003888121913395,0.9411764705882353,0.9033613445378151,0.9260504201680673,0.9134453781512605,,,
 
9
  Meta-Llama-3.1-70B-Instruct,0.9615546218487395,0.9512605042016806,0.9563798632627071,0.9647058823529412,0.9512605042016806,0.9647058823529412,0.965546218487395,,,
10
  Qwen2_5_3B_Instruct,0.9378151260504202,0.8924369747899159,0.9145635113049859,0.9504201680672268,0.9327731092436975,0.9378151260504202,0.9302521008403362,,,
11
  SeaLLMs-v3-7B-Chat,0.9403361344537815,0.917016806722689,0.9285300818164836,0.9537815126050421,0.9378151260504202,0.9394957983193277,0.9302521008403362,,,
@@ -16,6 +17,5 @@ Qwen2_5_14B_Instruct,0.9581932773109244,0.9474789915966386,0.9528060148705768,0.
16
  gemma2-9b-cpt-sea-lionv3-instruct,0.9573529411764706,0.9365546218487395,0.9468395810403457,0.9638655462184874,0.9428571428571428,0.9605042016806723,0.9621848739495799,,,
17
  gemma-2-2b-it,0.917016806722689,0.8665966386554622,0.8910940700869288,0.934453781512605,0.9025210084033614,0.9193277310924369,0.9117647058823529,,,
18
  llama3-8b-cpt-sea-lionv2-instruct,0.9365546218487395,0.9086134453781513,0.9223724784871395,0.9420168067226891,0.926890756302521,0.9436974789915966,0.9336134453781513,,,
19
- cross_openhermes_llama3_8b_12288_inst,0.9273109243697479,0.8850840336134453,0.9057055579353634,0.9394957983193277,0.9252100840336135,0.9218487394957983,0.9226890756302522,,,
20
  Qwen2_5_0_5B_Instruct,0.6584033613445378,0.48172268907563026,0.5563732844778362,0.692436974789916,0.673109243697479,0.653781512605042,0.6142857142857143,,,
21
  GPT4o_0513,0.9605042016806723,0.951890756302521,0.9561780814209724,0.965546218487395,0.9537815126050421,0.9630252100840336,0.9596638655462185,,,
 
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
  Qwen2-7B-Instruct,0.9418067226890756,0.9046218487394958,0.9228398561109394,0.957983193277311,0.9336134453781513,0.9436974789915966,0.9319327731092437,,,
3
+ Meta-Llama-3.1-8B-Instruct,0.9380252100840336,0.9021008403361345,0.919712353438052,0.9453781512605042,0.926890756302521,0.9411764705882353,0.938655462184874,,,
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.9361344537815126,0.9067226890756303,0.9211938673738631,0.9403361344537815,0.9260504201680673,0.9420168067226891,0.9361344537815126,,,
5
  Qwen2_5_7B_Instruct,0.9460084033613445,0.9178571428571428,0.9317201790045005,0.9554621848739496,0.9487394957983193,0.9445378151260504,0.9352941176470588,,,
6
  Qwen2_5_1_5B_Instruct,0.8939075630252101,0.8308823529411764,0.8612434620121144,0.9100840336134454,0.9,0.8957983193277311,0.8697478991596639,,,
7
  Qwen2-72B-Instruct,0.9613445378151261,0.9516806722689075,0.956488195931227,0.9638655462184874,0.9596638655462185,0.9596638655462185,0.9621848739495799,,,
8
  Meta-Llama-3-8B-Instruct,0.9210084033613445,0.880672268907563,0.9003888121913395,0.9411764705882353,0.9033613445378151,0.9260504201680673,0.9134453781512605,,,
9
+ merged_llama3_8b_sg_inst_avg_diff,0.9369747899159664,0.8936974789915966,0.9148245940061492,0.9470588235294117,0.9218487394957983,0.9403361344537815,0.938655462184874,,,
10
  Meta-Llama-3.1-70B-Instruct,0.9615546218487395,0.9512605042016806,0.9563798632627071,0.9647058823529412,0.9512605042016806,0.9647058823529412,0.965546218487395,,,
11
  Qwen2_5_3B_Instruct,0.9378151260504202,0.8924369747899159,0.9145635113049859,0.9504201680672268,0.9327731092436975,0.9378151260504202,0.9302521008403362,,,
12
  SeaLLMs-v3-7B-Chat,0.9403361344537815,0.917016806722689,0.9285300818164836,0.9537815126050421,0.9378151260504202,0.9394957983193277,0.9302521008403362,,,
 
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.9573529411764706,0.9365546218487395,0.9468395810403457,0.9638655462184874,0.9428571428571428,0.9605042016806723,0.9621848739495799,,,
18
  gemma-2-2b-it,0.917016806722689,0.8665966386554622,0.8910940700869288,0.934453781512605,0.9025210084033614,0.9193277310924369,0.9117647058823529,,,
19
  llama3-8b-cpt-sea-lionv2-instruct,0.9365546218487395,0.9086134453781513,0.9223724784871395,0.9420168067226891,0.926890756302521,0.9436974789915966,0.9336134453781513,,,
 
20
  Qwen2_5_0_5B_Instruct,0.6584033613445378,0.48172268907563026,0.5563732844778362,0.692436974789916,0.673109243697479,0.653781512605042,0.6142857142857143,,,
21
  GPT4o_0513,0.9605042016806723,0.951890756302521,0.9561780814209724,0.965546218487395,0.9537815126050421,0.9630252100840336,0.9596638655462185,,,
results/cross_lingual/zero_shot/cross_xquad_no_prompt.csv CHANGED
@@ -1,6 +1,11 @@
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
- llama3-8b-cpt-sea-lionv2.1-instruct,0.9191176470588235,0.8352941176470587,0.87520339228777,0.9394957983193277,0.8932773109243698,0.9285714285714286,0.915126050420168,,,
 
 
 
3
  Meta-Llama-3-8B-Instruct,0.9060924369747899,0.8224789915966386,0.8622634639161603,0.9319327731092437,0.8932773109243698,0.9134453781512605,0.8857142857142857,,,
 
 
 
4
  gemma2-9b-cpt-sea-lionv3-instruct,0.9315126050420168,0.8716386554621849,0.9005815677746684,0.9453781512605042,0.9142857142857143,0.9369747899159664,0.9294117647058824,,,
5
- cross_openhermes_llama3_8b_12288_inst,0.9054621848739496,0.8298319327731092,0.8659989418997561,0.9285714285714286,0.892436974789916,0.9134453781512605,0.8873949579831932,,,
6
  GPT4o_0513,0.8941176470588236,0.8014705882352942,0.8452629967360276,0.9302521008403362,0.8857142857142857,0.9168067226890756,0.8436974789915966,,,
 
1
  Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino
2
+ Meta-Llama-3.1-8B-Instruct,0.9168067226890756,0.8292016806722688,0.870806433460842,0.9436974789915966,0.8949579831932774,0.9201680672268907,0.9084033613445378,,,
3
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.928781512605042,0.8592436974789917,0.892660412722869,0.9470588235294117,0.9084033613445378,0.9352941176470588,0.9243697478991597,,,
4
+ Qwen2_5_7B_Instruct,0.9069327731092437,0.8264705882352941,0.8648342089942876,0.9210084033613445,0.8991596638655462,0.9092436974789916,0.8983193277310925,,,
5
+ Sailor2-8B-Chat,0.9086134453781513,0.8378151260504201,0.8717792421413649,0.9252100840336135,0.8949579831932774,0.9117647058823529,0.9025210084033614,,,
6
  Meta-Llama-3-8B-Instruct,0.9060924369747899,0.8224789915966386,0.8622634639161603,0.9319327731092437,0.8932773109243698,0.9134453781512605,0.8857142857142857,,,
7
+ merged_llama3_8b_sg_inst_avg_diff,0.9117647058823529,0.8266806722689075,0.8671405721911006,0.9302521008403362,0.8899159663865546,0.9210084033613445,0.9058823529411765,,,
8
+ SeaLLMs-v3-7B-Chat,0.8943277310924369,0.7991596638655463,0.8440696412045011,0.9210084033613445,0.8773109243697479,0.9,0.8789915966386554,,,
9
+ gemma-2-9b-it,0.8668067226890757,0.7012605042016806,0.7752949732453414,0.8773109243697479,0.8529411764705882,0.8714285714285714,0.865546218487395,,,
10
  gemma2-9b-cpt-sea-lionv3-instruct,0.9315126050420168,0.8716386554621849,0.9005815677746684,0.9453781512605042,0.9142857142857143,0.9369747899159664,0.9294117647058824,,,
 
11
  GPT4o_0513,0.8941176470588236,0.8014705882352942,0.8452629967360276,0.9302521008403362,0.8857142857142857,0.9168067226890756,0.8436974789915966,,,
results/cultural_reasoning/zero_shot/cn_eval.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.8285714285714286
3
- Meta-Llama-3.1-8B-Instruct,0.4857142857142857
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5047619047619047
5
  Qwen2_5_32B_Instruct,0.8476190476190476
6
  Qwen2_5_7B_Instruct,0.8
7
  Qwen2_5_1_5B_Instruct,0.5523809523809524
8
  Qwen2-72B-Instruct,0.8285714285714286
 
9
  Meta-Llama-3-8B-Instruct,0.4666666666666667
 
10
  Meta-Llama-3.1-70B-Instruct,0.5428571428571428
11
  Qwen2_5_3B_Instruct,0.7142857142857143
12
  SeaLLMs-v3-7B-Chat,0.819047619047619
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.8285714285714286
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.5904761904761905
18
  gemma-2-2b-it,0.3619047619047619
19
  llama3-8b-cpt-sea-lionv2-instruct,0.49523809523809526
20
- cross_openhermes_llama3_8b_12288_inst,0.5523809523809524
21
  Qwen2_5_0_5B_Instruct,0.3619047619047619
22
  GPT4o_0513,0.8095238095238095
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.8285714285714286
3
+ Meta-Llama-3.1-8B-Instruct,0.45714285714285713
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5047619047619047
5
  Qwen2_5_32B_Instruct,0.8476190476190476
6
  Qwen2_5_7B_Instruct,0.8
7
  Qwen2_5_1_5B_Instruct,0.5523809523809524
8
  Qwen2-72B-Instruct,0.8285714285714286
9
+ Sailor2-8B-Chat,0.7142857142857143
10
  Meta-Llama-3-8B-Instruct,0.4666666666666667
11
+ merged_llama3_8b_sg_inst_avg_diff,0.5142857142857142
12
  Meta-Llama-3.1-70B-Instruct,0.5428571428571428
13
  Qwen2_5_3B_Instruct,0.7142857142857143
14
  SeaLLMs-v3-7B-Chat,0.819047619047619
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.5904761904761905
20
  gemma-2-2b-it,0.3619047619047619
21
  llama3-8b-cpt-sea-lionv2-instruct,0.49523809523809526
 
22
  Qwen2_5_0_5B_Instruct,0.3619047619047619
23
  GPT4o_0513,0.8095238095238095
results/cultural_reasoning/zero_shot/ph_eval.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.52
3
- Meta-Llama-3.1-8B-Instruct,0.6
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.56
5
  Qwen2_5_32B_Instruct,0.7
6
  Qwen2_5_7B_Instruct,0.55
7
  Qwen2_5_1_5B_Instruct,0.37
8
  Qwen2-72B-Instruct,0.62
 
9
  Meta-Llama-3-8B-Instruct,0.58
 
10
  Meta-Llama-3.1-70B-Instruct,0.68
11
  Qwen2_5_3B_Instruct,0.4
12
  SeaLLMs-v3-7B-Chat,0.47
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.6
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.59
18
  gemma-2-2b-it,0.4
19
  llama3-8b-cpt-sea-lionv2-instruct,0.56
20
- cross_openhermes_llama3_8b_12288_inst,0.52
21
  Qwen2_5_0_5B_Instruct,0.27
22
  GPT4o_0513,0.77
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.52
3
+ Meta-Llama-3.1-8B-Instruct,0.57
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.56
5
  Qwen2_5_32B_Instruct,0.7
6
  Qwen2_5_7B_Instruct,0.55
7
  Qwen2_5_1_5B_Instruct,0.37
8
  Qwen2-72B-Instruct,0.62
9
+ Sailor2-8B-Chat,0.53
10
  Meta-Llama-3-8B-Instruct,0.58
11
+ merged_llama3_8b_sg_inst_avg_diff,0.54
12
  Meta-Llama-3.1-70B-Instruct,0.68
13
  Qwen2_5_3B_Instruct,0.4
14
  SeaLLMs-v3-7B-Chat,0.47
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.59
20
  gemma-2-2b-it,0.4
21
  llama3-8b-cpt-sea-lionv2-instruct,0.56
 
22
  Qwen2_5_0_5B_Instruct,0.27
23
  GPT4o_0513,0.77
results/cultural_reasoning/zero_shot/sg_eval.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.6796116504854369
3
- Meta-Llama-3.1-8B-Instruct,0.5728155339805825
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6601941747572816
5
  Qwen2_5_32B_Instruct,0.7184466019417476
6
  Qwen2_5_7B_Instruct,0.6699029126213593
7
  Qwen2_5_1_5B_Instruct,0.5048543689320388
8
  Qwen2-72B-Instruct,0.7378640776699029
 
9
  Meta-Llama-3-8B-Instruct,0.6504854368932039
 
10
  Meta-Llama-3.1-70B-Instruct,0.7184466019417476
11
  Qwen2_5_3B_Instruct,0.6310679611650486
12
  SeaLLMs-v3-7B-Chat,0.7184466019417476
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.7669902912621359
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.6796116504854369
18
  gemma-2-2b-it,0.5533980582524272
19
  llama3-8b-cpt-sea-lionv2-instruct,0.6504854368932039
20
- cross_openhermes_llama3_8b_12288_inst,0.6310679611650486
21
  Qwen2_5_0_5B_Instruct,0.4077669902912621
22
  GPT4o_0513,0.8446601941747572
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.6796116504854369
3
+ Meta-Llama-3.1-8B-Instruct,0.6407766990291263
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6601941747572816
5
  Qwen2_5_32B_Instruct,0.7184466019417476
6
  Qwen2_5_7B_Instruct,0.6699029126213593
7
  Qwen2_5_1_5B_Instruct,0.5048543689320388
8
  Qwen2-72B-Instruct,0.7378640776699029
9
+ Sailor2-8B-Chat,0.6019417475728155
10
  Meta-Llama-3-8B-Instruct,0.6504854368932039
11
+ merged_llama3_8b_sg_inst_avg_diff,0.6019417475728155
12
  Meta-Llama-3.1-70B-Instruct,0.7184466019417476
13
  Qwen2_5_3B_Instruct,0.6310679611650486
14
  SeaLLMs-v3-7B-Chat,0.7184466019417476
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.6796116504854369
20
  gemma-2-2b-it,0.5533980582524272
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6504854368932039
 
22
  Qwen2_5_0_5B_Instruct,0.4077669902912621
23
  GPT4o_0513,0.8446601941747572
results/cultural_reasoning/zero_shot/sg_eval_v1_cleaned.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.6323529411764706
3
- Meta-Llama-3.1-8B-Instruct,0.5294117647058824
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6617647058823529
5
  Qwen2_5_32B_Instruct,0.6470588235294118
6
  Qwen2_5_7B_Instruct,0.5882352941176471
7
  Qwen2_5_1_5B_Instruct,0.47058823529411764
8
  Qwen2-72B-Instruct,0.6764705882352942
 
9
  Meta-Llama-3-8B-Instruct,0.5882352941176471
 
10
  Meta-Llama-3.1-70B-Instruct,0.6617647058823529
11
  Qwen2_5_3B_Instruct,0.5882352941176471
12
  SeaLLMs-v3-7B-Chat,0.5882352941176471
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.6911764705882353
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.6029411764705882
18
  gemma-2-2b-it,0.5147058823529411
19
  llama3-8b-cpt-sea-lionv2-instruct,0.6617647058823529
20
- cross_openhermes_llama3_8b_12288_inst,0.6029411764705882
21
  Qwen2_5_0_5B_Instruct,0.36764705882352944
22
  GPT4o_0513,0.8088235294117647
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.6323529411764706
3
+ Meta-Llama-3.1-8B-Instruct,0.5735294117647058
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6617647058823529
5
  Qwen2_5_32B_Instruct,0.6470588235294118
6
  Qwen2_5_7B_Instruct,0.5882352941176471
7
  Qwen2_5_1_5B_Instruct,0.47058823529411764
8
  Qwen2-72B-Instruct,0.6764705882352942
9
+ Sailor2-8B-Chat,0.5735294117647058
10
  Meta-Llama-3-8B-Instruct,0.5882352941176471
11
+ merged_llama3_8b_sg_inst_avg_diff,0.5441176470588235
12
  Meta-Llama-3.1-70B-Instruct,0.6617647058823529
13
  Qwen2_5_3B_Instruct,0.5882352941176471
14
  SeaLLMs-v3-7B-Chat,0.5882352941176471
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.6029411764705882
20
  gemma-2-2b-it,0.5147058823529411
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6617647058823529
 
22
  Qwen2_5_0_5B_Instruct,0.36764705882352944
23
  GPT4o_0513,0.8088235294117647
results/cultural_reasoning/zero_shot/sg_eval_v2_mcq.csv CHANGED
@@ -1,15 +1,17 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7872727272727272
3
- Meta-Llama-3.1-8B-Instruct,0.7854545454545454
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.7836363636363637
5
  Qwen2_5_32B_Instruct,0.8436363636363636
6
  Qwen2_5_7B_Instruct,0.78
7
  Qwen2_5_1_5B_Instruct,0.6636363636363637
8
  Qwen2-72B-Instruct,0.8581818181818182
 
9
  Meta-Llama-3-8B-Instruct,0.7909090909090909
 
10
  Meta-Llama-3.1-70B-Instruct,0.8763636363636363
11
  Qwen2_5_3B_Instruct,0.72
12
- SeaLLMs-v3-7B-Chat,0.7836363636363637
13
  Qwen2_5_72B_Instruct,0.8618181818181818
14
  gemma-2-9b-it,0.8036363636363636
15
  Meta-Llama-3-70B-Instruct,0.8381818181818181
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.8345454545454546
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.8090909090909091
18
  gemma-2-2b-it,0.7163636363636363
19
  llama3-8b-cpt-sea-lionv2-instruct,0.7763636363636364
20
- cross_openhermes_llama3_8b_12288_inst,0.7890909090909091
21
  Qwen2_5_0_5B_Instruct,0.5727272727272728
22
  GPT4o_0513,0.8709090909090909
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7872727272727272
3
+ Meta-Llama-3.1-8B-Instruct,0.8127272727272727
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.7836363636363637
5
  Qwen2_5_32B_Instruct,0.8436363636363636
6
  Qwen2_5_7B_Instruct,0.78
7
  Qwen2_5_1_5B_Instruct,0.6636363636363637
8
  Qwen2-72B-Instruct,0.8581818181818182
9
+ Sailor2-8B-Chat,0.730909090909091
10
  Meta-Llama-3-8B-Instruct,0.7909090909090909
11
+ merged_llama3_8b_sg_inst_avg_diff,0.8109090909090909
12
  Meta-Llama-3.1-70B-Instruct,0.8763636363636363
13
  Qwen2_5_3B_Instruct,0.72
14
+ SeaLLMs-v3-7B-Chat,0.7909090909090909
15
  Qwen2_5_72B_Instruct,0.8618181818181818
16
  gemma-2-9b-it,0.8036363636363636
17
  Meta-Llama-3-70B-Instruct,0.8381818181818181
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.8090909090909091
20
  gemma-2-2b-it,0.7163636363636363
21
  llama3-8b-cpt-sea-lionv2-instruct,0.7763636363636364
 
22
  Qwen2_5_0_5B_Instruct,0.5727272727272728
23
  GPT4o_0513,0.8709090909090909
results/cultural_reasoning/zero_shot/sg_eval_v2_mcq_no_prompt.csv CHANGED
@@ -1,6 +1,11 @@
1
  Model,Accuracy
2
- llama3-8b-cpt-sea-lionv2.1-instruct,0.8
 
 
 
3
  Meta-Llama-3-8B-Instruct,0.8054545454545454
 
 
 
4
  gemma2-9b-cpt-sea-lionv3-instruct,0.7818181818181819
5
- cross_openhermes_llama3_8b_12288_inst,0.7945454545454546
6
  GPT4o_0513,0.9072727272727272
 
1
  Model,Accuracy
2
+ Meta-Llama-3.1-8B-Instruct,0.7418181818181818
3
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.7945454545454546
4
+ Qwen2_5_7B_Instruct,0.7654545454545455
5
+ Sailor2-8B-Chat,0.7145454545454546
6
  Meta-Llama-3-8B-Instruct,0.8054545454545454
7
+ merged_llama3_8b_sg_inst_avg_diff,0.7854545454545454
8
+ SeaLLMs-v3-7B-Chat,0.7581818181818182
9
+ gemma-2-9b-it,0.7618181818181818
10
  gemma2-9b-cpt-sea-lionv3-instruct,0.7818181818181819
 
11
  GPT4o_0513,0.9072727272727272
results/cultural_reasoning/zero_shot/sg_eval_v2_open.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,56.559999999999995
3
- Meta-Llama-3.1-8B-Instruct,49.2
4
  llama3-8b-cpt-sea-lionv2.1-instruct,50.03999999999999
5
  Qwen2_5_32B_Instruct,53.2
6
  Qwen2_5_7B_Instruct,50.279999999999994
7
  Qwen2_5_1_5B_Instruct,44.480000000000004
8
  Qwen2-72B-Instruct,54.080000000000005
 
9
  Meta-Llama-3-8B-Instruct,51.120000000000005
 
10
  Meta-Llama-3.1-70B-Instruct,51.31999999999999
11
  Qwen2_5_3B_Instruct,47.24
12
  SeaLLMs-v3-7B-Chat,55.0
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,53.2
17
  gemma2-9b-cpt-sea-lionv3-instruct,55.0
18
  gemma-2-2b-it,52.08
19
  llama3-8b-cpt-sea-lionv2-instruct,50.03999999999999
20
- cross_openhermes_llama3_8b_12288_inst,52.480000000000004
21
  Qwen2_5_0_5B_Instruct,35.28
22
  GPT4o_0513,57.28
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,56.559999999999995
3
+ Meta-Llama-3.1-8B-Instruct,47.839999999999996
4
  llama3-8b-cpt-sea-lionv2.1-instruct,50.03999999999999
5
  Qwen2_5_32B_Instruct,53.2
6
  Qwen2_5_7B_Instruct,50.279999999999994
7
  Qwen2_5_1_5B_Instruct,44.480000000000004
8
  Qwen2-72B-Instruct,54.080000000000005
9
+ Sailor2-8B-Chat,54.36
10
  Meta-Llama-3-8B-Instruct,51.120000000000005
11
+ merged_llama3_8b_sg_inst_avg_diff,49.2
12
  Meta-Llama-3.1-70B-Instruct,51.31999999999999
13
  Qwen2_5_3B_Instruct,47.24
14
  SeaLLMs-v3-7B-Chat,55.0
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,55.0
20
  gemma-2-2b-it,52.08
21
  llama3-8b-cpt-sea-lionv2-instruct,50.03999999999999
 
22
  Qwen2_5_0_5B_Instruct,35.28
23
  GPT4o_0513,57.28
results/cultural_reasoning/zero_shot/us_eval.csv CHANGED
@@ -6,7 +6,9 @@ Qwen2_5_32B_Instruct,0.8411214953271028
6
  Qwen2_5_7B_Instruct,0.7663551401869159
7
  Qwen2_5_1_5B_Instruct,0.5981308411214953
8
  Qwen2-72B-Instruct,0.8785046728971962
 
9
  Meta-Llama-3-8B-Instruct,0.7009345794392523
 
10
  Meta-Llama-3.1-70B-Instruct,0.8411214953271028
11
  Qwen2_5_3B_Instruct,0.6728971962616822
12
  SeaLLMs-v3-7B-Chat,0.6915887850467289
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.822429906542056
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.8037383177570093
18
  gemma-2-2b-it,0.6915887850467289
19
  llama3-8b-cpt-sea-lionv2-instruct,0.7009345794392523
20
- cross_openhermes_llama3_8b_12288_inst,0.7663551401869159
21
  Qwen2_5_0_5B_Instruct,0.37383177570093457
22
  GPT4o_0513,0.8691588785046729
 
6
  Qwen2_5_7B_Instruct,0.7663551401869159
7
  Qwen2_5_1_5B_Instruct,0.5981308411214953
8
  Qwen2-72B-Instruct,0.8785046728971962
9
+ Sailor2-8B-Chat,0.7009345794392523
10
  Meta-Llama-3-8B-Instruct,0.7009345794392523
11
+ merged_llama3_8b_sg_inst_avg_diff,0.7383177570093458
12
  Meta-Llama-3.1-70B-Instruct,0.8411214953271028
13
  Qwen2_5_3B_Instruct,0.6728971962616822
14
  SeaLLMs-v3-7B-Chat,0.6915887850467289
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.8037383177570093
20
  gemma-2-2b-it,0.6915887850467289
21
  llama3-8b-cpt-sea-lionv2-instruct,0.7009345794392523
 
22
  Qwen2_5_0_5B_Instruct,0.37383177570093457
23
  GPT4o_0513,0.8691588785046729
results/dialogue/zero_shot/dialogsum.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Average,ROUGE-1,ROUGE-2,ROUGE-L
2
  Qwen2-7B-Instruct,0.2092663759873139,0.30486100228371826,0.09413830506038247,0.22879982061784096
3
- Meta-Llama-3.1-8B-Instruct,0.24990743661648132,0.3515557454075673,0.12563120411564133,0.2725353603262354
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.2538187048721643,0.3556160487203703,0.12835761178300684,0.27748245411311584
5
  Qwen2_5_32B_Instruct,0.2393912015484827,0.3451081398022419,0.11160543395371676,0.26146003088948944
6
  Qwen2_5_7B_Instruct,0.2502928721533066,0.35566069744050016,0.12210269253668227,0.27311522648273734
7
  Qwen2_5_1_5B_Instruct,0.20263242988485167,0.30002072253966694,0.08416670238558713,0.22370986472930096
8
  Qwen2-72B-Instruct,0.2183280630214023,0.316174552903144,0.10156543495268992,0.23724420120837297
 
9
  Meta-Llama-3-8B-Instruct,0.23978455271183616,0.33971099717559883,0.1203340311564728,0.2593086298034369
 
10
  Meta-Llama-3.1-70B-Instruct,0.2526239717396146,0.35714386898604744,0.1258832921736473,0.27484475405914904
11
  Qwen2_5_3B_Instruct,0.22107390172674926,0.32206286484028823,0.10065030710901035,0.24050853323094928
12
  SeaLLMs-v3-7B-Chat,0.24891094210680076,0.35393482223136147,0.12172072639345373,0.27107727769558715
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.2343478938479703,0.3386251381162625,0.10742381514017992,0
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.23790909190042164,0.34294544984076464,0.1078722585042388,0.26290956735626153
18
  gemma-2-2b-it,0.2597323674875989,0.36848124762381895,0.12622684440269072,0.2844890104362872
19
  llama3-8b-cpt-sea-lionv2-instruct,0.25777587511641403,0.35911990072292727,0.13269121463917308,0.2815165099871418
20
- cross_openhermes_llama3_8b_12288_inst,0.27081377092899106,0.3746700335717668,0.1422316280821482,0.2955396511330582
21
  Qwen2_5_0_5B_Instruct,0.19408176276624156,0.28989753303423227,0.07842728643649079,0.21392046882800164
22
  GPT4o_0513,0.2375730297294346,0.3364674648846549,0.11718194476069822,0.25906967954295057
 
1
  Model,Average,ROUGE-1,ROUGE-2,ROUGE-L
2
  Qwen2-7B-Instruct,0.2092663759873139,0.30486100228371826,0.09413830506038247,0.22879982061784096
3
+ Meta-Llama-3.1-8B-Instruct,0.25246410428426797,0.3568915790041648,0.1259705585621535,0.27453017528648554
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.2538187048721643,0.3556160487203703,0.12835761178300684,0.27748245411311584
5
  Qwen2_5_32B_Instruct,0.2393912015484827,0.3451081398022419,0.11160543395371676,0.26146003088948944
6
  Qwen2_5_7B_Instruct,0.2502928721533066,0.35566069744050016,0.12210269253668227,0.27311522648273734
7
  Qwen2_5_1_5B_Instruct,0.20263242988485167,0.30002072253966694,0.08416670238558713,0.22370986472930096
8
  Qwen2-72B-Instruct,0.2183280630214023,0.316174552903144,0.10156543495268992,0.23724420120837297
9
+ Sailor2-8B-Chat,0.19777087324327317,0.2970393044008424,0.07701994204737679,0.21925337328160027
10
  Meta-Llama-3-8B-Instruct,0.23978455271183616,0.33971099717559883,0.1203340311564728,0.2593086298034369
11
+ merged_llama3_8b_sg_inst_avg_diff,0.25236243090492,0.3573462392196718,0.125506438977953,0.27423461451713527
12
  Meta-Llama-3.1-70B-Instruct,0.2526239717396146,0.35714386898604744,0.1258832921736473,0.27484475405914904
13
  Qwen2_5_3B_Instruct,0.22107390172674926,0.32206286484028823,0.10065030710901035,0.24050853323094928
14
  SeaLLMs-v3-7B-Chat,0.24891094210680076,0.35393482223136147,0.12172072639345373,0.27107727769558715
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.23790909190042164,0.34294544984076464,0.1078722585042388,0.26290956735626153
20
  gemma-2-2b-it,0.2597323674875989,0.36848124762381895,0.12622684440269072,0.2844890104362872
21
  llama3-8b-cpt-sea-lionv2-instruct,0.25777587511641403,0.35911990072292727,0.13269121463917308,0.2815165099871418
 
22
  Qwen2_5_0_5B_Instruct,0.19408176276624156,0.28989753303423227,0.07842728643649079,0.21392046882800164
23
  GPT4o_0513,0.2375730297294346,0.3364674648846549,0.11718194476069822,0.25906967954295057
results/dialogue/zero_shot/dream.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.9353258206761391
3
- Meta-Llama-3.1-8B-Instruct,0.9039686428221461
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.8838804507594317
5
  Qwen2_5_32B_Instruct,0.9559039686428221
6
  Qwen2_5_7B_Instruct,0.9348358647721705
7
  Qwen2_5_1_5B_Instruct,0.8314551690347869
8
  Qwen2-72B-Instruct,0.9612934835864773
 
9
  Meta-Llama-3-8B-Instruct,0.8946594806467418
 
10
  Meta-Llama-3.1-70B-Instruct,0.9559039686428221
11
  Qwen2_5_3B_Instruct,0.9029887310142087
12
  SeaLLMs-v3-7B-Chat,0.9265066144047036
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.9461048505634493
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.9407153356197943
18
  gemma-2-2b-it,0.8510534051935326
19
  llama3-8b-cpt-sea-lionv2-instruct,0.8858402743753062
20
- cross_openhermes_llama3_8b_12288_inst,0.8829005389514943
21
  Qwen2_5_0_5B_Instruct,0.6526212640862322
22
  GPT4o_0513,0.9583537481626654
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.9353258206761391
3
+ Meta-Llama-3.1-8B-Instruct,0.9054385105340519
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.8838804507594317
5
  Qwen2_5_32B_Instruct,0.9559039686428221
6
  Qwen2_5_7B_Instruct,0.9348358647721705
7
  Qwen2_5_1_5B_Instruct,0.8314551690347869
8
  Qwen2-72B-Instruct,0.9612934835864773
9
+ Sailor2-8B-Chat,0.9054385105340519
10
  Meta-Llama-3-8B-Instruct,0.8946594806467418
11
+ merged_llama3_8b_sg_inst_avg_diff,0.9103380695737384
12
  Meta-Llama-3.1-70B-Instruct,0.9559039686428221
13
  Qwen2_5_3B_Instruct,0.9029887310142087
14
  SeaLLMs-v3-7B-Chat,0.9265066144047036
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.9407153356197943
20
  gemma-2-2b-it,0.8510534051935326
21
  llama3-8b-cpt-sea-lionv2-instruct,0.8858402743753062
 
22
  Qwen2_5_0_5B_Instruct,0.6526212640862322
23
  GPT4o_0513,0.9583537481626654
results/dialogue/zero_shot/samsum.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Average,ROUGE-1,ROUGE-2,ROUGE-L
2
  Qwen2-7B-Instruct,0.25668781132950264,0.36375948458827556,0.12939804942125302,0.27690589997897935
3
- Meta-Llama-3.1-8B-Instruct,0.2891505262763006,0.4001228010515775,0.15677431231732958,0.31055446545999466
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.3049906423685726,0.42002411403511675,0.16877419641049218,0.32617361666010874
5
  Qwen2_5_32B_Instruct,0.2844232627209405,0.3986263552639068,0.14766658533002341,0.3069768475688912
6
  Qwen2_5_7B_Instruct,0.2987576845890178,0.4163299367235864,0.1599063413842216,0.32003677565924527
7
  Qwen2_5_1_5B_Instruct,0.2333120091694482,0.34339111721032756,0.10195887716459845,0.25458603313341865
8
  Qwen2-72B-Instruct,0.2800906719573321,0.3887231369098802,0.15237661526996754,0.29917226369214855
 
9
  Meta-Llama-3-8B-Instruct,0.2846315092346869,0.39397110152251813,0.154320846916639,0.30560257926490364
 
10
  Meta-Llama-3.1-70B-Instruct,0.28934874612070227,0.4036295731242805,0.15211190810296196,0.31230475713486433
11
  Qwen2_5_3B_Instruct,0.26935624341081515,0.380865832002109,0.13872106416227833,0.28848183406805816
12
  SeaLLMs-v3-7B-Chat,0.2959981719045788,0.4078820748825196,0.16338306782652476,0.316729373004692
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.2713801253928723,0.3836253496005304,0.13683087953788298,0
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.29509358497900623,0.4129497614150914,0.1502573048316353,0.32207368869029196
18
  gemma-2-2b-it,0.31118787136959813,0.4324251755711466,0.16441328335793207,0.33672515517971563
19
  llama3-8b-cpt-sea-lionv2-instruct,0.306997595680581,0.4214048099551701,0.1709790451938523,0.3286089318927205
20
- cross_openhermes_llama3_8b_12288_inst,0.30043920936284546,0.41309659421156786,0.16636483587009585,0.3218561980068726
21
  Qwen2_5_0_5B_Instruct,0.20766179544894214,0.3105872033328297,0.08726222085933319,0.22513596215466355
22
  GPT4o_0513,0.27736679291505306,0.386750207633093,0.14889081847621596,0.2964593526358502
 
1
  Model,Average,ROUGE-1,ROUGE-2,ROUGE-L
2
  Qwen2-7B-Instruct,0.25668781132950264,0.36375948458827556,0.12939804942125302,0.27690589997897935
3
+ Meta-Llama-3.1-8B-Instruct,0.2820716862174726,0.39415911433028605,0.148351765041582,0.30370417928054966
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.3049906423685726,0.42002411403511675,0.16877419641049218,0.32617361666010874
5
  Qwen2_5_32B_Instruct,0.2844232627209405,0.3986263552639068,0.14766658533002341,0.3069768475688912
6
  Qwen2_5_7B_Instruct,0.2987576845890178,0.4163299367235864,0.1599063413842216,0.32003677565924527
7
  Qwen2_5_1_5B_Instruct,0.2333120091694482,0.34339111721032756,0.10195887716459845,0.25458603313341865
8
  Qwen2-72B-Instruct,0.2800906719573321,0.3887231369098802,0.15237661526996754,0.29917226369214855
9
+ Sailor2-8B-Chat,0.23525560304744508,0.34567892481583223,0.10170204161284628,0.2583858427136568
10
  Meta-Llama-3-8B-Instruct,0.2846315092346869,0.39397110152251813,0.154320846916639,0.30560257926490364
11
+ merged_llama3_8b_sg_inst_avg_diff,0.2827552959388026,0.3953429193664384,0.14797005050571224,0.30495291794425716
12
  Meta-Llama-3.1-70B-Instruct,0.28934874612070227,0.4036295731242805,0.15211190810296196,0.31230475713486433
13
  Qwen2_5_3B_Instruct,0.26935624341081515,0.380865832002109,0.13872106416227833,0.28848183406805816
14
  SeaLLMs-v3-7B-Chat,0.2959981719045788,0.4078820748825196,0.16338306782652476,0.316729373004692
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.29509358497900623,0.4129497614150914,0.1502573048316353,0.32207368869029196
20
  gemma-2-2b-it,0.31118787136959813,0.4324251755711466,0.16441328335793207,0.33672515517971563
21
  llama3-8b-cpt-sea-lionv2-instruct,0.306997595680581,0.4214048099551701,0.1709790451938523,0.3286089318927205
 
22
  Qwen2_5_0_5B_Instruct,0.20766179544894214,0.3105872033328297,0.08726222085933319,0.22513596215466355
23
  GPT4o_0513,0.27736679291505306,0.386750207633093,0.14889081847621596,0.2964593526358502
results/emotion/zero_shot/ind_emotion.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.6545454545454545
3
- Meta-Llama-3.1-8B-Instruct,0.6545454545454545
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6681818181818182
5
  Qwen2_5_32B_Instruct,0.6909090909090909
6
  Qwen2_5_7B_Instruct,0.6636363636363637
7
  Qwen2_5_1_5B_Instruct,0.5795454545454546
8
  Qwen2-72B-Instruct,0.675
 
9
  Meta-Llama-3-8B-Instruct,0.6522727272727272
 
10
  Meta-Llama-3.1-70B-Instruct,0.7159090909090909
11
  Qwen2_5_3B_Instruct,0.5522727272727272
12
  SeaLLMs-v3-7B-Chat,0.6454545454545455
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.6954545454545454
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.7340909090909091
18
  gemma-2-2b-it,0.6636363636363637
19
  llama3-8b-cpt-sea-lionv2-instruct,0.6613636363636364
20
- cross_openhermes_llama3_8b_12288_inst,0.7045454545454546
21
  Qwen2_5_0_5B_Instruct,0.37727272727272726
22
  GPT4o_0513,0.7068181818181818
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.6545454545454545
3
+ Meta-Llama-3.1-8B-Instruct,0.6772727272727272
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6681818181818182
5
  Qwen2_5_32B_Instruct,0.6909090909090909
6
  Qwen2_5_7B_Instruct,0.6636363636363637
7
  Qwen2_5_1_5B_Instruct,0.5795454545454546
8
  Qwen2-72B-Instruct,0.675
9
+ Sailor2-8B-Chat,0.7363636363636363
10
  Meta-Llama-3-8B-Instruct,0.6522727272727272
11
+ merged_llama3_8b_sg_inst_avg_diff,0.7
12
  Meta-Llama-3.1-70B-Instruct,0.7159090909090909
13
  Qwen2_5_3B_Instruct,0.5522727272727272
14
  SeaLLMs-v3-7B-Chat,0.6454545454545455
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.7340909090909091
20
  gemma-2-2b-it,0.6636363636363637
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6613636363636364
 
22
  Qwen2_5_0_5B_Instruct,0.37727272727272726
23
  GPT4o_0513,0.7068181818181818
results/emotion/zero_shot/sst2.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.9346330275229358
3
- Meta-Llama-3.1-8B-Instruct,0.8646788990825688
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.9174311926605505
5
  Qwen2_5_32B_Instruct,0.9472477064220184
6
  Qwen2_5_7B_Instruct,0.9254587155963303
7
  Qwen2_5_1_5B_Instruct,0.9231651376146789
8
  Qwen2-72B-Instruct,0.9346330275229358
 
9
  Meta-Llama-3-8B-Instruct,0.8784403669724771
 
10
  Meta-Llama-3.1-70B-Instruct,0.9529816513761468
11
  Qwen2_5_3B_Instruct,0.8245412844036697
12
  SeaLLMs-v3-7B-Chat,0.9403669724770642
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.9311926605504587
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.9311926605504587
18
  gemma-2-2b-it,0.9243119266055045
19
  llama3-8b-cpt-sea-lionv2-instruct,0.9128440366972477
20
- cross_openhermes_llama3_8b_12288_inst,0.9288990825688074
21
  Qwen2_5_0_5B_Instruct,0.7889908256880734
22
  GPT4o_0513,0.9415137614678899
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.9346330275229358
3
+ Meta-Llama-3.1-8B-Instruct,0.8956422018348624
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.9174311926605505
5
  Qwen2_5_32B_Instruct,0.9472477064220184
6
  Qwen2_5_7B_Instruct,0.9254587155963303
7
  Qwen2_5_1_5B_Instruct,0.9231651376146789
8
  Qwen2-72B-Instruct,0.9346330275229358
9
+ Sailor2-8B-Chat,0.9461009174311926
10
  Meta-Llama-3-8B-Instruct,0.8784403669724771
11
+ merged_llama3_8b_sg_inst_avg_diff,0.8841743119266054
12
  Meta-Llama-3.1-70B-Instruct,0.9529816513761468
13
  Qwen2_5_3B_Instruct,0.8245412844036697
14
  SeaLLMs-v3-7B-Chat,0.9403669724770642
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.9311926605504587
20
  gemma-2-2b-it,0.9243119266055045
21
  llama3-8b-cpt-sea-lionv2-instruct,0.9128440366972477
 
22
  Qwen2_5_0_5B_Instruct,0.7889908256880734
23
  GPT4o_0513,0.9415137614678899
results/flores_translation/zero_shot/ind2eng.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,BLEU
2
  Qwen2-7B-Instruct,0.29408553325533265
3
- Meta-Llama-3.1-8B-Instruct,0.3765752579792989
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.38890283520513874
5
  Qwen2_5_32B_Instruct,0.3923422946746861
6
  Qwen2_5_7B_Instruct,0.36472669481333536
7
  Qwen2_5_1_5B_Instruct,0.2624938515155373
8
  Qwen2-72B-Instruct,0.4043588265556185
 
9
  Meta-Llama-3-8B-Instruct,0.33079891679041123
 
10
  Meta-Llama-3.1-70B-Instruct,0.43366494500251235
11
  Qwen2_5_3B_Instruct,0.3316936422167389
12
  SeaLLMs-v3-7B-Chat,0.3594829412574955
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.3901044620348051
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.3922444585880475
18
  gemma-2-2b-it,0.3482500758113138
19
  llama3-8b-cpt-sea-lionv2-instruct,0.3916108972514423
20
- cross_openhermes_llama3_8b_12288_inst,0.3900675406718024
21
  Qwen2_5_0_5B_Instruct,0.15776662800152338
22
  GPT4o_0513,0.42589589086974855
 
1
  Model,BLEU
2
  Qwen2-7B-Instruct,0.29408553325533265
3
+ Meta-Llama-3.1-8B-Instruct,0.37357029870421904
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.38890283520513874
5
  Qwen2_5_32B_Instruct,0.3923422946746861
6
  Qwen2_5_7B_Instruct,0.36472669481333536
7
  Qwen2_5_1_5B_Instruct,0.2624938515155373
8
  Qwen2-72B-Instruct,0.4043588265556185
9
+ Sailor2-8B-Chat,0.2487972955646591
10
  Meta-Llama-3-8B-Instruct,0.33079891679041123
11
+ merged_llama3_8b_sg_inst_avg_diff,0.38376586000725804
12
  Meta-Llama-3.1-70B-Instruct,0.43366494500251235
13
  Qwen2_5_3B_Instruct,0.3316936422167389
14
  SeaLLMs-v3-7B-Chat,0.3594829412574955
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.3922444585880475
20
  gemma-2-2b-it,0.3482500758113138
21
  llama3-8b-cpt-sea-lionv2-instruct,0.3916108972514423
 
22
  Qwen2_5_0_5B_Instruct,0.15776662800152338
23
  GPT4o_0513,0.42589589086974855
results/flores_translation/zero_shot/vie2eng.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,BLEU
2
  Qwen2-7B-Instruct,0.24106736560355876
3
- Meta-Llama-3.1-8B-Instruct,0.31019605539004524
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.32831099820283755
5
  Qwen2_5_32B_Instruct,0.33791529833420336
6
  Qwen2_5_7B_Instruct,0.3027564749728372
7
  Qwen2_5_1_5B_Instruct,0.21935649300365245
8
  Qwen2-72B-Instruct,0.33005323227052946
 
9
  Meta-Llama-3-8B-Instruct,0.2647448190950291
 
10
  Meta-Llama-3.1-70B-Instruct,0.37244508311079816
11
  Qwen2_5_3B_Instruct,0.27312609009801636
12
  SeaLLMs-v3-7B-Chat,0.30981028289420137
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.32198218156960645
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.3176282835521885
18
  gemma-2-2b-it,0.27518909199172303
19
  llama3-8b-cpt-sea-lionv2-instruct,0.327781936019637
20
- cross_openhermes_llama3_8b_12288_inst,0.29952664743728336
21
  Qwen2_5_0_5B_Instruct,0.14677375445859656
22
  GPT4o_0513,0.36219303373759176
 
1
  Model,BLEU
2
  Qwen2-7B-Instruct,0.24106736560355876
3
+ Meta-Llama-3.1-8B-Instruct,0.30308791998756773
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.32831099820283755
5
  Qwen2_5_32B_Instruct,0.33791529833420336
6
  Qwen2_5_7B_Instruct,0.3027564749728372
7
  Qwen2_5_1_5B_Instruct,0.21935649300365245
8
  Qwen2-72B-Instruct,0.33005323227052946
9
+ Sailor2-8B-Chat,0.1825857920682635
10
  Meta-Llama-3-8B-Instruct,0.2647448190950291
11
+ merged_llama3_8b_sg_inst_avg_diff,0.30900856944791294
12
  Meta-Llama-3.1-70B-Instruct,0.37244508311079816
13
  Qwen2_5_3B_Instruct,0.27312609009801636
14
  SeaLLMs-v3-7B-Chat,0.30981028289420137
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.3176282835521885
20
  gemma-2-2b-it,0.27518909199172303
21
  llama3-8b-cpt-sea-lionv2-instruct,0.327781936019637
 
22
  Qwen2_5_0_5B_Instruct,0.14677375445859656
23
  GPT4o_0513,0.36219303373759176
results/flores_translation/zero_shot/zho2eng.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,BLEU
2
  Qwen2-7B-Instruct,0.2113761361724575
3
- Meta-Llama-3.1-8B-Instruct,0.23889886925287113
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.2378480031503388
5
  Qwen2_5_32B_Instruct,0.26924811164378015
6
  Qwen2_5_7B_Instruct,0.2437311220019033
7
  Qwen2_5_1_5B_Instruct,0.18420680441018222
8
  Qwen2-72B-Instruct,0.23893268538329387
 
9
  Meta-Llama-3-8B-Instruct,0.199495011482748
 
10
  Meta-Llama-3.1-70B-Instruct,0.2832594176173152
11
  Qwen2_5_3B_Instruct,0.2245195134637718
12
  SeaLLMs-v3-7B-Chat,0.2516593644617717
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.2627781200417998
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.25110750921976727
18
  gemma-2-2b-it,0.21164036008441425
19
  llama3-8b-cpt-sea-lionv2-instruct,0.2381535278220489
20
- cross_openhermes_llama3_8b_12288_inst,0.2437964546132799
21
  Qwen2_5_0_5B_Instruct,0.13846648470535672
22
  GPT4o_0513,0.27722306559544163
 
1
  Model,BLEU
2
  Qwen2-7B-Instruct,0.2113761361724575
3
+ Meta-Llama-3.1-8B-Instruct,0.2429198421592902
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.2378480031503388
5
  Qwen2_5_32B_Instruct,0.26924811164378015
6
  Qwen2_5_7B_Instruct,0.2437311220019033
7
  Qwen2_5_1_5B_Instruct,0.18420680441018222
8
  Qwen2-72B-Instruct,0.23893268538329387
9
+ Sailor2-8B-Chat,0.16539980828035464
10
  Meta-Llama-3-8B-Instruct,0.199495011482748
11
+ merged_llama3_8b_sg_inst_avg_diff,0.24133164017585856
12
  Meta-Llama-3.1-70B-Instruct,0.2832594176173152
13
  Qwen2_5_3B_Instruct,0.2245195134637718
14
  SeaLLMs-v3-7B-Chat,0.2516593644617717
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.25110750921976727
20
  gemma-2-2b-it,0.21164036008441425
21
  llama3-8b-cpt-sea-lionv2-instruct,0.2381535278220489
 
22
  Qwen2_5_0_5B_Instruct,0.13846648470535672
23
  GPT4o_0513,0.27722306559544163
results/flores_translation/zero_shot/zsm2eng.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,BLEU
2
  Qwen2-7B-Instruct,0.28031997065822994
3
- Meta-Llama-3.1-8B-Instruct,0.3700921225177551
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.39042133634273773
5
  Qwen2_5_32B_Instruct,0.40310877536446654
6
  Qwen2_5_7B_Instruct,0.3466422765302921
7
  Qwen2_5_1_5B_Instruct,0.22890805100949677
8
  Qwen2-72B-Instruct,0.40796892621611885
 
9
  Meta-Llama-3-8B-Instruct,0.31625368345049
 
10
  Meta-Llama-3.1-70B-Instruct,0.4462132282683508
11
  Qwen2_5_3B_Instruct,0.31056841204320457
12
  SeaLLMs-v3-7B-Chat,0.3484133510670942
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.3841042767934729
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.4059485540822735
18
  gemma-2-2b-it,0.33737270487369614
19
  llama3-8b-cpt-sea-lionv2-instruct,0.38799258214381604
20
- cross_openhermes_llama3_8b_12288_inst,0.39589080400186966
21
  Qwen2_5_0_5B_Instruct,0.1194369315142997
22
  GPT4o_0513,0.451496635720668
 
1
  Model,BLEU
2
  Qwen2-7B-Instruct,0.28031997065822994
3
+ Meta-Llama-3.1-8B-Instruct,0.36579667849635744
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.39042133634273773
5
  Qwen2_5_32B_Instruct,0.40310877536446654
6
  Qwen2_5_7B_Instruct,0.3466422765302921
7
  Qwen2_5_1_5B_Instruct,0.22890805100949677
8
  Qwen2-72B-Instruct,0.40796892621611885
9
+ Sailor2-8B-Chat,0.269986448536842
10
  Meta-Llama-3-8B-Instruct,0.31625368345049
11
+ merged_llama3_8b_sg_inst_avg_diff,0.3729790018011108
12
  Meta-Llama-3.1-70B-Instruct,0.4462132282683508
13
  Qwen2_5_3B_Instruct,0.31056841204320457
14
  SeaLLMs-v3-7B-Chat,0.3484133510670942
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.4059485540822735
20
  gemma-2-2b-it,0.33737270487369614
21
  llama3-8b-cpt-sea-lionv2-instruct,0.38799258214381604
 
22
  Qwen2_5_0_5B_Instruct,0.1194369315142997
23
  GPT4o_0513,0.451496635720668
results/fundamental_nlp_tasks/zero_shot/c3.csv CHANGED
@@ -1,12 +1,13 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.9244577412116679
3
- Meta-Llama-3.1-8B-Instruct,0.8672400897531788
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.8676140613313388
5
  Qwen2_5_32B_Instruct,0.9603590127150337
6
  Qwen2_5_7B_Instruct,0.9121166791323859
7
  Qwen2_5_1_5B_Instruct,0.793941660433807
8
  Qwen2-72B-Instruct,0.9611069558713538
9
  Meta-Llama-3-8B-Instruct,0.8515332834704562
 
10
  Meta-Llama-3.1-70B-Instruct,0.9603590127150337
11
  Qwen2_5_3B_Instruct,0.8668661181750187
12
  SeaLLMs-v3-7B-Chat,0.9143605086013463
@@ -18,6 +19,5 @@ Meta-Llama-3.1-70B,0.7786088257292446
18
  gemma2-9b-cpt-sea-lionv3-instruct,0.9255796559461481
19
  gemma-2-2b-it,0.7700074794315632
20
  llama3-8b-cpt-sea-lionv2-instruct,0.8672400897531788
21
- cross_openhermes_llama3_8b_12288_inst,0.8485415108451758
22
  Qwen2_5_0_5B_Instruct,0.612939416604338
23
  GPT4o_0513,0.9648466716529543
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.9244577412116679
3
+ Meta-Llama-3.1-8B-Instruct,0.8814510097232611
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.8676140613313388
5
  Qwen2_5_32B_Instruct,0.9603590127150337
6
  Qwen2_5_7B_Instruct,0.9121166791323859
7
  Qwen2_5_1_5B_Instruct,0.793941660433807
8
  Qwen2-72B-Instruct,0.9611069558713538
9
  Meta-Llama-3-8B-Instruct,0.8515332834704562
10
+ merged_llama3_8b_sg_inst_avg_diff,0.8706058339566193
11
  Meta-Llama-3.1-70B-Instruct,0.9603590127150337
12
  Qwen2_5_3B_Instruct,0.8668661181750187
13
  SeaLLMs-v3-7B-Chat,0.9143605086013463
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.9255796559461481
20
  gemma-2-2b-it,0.7700074794315632
21
  llama3-8b-cpt-sea-lionv2-instruct,0.8672400897531788
 
22
  Qwen2_5_0_5B_Instruct,0.612939416604338
23
  GPT4o_0513,0.9648466716529543
results/fundamental_nlp_tasks/zero_shot/cola.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7871524448705657
3
- Meta-Llama-3.1-8B-Instruct,0.6673058485139022
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.613614573346117
5
  Qwen2_5_32B_Instruct,0.8427612655800575
6
  Qwen2_5_7B_Instruct,0.7909875359539789
7
  Qwen2_5_1_5B_Instruct,0.7497603068072867
8
  Qwen2-72B-Instruct,0.8341323106423778
 
9
  Meta-Llama-3-8B-Instruct,0.6548418024928092
 
10
  Meta-Llama-3.1-70B-Instruct,0.850431447746884
11
  Qwen2_5_3B_Instruct,0.6644295302013423
12
  SeaLLMs-v3-7B-Chat,0.785234899328859
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.8063279002876318
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.825503355704698
18
  gemma-2-2b-it,0.6749760306807286
19
  llama3-8b-cpt-sea-lionv2-instruct,0.6078619367209971
20
- cross_openhermes_llama3_8b_12288_inst,0.8207094918504314
21
  Qwen2_5_0_5B_Instruct,0.6116970278044104
22
  GPT4o_0513,0.8398849472674976
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7871524448705657
3
+ Meta-Llama-3.1-8B-Instruct,0.6519654841802492
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.613614573346117
5
  Qwen2_5_32B_Instruct,0.8427612655800575
6
  Qwen2_5_7B_Instruct,0.7909875359539789
7
  Qwen2_5_1_5B_Instruct,0.7497603068072867
8
  Qwen2-72B-Instruct,0.8341323106423778
9
+ Sailor2-8B-Chat,0.7900287631831256
10
  Meta-Llama-3-8B-Instruct,0.6548418024928092
11
+ merged_llama3_8b_sg_inst_avg_diff,0.6174496644295302
12
  Meta-Llama-3.1-70B-Instruct,0.850431447746884
13
  Qwen2_5_3B_Instruct,0.6644295302013423
14
  SeaLLMs-v3-7B-Chat,0.785234899328859
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.825503355704698
20
  gemma-2-2b-it,0.6749760306807286
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6078619367209971
 
22
  Qwen2_5_0_5B_Instruct,0.6116970278044104
23
  GPT4o_0513,0.8398849472674976
results/fundamental_nlp_tasks/zero_shot/mnli.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7295
3
- Meta-Llama-3.1-8B-Instruct,0.4825
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5775
5
  Qwen2_5_32B_Instruct,0.8715
6
  Qwen2_5_7B_Instruct,0.8105
7
  Qwen2_5_1_5B_Instruct,0.6045
8
  Qwen2-72B-Instruct,0.7925
 
9
  Meta-Llama-3-8B-Instruct,0.546
 
10
  Meta-Llama-3.1-70B-Instruct,0.7015
11
  Qwen2_5_3B_Instruct,0.7465
12
  SeaLLMs-v3-7B-Chat,0.653
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.818
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.7445
18
  gemma-2-2b-it,0.6185
19
  llama3-8b-cpt-sea-lionv2-instruct,0.5765
20
- cross_openhermes_llama3_8b_12288_inst,0.6485
21
  Qwen2_5_0_5B_Instruct,0.5095
22
  GPT4o_0513,0.8335
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7295
3
+ Meta-Llama-3.1-8B-Instruct,0.5295
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5775
5
  Qwen2_5_32B_Instruct,0.8715
6
  Qwen2_5_7B_Instruct,0.8105
7
  Qwen2_5_1_5B_Instruct,0.6045
8
  Qwen2-72B-Instruct,0.7925
9
+ Sailor2-8B-Chat,0.664
10
  Meta-Llama-3-8B-Instruct,0.546
11
+ merged_llama3_8b_sg_inst_avg_diff,0.5375
12
  Meta-Llama-3.1-70B-Instruct,0.7015
13
  Qwen2_5_3B_Instruct,0.7465
14
  SeaLLMs-v3-7B-Chat,0.653
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.7445
20
  gemma-2-2b-it,0.6185
21
  llama3-8b-cpt-sea-lionv2-instruct,0.5765
 
22
  Qwen2_5_0_5B_Instruct,0.5095
23
  GPT4o_0513,0.8335
results/fundamental_nlp_tasks/zero_shot/mrpc.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7867647058823529
3
- Meta-Llama-3.1-8B-Instruct,0.6740196078431373
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5808823529411765
5
  Qwen2_5_32B_Instruct,0.7745098039215687
6
  Qwen2_5_7B_Instruct,0.7058823529411765
7
  Qwen2_5_1_5B_Instruct,0.6838235294117647
8
  Qwen2-72B-Instruct,0.8063725490196079
 
9
  Meta-Llama-3-8B-Instruct,0.678921568627451
 
10
  Meta-Llama-3.1-70B-Instruct,0.7696078431372549
11
  Qwen2_5_3B_Instruct,0.5661764705882353
12
  SeaLLMs-v3-7B-Chat,0.7475490196078431
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.7794117647058824
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.7794117647058824
18
  gemma-2-2b-it,0.7083333333333334
19
  llama3-8b-cpt-sea-lionv2-instruct,0.5833333333333334
20
- cross_openhermes_llama3_8b_12288_inst,0.6985294117647058
21
  Qwen2_5_0_5B_Instruct,0.5759803921568627
22
  GPT4o_0513,0.7377450980392157
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7867647058823529
3
+ Meta-Llama-3.1-8B-Instruct,0.6666666666666666
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5808823529411765
5
  Qwen2_5_32B_Instruct,0.7745098039215687
6
  Qwen2_5_7B_Instruct,0.7058823529411765
7
  Qwen2_5_1_5B_Instruct,0.6838235294117647
8
  Qwen2-72B-Instruct,0.8063725490196079
9
+ Sailor2-8B-Chat,0.7769607843137255
10
  Meta-Llama-3-8B-Instruct,0.678921568627451
11
+ merged_llama3_8b_sg_inst_avg_diff,0.6274509803921569
12
  Meta-Llama-3.1-70B-Instruct,0.7696078431372549
13
  Qwen2_5_3B_Instruct,0.5661764705882353
14
  SeaLLMs-v3-7B-Chat,0.7475490196078431
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.7794117647058824
20
  gemma-2-2b-it,0.7083333333333334
21
  llama3-8b-cpt-sea-lionv2-instruct,0.5833333333333334
 
22
  Qwen2_5_0_5B_Instruct,0.5759803921568627
23
  GPT4o_0513,0.7377450980392157
results/fundamental_nlp_tasks/zero_shot/ocnli.csv CHANGED
@@ -1,11 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.6542372881355932
3
- Meta-Llama-3.1-8B-Instruct,0.40983050847457625
 
4
  Qwen2_5_32B_Instruct,0.7742372881355932
5
  Qwen2_5_7B_Instruct,0.6732203389830509
6
  Qwen2_5_1_5B_Instruct,0.5135593220338983
7
  Qwen2-72B-Instruct,0.7820338983050847
 
8
  Meta-Llama-3-8B-Instruct,0.44033898305084745
 
9
  Meta-Llama-3.1-70B-Instruct,0.6423728813559322
10
  Qwen2_5_3B_Instruct,0.6145762711864406
11
  SeaLLMs-v3-7B-Chat,0.5698305084745763
@@ -16,6 +19,5 @@ Qwen2_5_14B_Instruct,0.7538983050847458
16
  gemma2-9b-cpt-sea-lionv3-instruct,0.6488135593220339
17
  gemma-2-2b-it,0.43322033898305085
18
  llama3-8b-cpt-sea-lionv2-instruct,0.45559322033898303
19
- cross_openhermes_llama3_8b_12288_inst,0.5925423728813559
20
  Qwen2_5_0_5B_Instruct,0.3847457627118644
21
  GPT4o_0513,0.7308474576271187
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.6542372881355932
3
+ Meta-Llama-3.1-8B-Instruct,0.4359322033898305
4
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.4559322033898305
5
  Qwen2_5_32B_Instruct,0.7742372881355932
6
  Qwen2_5_7B_Instruct,0.6732203389830509
7
  Qwen2_5_1_5B_Instruct,0.5135593220338983
8
  Qwen2-72B-Instruct,0.7820338983050847
9
+ Sailor2-8B-Chat,0.5569491525423729
10
  Meta-Llama-3-8B-Instruct,0.44033898305084745
11
+ merged_llama3_8b_sg_inst_avg_diff,0.4633898305084746
12
  Meta-Llama-3.1-70B-Instruct,0.6423728813559322
13
  Qwen2_5_3B_Instruct,0.6145762711864406
14
  SeaLLMs-v3-7B-Chat,0.5698305084745763
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.6488135593220339
20
  gemma-2-2b-it,0.43322033898305085
21
  llama3-8b-cpt-sea-lionv2-instruct,0.45559322033898303
 
22
  Qwen2_5_0_5B_Instruct,0.3847457627118644
23
  GPT4o_0513,0.7308474576271187
results/fundamental_nlp_tasks/zero_shot/qnli.csv CHANGED
@@ -1,11 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.8154859967051071
3
- Meta-Llama-3.1-8B-Instruct,0.5777045579352005
 
4
  Qwen2_5_32B_Instruct,0.9062786015010068
5
  Qwen2_5_7B_Instruct,0.8652754896576972
6
  Qwen2_5_1_5B_Instruct,0.6148636280431997
7
  Qwen2-72B-Instruct,0.8887058392824455
 
8
  Meta-Llama-3-8B-Instruct,0.6025993044114956
 
9
  Meta-Llama-3.1-70B-Instruct,0.9026176093721399
10
  Qwen2_5_3B_Instruct,0.7645982061138569
11
  SeaLLMs-v3-7B-Chat,0.7159070107999268
@@ -16,6 +19,5 @@ Qwen2_5_14B_Instruct,0.9079260479589969
16
  gemma2-9b-cpt-sea-lionv3-instruct,0.9055464030752334
17
  gemma-2-2b-it,0.7792421746293245
18
  llama3-8b-cpt-sea-lionv2-instruct,0.6101043382756727
19
- cross_openhermes_llama3_8b_12288_inst,0.8282994691561413
20
  Qwen2_5_0_5B_Instruct,0.5464030752333883
21
  GPT4o_0513,0.9304411495515285
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.8154859967051071
3
+ Meta-Llama-3.1-8B-Instruct,0.6128500823723229
4
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.610287387882116
5
  Qwen2_5_32B_Instruct,0.9062786015010068
6
  Qwen2_5_7B_Instruct,0.8652754896576972
7
  Qwen2_5_1_5B_Instruct,0.6148636280431997
8
  Qwen2-72B-Instruct,0.8887058392824455
9
+ Sailor2-8B-Chat,0.6822258832143511
10
  Meta-Llama-3-8B-Instruct,0.6025993044114956
11
+ merged_llama3_8b_sg_inst_avg_diff,0.6522057477576423
12
  Meta-Llama-3.1-70B-Instruct,0.9026176093721399
13
  Qwen2_5_3B_Instruct,0.7645982061138569
14
  SeaLLMs-v3-7B-Chat,0.7159070107999268
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.9055464030752334
20
  gemma-2-2b-it,0.7792421746293245
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6101043382756727
 
22
  Qwen2_5_0_5B_Instruct,0.5464030752333883
23
  GPT4o_0513,0.9304411495515285
results/fundamental_nlp_tasks/zero_shot/qqp.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.781
3
- Meta-Llama-3.1-8B-Instruct,0.5645
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.625
5
  Qwen2_5_32B_Instruct,0.8315
6
  Qwen2_5_7B_Instruct,0.76
7
  Qwen2_5_1_5B_Instruct,0.731
8
  Qwen2-72B-Instruct,0.8065
 
9
  Meta-Llama-3-8B-Instruct,0.563
 
10
  Meta-Llama-3.1-70B-Instruct,0.815
11
  Qwen2_5_3B_Instruct,0.7415
12
  SeaLLMs-v3-7B-Chat,0.7625
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.8255
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.803
18
  gemma-2-2b-it,0.761
19
  llama3-8b-cpt-sea-lionv2-instruct,0.6225
20
- cross_openhermes_llama3_8b_12288_inst,0.792
21
  Qwen2_5_0_5B_Instruct,0.619
22
  GPT4o_0513,0.8085
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.781
3
+ Meta-Llama-3.1-8B-Instruct,0.6175
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.625
5
  Qwen2_5_32B_Instruct,0.8315
6
  Qwen2_5_7B_Instruct,0.76
7
  Qwen2_5_1_5B_Instruct,0.731
8
  Qwen2-72B-Instruct,0.8065
9
+ Sailor2-8B-Chat,0.8205
10
  Meta-Llama-3-8B-Instruct,0.563
11
+ merged_llama3_8b_sg_inst_avg_diff,0.597
12
  Meta-Llama-3.1-70B-Instruct,0.815
13
  Qwen2_5_3B_Instruct,0.7415
14
  SeaLLMs-v3-7B-Chat,0.7625
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.803
20
  gemma-2-2b-it,0.761
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6225
 
22
  Qwen2_5_0_5B_Instruct,0.619
23
  GPT4o_0513,0.8085
results/fundamental_nlp_tasks/zero_shot/rte.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.8231046931407943
3
- Meta-Llama-3.1-8B-Instruct,0.6750902527075813
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6859205776173285
5
  Qwen2_5_32B_Instruct,0.9097472924187726
6
  Qwen2_5_7B_Instruct,0.8592057761732852
7
  Qwen2_5_1_5B_Instruct,0.703971119133574
8
  Qwen2-72B-Instruct,0.8447653429602888
 
9
  Meta-Llama-3-8B-Instruct,0.6173285198555957
 
10
  Meta-Llama-3.1-70B-Instruct,0.8483754512635379
11
  Qwen2_5_3B_Instruct,0.779783393501805
12
  SeaLLMs-v3-7B-Chat,0.7870036101083032
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.8664259927797834
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.8483754512635379
18
  gemma-2-2b-it,0.7292418772563177
19
  llama3-8b-cpt-sea-lionv2-instruct,0.6859205776173285
20
- cross_openhermes_llama3_8b_12288_inst,0.8050541516245487
21
  Qwen2_5_0_5B_Instruct,0.5992779783393501
22
  GPT4o_0513,0.8700361010830325
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.8231046931407943
3
+ Meta-Llama-3.1-8B-Instruct,0.6389891696750902
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.6859205776173285
5
  Qwen2_5_32B_Instruct,0.9097472924187726
6
  Qwen2_5_7B_Instruct,0.8592057761732852
7
  Qwen2_5_1_5B_Instruct,0.703971119133574
8
  Qwen2-72B-Instruct,0.8447653429602888
9
+ Sailor2-8B-Chat,0.8122743682310469
10
  Meta-Llama-3-8B-Instruct,0.6173285198555957
11
+ merged_llama3_8b_sg_inst_avg_diff,0.6606498194945848
12
  Meta-Llama-3.1-70B-Instruct,0.8483754512635379
13
  Qwen2_5_3B_Instruct,0.779783393501805
14
  SeaLLMs-v3-7B-Chat,0.7870036101083032
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.8483754512635379
20
  gemma-2-2b-it,0.7292418772563177
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6859205776173285
 
22
  Qwen2_5_0_5B_Instruct,0.5992779783393501
23
  GPT4o_0513,0.8700361010830325
results/fundamental_nlp_tasks/zero_shot/wnli.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7183098591549296
3
- Meta-Llama-3.1-8B-Instruct,0.49295774647887325
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5915492957746479
5
  Qwen2_5_32B_Instruct,0.8732394366197183
6
  Qwen2_5_7B_Instruct,0.7605633802816901
7
  Qwen2_5_1_5B_Instruct,0.4647887323943662
8
  Qwen2-72B-Instruct,0.8873239436619719
 
9
  Meta-Llama-3-8B-Instruct,0.4788732394366197
 
10
  Meta-Llama-3.1-70B-Instruct,0.8450704225352113
11
  Qwen2_5_3B_Instruct,0.647887323943662
12
  SeaLLMs-v3-7B-Chat,0.5915492957746479
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.8309859154929577
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.7605633802816901
18
  gemma-2-2b-it,0.43661971830985913
19
  llama3-8b-cpt-sea-lionv2-instruct,0.5774647887323944
20
- cross_openhermes_llama3_8b_12288_inst,0.5211267605633803
21
  Qwen2_5_0_5B_Instruct,0.43661971830985913
22
  GPT4o_0513,0.9295774647887324
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7183098591549296
3
+ Meta-Llama-3.1-8B-Instruct,0.6197183098591549
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5915492957746479
5
  Qwen2_5_32B_Instruct,0.8732394366197183
6
  Qwen2_5_7B_Instruct,0.7605633802816901
7
  Qwen2_5_1_5B_Instruct,0.4647887323943662
8
  Qwen2-72B-Instruct,0.8873239436619719
9
+ Sailor2-8B-Chat,0.5492957746478874
10
  Meta-Llama-3-8B-Instruct,0.4788732394366197
11
+ merged_llama3_8b_sg_inst_avg_diff,0.5492957746478874
12
  Meta-Llama-3.1-70B-Instruct,0.8450704225352113
13
  Qwen2_5_3B_Instruct,0.647887323943662
14
  SeaLLMs-v3-7B-Chat,0.5915492957746479
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.7605633802816901
20
  gemma-2-2b-it,0.43661971830985913
21
  llama3-8b-cpt-sea-lionv2-instruct,0.5774647887323944
 
22
  Qwen2_5_0_5B_Instruct,0.43661971830985913
23
  GPT4o_0513,0.9295774647887324
results/general_reasoning/zero_shot/c_eval.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7615193026151931
3
- Meta-Llama-3.1-8B-Instruct,0.5149439601494396
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.50186799501868
5
  Qwen2_5_32B_Instruct,0.8262764632627646
6
  Qwen2_5_7B_Instruct,0.7459526774595268
7
  Qwen2_5_1_5B_Instruct,0.5971357409713575
8
  Qwen2-72B-Instruct,0.8312577833125778
 
9
  Meta-Llama-3-8B-Instruct,0.4775840597758406
 
10
  Meta-Llama-3.1-70B-Instruct,0.6612702366127023
11
  Qwen2_5_3B_Instruct,0.6537982565379825
12
  SeaLLMs-v3-7B-Chat,0.7658779576587795
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.7839352428393525
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.5722291407222914
18
  gemma-2-2b-it,0.4352428393524284
19
  llama3-8b-cpt-sea-lionv2-instruct,0.49813200498132004
20
- cross_openhermes_llama3_8b_12288_inst,0.4863013698630137
21
  Qwen2_5_0_5B_Instruct,0.41718555417185554
22
  GPT4o_0513,0.7073474470734745
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7615193026151931
3
+ Meta-Llama-3.1-8B-Instruct,0.5180572851805728
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.50186799501868
5
  Qwen2_5_32B_Instruct,0.8262764632627646
6
  Qwen2_5_7B_Instruct,0.7459526774595268
7
  Qwen2_5_1_5B_Instruct,0.5971357409713575
8
  Qwen2-72B-Instruct,0.8312577833125778
9
+ Sailor2-8B-Chat,0.5946450809464509
10
  Meta-Llama-3-8B-Instruct,0.4775840597758406
11
+ merged_llama3_8b_sg_inst_avg_diff,0.5205479452054794
12
  Meta-Llama-3.1-70B-Instruct,0.6612702366127023
13
  Qwen2_5_3B_Instruct,0.6537982565379825
14
  SeaLLMs-v3-7B-Chat,0.7658779576587795
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.5722291407222914
20
  gemma-2-2b-it,0.4352428393524284
21
  llama3-8b-cpt-sea-lionv2-instruct,0.49813200498132004
 
22
  Qwen2_5_0_5B_Instruct,0.41718555417185554
23
  GPT4o_0513,0.7073474470734745
results/general_reasoning/zero_shot/cmmlu.csv CHANGED
@@ -1,11 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7727508202383008
3
- Meta-Llama-3.1-8B-Instruct,0.5246934898981178
 
4
  Qwen2_5_32B_Instruct,0.8273182524607149
5
  Qwen2_5_7B_Instruct,0.7486617164565705
6
  Qwen2_5_1_5B_Instruct,0.5975651873596961
7
  Qwen2-72B-Instruct,0.8293904334311863
 
8
  Meta-Llama-3-8B-Instruct,0.4839405974788465
 
9
  Meta-Llama-3.1-70B-Instruct,0.6814885166637886
10
  Qwen2_5_3B_Instruct,0.6621481609393887
11
  SeaLLMs-v3-7B-Chat,0.7684337765498187
@@ -16,6 +19,5 @@ Qwen2_5_14B_Instruct,0.7807805214988776
16
  gemma2-9b-cpt-sea-lionv3-instruct,0.5796062856156105
17
  gemma-2-2b-it,0.4412882058366431
18
  llama3-8b-cpt-sea-lionv2-instruct,0.48929373165256435
19
- cross_openhermes_llama3_8b_12288_inst,0.48877568640994645
20
  Qwen2_5_0_5B_Instruct,0.42056639613192887
21
  GPT4o_0513,0.7414954239336902
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7727508202383008
3
+ Meta-Llama-3.1-8B-Instruct,0.5183042652391642
4
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.49222932136073216
5
  Qwen2_5_32B_Instruct,0.8273182524607149
6
  Qwen2_5_7B_Instruct,0.7486617164565705
7
  Qwen2_5_1_5B_Instruct,0.5975651873596961
8
  Qwen2-72B-Instruct,0.8293904334311863
9
+ Sailor2-8B-Chat,0.6416853738559835
10
  Meta-Llama-3-8B-Instruct,0.4839405974788465
11
+ merged_llama3_8b_sg_inst_avg_diff,0.5171818338801588
12
  Meta-Llama-3.1-70B-Instruct,0.6814885166637886
13
  Qwen2_5_3B_Instruct,0.6621481609393887
14
  SeaLLMs-v3-7B-Chat,0.7684337765498187
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.5796062856156105
20
  gemma-2-2b-it,0.4412882058366431
21
  llama3-8b-cpt-sea-lionv2-instruct,0.48929373165256435
 
22
  Qwen2_5_0_5B_Instruct,0.42056639613192887
23
  GPT4o_0513,0.7414954239336902
results/general_reasoning/zero_shot/indommlu.csv CHANGED
@@ -1,12 +1,13 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.5385539755657921
3
- Meta-Llama-3.1-8B-Instruct,0.5252687095266707
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5269377127979171
5
  Qwen2_5_32B_Instruct,0.6314840777087923
6
  Qwen2_5_7B_Instruct,0.5600507376994459
7
  Qwen2_5_1_5B_Instruct,0.4295346818879765
8
  Qwen2-72B-Instruct,0.6385606515788771
9
  Meta-Llama-3-8B-Instruct,0.5264703918819681
 
10
  Meta-Llama-3.1-70B-Instruct,0.6740770411910008
11
  Qwen2_5_3B_Instruct,0.49656185326123237
12
  SeaLLMs-v3-7B-Chat,0.5267374324053675
@@ -17,6 +18,5 @@ Qwen2_5_14B_Instruct,0.6009746979104079
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.6196007744175178
18
  gemma-2-2b-it,0.48220842512851325
19
  llama3-8b-cpt-sea-lionv2-instruct,0.5252687095266707
20
- cross_openhermes_llama3_8b_12288_inst,0.5533747246144602
21
  Qwen2_5_0_5B_Instruct,0.3279925228653448
22
  GPT4o_0513,0.7584618465852193
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.5385539755657921
3
+ Meta-Llama-3.1-8B-Instruct,0.5605180586153948
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.5269377127979171
5
  Qwen2_5_32B_Instruct,0.6314840777087923
6
  Qwen2_5_7B_Instruct,0.5600507376994459
7
  Qwen2_5_1_5B_Instruct,0.4295346818879765
8
  Qwen2-72B-Instruct,0.6385606515788771
9
  Meta-Llama-3-8B-Instruct,0.5264703918819681
10
+ merged_llama3_8b_sg_inst_avg_diff,0.5724013619066693
11
  Meta-Llama-3.1-70B-Instruct,0.6740770411910008
12
  Qwen2_5_3B_Instruct,0.49656185326123237
13
  SeaLLMs-v3-7B-Chat,0.5267374324053675
 
18
  gemma2-9b-cpt-sea-lionv3-instruct,0.6196007744175178
19
  gemma-2-2b-it,0.48220842512851325
20
  llama3-8b-cpt-sea-lionv2-instruct,0.5252687095266707
 
21
  Qwen2_5_0_5B_Instruct,0.3279925228653448
22
  GPT4o_0513,0.7584618465852193
results/general_reasoning/zero_shot/indommlu_no_prompt.csv CHANGED
@@ -1,6 +1,11 @@
1
  Model,Accuracy
2
- llama3-8b-cpt-sea-lionv2.1-instruct,0.5561786501101542
 
 
 
3
  Meta-Llama-3-8B-Instruct,0.5207957807597303
 
 
 
4
  gemma2-9b-cpt-sea-lionv3-instruct,0.6258762267174044
5
- cross_openhermes_llama3_8b_12288_inst,0.5279391147606649
6
  GPT4o_0513,0.7599305694639161
 
1
  Model,Accuracy
2
+ Meta-Llama-3.1-8B-Instruct,0.5483009546698712
3
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.559516656652647
4
+ Qwen2_5_7B_Instruct,0.581814540356499
5
+ Sailor2-8B-Chat,0.6342212430736365
6
  Meta-Llama-3-8B-Instruct,0.5207957807597303
7
+ merged_llama3_8b_sg_inst_avg_diff,0.575806128580012
8
+ SeaLLMs-v3-7B-Chat,0.5406235396221376
9
+ gemma-2-9b-it,0.6210027371653648
10
  gemma2-9b-cpt-sea-lionv3-instruct,0.6258762267174044
 
11
  GPT4o_0513,0.7599305694639161
results/general_reasoning/zero_shot/mmlu.csv CHANGED
@@ -1,11 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.672506256703611
3
- Meta-Llama-3.1-8B-Instruct,0.6037182695745441
 
4
  Qwen2_5_32B_Instruct,0.7996424740793707
5
  Qwen2_5_7B_Instruct,0.6935287808366106
6
  Qwen2_5_1_5B_Instruct,0.5646764390418305
7
  Qwen2-72B-Instruct,0.7922774401144083
 
8
  Meta-Llama-3-8B-Instruct,0.6005720414730068
 
9
  Meta-Llama-3.1-70B-Instruct,0.8058634250983197
10
  Qwen2_5_3B_Instruct,0.6118698605648909
11
  SeaLLMs-v3-7B-Chat,0.6670003575259207
@@ -16,6 +19,5 @@ Qwen2_5_14B_Instruct,0.7542366821594566
16
  gemma2-9b-cpt-sea-lionv3-instruct,0.7079013228459063
17
  gemma-2-2b-it,0.5706828745084018
18
  llama3-8b-cpt-sea-lionv2-instruct,0.6130854486950303
19
- cross_openhermes_llama3_8b_12288_inst,0.5870575616732213
20
  Qwen2_5_0_5B_Instruct,0.461136932427601
21
  GPT4o_0513,0.8308187343582409
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.672506256703611
3
+ Meta-Llama-3.1-8B-Instruct,0.6387558097962103
4
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.6117268501966393
5
  Qwen2_5_32B_Instruct,0.7996424740793707
6
  Qwen2_5_7B_Instruct,0.6935287808366106
7
  Qwen2_5_1_5B_Instruct,0.5646764390418305
8
  Qwen2-72B-Instruct,0.7922774401144083
9
+ Sailor2-8B-Chat,0.6202359671076153
10
  Meta-Llama-3-8B-Instruct,0.6005720414730068
11
+ merged_llama3_8b_sg_inst_avg_diff,0.631748301751877
12
  Meta-Llama-3.1-70B-Instruct,0.8058634250983197
13
  Qwen2_5_3B_Instruct,0.6118698605648909
14
  SeaLLMs-v3-7B-Chat,0.6670003575259207
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.7079013228459063
20
  gemma-2-2b-it,0.5706828745084018
21
  llama3-8b-cpt-sea-lionv2-instruct,0.6130854486950303
 
22
  Qwen2_5_0_5B_Instruct,0.461136932427601
23
  GPT4o_0513,0.8308187343582409
results/general_reasoning/zero_shot/mmlu_no_prompt.csv CHANGED
@@ -1,5 +1,10 @@
1
  Model,Accuracy
2
- llama3-8b-cpt-sea-lionv2.1-instruct,0.6551304969610297
 
 
3
  Meta-Llama-3-8B-Instruct,0.6618519842688595
4
- cross_openhermes_llama3_8b_12288_inst,0.6010010725777619
 
 
 
5
  GPT4o_0513,0.871576689309975
 
1
  Model,Accuracy
2
+ Meta-Llama-3.1-8B-Instruct,0.7056131569538792
3
+ llama3-8b-cpt-sea-lionv2.1-instruct,0.6454057919199142
4
+ Qwen2_5_7B_Instruct,0.73936360386128
5
  Meta-Llama-3-8B-Instruct,0.6618519842688595
6
+ merged_llama3_8b_sg_inst_avg_diff,0.6988916696460493
7
+ SeaLLMs-v3-7B-Chat,0.6913836253128351
8
+ gemma-2-9b-it,0.740293171254916
9
+ gemma2-9b-cpt-sea-lionv3-instruct,0.7372899535216303
10
  GPT4o_0513,0.871576689309975
results/general_reasoning/zero_shot/zbench.csv CHANGED
@@ -1,12 +1,14 @@
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7272727272727273
3
- Meta-Llama-3.1-8B-Instruct,0.42424242424242425
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.2727272727272727
5
  Qwen2_5_32B_Instruct,0.6060606060606061
6
  Qwen2_5_7B_Instruct,0.6666666666666666
7
  Qwen2_5_1_5B_Instruct,0.42424242424242425
8
  Qwen2-72B-Instruct,0.5757575757575758
 
9
  Meta-Llama-3-8B-Instruct,0.3333333333333333
 
10
  Meta-Llama-3.1-70B-Instruct,0.48484848484848486
11
  Qwen2_5_3B_Instruct,0.5757575757575758
12
  SeaLLMs-v3-7B-Chat,0.5454545454545454
@@ -17,6 +19,5 @@ Qwen2_5_14B_Instruct,0.6666666666666666
17
  gemma2-9b-cpt-sea-lionv3-instruct,0.42424242424242425
18
  gemma-2-2b-it,0.24242424242424243
19
  llama3-8b-cpt-sea-lionv2-instruct,0.30303030303030304
20
- cross_openhermes_llama3_8b_12288_inst,0.42424242424242425
21
  Qwen2_5_0_5B_Instruct,0.36363636363636365
22
  GPT4o_0513,0.696969696969697
 
1
  Model,Accuracy
2
  Qwen2-7B-Instruct,0.7272727272727273
3
+ Meta-Llama-3.1-8B-Instruct,0.3939393939393939
4
  llama3-8b-cpt-sea-lionv2.1-instruct,0.2727272727272727
5
  Qwen2_5_32B_Instruct,0.6060606060606061
6
  Qwen2_5_7B_Instruct,0.6666666666666666
7
  Qwen2_5_1_5B_Instruct,0.42424242424242425
8
  Qwen2-72B-Instruct,0.5757575757575758
9
+ Sailor2-8B-Chat,0.5151515151515151
10
  Meta-Llama-3-8B-Instruct,0.3333333333333333
11
+ merged_llama3_8b_sg_inst_avg_diff,0.42424242424242425
12
  Meta-Llama-3.1-70B-Instruct,0.48484848484848486
13
  Qwen2_5_3B_Instruct,0.5757575757575758
14
  SeaLLMs-v3-7B-Chat,0.5454545454545454
 
19
  gemma2-9b-cpt-sea-lionv3-instruct,0.42424242424242425
20
  gemma-2-2b-it,0.24242424242424243
21
  llama3-8b-cpt-sea-lionv2-instruct,0.30303030303030304
 
22
  Qwen2_5_0_5B_Instruct,0.36363636363636365
23
  GPT4o_0513,0.696969696969697