Update README.md
Browse files
README.md
CHANGED
@@ -70,4 +70,43 @@ Here there are some, but we also submitted it to the HF eval queue....
|
|
70 |
|pubmedqa |Yaml |none | 0|acc |0.7920|± |0.0182|
|
71 |
|sciq |Yaml |none | 0|acc |0.9630|± |0.0060|
|
72 |
| | |none | 0|acc_norm |0.9370|± |0.0077|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
```
|
|
|
70 |
|pubmedqa |Yaml |none | 0|acc |0.7920|± |0.0182|
|
71 |
|sciq |Yaml |none | 0|acc |0.9630|± |0.0060|
|
72 |
| | |none | 0|acc_norm |0.9370|± |0.0077|
|
73 |
+
```
|
74 |
+
|
75 |
+
## BBH
|
76 |
+
```
|
77 |
+
vllm (pretrained=fblgit/UNAversal-8x7B-v1beta,tensor_parallel_size=2,data_parallel_size=4,gpu_memory_utilization=0.8,dtype=float16), gen_kwargs: (None), limit: None, num_fewshot: 0, batch_size: auto
|
78 |
+
| Tasks |Version| Filter |n-shot| Metric |Value | |Stderr|
|
79 |
+
|----------------------------------------------------------|-------|----------|-----:|-----------|-----:|---|-----:|
|
80 |
+
|bbh |N/A |get-answer| 0|exact_match|0.6752|± |0.1772|
|
81 |
+
| - bbh_cot_fewshot_boolean_expressions |Yaml |get-answer| 0|exact_match|0.8840|± |0.0203|
|
82 |
+
| - bbh_cot_fewshot_causal_judgement |Yaml |get-answer| 0|exact_match|0.6417|± |0.0352|
|
83 |
+
| - bbh_cot_fewshot_date_understanding |Yaml |get-answer| 0|exact_match|0.7600|± |0.0271|
|
84 |
+
| - bbh_cot_fewshot_disambiguation_qa |Yaml |get-answer| 0|exact_match|0.7160|± |0.0286|
|
85 |
+
| - bbh_cot_fewshot_dyck_languages |Yaml |get-answer| 0|exact_match|0.1800|± |0.0243|
|
86 |
+
| - bbh_cot_fewshot_formal_fallacies |Yaml |get-answer| 0|exact_match|0.6520|± |0.0302|
|
87 |
+
| - bbh_cot_fewshot_geometric_shapes |Yaml |get-answer| 0|exact_match|0.3880|± |0.0309|
|
88 |
+
| - bbh_cot_fewshot_hyperbaton |Yaml |get-answer| 0|exact_match|0.9600|± |0.0124|
|
89 |
+
| - bbh_cot_fewshot_logical_deduction_five_objects |Yaml |get-answer| 0|exact_match|0.5360|± |0.0316|
|
90 |
+
| - bbh_cot_fewshot_logical_deduction_seven_objects |Yaml |get-answer| 0|exact_match|0.5040|± |0.0317|
|
91 |
+
| - bbh_cot_fewshot_logical_deduction_three_objects |Yaml |get-answer| 0|exact_match|0.8600|± |0.0220|
|
92 |
+
| - bbh_cot_fewshot_movie_recommendation |Yaml |get-answer| 0|exact_match|0.7840|± |0.0261|
|
93 |
+
| - bbh_cot_fewshot_multistep_arithmetic_two |Yaml |get-answer| 0|exact_match|0.6600|± |0.0300|
|
94 |
+
| - bbh_cot_fewshot_navigate |Yaml |get-answer| 0|exact_match|0.8160|± |0.0246|
|
95 |
+
| - bbh_cot_fewshot_object_counting |Yaml |get-answer| 0|exact_match|0.8360|± |0.0235|
|
96 |
+
| - bbh_cot_fewshot_penguins_in_a_table |Yaml |get-answer| 0|exact_match|0.7329|± |0.0367|
|
97 |
+
| - bbh_cot_fewshot_reasoning_about_colored_objects |Yaml |get-answer| 0|exact_match|0.8120|± |0.0248|
|
98 |
+
| - bbh_cot_fewshot_ruin_names |Yaml |get-answer| 0|exact_match|0.4440|± |0.0315|
|
99 |
+
| - bbh_cot_fewshot_salient_translation_error_detection |Yaml |get-answer| 0|exact_match|0.5200|± |0.0317|
|
100 |
+
| - bbh_cot_fewshot_snarks |Yaml |get-answer| 0|exact_match|0.7135|± |0.0340|
|
101 |
+
| - bbh_cot_fewshot_sports_understanding |Yaml |get-answer| 0|exact_match|0.9400|± |0.0151|
|
102 |
+
| - bbh_cot_fewshot_temporal_sequences |Yaml |get-answer| 0|exact_match|0.7560|± |0.0272|
|
103 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects |Yaml |get-answer| 0|exact_match|0.5680|± |0.0314|
|
104 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects|Yaml |get-answer| 0|exact_match|0.6280|± |0.0306|
|
105 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects|Yaml |get-answer| 0|exact_match|0.6280|± |0.0306|
|
106 |
+
| - bbh_cot_fewshot_web_of_lies |Yaml |get-answer| 0|exact_match|0.9560|± |0.0130|
|
107 |
+
| - bbh_cot_fewshot_word_sorting |Yaml |get-answer| 0|exact_match|0.3800|± |0.0308|
|
108 |
+
|
109 |
+
|Groups|Version| Filter |n-shot| Metric |Value | |Stderr|
|
110 |
+
|------|-------|----------|-----:|-----------|-----:|---|-----:|
|
111 |
+
|bbh |N/A |get-answer| 0|exact_match|0.6752|± |0.1772|
|
112 |
```
|