Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Sean Cho
commited on
Commit
Β·
3967c9e
1
Parent(s):
af4234d
add 6 new tasks
Browse files- src/display/about.py +9 -3
- src/display/utils.py +7 -0
- src/leaderboard/read_evals.py +2 -15
- src/tools/plots.py +1 -6
src/display/about.py
CHANGED
@@ -33,7 +33,7 @@ Please provide information about the model through an issue! π€©
|
|
33 |
|
34 |
π We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
|
35 |
|
36 |
-
We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by
|
37 |
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
|
38 |
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
39 |
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
@@ -41,8 +41,14 @@ We have set up a benchmark using datasets translated into Korean, and applied va
|
|
41 |
- Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
42 |
- Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
43 |
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
|
48 |
|
|
|
33 |
|
34 |
π We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
|
35 |
|
36 |
+
We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by __HuggingFace [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)__. We have also added a new dataset prepared from scratch.
|
37 |
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
|
38 |
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
39 |
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
|
|
41 |
- Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
42 |
- Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
43 |
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
|
44 |
+
- Ko-EQ Bench (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
45 |
+
- Ko-InstFollow (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
46 |
+
- KorNAT-CKA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
47 |
+
- KorNAT-SVA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
48 |
+
- Ko-Harmlessness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
49 |
+
- Ko-Helpfulness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
50 |
+
|
51 |
+
To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, truthfulness and common sense. The final score is converted to the average score from each evaluation datasets.
|
52 |
|
53 |
GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
|
54 |
|
src/display/utils.py
CHANGED
@@ -21,6 +21,13 @@ class Tasks(Enum):
|
|
21 |
winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
|
22 |
gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
|
23 |
commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# These classes are for user facing column names,
|
26 |
# to avoid having to change them all around the code
|
|
|
21 |
winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
|
22 |
gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
|
23 |
commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
|
24 |
+
eqBench = Task("ko_eq_bench", "acc_norm", "Ko-EQ Bench")
|
25 |
+
instFollow = Task("ko_inst_follow", "acc_norm", "Ko-InstFollow")
|
26 |
+
korNatCka = Task("kor_nat_cka", "acc_norm", "KorNAT-CKA")
|
27 |
+
korNatSva = Task("kor_nat_sva", "acc_norm", "KorNAT-SVA")
|
28 |
+
harmlessness = Task("ko_harmlessness", "acc_norm", "Ko-Harmlessness")
|
29 |
+
helpfulness = Task("ko_helpfulness", "acc_norm", "Ko-Helpfulness")
|
30 |
+
|
31 |
|
32 |
# These classes are for user facing column names,
|
33 |
# to avoid having to change them all around the code
|
src/leaderboard/read_evals.py
CHANGED
@@ -103,11 +103,6 @@ class EvalResult:
|
|
103 |
results[task.benchmark] = 0.0
|
104 |
continue
|
105 |
|
106 |
-
# Two new tasks have been added, we need to skip them for now
|
107 |
-
if task.benchmark == "ko_winogrande" or task.benchmark == "ko_gsm8k":
|
108 |
-
results[task.benchmark] = 0.0
|
109 |
-
continue
|
110 |
-
|
111 |
# We average all scores of a given metric (mostly for mmlu)
|
112 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
113 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
@@ -148,16 +143,8 @@ class EvalResult:
|
|
148 |
|
149 |
def to_dict(self):
|
150 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
151 |
-
|
152 |
-
|
153 |
-
# TODO: safely remove this code when the task results are added
|
154 |
-
skip_avg_len = 0
|
155 |
-
if self.results['ko_winogrande'] == 0.0:
|
156 |
-
skip_avg_len += 1
|
157 |
-
if self.results['ko_gsm8k'] == 0.0:
|
158 |
-
skip_avg_len += 1
|
159 |
-
|
160 |
-
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
161 |
data_dict = {
|
162 |
"eval_name": self.eval_name, # not a column, just a save name,
|
163 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
103 |
results[task.benchmark] = 0.0
|
104 |
continue
|
105 |
|
|
|
|
|
|
|
|
|
|
|
106 |
# We average all scores of a given metric (mostly for mmlu)
|
107 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
108 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
143 |
|
144 |
def to_dict(self):
|
145 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
146 |
+
|
147 |
+
average = sum([v for v in self.results.values() if v is not None]) / sum([1 for v in self.results.values() if v is not None])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
data_dict = {
|
149 |
"eval_name": self.eval_name, # not a column, just a save name,
|
150 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
src/tools/plots.py
CHANGED
@@ -36,12 +36,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
36 |
|
37 |
current_date = row["date"]
|
38 |
if task.benchmark == "Average":
|
39 |
-
|
40 |
-
if row["results"]["ko_winogrande"] == 0.0:
|
41 |
-
avg_skip_len += 1
|
42 |
-
if row["results"]["ko_gsm8k"] == 0.0:
|
43 |
-
avg_skip_len += 1
|
44 |
-
current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
|
45 |
else:
|
46 |
current_score = row["results"][task.benchmark]
|
47 |
|
|
|
36 |
|
37 |
current_date = row["date"]
|
38 |
if task.benchmark == "Average":
|
39 |
+
current_score = np.mean(list(row["results"].values()))
|
|
|
|
|
|
|
|
|
|
|
40 |
else:
|
41 |
current_score = row["results"][task.benchmark]
|
42 |
|