Spaces:
Running
Running
update
Browse files
tasks.py
CHANGED
@@ -6,11 +6,20 @@ from typing import Any, Optional, Protocol, Iterable, Callable
|
|
6 |
import logging
|
7 |
import pandas as pd
|
8 |
from functools import partial
|
|
|
9 |
|
10 |
from .utils import *
|
11 |
|
12 |
from evaluate import load
|
13 |
from collections import defaultdict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
def fake_pipeline(prompts: Iterable[str]) -> list[str]:
|
@@ -100,6 +109,7 @@ class Task:
|
|
100 |
)
|
101 |
return metric
|
102 |
|
|
|
103 |
def run(
|
104 |
self,
|
105 |
pipeline,
|
@@ -129,14 +139,20 @@ def multichoice(responses: Any, references: list[str]):
|
|
129 |
else:
|
130 |
responses = decode_choice(responses)
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
return responses, references
|
136 |
|
137 |
|
138 |
class Metrics:
|
139 |
-
cmmlu =
|
140 |
mmlu = multichoice
|
141 |
|
142 |
def gsm8k(responses: list[str], answers: list[str | int]):
|
@@ -299,6 +315,7 @@ class CMMLU:
|
|
299 |
.to_dict()
|
300 |
)
|
301 |
suite = defaultdict(list)
|
|
|
302 |
for k, v in cls.categories.items():
|
303 |
for subject in v:
|
304 |
suite[k].extend(
|
@@ -429,6 +446,7 @@ class MMLU:
|
|
429 |
.to_dict()
|
430 |
)
|
431 |
suite = defaultdict(list)
|
|
|
432 |
for k, v in cls.categories.items():
|
433 |
for subject in v:
|
434 |
suite[k].extend(
|
|
|
6 |
import logging
|
7 |
import pandas as pd
|
8 |
from functools import partial
|
9 |
+
from datasets.utils.logging import disable_progress_bar
|
10 |
|
11 |
from .utils import *
|
12 |
|
13 |
from evaluate import load
|
14 |
from collections import defaultdict
|
15 |
+
import sys
|
16 |
+
|
17 |
+
# if sys.version_info >= (3, 9):
|
18 |
+
# from functools import cache
|
19 |
+
# else:
|
20 |
+
# from functools import lru_cache as cache
|
21 |
+
|
22 |
+
disable_progress_bar()
|
23 |
|
24 |
|
25 |
def fake_pipeline(prompts: Iterable[str]) -> list[str]:
|
|
|
109 |
)
|
110 |
return metric
|
111 |
|
112 |
+
# @cache
|
113 |
def run(
|
114 |
self,
|
115 |
pipeline,
|
|
|
139 |
else:
|
140 |
responses = decode_choice(responses)
|
141 |
|
142 |
+
return responses, references
|
143 |
+
|
144 |
+
|
145 |
+
def multichoice_zh(responses: Any, references: list[str]):
|
146 |
+
if isinstance(responses[0], str):
|
147 |
+
responses = [extract_choice_zh(response) for response in responses]
|
148 |
+
else:
|
149 |
+
responses = decode_choice(responses)
|
150 |
+
|
151 |
return responses, references
|
152 |
|
153 |
|
154 |
class Metrics:
|
155 |
+
cmmlu = multichoice_zh
|
156 |
mmlu = multichoice
|
157 |
|
158 |
def gsm8k(responses: list[str], answers: list[str | int]):
|
|
|
315 |
.to_dict()
|
316 |
)
|
317 |
suite = defaultdict(list)
|
318 |
+
cls.categories["all"] = list(finer_categories.keys())
|
319 |
for k, v in cls.categories.items():
|
320 |
for subject in v:
|
321 |
suite[k].extend(
|
|
|
446 |
.to_dict()
|
447 |
)
|
448 |
suite = defaultdict(list)
|
449 |
+
cls.categories["all"] = list(finer_categories.keys())
|
450 |
for k, v in cls.categories.items():
|
451 |
for subject in v:
|
452 |
suite[k].extend(
|
tlem.py
CHANGED
@@ -16,32 +16,7 @@ import pandas as pd
|
|
16 |
from .tasks import *
|
17 |
from .utils import is_equiv
|
18 |
|
19 |
-
# %%
|
20 |
-
|
21 |
-
# %cd ../tlem
|
22 |
-
|
23 |
-
# %load_ext ipytorch
|
24 |
-
# %ls
|
25 |
-
|
26 |
-
|
27 |
-
# TODO: Add BibTeX citation
|
28 |
-
_CITATION = """\
|
29 |
-
"""
|
30 |
-
|
31 |
-
# TODO: Add description of the module here
|
32 |
-
_DESCRIPTION = """\
|
33 |
-
"""
|
34 |
|
35 |
-
|
36 |
-
# TODO: Add description of the arguments of the module here
|
37 |
-
_KWARGS_DESCRIPTION = """
|
38 |
-
"""
|
39 |
-
|
40 |
-
# TODO: Define external resources urls if needed
|
41 |
-
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
42 |
-
|
43 |
-
|
44 |
-
# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
45 |
class ReasoningMetric(evaluate.Metric):
|
46 |
"""TODO: Short description of my evaluation module."""
|
47 |
|
@@ -59,9 +34,9 @@ class ReasoningMetric(evaluate.Metric):
|
|
59 |
return evaluate.EvaluationModuleInfo(
|
60 |
# This is the description that will appear on the modules page.
|
61 |
# module_type="measurement",
|
62 |
-
description=
|
63 |
-
citation=
|
64 |
-
inputs_description=
|
65 |
# This defines the format of each prediction and reference
|
66 |
features=features,
|
67 |
# Homepage of the module for documentation
|
@@ -106,26 +81,30 @@ class Suite(EvaluationSuite):
|
|
106 |
def run(
|
107 |
self,
|
108 |
model_or_pipeline: Any,
|
109 |
-
name="tlem",
|
110 |
) -> dict[str, float]:
|
111 |
self.assert_suite_nonempty()
|
112 |
|
113 |
def run_tasks(tasks):
|
114 |
-
for task in tqdm(tasks):
|
|
|
115 |
if task.name not in self.cached_result:
|
116 |
self.cached_result[task.name] = task.run(model_or_pipeline)
|
117 |
results = [self.cached_result[task.name] for task in tasks]
|
118 |
return pd.DataFrame(results).mean().to_dict()
|
119 |
|
120 |
if isinstance(self.suite, dict):
|
121 |
-
for category, tasks in tqdm(self.suite.items()):
|
122 |
-
|
|
|
123 |
else:
|
124 |
logging.warning(f"Combined results: {run_tasks(self.suite)}")
|
125 |
|
126 |
return self.cached_result
|
127 |
|
128 |
def add(self, name):
|
|
|
|
|
|
|
129 |
chat = False
|
130 |
match name:
|
131 |
case _ if "chat" in name:
|
@@ -146,8 +125,4 @@ class Suite(EvaluationSuite):
|
|
146 |
def __init__(self, name="tlem"):
|
147 |
super().__init__(name)
|
148 |
self.cached_result = {}
|
149 |
-
|
150 |
-
self.suite = [
|
151 |
-
# TASK_REGISTRY["gsm8k"],
|
152 |
-
# TASK_REGISTRY["competition_math"],
|
153 |
-
]
|
|
|
16 |
from .tasks import *
|
17 |
from .utils import is_equiv
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
class ReasoningMetric(evaluate.Metric):
|
21 |
"""TODO: Short description of my evaluation module."""
|
22 |
|
|
|
34 |
return evaluate.EvaluationModuleInfo(
|
35 |
# This is the description that will appear on the modules page.
|
36 |
# module_type="measurement",
|
37 |
+
description="",
|
38 |
+
citation="",
|
39 |
+
inputs_description="",
|
40 |
# This defines the format of each prediction and reference
|
41 |
features=features,
|
42 |
# Homepage of the module for documentation
|
|
|
81 |
def run(
|
82 |
self,
|
83 |
model_or_pipeline: Any,
|
|
|
84 |
) -> dict[str, float]:
|
85 |
self.assert_suite_nonempty()
|
86 |
|
87 |
def run_tasks(tasks):
|
88 |
+
for task in (bar := tqdm(tasks, leave=False)):
|
89 |
+
bar.desc = f"complete {task.name}."
|
90 |
if task.name not in self.cached_result:
|
91 |
self.cached_result[task.name] = task.run(model_or_pipeline)
|
92 |
results = [self.cached_result[task.name] for task in tasks]
|
93 |
return pd.DataFrame(results).mean().to_dict()
|
94 |
|
95 |
if isinstance(self.suite, dict):
|
96 |
+
for category, tasks in (bar := tqdm(self.suite.items())):
|
97 |
+
bar.desc = f"complete {category}."
|
98 |
+
logging.warning(f"Combined results {category}: {run_tasks(tasks)}")
|
99 |
else:
|
100 |
logging.warning(f"Combined results: {run_tasks(self.suite)}")
|
101 |
|
102 |
return self.cached_result
|
103 |
|
104 |
def add(self, name):
|
105 |
+
self.load(name)
|
106 |
+
|
107 |
+
def load(self, name):
|
108 |
chat = False
|
109 |
match name:
|
110 |
case _ if "chat" in name:
|
|
|
125 |
def __init__(self, name="tlem"):
|
126 |
super().__init__(name)
|
127 |
self.cached_result = {}
|
128 |
+
self.suite = []
|
|
|
|
|
|
|
|
utils.py
CHANGED
@@ -9,6 +9,34 @@ NUMERIC_IN_ZH = (
|
|
9 |
)
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def extract_choice(gen):
|
13 |
# answer is A | choice is A | choose A
|
14 |
res = re.search(
|
|
|
9 |
)
|
10 |
|
11 |
|
12 |
+
def extract_choice_zh(gen):
|
13 |
+
# 答案是A | 选项是A | 应该选A选项
|
14 |
+
res = re.search(
|
15 |
+
r"(?:(?:选|选择|选定)[::]?\s*|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|选|为|:|:|】))[^ABCD]{0,10}?(?:是|选|为|:|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|,|,|.|、|A|B|C|D|$|:|:|\)|))",
|
16 |
+
gen,
|
17 |
+
)
|
18 |
+
|
19 |
+
# A选项正确 | A选项符合题意
|
20 |
+
if res is None:
|
21 |
+
res = re.search(
|
22 |
+
r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对[的,。:]|符合))[^ABCD]{0,4}?(?:正确|对[的,。:]|符合)",
|
23 |
+
gen,
|
24 |
+
)
|
25 |
+
|
26 |
+
# 直接输出 A
|
27 |
+
if res is None:
|
28 |
+
res = re.search(r"^[\((]?(A|B|C|D)(?:。|\)|)|\.|,|,|.|:|:|$)", gen)
|
29 |
+
|
30 |
+
# 获取第一个出现的字母
|
31 |
+
if res is None:
|
32 |
+
res = re.search(r"(?<![a-zA-Z])(A|B|C|D)(?![a-zA-Z=])", gen)
|
33 |
+
if res is None:
|
34 |
+
res = "A"
|
35 |
+
if isinstance(res, str):
|
36 |
+
return res
|
37 |
+
return res.group(1)
|
38 |
+
|
39 |
+
|
40 |
def extract_choice(gen):
|
41 |
# answer is A | choice is A | choose A
|
42 |
res = re.search(
|