Spaces:
Running
Running
update
Browse files
tasks.py
CHANGED
@@ -85,6 +85,9 @@ class Task:
|
|
85 |
}
|
86 |
self.label_column = self.label_column or self.input_column
|
87 |
|
|
|
|
|
|
|
88 |
@cached_property
|
89 |
def samples(self):
|
90 |
return self.dataset[self.input_column]
|
|
|
85 |
}
|
86 |
self.label_column = self.label_column or self.input_column
|
87 |
|
88 |
+
def __eq__(self, __value: object) -> bool:
|
89 |
+
return self.name == __value.name
|
90 |
+
|
91 |
@cached_property
|
92 |
def samples(self):
|
93 |
return self.dataset[self.input_column]
|
tlem.py
CHANGED
@@ -13,6 +13,7 @@ import pandas as pd
|
|
13 |
from .tasks import *
|
14 |
from .utils import *
|
15 |
from itertools import chain
|
|
|
16 |
|
17 |
|
18 |
class ReasoningMetric(evaluate.Metric):
|
@@ -78,26 +79,29 @@ class Suite(EvaluationSuite):
|
|
78 |
# case _:
|
79 |
# return list(chain(*self.suite.values()))[key]
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
def run(
|
82 |
self,
|
83 |
model_or_pipeline: Any,
|
84 |
-
suite=None,
|
85 |
) -> dict[str, float]:
|
86 |
self.assert_suite_nonempty()
|
87 |
-
if suite is None:
|
88 |
-
suite = self.suite
|
89 |
|
90 |
self.suite: dict[str, list[Task]]
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
results[category] = self.run(model_or_pipeline, tasks)
|
96 |
-
else:
|
97 |
-
for task in tasks:
|
98 |
-
results[category].update(task.run(model_or_pipeline))
|
99 |
-
results[category] = np.mean(list(results[category].values()))
|
100 |
-
return results
|
101 |
|
102 |
def get_suite(self, name) -> dict[str, Task]:
|
103 |
chat = False
|
@@ -144,6 +148,20 @@ class Suite(EvaluationSuite):
|
|
144 |
input_column="problem",
|
145 |
label_column="solution",
|
146 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
if isinstance(suite, Task):
|
148 |
suite = [suite]
|
149 |
if isinstance(suite, list):
|
|
|
13 |
from .tasks import *
|
14 |
from .utils import *
|
15 |
from itertools import chain
|
16 |
+
from copy import deepcopy
|
17 |
|
18 |
|
19 |
class ReasoningMetric(evaluate.Metric):
|
|
|
79 |
# case _:
|
80 |
# return list(chain(*self.suite.values()))[key]
|
81 |
|
82 |
+
def aggregate(self, suite):
|
83 |
+
for cate, tasks in suite.items():
|
84 |
+
if isinstance(tasks, dict):
|
85 |
+
suite[cate] = self.aggregate(tasks)
|
86 |
+
else:
|
87 |
+
result = []
|
88 |
+
for task in tasks:
|
89 |
+
result.extend(task.result.values())
|
90 |
+
suite[cate] = np.mean(result)
|
91 |
+
|
92 |
+
return suite
|
93 |
+
|
94 |
def run(
|
95 |
self,
|
96 |
model_or_pipeline: Any,
|
|
|
97 |
) -> dict[str, float]:
|
98 |
self.assert_suite_nonempty()
|
|
|
|
|
99 |
|
100 |
self.suite: dict[str, list[Task]]
|
101 |
+
for task in (bar := tqdm(self.tasks)):
|
102 |
+
bar.desc = f"complete {task.name}."
|
103 |
+
_ = task.run(model_or_pipeline)
|
104 |
+
return self.aggregate(deepcopy(self.suite))
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
def get_suite(self, name) -> dict[str, Task]:
|
107 |
chat = False
|
|
|
148 |
input_column="problem",
|
149 |
label_column="solution",
|
150 |
)
|
151 |
+
|
152 |
+
case "open-leaderboard":
|
153 |
+
suite = {}
|
154 |
+
for name in [
|
155 |
+
"arc",
|
156 |
+
"hellaswag",
|
157 |
+
"mmlu-chat",
|
158 |
+
"winogrande",
|
159 |
+
"gsm8k",
|
160 |
+
# "truthful_qa",
|
161 |
+
"drop",
|
162 |
+
]:
|
163 |
+
suite[name] = self.get_suite(name)
|
164 |
+
|
165 |
if isinstance(suite, Task):
|
166 |
suite = [suite]
|
167 |
if isinstance(suite, list):
|