Delete lighteval_tasks.py
Browse files- lighteval_tasks.py +0 -251
lighteval_tasks.py
DELETED
@@ -1,251 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
from typing import List, Tuple
|
3 |
-
|
4 |
-
from lighteval.metrics import Metrics
|
5 |
-
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
6 |
-
from lighteval.tasks.requests import Doc
|
7 |
-
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
8 |
-
|
9 |
-
_TASKS_STRINGS: List[Tuple[LightevalTaskConfig, str]] = []
|
10 |
-
_TASKS: List[LightevalTaskConfig] = []
|
11 |
-
|
12 |
-
## COMMON_SENSE_REASONING_TASKS ##
|
13 |
-
COMMON_SENSE_REASONING_TASKS = [
|
14 |
-
LightevalTaskConfig(
|
15 |
-
name="hellaswag",
|
16 |
-
prompt_function="hellaswag_prompt",
|
17 |
-
hf_repo="hellaswag",
|
18 |
-
hf_subset="default",
|
19 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
20 |
-
),
|
21 |
-
LightevalTaskConfig(
|
22 |
-
name="winogrande",
|
23 |
-
prompt_function="winogrande",
|
24 |
-
hf_repo="winogrande",
|
25 |
-
hf_subset="winogrande_xl",
|
26 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
27 |
-
),
|
28 |
-
LightevalTaskConfig(
|
29 |
-
name="piqa",
|
30 |
-
prompt_function="piqa_harness",
|
31 |
-
hf_repo="piqa",
|
32 |
-
hf_subset="plain_text",
|
33 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
34 |
-
),
|
35 |
-
LightevalTaskConfig(
|
36 |
-
name="siqa",
|
37 |
-
prompt_function="siqa_prompt",
|
38 |
-
hf_repo="lighteval/siqa",
|
39 |
-
hf_subset="default",
|
40 |
-
hf_avail_splits=["train", "validation"],
|
41 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
42 |
-
),
|
43 |
-
LightevalTaskConfig(
|
44 |
-
name="openbookqa",
|
45 |
-
prompt_function="openbookqa",
|
46 |
-
hf_repo="openbookqa",
|
47 |
-
hf_subset="main",
|
48 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
49 |
-
),
|
50 |
-
LightevalTaskConfig(
|
51 |
-
name="arc:easy",
|
52 |
-
prompt_function="arc",
|
53 |
-
hf_repo="ai2_arc",
|
54 |
-
hf_subset="ARC-Easy",
|
55 |
-
evaluation_splits=["test"],
|
56 |
-
generation_size=1,
|
57 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
58 |
-
),
|
59 |
-
LightevalTaskConfig(
|
60 |
-
name="arc:challenge",
|
61 |
-
prompt_function="arc",
|
62 |
-
hf_repo="ai2_arc",
|
63 |
-
hf_subset="ARC-Challenge",
|
64 |
-
evaluation_splits=["test"],
|
65 |
-
generation_size=1,
|
66 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
67 |
-
),
|
68 |
-
LightevalTaskConfig(
|
69 |
-
name="commonsense_qa",
|
70 |
-
prompt_function="commonsense_qa_prompt",
|
71 |
-
hf_repo="commonsense_qa",
|
72 |
-
hf_subset="default",
|
73 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
74 |
-
),
|
75 |
-
]
|
76 |
-
|
77 |
-
|
78 |
-
def commonsense_qa_prompt(line, task_name: str = None):
|
79 |
-
return Doc(
|
80 |
-
task_name=task_name,
|
81 |
-
query=line["question"],
|
82 |
-
choices=[f" {c}" for c in line["choices"]["text"]],
|
83 |
-
gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
|
84 |
-
instruction="",
|
85 |
-
)
|
86 |
-
|
87 |
-
|
88 |
-
def siqa_prompt(line, task_name: str = None):
|
89 |
-
return Doc(
|
90 |
-
task_name=task_name,
|
91 |
-
query=line["context"] + " " + line["question"],
|
92 |
-
choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]],
|
93 |
-
gold_index=int(line["label"]) - 1,
|
94 |
-
instruction="",
|
95 |
-
)
|
96 |
-
|
97 |
-
|
98 |
-
def hellaswag_prompt(line, task_name: str = None):
|
99 |
-
def preprocess(text):
|
100 |
-
"""Comes from AiHarness"""
|
101 |
-
# text = text.strip()
|
102 |
-
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
|
103 |
-
text = text.replace(" [title]", ". ")
|
104 |
-
text = re.sub("\\[.*?\\]", "", text)
|
105 |
-
text = text.replace(" ", " ")
|
106 |
-
return text
|
107 |
-
|
108 |
-
ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} "
|
109 |
-
return Doc(
|
110 |
-
task_name=task_name,
|
111 |
-
query=preprocess(line["activity_label"] + ": " + ctx),
|
112 |
-
choices=[" " + preprocess(ending) for ending in line["endings"]],
|
113 |
-
gold_index=int(line["label"]) if line["label"] != "" else -1, # -1 for test
|
114 |
-
# "metric": "choices_loglikelihood",
|
115 |
-
)
|
116 |
-
|
117 |
-
|
118 |
-
# 0 short for common sense
|
119 |
-
COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS]
|
120 |
-
_TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING)
|
121 |
-
_TASKS += COMMON_SENSE_REASONING_TASKS
|
122 |
-
|
123 |
-
## MMLU ##
|
124 |
-
class CustomMMLUEvaluationTask(LightevalTaskConfig):
|
125 |
-
def __init__(
|
126 |
-
self,
|
127 |
-
name,
|
128 |
-
prompt_function="mmlu_prompt",
|
129 |
-
hf_repo="lighteval/mmlu",
|
130 |
-
hf_subset=None,
|
131 |
-
# metric=[Metrics.loglikelihood_acc_single_token],
|
132 |
-
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
|
133 |
-
hf_avail_splits=None,
|
134 |
-
evaluation_splits=["test"],
|
135 |
-
few_shots_split="dev",
|
136 |
-
few_shots_select=None,
|
137 |
-
suite=None,
|
138 |
-
generation_size=-1,
|
139 |
-
stop_sequence=None,
|
140 |
-
output_regex=None,
|
141 |
-
frozen=False,
|
142 |
-
):
|
143 |
-
super().__init__(
|
144 |
-
name=name,
|
145 |
-
prompt_function=prompt_function,
|
146 |
-
hf_repo=hf_repo,
|
147 |
-
hf_subset=hf_subset,
|
148 |
-
metric=metric,
|
149 |
-
hf_avail_splits=hf_avail_splits,
|
150 |
-
evaluation_splits=evaluation_splits,
|
151 |
-
few_shots_split=few_shots_split,
|
152 |
-
few_shots_select=few_shots_select,
|
153 |
-
suite=suite,
|
154 |
-
generation_size=generation_size,
|
155 |
-
stop_sequence=stop_sequence,
|
156 |
-
output_regex=output_regex,
|
157 |
-
frozen=frozen,
|
158 |
-
)
|
159 |
-
|
160 |
-
|
161 |
-
MMLU_TASKS = [
|
162 |
-
CustomMMLUEvaluationTask(name="mmlu:abstract_algebra", hf_subset="abstract_algebra"),
|
163 |
-
CustomMMLUEvaluationTask(name="mmlu:anatomy", hf_subset="anatomy"),
|
164 |
-
CustomMMLUEvaluationTask(name="mmlu:astronomy", hf_subset="astronomy"),
|
165 |
-
CustomMMLUEvaluationTask(name="mmlu:business_ethics", hf_subset="business_ethics"),
|
166 |
-
CustomMMLUEvaluationTask(name="mmlu:clinical_knowledge", hf_subset="clinical_knowledge"),
|
167 |
-
CustomMMLUEvaluationTask(name="mmlu:college_biology", hf_subset="college_biology"),
|
168 |
-
CustomMMLUEvaluationTask(name="mmlu:college_chemistry", hf_subset="college_chemistry"),
|
169 |
-
CustomMMLUEvaluationTask(name="mmlu:college_computer_science", hf_subset="college_computer_science"),
|
170 |
-
CustomMMLUEvaluationTask(name="mmlu:college_mathematics", hf_subset="college_mathematics"),
|
171 |
-
CustomMMLUEvaluationTask(name="mmlu:college_medicine", hf_subset="college_medicine"),
|
172 |
-
CustomMMLUEvaluationTask(name="mmlu:college_physics", hf_subset="college_physics"),
|
173 |
-
CustomMMLUEvaluationTask(name="mmlu:computer_security", hf_subset="computer_security"),
|
174 |
-
CustomMMLUEvaluationTask(name="mmlu:conceptual_physics", hf_subset="conceptual_physics"),
|
175 |
-
CustomMMLUEvaluationTask(name="mmlu:econometrics", hf_subset="econometrics"),
|
176 |
-
CustomMMLUEvaluationTask(name="mmlu:electrical_engineering", hf_subset="electrical_engineering"),
|
177 |
-
CustomMMLUEvaluationTask(name="mmlu:elementary_mathematics", hf_subset="elementary_mathematics"),
|
178 |
-
CustomMMLUEvaluationTask(name="mmlu:formal_logic", hf_subset="formal_logic"),
|
179 |
-
CustomMMLUEvaluationTask(name="mmlu:global_facts", hf_subset="global_facts"),
|
180 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_biology", hf_subset="high_school_biology"),
|
181 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_chemistry", hf_subset="high_school_chemistry"),
|
182 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_computer_science", hf_subset="high_school_computer_science"),
|
183 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_european_history", hf_subset="high_school_european_history"),
|
184 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_geography", hf_subset="high_school_geography"),
|
185 |
-
CustomMMLUEvaluationTask(
|
186 |
-
name="mmlu:high_school_government_and_politics", hf_subset="high_school_government_and_politics"
|
187 |
-
),
|
188 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_macroeconomics", hf_subset="high_school_macroeconomics"),
|
189 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_mathematics", hf_subset="high_school_mathematics"),
|
190 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_microeconomics", hf_subset="high_school_microeconomics"),
|
191 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_physics", hf_subset="high_school_physics"),
|
192 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_psychology", hf_subset="high_school_psychology"),
|
193 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_statistics", hf_subset="high_school_statistics"),
|
194 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_us_history", hf_subset="high_school_us_history"),
|
195 |
-
CustomMMLUEvaluationTask(name="mmlu:high_school_world_history", hf_subset="high_school_world_history"),
|
196 |
-
CustomMMLUEvaluationTask(name="mmlu:human_aging", hf_subset="human_aging"),
|
197 |
-
CustomMMLUEvaluationTask(name="mmlu:human_sexuality", hf_subset="human_sexuality"),
|
198 |
-
CustomMMLUEvaluationTask(name="mmlu:international_law", hf_subset="international_law"),
|
199 |
-
CustomMMLUEvaluationTask(name="mmlu:jurisprudence", hf_subset="jurisprudence"),
|
200 |
-
CustomMMLUEvaluationTask(name="mmlu:logical_fallacies", hf_subset="logical_fallacies"),
|
201 |
-
CustomMMLUEvaluationTask(name="mmlu:machine_learning", hf_subset="machine_learning"),
|
202 |
-
CustomMMLUEvaluationTask(name="mmlu:management", hf_subset="management"),
|
203 |
-
CustomMMLUEvaluationTask(name="mmlu:marketing", hf_subset="marketing"),
|
204 |
-
CustomMMLUEvaluationTask(name="mmlu:medical_genetics", hf_subset="medical_genetics"),
|
205 |
-
CustomMMLUEvaluationTask(name="mmlu:miscellaneous", hf_subset="miscellaneous"),
|
206 |
-
CustomMMLUEvaluationTask(name="mmlu:moral_disputes", hf_subset="moral_disputes"),
|
207 |
-
CustomMMLUEvaluationTask(name="mmlu:moral_scenarios", hf_subset="moral_scenarios"),
|
208 |
-
CustomMMLUEvaluationTask(name="mmlu:nutrition", hf_subset="nutrition"),
|
209 |
-
CustomMMLUEvaluationTask(name="mmlu:philosophy", hf_subset="philosophy"),
|
210 |
-
CustomMMLUEvaluationTask(name="mmlu:prehistory", hf_subset="prehistory"),
|
211 |
-
CustomMMLUEvaluationTask(name="mmlu:professional_accounting", hf_subset="professional_accounting"),
|
212 |
-
CustomMMLUEvaluationTask(name="mmlu:professional_law", hf_subset="professional_law"),
|
213 |
-
CustomMMLUEvaluationTask(name="mmlu:professional_medicine", hf_subset="professional_medicine"),
|
214 |
-
CustomMMLUEvaluationTask(name="mmlu:professional_psychology", hf_subset="professional_psychology"),
|
215 |
-
CustomMMLUEvaluationTask(name="mmlu:public_relations", hf_subset="public_relations"),
|
216 |
-
CustomMMLUEvaluationTask(name="mmlu:security_studies", hf_subset="security_studies"),
|
217 |
-
CustomMMLUEvaluationTask(name="mmlu:sociology", hf_subset="sociology"),
|
218 |
-
CustomMMLUEvaluationTask(name="mmlu:us_foreign_policy", hf_subset="us_foreign_policy"),
|
219 |
-
CustomMMLUEvaluationTask(name="mmlu:virology", hf_subset="virology"),
|
220 |
-
CustomMMLUEvaluationTask(name="mmlu:world_religions", hf_subset="world_religions"),
|
221 |
-
]
|
222 |
-
|
223 |
-
|
224 |
-
def mmlu_prompt(line, task_name: str = None):
|
225 |
-
"""MMLU prompt without letters"""
|
226 |
-
topic = line["subject"]
|
227 |
-
prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: "
|
228 |
-
prompt += line["question"] + "\nAnswer:"
|
229 |
-
|
230 |
-
return Doc(
|
231 |
-
task_name=task_name,
|
232 |
-
query=prompt,
|
233 |
-
choices=[f" {c}" for c in line["choices"]],
|
234 |
-
gold_index=line["answer"],
|
235 |
-
instruction=f"The following are questions about {topic.replace('_', ' ')}.\n",
|
236 |
-
)
|
237 |
-
|
238 |
-
|
239 |
-
MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS]
|
240 |
-
_TASKS_STRINGS.extend(MMLU_STRING)
|
241 |
-
_TASKS += MMLU_TASKS
|
242 |
-
|
243 |
-
# common sense reasoning + mmlu
|
244 |
-
EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING])
|
245 |
-
|
246 |
-
# Convert to dict for lighteval
|
247 |
-
TASKS_TABLE = [task.as_dict() for task in _TASKS]
|
248 |
-
# You can have a few pre-organised groups of tasks
|
249 |
-
TASKS_GROUPS = {
|
250 |
-
"early-signal": EARLY_SIGNAL_TASKS,
|
251 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|