khulnasoft commited on
Commit
91c4ae7
·
verified ·
1 Parent(s): a9d4a31

Delete lighteval_tasks.py

Browse files
Files changed (1) hide show
  1. lighteval_tasks.py +0 -251
lighteval_tasks.py DELETED
@@ -1,251 +0,0 @@
1
- import re
2
- from typing import List, Tuple
3
-
4
- from lighteval.metrics import Metrics
5
- from lighteval.tasks.lighteval_task import LightevalTaskConfig
6
- from lighteval.tasks.requests import Doc
7
- from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
8
-
9
- _TASKS_STRINGS: List[Tuple[LightevalTaskConfig, str]] = []
10
- _TASKS: List[LightevalTaskConfig] = []
11
-
12
- ## COMMON_SENSE_REASONING_TASKS ##
13
- COMMON_SENSE_REASONING_TASKS = [
14
- LightevalTaskConfig(
15
- name="hellaswag",
16
- prompt_function="hellaswag_prompt",
17
- hf_repo="hellaswag",
18
- hf_subset="default",
19
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
20
- ),
21
- LightevalTaskConfig(
22
- name="winogrande",
23
- prompt_function="winogrande",
24
- hf_repo="winogrande",
25
- hf_subset="winogrande_xl",
26
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
27
- ),
28
- LightevalTaskConfig(
29
- name="piqa",
30
- prompt_function="piqa_harness",
31
- hf_repo="piqa",
32
- hf_subset="plain_text",
33
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
34
- ),
35
- LightevalTaskConfig(
36
- name="siqa",
37
- prompt_function="siqa_prompt",
38
- hf_repo="lighteval/siqa",
39
- hf_subset="default",
40
- hf_avail_splits=["train", "validation"],
41
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
42
- ),
43
- LightevalTaskConfig(
44
- name="openbookqa",
45
- prompt_function="openbookqa",
46
- hf_repo="openbookqa",
47
- hf_subset="main",
48
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
49
- ),
50
- LightevalTaskConfig(
51
- name="arc:easy",
52
- prompt_function="arc",
53
- hf_repo="ai2_arc",
54
- hf_subset="ARC-Easy",
55
- evaluation_splits=["test"],
56
- generation_size=1,
57
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
58
- ),
59
- LightevalTaskConfig(
60
- name="arc:challenge",
61
- prompt_function="arc",
62
- hf_repo="ai2_arc",
63
- hf_subset="ARC-Challenge",
64
- evaluation_splits=["test"],
65
- generation_size=1,
66
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
67
- ),
68
- LightevalTaskConfig(
69
- name="commonsense_qa",
70
- prompt_function="commonsense_qa_prompt",
71
- hf_repo="commonsense_qa",
72
- hf_subset="default",
73
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
74
- ),
75
- ]
76
-
77
-
78
- def commonsense_qa_prompt(line, task_name: str = None):
79
- return Doc(
80
- task_name=task_name,
81
- query=line["question"],
82
- choices=[f" {c}" for c in line["choices"]["text"]],
83
- gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
84
- instruction="",
85
- )
86
-
87
-
88
- def siqa_prompt(line, task_name: str = None):
89
- return Doc(
90
- task_name=task_name,
91
- query=line["context"] + " " + line["question"],
92
- choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]],
93
- gold_index=int(line["label"]) - 1,
94
- instruction="",
95
- )
96
-
97
-
98
- def hellaswag_prompt(line, task_name: str = None):
99
- def preprocess(text):
100
- """Comes from AiHarness"""
101
- # text = text.strip()
102
- # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
103
- text = text.replace(" [title]", ". ")
104
- text = re.sub("\\[.*?\\]", "", text)
105
- text = text.replace(" ", " ")
106
- return text
107
-
108
- ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} "
109
- return Doc(
110
- task_name=task_name,
111
- query=preprocess(line["activity_label"] + ": " + ctx),
112
- choices=[" " + preprocess(ending) for ending in line["endings"]],
113
- gold_index=int(line["label"]) if line["label"] != "" else -1, # -1 for test
114
- # "metric": "choices_loglikelihood",
115
- )
116
-
117
-
118
- # 0 short for common sense
119
- COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS]
120
- _TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING)
121
- _TASKS += COMMON_SENSE_REASONING_TASKS
122
-
123
- ## MMLU ##
124
- class CustomMMLUEvaluationTask(LightevalTaskConfig):
125
- def __init__(
126
- self,
127
- name,
128
- prompt_function="mmlu_prompt",
129
- hf_repo="lighteval/mmlu",
130
- hf_subset=None,
131
- # metric=[Metrics.loglikelihood_acc_single_token],
132
- metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
133
- hf_avail_splits=None,
134
- evaluation_splits=["test"],
135
- few_shots_split="dev",
136
- few_shots_select=None,
137
- suite=None,
138
- generation_size=-1,
139
- stop_sequence=None,
140
- output_regex=None,
141
- frozen=False,
142
- ):
143
- super().__init__(
144
- name=name,
145
- prompt_function=prompt_function,
146
- hf_repo=hf_repo,
147
- hf_subset=hf_subset,
148
- metric=metric,
149
- hf_avail_splits=hf_avail_splits,
150
- evaluation_splits=evaluation_splits,
151
- few_shots_split=few_shots_split,
152
- few_shots_select=few_shots_select,
153
- suite=suite,
154
- generation_size=generation_size,
155
- stop_sequence=stop_sequence,
156
- output_regex=output_regex,
157
- frozen=frozen,
158
- )
159
-
160
-
161
- MMLU_TASKS = [
162
- CustomMMLUEvaluationTask(name="mmlu:abstract_algebra", hf_subset="abstract_algebra"),
163
- CustomMMLUEvaluationTask(name="mmlu:anatomy", hf_subset="anatomy"),
164
- CustomMMLUEvaluationTask(name="mmlu:astronomy", hf_subset="astronomy"),
165
- CustomMMLUEvaluationTask(name="mmlu:business_ethics", hf_subset="business_ethics"),
166
- CustomMMLUEvaluationTask(name="mmlu:clinical_knowledge", hf_subset="clinical_knowledge"),
167
- CustomMMLUEvaluationTask(name="mmlu:college_biology", hf_subset="college_biology"),
168
- CustomMMLUEvaluationTask(name="mmlu:college_chemistry", hf_subset="college_chemistry"),
169
- CustomMMLUEvaluationTask(name="mmlu:college_computer_science", hf_subset="college_computer_science"),
170
- CustomMMLUEvaluationTask(name="mmlu:college_mathematics", hf_subset="college_mathematics"),
171
- CustomMMLUEvaluationTask(name="mmlu:college_medicine", hf_subset="college_medicine"),
172
- CustomMMLUEvaluationTask(name="mmlu:college_physics", hf_subset="college_physics"),
173
- CustomMMLUEvaluationTask(name="mmlu:computer_security", hf_subset="computer_security"),
174
- CustomMMLUEvaluationTask(name="mmlu:conceptual_physics", hf_subset="conceptual_physics"),
175
- CustomMMLUEvaluationTask(name="mmlu:econometrics", hf_subset="econometrics"),
176
- CustomMMLUEvaluationTask(name="mmlu:electrical_engineering", hf_subset="electrical_engineering"),
177
- CustomMMLUEvaluationTask(name="mmlu:elementary_mathematics", hf_subset="elementary_mathematics"),
178
- CustomMMLUEvaluationTask(name="mmlu:formal_logic", hf_subset="formal_logic"),
179
- CustomMMLUEvaluationTask(name="mmlu:global_facts", hf_subset="global_facts"),
180
- CustomMMLUEvaluationTask(name="mmlu:high_school_biology", hf_subset="high_school_biology"),
181
- CustomMMLUEvaluationTask(name="mmlu:high_school_chemistry", hf_subset="high_school_chemistry"),
182
- CustomMMLUEvaluationTask(name="mmlu:high_school_computer_science", hf_subset="high_school_computer_science"),
183
- CustomMMLUEvaluationTask(name="mmlu:high_school_european_history", hf_subset="high_school_european_history"),
184
- CustomMMLUEvaluationTask(name="mmlu:high_school_geography", hf_subset="high_school_geography"),
185
- CustomMMLUEvaluationTask(
186
- name="mmlu:high_school_government_and_politics", hf_subset="high_school_government_and_politics"
187
- ),
188
- CustomMMLUEvaluationTask(name="mmlu:high_school_macroeconomics", hf_subset="high_school_macroeconomics"),
189
- CustomMMLUEvaluationTask(name="mmlu:high_school_mathematics", hf_subset="high_school_mathematics"),
190
- CustomMMLUEvaluationTask(name="mmlu:high_school_microeconomics", hf_subset="high_school_microeconomics"),
191
- CustomMMLUEvaluationTask(name="mmlu:high_school_physics", hf_subset="high_school_physics"),
192
- CustomMMLUEvaluationTask(name="mmlu:high_school_psychology", hf_subset="high_school_psychology"),
193
- CustomMMLUEvaluationTask(name="mmlu:high_school_statistics", hf_subset="high_school_statistics"),
194
- CustomMMLUEvaluationTask(name="mmlu:high_school_us_history", hf_subset="high_school_us_history"),
195
- CustomMMLUEvaluationTask(name="mmlu:high_school_world_history", hf_subset="high_school_world_history"),
196
- CustomMMLUEvaluationTask(name="mmlu:human_aging", hf_subset="human_aging"),
197
- CustomMMLUEvaluationTask(name="mmlu:human_sexuality", hf_subset="human_sexuality"),
198
- CustomMMLUEvaluationTask(name="mmlu:international_law", hf_subset="international_law"),
199
- CustomMMLUEvaluationTask(name="mmlu:jurisprudence", hf_subset="jurisprudence"),
200
- CustomMMLUEvaluationTask(name="mmlu:logical_fallacies", hf_subset="logical_fallacies"),
201
- CustomMMLUEvaluationTask(name="mmlu:machine_learning", hf_subset="machine_learning"),
202
- CustomMMLUEvaluationTask(name="mmlu:management", hf_subset="management"),
203
- CustomMMLUEvaluationTask(name="mmlu:marketing", hf_subset="marketing"),
204
- CustomMMLUEvaluationTask(name="mmlu:medical_genetics", hf_subset="medical_genetics"),
205
- CustomMMLUEvaluationTask(name="mmlu:miscellaneous", hf_subset="miscellaneous"),
206
- CustomMMLUEvaluationTask(name="mmlu:moral_disputes", hf_subset="moral_disputes"),
207
- CustomMMLUEvaluationTask(name="mmlu:moral_scenarios", hf_subset="moral_scenarios"),
208
- CustomMMLUEvaluationTask(name="mmlu:nutrition", hf_subset="nutrition"),
209
- CustomMMLUEvaluationTask(name="mmlu:philosophy", hf_subset="philosophy"),
210
- CustomMMLUEvaluationTask(name="mmlu:prehistory", hf_subset="prehistory"),
211
- CustomMMLUEvaluationTask(name="mmlu:professional_accounting", hf_subset="professional_accounting"),
212
- CustomMMLUEvaluationTask(name="mmlu:professional_law", hf_subset="professional_law"),
213
- CustomMMLUEvaluationTask(name="mmlu:professional_medicine", hf_subset="professional_medicine"),
214
- CustomMMLUEvaluationTask(name="mmlu:professional_psychology", hf_subset="professional_psychology"),
215
- CustomMMLUEvaluationTask(name="mmlu:public_relations", hf_subset="public_relations"),
216
- CustomMMLUEvaluationTask(name="mmlu:security_studies", hf_subset="security_studies"),
217
- CustomMMLUEvaluationTask(name="mmlu:sociology", hf_subset="sociology"),
218
- CustomMMLUEvaluationTask(name="mmlu:us_foreign_policy", hf_subset="us_foreign_policy"),
219
- CustomMMLUEvaluationTask(name="mmlu:virology", hf_subset="virology"),
220
- CustomMMLUEvaluationTask(name="mmlu:world_religions", hf_subset="world_religions"),
221
- ]
222
-
223
-
224
- def mmlu_prompt(line, task_name: str = None):
225
- """MMLU prompt without letters"""
226
- topic = line["subject"]
227
- prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: "
228
- prompt += line["question"] + "\nAnswer:"
229
-
230
- return Doc(
231
- task_name=task_name,
232
- query=prompt,
233
- choices=[f" {c}" for c in line["choices"]],
234
- gold_index=line["answer"],
235
- instruction=f"The following are questions about {topic.replace('_', ' ')}.\n",
236
- )
237
-
238
-
239
- MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS]
240
- _TASKS_STRINGS.extend(MMLU_STRING)
241
- _TASKS += MMLU_TASKS
242
-
243
- # common sense reasoning + mmlu
244
- EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING])
245
-
246
- # Convert to dict for lighteval
247
- TASKS_TABLE = [task.as_dict() for task in _TASKS]
248
- # You can have a few pre-organised groups of tasks
249
- TASKS_GROUPS = {
250
- "early-signal": EARLY_SIGNAL_TASKS,
251
- }