Spaces:
Runtime error
Runtime error
pminervini
commited on
Commit
•
73d1e6e
1
Parent(s):
b2aa5d0
update
Browse files
cli/fix-requests-cli.py
CHANGED
@@ -41,7 +41,7 @@ for path in json_files:
|
|
41 |
data["model_type"] = "fine-tuned"
|
42 |
to_overwrite = True
|
43 |
|
44 |
-
is_instruction_tuned = 'nstruct' in model_id
|
45 |
if is_instruction_tuned:
|
46 |
data["model_type"] = "instruction-tuned"
|
47 |
to_overwrite = True
|
|
|
41 |
data["model_type"] = "fine-tuned"
|
42 |
to_overwrite = True
|
43 |
|
44 |
+
is_instruction_tuned = ('nstruct' in model_id) or ('chat' in model_id)
|
45 |
if is_instruction_tuned:
|
46 |
data["model_type"] = "instruction-tuned"
|
47 |
to_overwrite = True
|
cli/halueval-cli.py
CHANGED
@@ -7,6 +7,8 @@ from src.backend.manage_requests import get_eval_requests
|
|
7 |
from src.backend.manage_requests import EvalRequest
|
8 |
from src.backend.run_eval_suite import run_evaluation
|
9 |
|
|
|
|
|
10 |
from lm_eval.tasks import initialize_tasks, include_task_folder
|
11 |
from lm_eval import tasks, evaluator, utils
|
12 |
|
@@ -15,7 +17,7 @@ from src.envs import QUEUE_REPO
|
|
15 |
|
16 |
|
17 |
def main():
|
18 |
-
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
19 |
|
20 |
PENDING_STATUS = "PENDING"
|
21 |
RUNNING_STATUS = "RUNNING"
|
@@ -28,7 +30,10 @@ def main():
|
|
28 |
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
29 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
30 |
|
31 |
-
|
|
|
|
|
|
|
32 |
# task_names = ['triviaqa']
|
33 |
# TASKS_HARNESS = [task.value for task in Tasks]
|
34 |
|
|
|
7 |
from src.backend.manage_requests import EvalRequest
|
8 |
from src.backend.run_eval_suite import run_evaluation
|
9 |
|
10 |
+
from src.backend.tasks.xsum.task import XSum
|
11 |
+
|
12 |
from lm_eval.tasks import initialize_tasks, include_task_folder
|
13 |
from lm_eval import tasks, evaluator, utils
|
14 |
|
|
|
17 |
|
18 |
|
19 |
def main():
|
20 |
+
# snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
21 |
|
22 |
PENDING_STATUS = "PENDING"
|
23 |
RUNNING_STATUS = "RUNNING"
|
|
|
30 |
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
31 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
32 |
|
33 |
+
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
34 |
+
my_task = Task("xsum", "rougeLsum", "XSum", 2)
|
35 |
+
|
36 |
+
TASKS_HARNESS = [my_task]
|
37 |
# task_names = ['triviaqa']
|
38 |
# TASKS_HARNESS = [task.value for task in Tasks]
|
39 |
|
src/backend/tasks/xsum/xsum.yaml.bak → snippets/xsum.yaml
RENAMED
File without changes
|
src/backend/envs.py
CHANGED
@@ -23,12 +23,18 @@ class Tasks(Enum):
|
|
23 |
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
|
24 |
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
|
25 |
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
|
|
|
26 |
# task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
|
27 |
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
|
28 |
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
|
|
|
29 |
task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
|
|
|
|
|
|
|
30 |
# task6 = Task("xsum", "rougeL_acc", "XSum", 8)
|
31 |
-
|
|
|
32 |
|
33 |
# NUM_FEWSHOT = 64 # Change with your few shot
|
34 |
|
|
|
23 |
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
|
24 |
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
|
25 |
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
|
26 |
+
|
27 |
# task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
|
28 |
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
|
29 |
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
|
30 |
+
|
31 |
task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
|
32 |
+
# task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
|
33 |
+
# task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
|
34 |
+
|
35 |
# task6 = Task("xsum", "rougeL_acc", "XSum", 8)
|
36 |
+
|
37 |
+
task8 = Task("memo-trap", "acc", "memo-trap", 0)
|
38 |
|
39 |
# NUM_FEWSHOT = 64 # Change with your few shot
|
40 |
|