Shaltiel commited on
Commit
9e82c5f
·
1 Parent(s): b128812

Added ilfacts task (not for eval yet) and updated backend with fallback

Browse files
custom_tasks.py CHANGED
@@ -11,11 +11,12 @@ from src.custom_tasks.sentiment_task import *
11
  from src.custom_tasks.winograd_task import *
12
  from src.custom_tasks.translation_task import *
13
  from src.custom_tasks.snli_task import *
 
14
 
15
  ## MODULE LOGIC
16
  # You should not need to touch this
17
  # Convert to dict for lighteval
18
- TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, translation_task, snli_task]]
19
 
20
  if __name__ == "__main__":
21
  print(t["name"] for t in TASKS_TABLE)
 
11
  from src.custom_tasks.winograd_task import *
12
  from src.custom_tasks.translation_task import *
13
  from src.custom_tasks.snli_task import *
14
+ from src.custom_tasks.ilfacts_task import *
15
 
16
  ## MODULE LOGIC
17
  # You should not need to touch this
18
  # Convert to dict for lighteval
19
+ TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, translation_task, snli_task, ilfacts_task]]
20
 
21
  if __name__ == "__main__":
22
  print(t["name"] for t in TASKS_TABLE)
src/backend/run_eval_suite_lighteval.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import logging
4
  from datetime import datetime
5
  from argparse import Namespace
 
6
 
7
  from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
8
  from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
@@ -10,6 +11,7 @@ from src.backend.manage_requests import EvalRequest
10
  from lighteval.logging.evaluation_tracker import EnhancedJSONEncoder
11
  from lighteval.models.model_loader import ModelInfo
12
  from huggingface_hub.errors import InferenceEndpointTimeoutError
 
13
 
14
  logging.getLogger("openai").setLevel(logging.WARNING)
15
 
@@ -21,70 +23,81 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
21
  if limit:
22
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
23
 
24
- args = DefaultNamespace(**{
25
- "model_config": dict(model=dict(
26
- type="endpoint",
27
- base_params=dict(
28
- endpoint_name=f'{eval_request.model.split("/")[1].replace(".", "-").replace("_", "-").lower()}-lighteval'[-32:].strip('-'),
29
- model=eval_request.model,
30
- revision=eval_request.revision,
31
- dtype=eval_request.precision,
32
- reuse_existing=False
33
- ),
34
- instance=dict(
35
- accelerator=accelerator,
36
- region=region,
37
- vendor=vendor,
38
- instance_size=instance_size,
39
- instance_type=instance_type,
40
- framework='pytorch',
41
- endpoint_type='protected',
42
- namespace=OWNER
43
- ),
44
- generation=dict(
45
- add_special_tokens=True
46
- )
47
- )),
48
- "max_samples": limit,
49
- "job_id": str(datetime.now()),
50
- "push_results_to_hub": True,
51
- "save_details": False,
52
- "push_details_to_hub": False,
53
- "public_run": False,
54
- "cache_dir": CACHE_PATH,
55
- "results_org": OWNER,
56
- "output_dir": local_dir,
57
- "override_batch_size": batch_size,
58
- "custom_tasks": "custom_tasks.py",
59
- "tasks": task_names,
60
- "dataset_loading_processes": 24,
61
- "num_fewshot_seeds": 0
62
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- try:
65
- # in case of timeout, try it again with reuse_existing
66
- for i in range(3):
 
 
 
 
 
 
 
 
 
 
 
67
  try:
68
- results = main(args)
69
- # if we are i>0, then raise an error so that we call clean up
70
- if i > 0: raise Exception()
71
- except InferenceEndpointTimeoutError:
72
- if i < 3:
73
- print('Timed out, trying again...')
74
- args.model_config['model']['base_params']['reuse_existing'] = True
75
 
76
- dumped = json.dumps(results, cls=EnhancedJSONEncoder, indent=2)
77
- print(dumped)
78
- except Exception as ex: # if eval failed, we force a cleanup
79
- import traceback
80
- traceback.print_exception(ex)
81
- env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
82
- args.model_config['model']['base_params']['reuse_existing'] = True
83
- model_config = create_model_config(args=args, accelerator=accelerator)
84
- model, _ = load_model(config=model_config, env_config=env_config)
85
- print('Cleaning up')
86
- model.reuse_existing = False # force it to clean up
87
- model.cleanup()
88
- results = None
89
 
90
  return results
 
3
  import logging
4
  from datetime import datetime
5
  from argparse import Namespace
6
+ import traceback
7
 
8
  from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
9
  from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
 
11
  from lighteval.logging.evaluation_tracker import EnhancedJSONEncoder
12
  from lighteval.models.model_loader import ModelInfo
13
  from huggingface_hub.errors import InferenceEndpointTimeoutError
14
+ from huggingface_hub import HfApi
15
 
16
  logging.getLogger("openai").setLevel(logging.WARNING)
17
 
 
23
  if limit:
24
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
25
 
26
+ api = HfApi(token=TOKEN)
27
+
28
+ completed = False
29
+ for img_version in ['1.4.5', '2.0.3']:
30
+ args = DefaultNamespace(**{
31
+ "model_config": dict(model=dict(
32
+ type="endpoint",
33
+ base_params=dict(
34
+ endpoint_name=f'{eval_request.model.split("/")[1].replace(".", "-").replace("_", "-").lower()}-lighteval'[-32:].strip('-'),
35
+ model=eval_request.model,
36
+ revision=eval_request.revision,
37
+ dtype=eval_request.precision,
38
+ reuse_existing=False
39
+ ),
40
+ instance=dict(
41
+ accelerator=accelerator,
42
+ region=region,
43
+ vendor=vendor,
44
+ instance_size=instance_size,
45
+ instance_type=instance_type,
46
+ framework='pytorch',
47
+ endpoint_type='protected',
48
+ namespace=OWNER,
49
+ image_url='ghcr.io/huggingface/text-generation-inference:' + img_version
50
+ ),
51
+ generation=dict(
52
+ add_special_tokens=True
53
+ )
54
+ )),
55
+ "max_samples": limit,
56
+ "job_id": str(datetime.now()),
57
+ "push_results_to_hub": True,
58
+ "save_details": False,
59
+ "push_details_to_hub": False,
60
+ "public_run": False,
61
+ "cache_dir": CACHE_PATH,
62
+ "results_org": OWNER,
63
+ "output_dir": local_dir,
64
+ "override_batch_size": batch_size,
65
+ "custom_tasks": "custom_tasks.py",
66
+ "tasks": task_names,
67
+ "dataset_loading_processes": 24,
68
+ "num_fewshot_seeds": 0
69
+ })
70
+
71
+
72
+ try:
73
+ # in case of timeout, try it again with reuse_existing
74
+ for i in range(3):
75
+ try:
76
+ results = main(args)
77
+ completed = True # success!
78
 
79
+ dumped = json.dumps(results, cls=EnhancedJSONEncoder, indent=2)
80
+ print(dumped)
81
+
82
+ # if we are i>0, then raise an error so that we call clean up
83
+ if i > 0: raise Exception()
84
+ break # no need to loop twice if we completed
85
+ except InferenceEndpointTimeoutError:
86
+ if i < 3:
87
+ print('Timed out, trying again...')
88
+ args.model_config['model']['base_params']['reuse_existing'] = True
89
+ # loop around and try again, for timeout
90
+
91
+ except Exception as ex: # if eval failed, we force a cleanup
92
+ traceback.print_exception(ex)
93
  try:
94
+ api.delete_inference_endpoint(
95
+ name=args.model_config['model']['base_params']['endpoint_name'],
96
+ namespace=args.model_config['model']['instance']['namespace']
97
+ )
98
+ except Exception as ex:
99
+ traceback.print_exception(ex)
 
100
 
101
+ if completed: break # no need to try with a different image version
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  return results
src/custom_tasks/ilfacts_task.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
4
+ from lighteval.metrics import Metrics, MetricCategory
5
+ from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
6
+ from aenum import extend_enum
7
+ import numpy as np
8
+ from lighteval.tasks.requests import Doc
9
+ from Levenshtein import distance
10
+ import collections
11
+ from lighteval.utils import as_list
12
+
13
+ def ilfacts_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
14
+ if len(predictions) > 1:
15
+ raise ValueError("Predictions should have one item")
16
+ # do some santizations, since some models produce more info
17
+ pred = re.sub('<[^>]+>', '', predictions[0]).strip() # remove xml tags
18
+ return 1 if pred == golds[0] else 0
19
+
20
+ ilfacts_acc_metric = CorpusLevelMetric(
21
+ metric="ilfacts_acc",
22
+ higher_is_better=True,
23
+ category=MetricCategory.GENERATIVE,
24
+ use_case=MetricUseCase.ACCURACY,
25
+ corpus_level_fn=np.mean,
26
+ sample_level_fn=ilfacts_eval_fn
27
+ )
28
+ extend_enum(Metrics, 'ilfacts_acc_metric', ilfacts_acc_metric)
29
+
30
+ def ilfacts_prompt_fn(line, task_name: str = None):
31
+ """Defines how to go from a dataset line to a doc object.
32
+ Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
33
+ about what this function should do in the README.
34
+ """
35
+ return Doc(
36
+ task_name=task_name,
37
+ query=line["prompt"].strip(),
38
+ choices=[resp.strip() for resp in line["response"]],
39
+ gold_index=0,
40
+ instruction="",
41
+ )
42
+
43
+ # This is how you create a simple tasks (like hellaswag) which has one single subset
44
+ # attached to it, and one evaluation possible.
45
+ ilfacts_task = LightevalTaskConfig(
46
+ name="ilfacts-acc",
47
+ prompt_function="ilfacts_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
48
+ suite=["custom"],
49
+ hf_repo="hebrew-llm-leaderboard/tests",
50
+ hf_subset="default",
51
+ hf_avail_splits=["ilfacts"],
52
+ evaluation_splits=["ilfacts"],
53
+ metric=['ilfacts_acc_metric'],
54
+ stop_sequence=['\n'],
55
+ generation_size=48
56
+ )