Makefile CHANGED
@@ -1,13 +1,18 @@
1
- .PHONY: style format
2
-
3
 
 
4
  style:
5
- python -m black --line-length 119 .
6
- python -m isort .
7
- ruff check --fix .
8
-
9
 
 
10
  quality:
11
- python -m black --check --line-length 119 .
12
- python -m isort --check-only .
13
- ruff check .
 
 
 
 
 
 
1
+ .PHONY: style format quality all
 
2
 
3
+ # Applies code style fixes to the specified file or directory
4
  style:
5
+ @echo "Applying style fixes to $(file)"
6
+ ruff format $(file)
7
+ ruff check --fix $(file) --line-length 119
 
8
 
9
+ # Checks code quality for the specified file or directory
10
  quality:
11
+ @echo "Checking code quality for $(file)"
12
+ ruff check $(file) --line-length 119
13
+
14
+ # Applies PEP8 formatting and checks the entire codebase
15
+ all:
16
+ @echo "Formatting and checking the entire codebase"
17
+ ruff format .
18
+ ruff check --fix . --line-length 119
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- import pandas as pd
3
  import logging
4
  import time
5
  import gradio as gr
@@ -23,8 +22,6 @@ from src.display.utils import (
23
  COLS,
24
  EVAL_COLS,
25
  EVAL_TYPES,
26
- NUMERIC_INTERVALS,
27
- TYPES,
28
  AutoEvalColumn,
29
  ModelType,
30
  Precision,
@@ -51,11 +48,12 @@ from src.tools.collections import update_collections
51
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
52
 
53
  # Configure logging
54
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
55
 
56
  # Start ephemeral Spaces on PRs (see config in README.md)
57
  enable_space_ci()
58
 
 
59
  def restart_space():
60
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
61
 
@@ -68,6 +66,7 @@ def time_diff_wrapper(func):
68
  diff = end_time - start_time
69
  logging.info(f"Time taken for {func.__name__}: {diff} seconds")
70
  return result
 
71
  return wrapper
72
 
73
 
@@ -89,12 +88,13 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
89
  logging.info("Download successful")
90
  return
91
  except Exception as e:
92
- wait_time = backoff_factor ** attempt
93
  logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
94
  time.sleep(wait_time)
95
  attempt += 1
96
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
97
 
 
98
  def init_space(full_init: bool = True):
99
  """Initializes the application space, loading only necessary data."""
100
  if full_init:
@@ -120,12 +120,13 @@ def init_space(full_init: bool = True):
120
  update_collections(original_df)
121
 
122
  leaderboard_df = original_df.copy()
123
-
124
  # Evaluation queue DataFrame retrieval is independent of initialization detail level
125
  eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
126
 
127
  return leaderboard_df, raw_data, original_df, eval_queue_dfs
128
 
 
129
  # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
130
  # This controls whether a full initialization should be performed.
131
  do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
@@ -153,36 +154,34 @@ with demo:
153
  value=leaderboard_df,
154
  datatype=[c.type for c in fields(AutoEvalColumn)],
155
  select_columns=SelectColumns(
156
- default_selection=[
157
- c.name
158
- for c in fields(AutoEvalColumn)
159
- if c.displayed_by_default
160
- ],
161
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
162
  label="Select Columns to Display:",
163
  ),
164
- search_columns=[
165
- AutoEvalColumn.model.name,
166
- AutoEvalColumn.fullname.name,
167
- AutoEvalColumn.license.name
168
- ],
169
- hide_columns=[
170
- c.name
171
- for c in fields(AutoEvalColumn)
172
- if c.hidden
173
- ],
174
  filter_columns=[
175
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
176
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
177
- ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
178
- ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True),
179
- ColumnFilter(AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True),
 
 
 
 
 
 
 
 
 
 
180
  ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
181
  ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
182
  ],
183
- bool_checkboxgroup_label="Hide models"
184
  )
185
-
186
  with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
187
  with gr.Row():
188
  with gr.Column():
@@ -313,4 +312,4 @@ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
313
  scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
314
  scheduler.start()
315
 
316
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import os
 
2
  import logging
3
  import time
4
  import gradio as gr
 
22
  COLS,
23
  EVAL_COLS,
24
  EVAL_TYPES,
 
 
25
  AutoEvalColumn,
26
  ModelType,
27
  Precision,
 
48
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
49
 
50
  # Configure logging
51
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
52
 
53
  # Start ephemeral Spaces on PRs (see config in README.md)
54
  enable_space_ci()
55
 
56
+
57
  def restart_space():
58
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
59
 
 
66
  diff = end_time - start_time
67
  logging.info(f"Time taken for {func.__name__}: {diff} seconds")
68
  return result
69
+
70
  return wrapper
71
 
72
 
 
88
  logging.info("Download successful")
89
  return
90
  except Exception as e:
91
+ wait_time = backoff_factor**attempt
92
  logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
93
  time.sleep(wait_time)
94
  attempt += 1
95
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
96
 
97
+
98
  def init_space(full_init: bool = True):
99
  """Initializes the application space, loading only necessary data."""
100
  if full_init:
 
120
  update_collections(original_df)
121
 
122
  leaderboard_df = original_df.copy()
123
+
124
  # Evaluation queue DataFrame retrieval is independent of initialization detail level
125
  eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
126
 
127
  return leaderboard_df, raw_data, original_df, eval_queue_dfs
128
 
129
+
130
  # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
131
  # This controls whether a full initialization should be performed.
132
  do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
 
154
  value=leaderboard_df,
155
  datatype=[c.type for c in fields(AutoEvalColumn)],
156
  select_columns=SelectColumns(
157
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
 
 
 
 
158
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
159
  label="Select Columns to Display:",
160
  ),
161
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
162
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
 
 
 
 
 
 
 
 
163
  filter_columns=[
164
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
165
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
166
+ ColumnFilter(
167
+ AutoEvalColumn.params.name,
168
+ type="slider",
169
+ min=0,
170
+ max=150,
171
+ label="Select the number of parameters (B)",
172
+ ),
173
+ ColumnFilter(
174
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
175
+ ),
176
+ ColumnFilter(
177
+ AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
178
+ ),
179
  ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
180
  ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
181
  ],
182
+ bool_checkboxgroup_label="Hide models",
183
  )
184
+
185
  with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
186
  with gr.Row():
187
  with gr.Column():
 
312
  scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
313
  scheduler.start()
314
 
315
+ demo.queue(default_concurrency_limit=40).launch()
src/display/utils.py CHANGED
@@ -7,7 +7,8 @@ import pandas as pd
7
 
8
 
9
  # Configure logging
10
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
11
 
12
  def parse_datetime(datetime_str):
13
  formats = [
@@ -15,7 +16,7 @@ def parse_datetime(datetime_str):
15
  "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
16
  "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
17
  ]
18
-
19
  for fmt in formats:
20
  try:
21
  return datetime.strptime(datetime_str, fmt)
@@ -25,6 +26,7 @@ def parse_datetime(datetime_str):
25
  logging.error(f"No valid date format found for: {datetime_str}")
26
  return datetime(1970, 1, 1)
27
 
 
28
  def load_json_data(file_path):
29
  """Safely load JSON data from a file."""
30
  try:
@@ -98,7 +100,6 @@ auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname
98
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
99
 
100
 
101
-
102
  @dataclass(frozen=True)
103
  class EvalQueueColumn: # Queue column
104
  model = ColumnContent("model", "markdown", True)
 
7
 
8
 
9
  # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
+
12
 
13
  def parse_datetime(datetime_str):
14
  formats = [
 
16
  "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
17
  "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
18
  ]
19
+
20
  for fmt in formats:
21
  try:
22
  return datetime.strptime(datetime_str, fmt)
 
26
  logging.error(f"No valid date format found for: {datetime_str}")
27
  return datetime(1970, 1, 1)
28
 
29
+
30
  def load_json_data(file_path):
31
  """Safely load JSON data from a file."""
32
  try:
 
100
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
101
 
102
 
 
103
  @dataclass(frozen=True)
104
  class EvalQueueColumn: # Queue column
105
  model = ColumnContent("model", "markdown", True)
src/envs.py CHANGED
@@ -1,6 +1,4 @@
1
  import os
2
- import logging
3
-
4
  from huggingface_hub import HfApi
5
 
6
  # clone / pull the lmeh eval data
 
1
  import os
 
 
2
  from huggingface_hub import HfApi
3
 
4
  # clone / pull the lmeh eval data
src/leaderboard/filter_models.py CHANGED
@@ -137,9 +137,9 @@ def flag_models(leaderboard_data: list[dict]):
137
  if model_data[AutoEvalColumn.not_flagged.name]:
138
  flag_key = model_data[AutoEvalColumn.fullname.name]
139
  else:
140
- # Merges and moes are flagged
141
  flag_key = "merged"
142
-
143
  # Reverse the logic: Check for non-flagged models instead
144
  if flag_key in FLAGGED_MODELS:
145
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
@@ -171,4 +171,3 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
171
  def filter_models_flags(leaderboard_data: list[dict]):
172
  leaderboard_data = remove_forbidden_models(leaderboard_data)
173
  flag_models(leaderboard_data)
174
-
 
137
  if model_data[AutoEvalColumn.not_flagged.name]:
138
  flag_key = model_data[AutoEvalColumn.fullname.name]
139
  else:
140
+ # Merges and moes are flagged
141
  flag_key = "merged"
142
+
143
  # Reverse the logic: Check for non-flagged models instead
144
  if flag_key in FLAGGED_MODELS:
145
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
 
171
  def filter_models_flags(leaderboard_data: list[dict]):
172
  leaderboard_data = remove_forbidden_models(leaderboard_data)
173
  flag_models(leaderboard_data)
 
src/leaderboard/read_evals.py CHANGED
@@ -16,36 +16,36 @@ from src.display.formatting import make_clickable_model
16
  from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
 
18
  # Configure logging
19
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
20
 
21
  @dataclass
22
  class EvalResult:
23
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
24
- eval_name: str # org_model_precision (uid)
25
- full_model: str # org/model (path on hub)
26
  org: Optional[str]
27
  model: str
28
- revision: str # commit hash, "" if main
29
  results: Dict[str, float]
30
  precision: Precision = Precision.Unknown
31
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
32
  weight_type: WeightType = WeightType.Original
33
- architecture: str = "Unknown" # From config file
34
  license: str = "?"
35
  likes: int = 0
36
  num_params: int = 0
37
- date: str = "" # submission date of request file
38
  still_on_hub: bool = True
39
  is_merge: bool = False
40
  not_flagged: bool = False
41
  status: str = "FINISHED"
42
  # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
43
  tags: List[str] = field(default_factory=list)
44
-
45
-
46
  @classmethod
47
- def init_from_json_file(cls, json_filepath: str) -> 'EvalResult':
48
- with open(json_filepath, 'r') as fp:
49
  data = json.load(fp)
50
 
51
  config = data.get("config_general", {})
@@ -72,7 +72,7 @@ class EvalResult:
72
  model=model,
73
  results=results,
74
  precision=precision,
75
- revision=config.get("model_sha", "")
76
  )
77
 
78
  @staticmethod
@@ -118,9 +118,8 @@ class EvalResult:
118
 
119
  mean_acc = np.mean(accs) * 100.0
120
  results[task.benchmark] = mean_acc
121
-
122
- return results
123
 
 
124
 
125
  def update_with_request_file(self, requests_path):
126
  """Finds the relevant request file for the current model and updates info with it."""
@@ -130,17 +129,17 @@ class EvalResult:
130
  logging.warning(f"No request file for {self.org}/{self.model}")
131
  self.status = "FAILED"
132
  return
133
-
134
  with open(request_file, "r") as f:
135
  request = json.load(f)
136
-
137
  self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
138
  self.weight_type = WeightType[request.get("weight_type", "Original")]
139
  self.num_params = int(request.get("params", 0)) # Ensuring type safety
140
  self.date = request.get("submitted_time", "")
141
  self.architecture = request.get("architectures", "Unknown")
142
  self.status = request.get("status", "FAILED")
143
-
144
  except FileNotFoundError:
145
  self.status = "FAILED"
146
  logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
@@ -154,7 +153,6 @@ class EvalResult:
154
  self.status = "FAILED"
155
  logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
156
 
157
-
158
  def update_with_dynamic_file_dict(self, file_dict):
159
  """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
160
  # Default values set for optional or potentially missing keys.
@@ -162,11 +160,10 @@ class EvalResult:
162
  self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
163
  self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
164
  self.tags = file_dict.get("tags", [])
165
-
166
  # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
167
  self.not_flagged = not (any("flagged" in tag for tag in self.tags))
168
 
169
-
170
  def to_dict(self):
171
  """Converts the Eval Result to a dict compatible with our dataframe display"""
172
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
@@ -185,8 +182,10 @@ class EvalResult:
185
  AutoEvalColumn.likes.name: self.likes,
186
  AutoEvalColumn.params.name: self.num_params,
187
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
188
- AutoEvalColumn.merged.name: not( "merge" in self.tags if self.tags else False),
189
- AutoEvalColumn.moe.name: not ( ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()) ,
 
 
190
  AutoEvalColumn.not_flagged.name: self.not_flagged,
191
  }
192
 
@@ -194,16 +193,16 @@ class EvalResult:
194
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
195
 
196
  return data_dict
197
-
198
 
199
  def get_request_file_for_model(requests_path, model_name, precision):
200
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
201
  requests_path = Path(requests_path)
202
  pattern = f"{model_name}_eval_request_*.json"
203
-
204
  # Using pathlib to find files matching the pattern
205
  request_files = list(requests_path.glob(pattern))
206
-
207
  # Sort the files by name in descending order to mimic 'reverse=True'
208
  request_files.sort(reverse=True)
209
 
@@ -214,7 +213,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
214
  req_content = json.load(f)
215
  if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
216
  request_file = str(request_file)
217
-
218
  # Return empty string if no file found that matches criteria
219
  return request_file
220
 
@@ -223,9 +222,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
223
  """From the path of the results folder root, extract all needed info for results"""
224
  with open(dynamic_path) as f:
225
  dynamic_data = json.load(f)
226
-
227
  results_path = Path(results_path)
228
- model_files = list(results_path.rglob('results_*.json'))
229
  model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
230
 
231
  eval_results = {}
@@ -260,4 +259,3 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
260
  continue
261
 
262
  return results
263
-
 
16
  from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
 
18
  # Configure logging
19
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
+
21
 
22
  @dataclass
23
  class EvalResult:
24
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
25
+ eval_name: str # org_model_precision (uid)
26
+ full_model: str # org/model (path on hub)
27
  org: Optional[str]
28
  model: str
29
+ revision: str # commit hash, "" if main
30
  results: Dict[str, float]
31
  precision: Precision = Precision.Unknown
32
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
33
  weight_type: WeightType = WeightType.Original
34
+ architecture: str = "Unknown" # From config file
35
  license: str = "?"
36
  likes: int = 0
37
  num_params: int = 0
38
+ date: str = "" # submission date of request file
39
  still_on_hub: bool = True
40
  is_merge: bool = False
41
  not_flagged: bool = False
42
  status: str = "FINISHED"
43
  # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
44
  tags: List[str] = field(default_factory=list)
45
+
 
46
  @classmethod
47
+ def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
48
+ with open(json_filepath, "r") as fp:
49
  data = json.load(fp)
50
 
51
  config = data.get("config_general", {})
 
72
  model=model,
73
  results=results,
74
  precision=precision,
75
+ revision=config.get("model_sha", ""),
76
  )
77
 
78
  @staticmethod
 
118
 
119
  mean_acc = np.mean(accs) * 100.0
120
  results[task.benchmark] = mean_acc
 
 
121
 
122
+ return results
123
 
124
  def update_with_request_file(self, requests_path):
125
  """Finds the relevant request file for the current model and updates info with it."""
 
129
  logging.warning(f"No request file for {self.org}/{self.model}")
130
  self.status = "FAILED"
131
  return
132
+
133
  with open(request_file, "r") as f:
134
  request = json.load(f)
135
+
136
  self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
137
  self.weight_type = WeightType[request.get("weight_type", "Original")]
138
  self.num_params = int(request.get("params", 0)) # Ensuring type safety
139
  self.date = request.get("submitted_time", "")
140
  self.architecture = request.get("architectures", "Unknown")
141
  self.status = request.get("status", "FAILED")
142
+
143
  except FileNotFoundError:
144
  self.status = "FAILED"
145
  logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
 
153
  self.status = "FAILED"
154
  logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
155
 
 
156
  def update_with_dynamic_file_dict(self, file_dict):
157
  """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
158
  # Default values set for optional or potentially missing keys.
 
160
  self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
161
  self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
162
  self.tags = file_dict.get("tags", [])
163
+
164
  # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
165
  self.not_flagged = not (any("flagged" in tag for tag in self.tags))
166
 
 
167
  def to_dict(self):
168
  """Converts the Eval Result to a dict compatible with our dataframe display"""
169
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
182
  AutoEvalColumn.likes.name: self.likes,
183
  AutoEvalColumn.params.name: self.num_params,
184
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
185
+ AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
186
+ AutoEvalColumn.moe.name: not (
187
+ ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
188
+ ),
189
  AutoEvalColumn.not_flagged.name: self.not_flagged,
190
  }
191
 
 
193
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
194
 
195
  return data_dict
196
+
197
 
198
  def get_request_file_for_model(requests_path, model_name, precision):
199
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
200
  requests_path = Path(requests_path)
201
  pattern = f"{model_name}_eval_request_*.json"
202
+
203
  # Using pathlib to find files matching the pattern
204
  request_files = list(requests_path.glob(pattern))
205
+
206
  # Sort the files by name in descending order to mimic 'reverse=True'
207
  request_files.sort(reverse=True)
208
 
 
213
  req_content = json.load(f)
214
  if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
215
  request_file = str(request_file)
216
+
217
  # Return empty string if no file found that matches criteria
218
  return request_file
219
 
 
222
  """From the path of the results folder root, extract all needed info for results"""
223
  with open(dynamic_path) as f:
224
  dynamic_data = json.load(f)
225
+
226
  results_path = Path(results_path)
227
+ model_files = list(results_path.rglob("results_*.json"))
228
  model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
229
 
230
  eval_results = {}
 
259
  continue
260
 
261
  return results
 
src/populate.py CHANGED
@@ -1,5 +1,3 @@
1
- import json
2
- import os
3
  import pathlib
4
  import pandas as pd
5
  from src.display.formatting import has_no_nan_values, make_clickable_model
@@ -21,7 +19,7 @@ def get_evaluation_queue_df(save_path, cols):
21
  save_path = pathlib.Path(save_path)
22
  all_evals = []
23
 
24
- for path in save_path.rglob('*.json'):
25
  data = load_json_data(path)
26
  if data:
27
  all_evals.append(_process_model_data(data))
 
 
 
1
  import pathlib
2
  import pandas as pd
3
  from src.display.formatting import has_no_nan_values, make_clickable_model
 
19
  save_path = pathlib.Path(save_path)
20
  all_evals = []
21
 
22
+ for path in save_path.rglob("*.json"):
23
  data = load_json_data(path)
24
  if data:
25
  all_evals.append(_process_model_data(data))