rusticluftig commited on
Commit
9b87de8
·
1 Parent(s): 20e459a

Add benchmark data to the LB

Browse files
Files changed (3) hide show
  1. app.py +76 -43
  2. requirements.txt +1 -0
  3. utils.py +94 -163
app.py CHANGED
@@ -7,6 +7,7 @@ import gradio as gr
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from dotenv import load_dotenv
9
  from huggingface_hub import HfApi
 
10
 
11
  import competitions
12
  import utils
@@ -54,9 +55,8 @@ def main():
54
  vali_runs = state_vars["vali_runs"]
55
  scores = state_vars["scores"]
56
  validator_df = state_vars["validator_df"]
57
- benchmarks = state_vars.get("benchmarks", None)
58
- benchmark_timestamp = state_vars.get("benchmark_timestamp", None)
59
- losses_1 = state_vars["losses_1"]
60
  losses_2 = state_vars["losses_2"]
61
 
62
  demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
@@ -74,51 +74,44 @@ def main():
74
  },
75
  num_top_classes=10,
76
  )
77
- if benchmarks is not None:
78
- with gr.Accordion("Top Model Benchmarks"):
79
- gr.components.Dataframe(benchmarks)
80
- gr.HTML("""<div>PPL computed using a stride of 512. See <a href='https://github.com/macrocosm-os/finetuning/blob/dev/scripts/run_benchmarks.py'>here</a> for the full code.</div>""")
81
- gr.HTML(f"""<div>Last Updated: {benchmark_timestamp.strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>""")
82
 
83
  with gr.Accordion("Competition Results"):
84
  gr.HTML(EVALUATION_HEADER)
85
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
86
  competition_leaderboards = []
87
- comp_1 = competitions.COMPETITION_DETAILS[1]
88
- with gr.Accordion(f"{comp_1.name} Competition"):
89
- gr.HTML(comp_1.html_description)
90
- competition_leaderboards.append(gr.components.Dataframe(
91
- value=utils.leaderboard_data(model_data, scores, 1, show_stale.value),
92
- headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
93
- datatype=["markdown", "number", "number", "number", "number", "number"],
94
- elem_id="comp1-table",
95
- interactive=False,
96
- visible=True,
97
- ))
98
- gr.LinePlot(
99
- losses_1,
100
- x="timestamp",
101
- x_title="Date",
102
- y="losses",
103
- y_title="Average Loss",
104
- interactive=True,
105
- visible=True,
106
- width=1024,
107
- title="Best Average Loss Over Time",
108
- )
109
  comp_2 = competitions.COMPETITION_DETAILS[2]
110
  # Covert the losses into % of correct answers.
111
- losses_2["losses"] = losses_2["losses"].apply(lambda x: 1 - x if x else None)
 
 
112
  with gr.Accordion(f"{comp_2.name} Competition"):
113
  gr.HTML(comp_2.html_description)
114
- competition_leaderboards.append(gr.components.Dataframe(
115
- value=utils.leaderboard_data(model_data, scores, 2, show_stale.value),
116
- headers=["Name", "Win Rate", "MC Score", "Weight", "UID", "Block"],
117
- datatype=["markdown", "number", "number", "number", "number", "number"],
118
- elem_id="comp2-table",
119
- interactive=False,
120
- visible=True,
121
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  gr.LinePlot(
123
  losses_2,
124
  x="timestamp",
@@ -130,19 +123,59 @@ def main():
130
  width=1024,
131
  title="Best MC Score Over Time",
132
  )
133
- gr.HTML("""
 
134
  <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
135
  <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
136
  <li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
137
  <li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
138
  <li><b>UID:</b> the Bittensor UID of the miner</li>
139
  <li><b>Weight:</b> the bittensor weight set for this model</li>
140
- <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>.""")
 
141
  show_stale.change(
142
- lambda stale: [utils.leaderboard_data(model_data, scores, 1, stale), utils.leaderboard_data(model_data, scores, 2, stale)],
143
  inputs=[show_stale],
144
  outputs=competition_leaderboards,
145
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  with gr.Accordion("Validator Stats"):
148
  gr.components.Dataframe(
 
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from dotenv import load_dotenv
9
  from huggingface_hub import HfApi
10
+ import matplotlib.pyplot as plt
11
 
12
  import competitions
13
  import utils
 
55
  vali_runs = state_vars["vali_runs"]
56
  scores = state_vars["scores"]
57
  validator_df = state_vars["validator_df"]
58
+ benchmarks_df = state_vars["benchmarks_df"]
59
+ benchmarks_targets = state_vars["benchmarks_targets"]
 
60
  losses_2 = state_vars["losses_2"]
61
 
62
  demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
 
74
  },
75
  num_top_classes=10,
76
  )
 
 
 
 
 
77
 
78
  with gr.Accordion("Competition Results"):
79
  gr.HTML(EVALUATION_HEADER)
80
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
81
  competition_leaderboards = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  comp_2 = competitions.COMPETITION_DETAILS[2]
83
  # Covert the losses into % of correct answers.
84
+ losses_2["losses"] = losses_2["losses"].apply(
85
+ lambda x: 1 - x if x else None
86
+ )
87
  with gr.Accordion(f"{comp_2.name} Competition"):
88
  gr.HTML(comp_2.html_description)
89
+ competition_leaderboards.append(
90
+ gr.components.Dataframe(
91
+ value=utils.leaderboard_data(
92
+ model_data, scores, 2, show_stale.value
93
+ ),
94
+ headers=[
95
+ "Name",
96
+ "Win Rate",
97
+ "MC Score",
98
+ "Weight",
99
+ "UID",
100
+ "Block",
101
+ ],
102
+ datatype=[
103
+ "markdown",
104
+ "number",
105
+ "number",
106
+ "number",
107
+ "number",
108
+ "number",
109
+ ],
110
+ elem_id="comp2-table",
111
+ interactive=False,
112
+ visible=True,
113
+ )
114
+ )
115
  gr.LinePlot(
116
  losses_2,
117
  x="timestamp",
 
123
  width=1024,
124
  title="Best MC Score Over Time",
125
  )
126
+ gr.HTML(
127
+ """
128
  <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
129
  <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
130
  <li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
131
  <li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
132
  <li><b>UID:</b> the Bittensor UID of the miner</li>
133
  <li><b>Weight:</b> the bittensor weight set for this model</li>
134
+ <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
135
+ )
136
  show_stale.change(
137
+ lambda stale: [utils.leaderboard_data(model_data, scores, 2, stale)],
138
  inputs=[show_stale],
139
  outputs=competition_leaderboards,
140
+ )
141
+
142
+ if benchmarks_df is not None:
143
+
144
+ def create_benchmark_plot(benchmark: str):
145
+ fig = plt.figure(figsize=(10, 8))
146
+
147
+ plt.plot(benchmarks_df["timestamp"], benchmarks_df[benchmark])
148
+
149
+ # Adding horizontal dotted lines for various benchmark targets (well-known models)
150
+ for model, score in benchmarks_targets[benchmark].items():
151
+ plt.axhline(y=score, linestyle="--", label=f"{model}")
152
+ plt.text(
153
+ benchmarks_df["timestamp"].max(),
154
+ score,
155
+ f"{model}",
156
+ va="center",
157
+ ha="right",
158
+ backgroundcolor="white",
159
+ )
160
+
161
+ # Adding labels and title
162
+ plt.ylabel(benchmark.upper())
163
+ plt.title(f"{benchmark.upper()} Over Time")
164
+ plt.xticks(rotation=45)
165
+
166
+ return fig
167
+
168
+ with gr.Accordion("Top Model Benchmarks"):
169
+ mmlu = create_benchmark_plot("mmlu")
170
+ mmlu_pro = create_benchmark_plot("mmlu_pro")
171
+ gr.Plot(mmlu)
172
+ gr.Plot(mmlu_pro)
173
+ gr.HTML(
174
+ """<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
175
+ )
176
+ gr.HTML(
177
+ """<ul><li>MMLU: Raw score</li><li>MMLU Pro: Normalized score using <a href='https://huggingface.co/docs/leaderboards/open_llm_leaderboard/normalization'>this</a> method</li></ul>"""
178
+ )
179
 
180
  with gr.Accordion("Validator Stats"):
181
  gr.components.Dataframe(
requirements.txt CHANGED
@@ -8,4 +8,5 @@ huggingface-hub
8
  gradio
9
  pandas
10
  flask
 
11
 
 
8
  gradio
9
  pandas
10
  flask
11
+ matplotlib
12
 
utils.py CHANGED
@@ -15,7 +15,7 @@ import pandas as pd
15
  import wandb
16
  from bittensor.extrinsics.serving import get_metadata
17
  from dotenv import load_dotenv
18
- from wandb.apis.public.history import HistoryScan
19
 
20
  NETUID = 37
21
  DELAY_SECS = 3
@@ -26,8 +26,7 @@ load_dotenv()
26
  WANDB_TOKEN = os.environ.get("WANDB_API_KEY", None)
27
  SUBTENSOR_ENDPOINT = os.environ.get("SUBTENSOR_ENDPOINT", None)
28
  VALIDATOR_WANDB_PROJECT = "rusticluftig/finetuning"
29
- BENCHMARK_WANDB_PROJECT = ""
30
- BENCHMARK_FLAG = os.environ.get("BENCHMARK_FLAG", None)
31
 
32
 
33
  @dataclass(frozen=True)
@@ -146,19 +145,26 @@ def get_subnet_data(
146
  return result
147
 
148
 
149
- def get_wandb_runs(project: str, filters: Dict[str, Any]) -> List:
 
 
150
  """Get the latest runs from Wandb, retrying infinitely until we get them.
151
 
 
 
 
 
 
152
  Returns:
153
- List: List of runs matching the provided filters, newest run (by creation time) first.
154
  """
155
  while True:
156
- api = wandb.Api(api_key=WANDB_TOKEN)
157
  runs = list(
158
  api.runs(
159
  project,
160
  filters=filters,
161
- order="-created_at",
162
  )
163
  )
164
  if len(runs) > 0:
@@ -178,12 +184,13 @@ def get_scores(
178
  uids (List[int]): List of UIDs to get scores for.
179
  wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
180
  """
 
181
  def _maybe_convert_loss(loss: float, comp_id: int) -> float:
182
  """Converts loss to score for competitions that require it."""
183
  if comp_id == 2:
184
  return 1 - loss if loss else None
185
  return loss
186
-
187
  result = {}
188
  previous_timestamp = None
189
  seen_competitions = set()
@@ -209,7 +216,9 @@ def get_scores(
209
  # Only the most recent run per competition is fresh.
210
  is_fresh = comp_id not in seen_competitions
211
  result[uid] = {
212
- "avg_loss": _maybe_convert_loss(uid_data.get("average_loss", None), comp_id),
 
 
213
  "win_rate": uid_data.get("win_rate", None),
214
  "win_total": uid_data.get("win_total", None),
215
  "weight": uid_data.get("weight", None),
@@ -245,32 +254,35 @@ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
245
  """Returns a dataframe of the best average model loss over time."""
246
  timestamps = []
247
  losses = []
248
-
249
  for run in wandb_runs:
250
  # For each run, check the 10 most recent steps.
251
  best_loss = math.inf
252
  should_add_datapoint = False
253
  min_step = max(0, run.lastHistoryStep - 10)
254
- history_scan = HistoryScan(
255
- run.client, run, min_step, run.lastHistoryStep, page_size=10
 
 
 
 
 
256
  )
257
  max_timestamp = None
258
  for step in history_scan:
259
- if "original_format_json" not in step:
260
- continue
261
  data = json.loads(step["original_format_json"])
262
  all_uid_data = data["uid_data"]
263
  timestamp = datetime.datetime.fromtimestamp(data["timestamp"])
264
  if max_timestamp is None:
265
  max_timestamp = timestamp
266
  max_timestamp = max(max_timestamp, timestamp)
267
-
268
  for _, uid_data in all_uid_data.items():
269
  loss = uid_data.get("average_loss", math.inf)
270
  c_id = uid_data.get("competition_id", None)
271
  if c_id is None or c_id != competition_id:
272
  continue
273
-
274
  if loss < best_loss:
275
  best_loss = loss
276
  should_add_datapoint = True
@@ -278,15 +290,8 @@ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
278
  if should_add_datapoint:
279
  timestamps.append(max_timestamp)
280
  losses.append(best_loss)
281
-
282
- return pd.DataFrame({"timestamp": timestamps, "losses": losses })
283
 
284
- def next_epoch(subtensor: bt.subtensor, block: int) -> int:
285
- return (
286
- block
287
- + subtensor.get_subnet_hyperparameters(NETUID).tempo
288
- - subtensor.blocks_since_epoch(NETUID, block)
289
- )
290
 
291
 
292
  def is_floatable(x) -> bool:
@@ -321,26 +326,65 @@ def leaderboard_data(
321
  c.block,
322
  ]
323
  for c in leaderboard
324
- if c.competition_id == competition_id and ((c.uid in scores and scores[c.uid]["fresh"]) or show_stale)
 
325
  ]
326
 
327
 
328
- def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
329
  """Returns the latest benchmarks and the time they were run."""
330
  if not BENCHMARK_WANDB_PROJECT:
331
  print("No benchmark project set.")
332
  return None, None
333
- runs = get_wandb_runs(project=BENCHMARK_WANDB_PROJECT, filters=None)
 
 
 
334
  for run in runs:
335
- artifacts = list(run.logged_artifacts())
336
- if artifacts:
337
- table = artifacts[-1].get("benchmarks")
338
- if table:
339
- return table.get_dataframe(), datetime.datetime.strptime(
340
- run.metadata["startedAt"], "%Y-%m-%dT%H:%M:%S.%f"
341
- )
342
- print("Failed to get benchmarks from Wandb.")
343
- return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
 
346
  def make_validator_dataframe(
@@ -406,31 +450,32 @@ def load_state_vars() -> dict[Any]:
406
 
407
  model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
408
  model_data.sort(key=lambda x: x.incentive, reverse=True)
409
-
410
  print(f"Loaded {len(model_data)} models")
 
411
  vali_runs = get_wandb_runs(
412
- project=VALIDATOR_WANDB_PROJECT,
413
- filters={"$and": [{"config.type": "validator"}], "$or": [{"config.uid": 28}, {"config.uid": 16}]},
 
 
 
414
  )
415
-
416
  print(f"Loaded {len(vali_runs)} validator runs")
417
 
418
  scores = get_scores([x.uid for x in model_data], vali_runs)
 
419
 
420
  validator_df = get_validator_weights(metagraph)
421
  weight_keys = set()
422
  for uid, stats in validator_df.items():
423
  weight_keys.update(stats[-1].keys())
424
-
 
425
  # Compute loss over time for all competitions.
426
- losses_1 = get_losses_over_time(vali_runs, 1)
427
  losses_2 = get_losses_over_time(vali_runs, 2)
 
428
 
429
- # Enable benchmark if the flag is set
430
- if BENCHMARK_FLAG:
431
- benchmarks, benchmark_timestamp = get_benchmarks()
432
- else:
433
- benchmarks, benchmark_timestamp = None, None
434
  break
435
 
436
  except KeyboardInterrupt:
@@ -447,121 +492,7 @@ def load_state_vars() -> dict[Any]:
447
  "vali_runs": vali_runs,
448
  "scores": scores,
449
  "validator_df": validator_df,
450
- "benchmarks": benchmarks,
451
- "benchmark_timestamp": benchmark_timestamp,
452
- "losses_1": losses_1,
453
  "losses_2": losses_2,
454
  }
455
-
456
-
457
- def test_load_state_vars():
458
- # TODO: Change to finetuning data.
459
- subtensor = bt.subtensor("finney")
460
- metagraph = subtensor.metagraph(NETUID, lite=True)
461
- model_data = [
462
- ModelData(
463
- uid=253,
464
- hotkey="5DjoPAgZ54Zf6NsuiVYh8RjonnWWWREE2iXBNzM2VDBMQDPm",
465
- namespace="jw-hf-test",
466
- name="jw2",
467
- commit="aad131f6b02219964e6dcf749c2a23e75a7ceca8",
468
- secure_hash="L1ImYzWJwV+9KSnZ2TYW0Iy2KMcVjJVTd30YJoRkpbw=",
469
- block=3131103,
470
- incentive=1.0,
471
- emission=209.06051635742188,
472
- ),
473
- ModelData(
474
- uid=1,
475
- hotkey="5CccVtjk4yamCao6QYgEg7jc8vktdj16RbLKNUftHfEsjuJS",
476
- namespace="borggAI",
477
- name="bittensor-subnet9-models",
478
- commit="d373864bc6c972872edb8db95eed570958054bac",
479
- secure_hash="+drdTIKYEGYClW2FFVVID6A2Dh//4rLmExRFCJsH6Y4=",
480
- block=2081837,
481
- incentive=0.0,
482
- emission=0.0,
483
- ),
484
- ModelData(
485
- uid=2,
486
- hotkey="5HYwoXaczs3jAptbb5mk4aUCkgZqeNcNzJKxSec97GwasfLy",
487
- namespace="jungiebeen",
488
- name="pretrain1",
489
- commit="4c0c6bfd0f92e243d6c8a82209142e7204c852c3",
490
- secure_hash="ld/agc0XIWICom/Cpj0fkQLcMogMNj/F65MJogK5RLY=",
491
- block=2467482,
492
- incentive=0.0,
493
- emission=0.0,
494
- ),
495
- ModelData(
496
- uid=3,
497
- hotkey="5Dnb6edh9yTeEp5aasRPZVPRAkxvQ6qnERVcXw22awMZ5rxm",
498
- namespace="jungiebeen",
499
- name="pretrain2",
500
- commit="e827b7281c92224adb11124489cc45356553a87a",
501
- secure_hash="ld/agc0XIWICom/Cpj0fkQLcMogMNj/F65MJogK5RLY=",
502
- block=2467497,
503
- incentive=0.0,
504
- emission=0.0,
505
- ),
506
- ModelData(
507
- uid=4,
508
- hotkey="5FRfca8NbnH424WaX43PMhKBnbLA1bZpRRoXXiVs6HgsxN4K",
509
- namespace="ZainAli60",
510
- name="mine_modeles",
511
- commit="8a4ed4ad1f1fb58d424fd22e8e9874b87d32917c",
512
- secure_hash="tVcbZAFoNIOF+Ntxq31OQ2NrLXf5iFCmmPUJlpkMYYo=",
513
- block=2508509,
514
- incentive=0.0,
515
- emission=0.0,
516
- ),
517
- ]
518
- vali_runs = get_wandb_runs(
519
- project=VALIDATOR_WANDB_PROJECT,
520
- filters={"config.type": "validator", "config.uid": 238},
521
- )
522
-
523
- scores = get_scores([x.uid for x in model_data], vali_runs)
524
-
525
- validator_df = {
526
- 28: (1.0, 33273.4453125, {253: 1.0}),
527
- 49: (
528
- 0.9127794504165649,
529
- 10401.677734375,
530
- {
531
- 7: 0.0867,
532
- 217: 0.0001,
533
- 219: 0.0001,
534
- 241: 0.0001,
535
- 248: 0.0001,
536
- 253: 0.9128,
537
- },
538
- ),
539
- 78: (1.0, 26730.37109375, {253: 1.0}),
540
- 116: (1.0, 629248.4375, {253: 1.0}),
541
- 150: (1.0, 272634.53125, {253: 1.0}),
542
- 161: (1.0, 280212.53125, {253: 1.0}),
543
- 180: (1.0, 16838.0, {253: 1.0}),
544
- 184: (1.0, 47969.3984375, {253: 1.0}),
545
- 210: (1.0, 262846.28125, {253: 1.0}),
546
- 213: (1.0, 119462.734375, {253: 1.0}),
547
- 215: (1.0, 274747.46875, {253: 1.0}),
548
- 234: (1.0, 38831.6953125, {253: 1.0}),
549
- 236: (1.0, 183966.9375, {253: 1.0}),
550
- 238: (1.0, 1293707.25, {253: 1.0}),
551
- 240: (1.0, 106461.6015625, {253: 1.0}),
552
- 243: (1.0, 320271.5, {253: 1.0}),
553
- 244: (1.0, 116138.9609375, {253: 1.0}),
554
- 247: (0.9527428150177002, 119812.390625, {7: 0.0472, 253: 0.9528}),
555
- 249: (1.0, 478127.3125, {253: 1.0}),
556
- 252: (1.0, 442395.03125, {253: 1.0}),
557
- 254: (1.0, 46845.2109375, {253: 1.0}),
558
- 255: (1.0, 28977.56640625, {253: 1.0}),
559
- }
560
-
561
- return {
562
- "metagraph": metagraph,
563
- "model_data": model_data,
564
- "vali_runs": vali_runs,
565
- "scores": scores,
566
- "validator_df": validator_df,
567
- }
 
15
  import wandb
16
  from bittensor.extrinsics.serving import get_metadata
17
  from dotenv import load_dotenv
18
+ from wandb.apis.public.history import SampledHistoryScan
19
 
20
  NETUID = 37
21
  DELAY_SECS = 3
 
26
  WANDB_TOKEN = os.environ.get("WANDB_API_KEY", None)
27
  SUBTENSOR_ENDPOINT = os.environ.get("SUBTENSOR_ENDPOINT", None)
28
  VALIDATOR_WANDB_PROJECT = "rusticluftig/finetuning"
29
+ BENCHMARK_WANDB_PROJECT = "rusticluftig/test-benchmarks"
 
30
 
31
 
32
  @dataclass(frozen=True)
 
145
  return result
146
 
147
 
148
+ def get_wandb_runs(
149
+ project: str, filters: Dict[str, Any], order: str = "-created_at"
150
+ ) -> List:
151
  """Get the latest runs from Wandb, retrying infinitely until we get them.
152
 
153
+ Args:
154
+ project (str): The Wandb project to get runs from.
155
+ filters (Dict[str, Any]): Filters to apply to the runs.
156
+ order (str): Order to sort the runs by. Defaults to "-created_at" (newest first)
157
+
158
  Returns:
159
+ List: List of runs matching the provided filters
160
  """
161
  while True:
162
+ api = wandb.Api(api_key=WANDB_TOKEN, timeout=100)
163
  runs = list(
164
  api.runs(
165
  project,
166
  filters=filters,
167
+ order=order,
168
  )
169
  )
170
  if len(runs) > 0:
 
184
  uids (List[int]): List of UIDs to get scores for.
185
  wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
186
  """
187
+
188
  def _maybe_convert_loss(loss: float, comp_id: int) -> float:
189
  """Converts loss to score for competitions that require it."""
190
  if comp_id == 2:
191
  return 1 - loss if loss else None
192
  return loss
193
+
194
  result = {}
195
  previous_timestamp = None
196
  seen_competitions = set()
 
216
  # Only the most recent run per competition is fresh.
217
  is_fresh = comp_id not in seen_competitions
218
  result[uid] = {
219
+ "avg_loss": _maybe_convert_loss(
220
+ uid_data.get("average_loss", None), comp_id
221
+ ),
222
  "win_rate": uid_data.get("win_rate", None),
223
  "win_total": uid_data.get("win_total", None),
224
  "weight": uid_data.get("weight", None),
 
254
  """Returns a dataframe of the best average model loss over time."""
255
  timestamps = []
256
  losses = []
257
+
258
  for run in wandb_runs:
259
  # For each run, check the 10 most recent steps.
260
  best_loss = math.inf
261
  should_add_datapoint = False
262
  min_step = max(0, run.lastHistoryStep - 10)
263
+ history_scan = SampledHistoryScan(
264
+ run.client,
265
+ run,
266
+ ["original_format_json"],
267
+ min_step,
268
+ run.lastHistoryStep,
269
+ page_size=10,
270
  )
271
  max_timestamp = None
272
  for step in history_scan:
 
 
273
  data = json.loads(step["original_format_json"])
274
  all_uid_data = data["uid_data"]
275
  timestamp = datetime.datetime.fromtimestamp(data["timestamp"])
276
  if max_timestamp is None:
277
  max_timestamp = timestamp
278
  max_timestamp = max(max_timestamp, timestamp)
279
+
280
  for _, uid_data in all_uid_data.items():
281
  loss = uid_data.get("average_loss", math.inf)
282
  c_id = uid_data.get("competition_id", None)
283
  if c_id is None or c_id != competition_id:
284
  continue
285
+
286
  if loss < best_loss:
287
  best_loss = loss
288
  should_add_datapoint = True
 
290
  if should_add_datapoint:
291
  timestamps.append(max_timestamp)
292
  losses.append(best_loss)
 
 
293
 
294
+ return pd.DataFrame({"timestamp": timestamps, "losses": losses})
 
 
 
 
 
295
 
296
 
297
  def is_floatable(x) -> bool:
 
326
  c.block,
327
  ]
328
  for c in leaderboard
329
+ if c.competition_id == competition_id
330
+ and ((c.uid in scores and scores[c.uid]["fresh"]) or show_stale)
331
  ]
332
 
333
 
334
+ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
335
  """Returns the latest benchmarks and the time they were run."""
336
  if not BENCHMARK_WANDB_PROJECT:
337
  print("No benchmark project set.")
338
  return None, None
339
+ runs = get_wandb_runs(
340
+ project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
341
+ )
342
+ timestamps, uids, models, mmlu, mmlu_pro = [], [], [], [], []
343
  for run in runs:
344
+ uid = run.config.get("uid", None)
345
+ model = run.config.get("model", None)
346
+ if not uid or not model:
347
+ continue
348
+ samples = list(
349
+ SampledHistoryScan(
350
+ run.client,
351
+ run,
352
+ ["_timestamp", "mmlu.acc,none", "mmlu_pro"],
353
+ 0,
354
+ 1,
355
+ )
356
+ )
357
+ if not samples:
358
+ continue
359
+ sample = samples[0]
360
+ timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
361
+ mmlu.append(sample["mmlu.acc,none"])
362
+ mmlu_pro.append(sample["mmlu_pro"])
363
+ uids.append(uid)
364
+ models.append(model)
365
+ return (
366
+ pd.DataFrame(
367
+ {
368
+ "timestamp": timestamps,
369
+ "uid": uids,
370
+ "model": models,
371
+ "mmlu": mmlu,
372
+ "mmlu_pro": mmlu_pro,
373
+ }
374
+ ),
375
+ {
376
+ "mmlu": {
377
+ "Llama-3.1-8B-Instruct": 0.681,
378
+ "Mistral-7B-Instruct-v0.3": 0.597,
379
+ "gemma-2-9b-it": 0.719,
380
+ },
381
+ "mmlu_pro": {
382
+ "Llama-3.1-8B-Instruct": 30.68,
383
+ "Mistral-7B-Instruct-v0.3": 23.06,
384
+ "gemma-2-9b-it": 31.95,
385
+ },
386
+ },
387
+ )
388
 
389
 
390
  def make_validator_dataframe(
 
450
 
451
  model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
452
  model_data.sort(key=lambda x: x.incentive, reverse=True)
 
453
  print(f"Loaded {len(model_data)} models")
454
+
455
  vali_runs = get_wandb_runs(
456
+ project=VALIDATOR_WANDB_PROJECT,
457
+ filters={
458
+ "$and": [{"config.type": "validator"}],
459
+ "$or": [{"config.uid": 28}, {"config.uid": 16}],
460
+ },
461
  )
 
462
  print(f"Loaded {len(vali_runs)} validator runs")
463
 
464
  scores = get_scores([x.uid for x in model_data], vali_runs)
465
+ print(f"Loaded {len(scores)} scores")
466
 
467
  validator_df = get_validator_weights(metagraph)
468
  weight_keys = set()
469
  for uid, stats in validator_df.items():
470
  weight_keys.update(stats[-1].keys())
471
+ print("Loaded validator weights")
472
+
473
  # Compute loss over time for all competitions.
 
474
  losses_2 = get_losses_over_time(vali_runs, 2)
475
+ print("Loaded losses over time for comp 2")
476
 
477
+ benchmarks_df, benchmarks_targets = get_benchmarks()
478
+ print("Loaded benchmarks")
 
 
 
479
  break
480
 
481
  except KeyboardInterrupt:
 
492
  "vali_runs": vali_runs,
493
  "scores": scores,
494
  "validator_df": validator_df,
495
+ "benchmarks_df": benchmarks_df,
496
+ "benchmarks_targets": benchmarks_targets,
 
497
  "losses_2": losses_2,
498
  }