rusticluftig commited on
Commit
191e77c
·
1 Parent(s): 1681195

Add leaderboard for competition 2

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. app.py +58 -34
  3. competitions.py +5 -0
  4. utils.py +21 -29
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  .venv
2
  __pycache__/
3
- .env
 
 
1
  .venv
2
  __pycache__/
3
+ .env
4
+ **.ipynb
app.py CHANGED
@@ -1,13 +1,12 @@
1
  # Code adapted from: https://huggingface.co/spaces/RaoFoundation/pretraining-leaderboard/blob/main/app.py
2
 
3
- import os
4
  import datetime
5
- from typing import Dict
6
- import gradio as gr
7
 
 
 
8
  from dotenv import load_dotenv
9
  from huggingface_hub import HfApi
10
- from apscheduler.schedulers.background import BackgroundScheduler
11
 
12
  import competitions
13
  import utils
@@ -17,8 +16,7 @@ FONT = (
17
  )
18
  TITLE = """<h1 align="center" id="space-title" class="typewriter">Finetuning Subnet Leaderboard</h1>"""
19
  HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/macrocosm-os/finetuning" target="_blank">Finetuning</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that rewards miners for producing finetuned models in defined competitions. The model with the best head-to-head score in each competition receive a steady emission of TAO.</h3>"""
20
- EVALUATION_DETAILS = """<ul><li><b>Name:</b> the 🤗 Hugging Face model name (click to go to the model card)</li><li><b>Rewards / Day:</b> the expected rewards per day based on current ranking.</li><li><b>Last Average Loss:</b> the last loss value on the evaluation data for the model as calculated by a validator (lower is better)</li><li><b>UID:</b> the Bittensor UID of the miner</li><li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
21
- EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by the Opentensor validator</h3>"""
22
 
23
  HF_REPO_ID = "macrocosm-os/finetuning-leaderboard"
24
  SECONDS_PER_BLOCK = 12
@@ -65,11 +63,8 @@ def main():
65
  gr.HTML(TITLE)
66
  gr.HTML(HEADER)
67
 
68
- # TODO: Re-enable once ""SubtensorModule.BlocksSinceEpoch" not found" issue is resolved.
69
- # gr.HTML(value=get_next_update_div(current_block, next_epoch_block))
70
-
71
- # TODO: Figure out the best approach to showing the per competition rewards.
72
  gr.Label(
 
73
  value={
74
  f"{c.namespace}/{c.name} ({c.commit[0:8]}) · (τ{round(c.emission, 2):,})": c.incentive
75
  for c in model_data
@@ -83,42 +78,71 @@ def main():
83
  gr.HTML("""<div>PPL computed using a stride of 512. See <a href='https://github.com/macrocosm-os/finetuning/blob/dev/scripts/run_benchmarks.py'>here</a> for the full code.</div>""")
84
  gr.HTML(f"""<div>Last Updated: {benchmark_timestamp.strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>""")
85
 
86
- with gr.Accordion("Evaluation Stats"):
87
  gr.HTML(EVALUATION_HEADER)
88
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
89
  competition_leaderboards = []
90
- # TODO: Dynamically generate per-competition leaderboards based on model_data.
91
- competition_details = competitions.COMPETITION_DETAILS[1]
92
- with gr.Accordion(f"{competition_details.name} competition"):
93
- gr.HTML(competition_details.html_description)
94
  competition_leaderboards.append(gr.components.Dataframe(
95
- value=utils.leaderboard_data(model_data, scores, show_stale.value),
96
  headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
97
  datatype=["markdown", "number", "number", "number", "number", "number"],
98
- elem_id="leaderboard-table",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  interactive=False,
100
  visible=True,
101
  ))
102
- gr.HTML(EVALUATION_DETAILS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  show_stale.change(
104
- lambda stale: utils.leaderboard_data(model_data, scores, stale),
105
  inputs=[show_stale],
106
  outputs=competition_leaderboards,
107
- )
108
-
109
- # TODO: Make this a multi-competition line plot
110
- gr.LinePlot(
111
- utils.get_losses_over_time(vali_runs),
112
- x="timestamp",
113
- x_title="Date",
114
- y="SN9_MODEL",
115
- y_title="Average Loss",
116
- tooltip="SN9_MODEL",
117
- interactive=True,
118
- visible=True,
119
- width=1024,
120
- title="Best Average Loss Over Time",
121
- )
122
 
123
  with gr.Accordion("Validator Stats"):
124
  gr.components.Dataframe(
 
1
  # Code adapted from: https://huggingface.co/spaces/RaoFoundation/pretraining-leaderboard/blob/main/app.py
2
 
 
3
  import datetime
4
+ import os
 
5
 
6
+ import gradio as gr
7
+ from apscheduler.schedulers.background import BackgroundScheduler
8
  from dotenv import load_dotenv
9
  from huggingface_hub import HfApi
 
10
 
11
  import competitions
12
  import utils
 
16
  )
17
  TITLE = """<h1 align="center" id="space-title" class="typewriter">Finetuning Subnet Leaderboard</h1>"""
18
  HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/macrocosm-os/finetuning" target="_blank">Finetuning</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that rewards miners for producing finetuned models in defined competitions. The model with the best head-to-head score in each competition receive a steady emission of TAO.</h3>"""
19
+ EVALUATION_HEADER = """<h3 align="center">Shows the latest per-competition evaluation statistics as calculated by the Taoverse validator</h3>"""
 
20
 
21
  HF_REPO_ID = "macrocosm-os/finetuning-leaderboard"
22
  SECONDS_PER_BLOCK = 12
 
63
  gr.HTML(TITLE)
64
  gr.HTML(HEADER)
65
 
 
 
 
 
66
  gr.Label(
67
+ label="Emissions",
68
  value={
69
  f"{c.namespace}/{c.name} ({c.commit[0:8]}) · (τ{round(c.emission, 2):,})": c.incentive
70
  for c in model_data
 
78
  gr.HTML("""<div>PPL computed using a stride of 512. See <a href='https://github.com/macrocosm-os/finetuning/blob/dev/scripts/run_benchmarks.py'>here</a> for the full code.</div>""")
79
  gr.HTML(f"""<div>Last Updated: {benchmark_timestamp.strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>""")
80
 
81
+ with gr.Accordion("Competition Results"):
82
  gr.HTML(EVALUATION_HEADER)
83
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
84
  competition_leaderboards = []
85
+ losses_1 = utils.get_losses_over_time(vali_runs, 1)
86
+ comp_1 = competitions.COMPETITION_DETAILS[1]
87
+ with gr.Accordion(f"{comp_1.name} Competition"):
88
+ gr.HTML(comp_1.html_description)
89
  competition_leaderboards.append(gr.components.Dataframe(
90
+ value=utils.leaderboard_data(model_data, scores, 1, show_stale.value),
91
  headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
92
  datatype=["markdown", "number", "number", "number", "number", "number"],
93
+ elem_id="comp1-table",
94
+ interactive=False,
95
+ visible=True,
96
+ ))
97
+ gr.LinePlot(
98
+ losses_1,
99
+ x="timestamp",
100
+ x_title="Date",
101
+ y="losses",
102
+ y_title="Average Loss",
103
+ interactive=True,
104
+ visible=True,
105
+ width=1024,
106
+ title="Best Average Loss Over Time",
107
+ )
108
+ comp_2 = competitions.COMPETITION_DETAILS[2]
109
+ losses_2 = utils.get_losses_over_time(vali_runs, 2)
110
+ # Covert the losses into % of correct answers.
111
+ losses_2["losses"] = losses_2["losses"].apply(lambda x: 1 - x if x else None)
112
+ with gr.Accordion(f"{comp_2.name} Competition"):
113
+ gr.HTML(comp_2.html_description)
114
+ competition_leaderboards.append(gr.components.Dataframe(
115
+ value=utils.leaderboard_data(model_data, scores, 2, show_stale.value),
116
+ headers=["Name", "Win Rate", "MC Score", "Weight", "UID", "Block"],
117
+ datatype=["markdown", "number", "number", "number", "number", "number"],
118
+ elem_id="comp2-table",
119
  interactive=False,
120
  visible=True,
121
  ))
122
+ gr.LinePlot(
123
+ losses_2,
124
+ x="timestamp",
125
+ x_title="Date",
126
+ y="losses",
127
+ y_title="MC Score",
128
+ interactive=True,
129
+ visible=True,
130
+ width=1024,
131
+ title="Best MC Score Over Time",
132
+ )
133
+ gr.HTML("""
134
+ <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
135
+ <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
136
+ <li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
137
+ <li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
138
+ <li><b>UID:</b> the Bittensor UID of the miner</li>
139
+ <li><b>Weight:</b> the bittensor weight set for this model</li>
140
+ <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>.""")
141
  show_stale.change(
142
+ lambda stale: [utils.leaderboard_data(model_data, scores, 1, stale), utils.leaderboard_data(model_data, scores, 2, stale)],
143
  inputs=[show_stale],
144
  outputs=competition_leaderboards,
145
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  with gr.Accordion("Validator Stats"):
148
  gr.components.Dataframe(
competitions.py CHANGED
@@ -16,5 +16,10 @@ COMPETITION_DETAILS: Dict[int, CompetitionDetails] = {
16
  1: CompetitionDetails(
17
  name="SN9_MODEL",
18
  html_description="""<b>Competition ID 1</b><br/>Produce the best fine-tuned model from a Subnet 9 pretrained model. Models are evaluated using synthetic prompt/response data from Subnet 18.""",
 
 
 
 
 
19
  )
20
  }
 
16
  1: CompetitionDetails(
17
  name="SN9_MODEL",
18
  html_description="""<b>Competition ID 1</b><br/>Produce the best fine-tuned model from a Subnet 9 pretrained model. Models are evaluated using synthetic prompt/response data from Subnet 18.""",
19
+ ),
20
+ 2: CompetitionDetails(
21
+ name="General Knowledge Chat-bot",
22
+ # TODO: Add link to SN1 dataset details.
23
+ html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
24
  )
25
  }
utils.py CHANGED
@@ -80,7 +80,7 @@ def run_with_retries(func, *args, **kwargs):
80
  try:
81
  return func(*args, **kwargs)
82
  except (Exception, RuntimeError):
83
- bt.logging.error(f"Failed to run function: {traceback.format_exc()}")
84
  if i == RETRIES - 1:
85
  raise
86
  time.sleep(DELAY_SECS)
@@ -167,7 +167,7 @@ def get_wandb_runs(project: str, filters: Dict[str, Any]) -> List:
167
  if len(runs) > 0:
168
  return runs
169
  # WandDB API is quite unreliable. Wait another minute and try again.
170
- bt.logging.error("Failed to get runs from Wandb. Trying again in 60 seconds.")
171
  time.sleep(60)
172
 
173
 
@@ -235,14 +235,14 @@ def get_validator_weights(
235
  return ret
236
 
237
 
238
- def get_losses_over_time(wandb_runs: List) -> pd.DataFrame:
239
  """Returns a dataframe of the best average model loss over time."""
240
  timestamps = []
241
- datapoints_per_comp_id = {id: [] for id in competitions.COMPETITION_DETAILS}
242
 
243
  for run in wandb_runs:
244
  # For each run, check the 10 most recent steps.
245
- best_loss_per_competition_id = defaultdict(lambda: math.inf)
246
  should_add_datapoint = False
247
  min_step = max(0, run.lastHistoryStep - 10)
248
  history_scan = HistoryScan(
@@ -261,26 +261,19 @@ def get_losses_over_time(wandb_runs: List) -> pd.DataFrame:
261
 
262
  for _, uid_data in all_uid_data.items():
263
  loss = uid_data.get("average_loss", math.inf)
264
- competition_id = uid_data.get("competition_id", None)
265
- if not competition_id:
266
  continue
267
 
268
- if loss < best_loss_per_competition_id[competition_id]:
269
- best_loss_per_competition_id[competition_id] = uid_data["average_loss"]
270
  should_add_datapoint = True
271
  # Now that we've processed the run's most recent steps, check if we should add a datapoint.
272
  if should_add_datapoint:
273
  timestamps.append(max_timestamp)
274
- # Iterate through all possible competitions and add the best loss for each.
275
- # Set None for any that aren't active during this run.
276
- for id, losses in datapoints_per_comp_id.items():
277
- losses.append(best_loss_per_competition_id.get(id, None))
278
-
279
- # Create a dictionary of competitions to lists of losses.
280
- output_columns = {competitions.COMPETITION_DETAILS[id].name: losses for id, losses in datapoints_per_comp_id.items()}
281
 
282
- return pd.DataFrame({"timestamp": timestamps, **output_columns})
283
-
284
 
285
  def next_epoch(subtensor: bt.subtensor, block: int) -> int:
286
  return (
@@ -308,6 +301,7 @@ def format_score(uid: int, scores, key) -> Optional[float]:
308
  def leaderboard_data(
309
  leaderboard: List[ModelData],
310
  scores: Dict[int, Dict[str, Optional[float]]],
 
311
  show_stale: bool,
312
  ) -> List[List[Any]]:
313
  """Returns the leaderboard data, based on models data and UID scores."""
@@ -321,14 +315,14 @@ def leaderboard_data(
321
  c.block,
322
  ]
323
  for c in leaderboard
324
- if (c.uid in scores and scores[c.uid]["fresh"]) or show_stale
325
  ]
326
 
327
 
328
  def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
329
  """Returns the latest benchmarks and the time they were run."""
330
  if not BENCHMARK_WANDB_PROJECT:
331
- bt.logging.error("No benchmark project set.")
332
  return None, None
333
  runs = get_wandb_runs(project=BENCHMARK_WANDB_PROJECT, filters=None)
334
  for run in runs:
@@ -339,7 +333,7 @@ def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
339
  return table.get_dataframe(), datetime.datetime.strptime(
340
  run.metadata["startedAt"], "%Y-%m-%dT%H:%M:%S.%f"
341
  )
342
- bt.logging.error("Failed to get benchmarks from Wandb.")
343
  return None, None
344
 
345
 
@@ -402,24 +396,22 @@ def load_state_vars() -> dict[Any]:
402
  try:
403
  subtensor, metagraph = get_subtensor_and_metagraph()
404
 
405
- bt.logging.success("Loaded subtensor and metagraph")
406
 
407
  model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
408
  model_data.sort(key=lambda x: x.incentive, reverse=True)
409
 
410
- bt.logging.success(f"Loaded {len(model_data)} models")
411
  vali_runs = get_wandb_runs(
412
  project=VALIDATOR_WANDB_PROJECT,
413
- # TODO: Update to point to the OTF vali on finetuning
414
  filters={"config.type": "validator", "config.uid": 28},
 
415
  )
 
 
416
 
417
  scores = get_scores([x.uid for x in model_data], vali_runs)
418
 
419
- # TODO: Re-enable once ""SubtensorModule.BlocksSinceEpoch" not found" issue is resolved.
420
- # current_block = metagraph.block.item()
421
- # next_epoch_block = next_epoch(subtensor, current_block)
422
-
423
  validator_df = get_validator_weights(metagraph)
424
  weight_keys = set()
425
  for uid, stats in validator_df.items():
@@ -433,7 +425,7 @@ def load_state_vars() -> dict[Any]:
433
  break
434
 
435
  except KeyboardInterrupt:
436
- bt.logging.error("Exiting...")
437
  break
438
 
439
  except Exception as e:
 
80
  try:
81
  return func(*args, **kwargs)
82
  except (Exception, RuntimeError):
83
+ print(f"Failed to run function: {traceback.format_exc()}")
84
  if i == RETRIES - 1:
85
  raise
86
  time.sleep(DELAY_SECS)
 
167
  if len(runs) > 0:
168
  return runs
169
  # WandDB API is quite unreliable. Wait another minute and try again.
170
+ print("Failed to get runs from Wandb. Trying again in 60 seconds.")
171
  time.sleep(60)
172
 
173
 
 
235
  return ret
236
 
237
 
238
+ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
239
  """Returns a dataframe of the best average model loss over time."""
240
  timestamps = []
241
+ losses = []
242
 
243
  for run in wandb_runs:
244
  # For each run, check the 10 most recent steps.
245
+ best_loss = math.inf
246
  should_add_datapoint = False
247
  min_step = max(0, run.lastHistoryStep - 10)
248
  history_scan = HistoryScan(
 
261
 
262
  for _, uid_data in all_uid_data.items():
263
  loss = uid_data.get("average_loss", math.inf)
264
+ c_id = uid_data.get("competition_id", None)
265
+ if c_id is None or c_id != competition_id:
266
  continue
267
 
268
+ if loss < best_loss:
269
+ best_loss = uid_data["average_loss"]
270
  should_add_datapoint = True
271
  # Now that we've processed the run's most recent steps, check if we should add a datapoint.
272
  if should_add_datapoint:
273
  timestamps.append(max_timestamp)
274
+ losses.append(best_loss)
 
 
 
 
 
 
275
 
276
+ return pd.DataFrame({"timestamp": timestamps, "losses": losses })
 
277
 
278
  def next_epoch(subtensor: bt.subtensor, block: int) -> int:
279
  return (
 
301
  def leaderboard_data(
302
  leaderboard: List[ModelData],
303
  scores: Dict[int, Dict[str, Optional[float]]],
304
+ competition_id: int,
305
  show_stale: bool,
306
  ) -> List[List[Any]]:
307
  """Returns the leaderboard data, based on models data and UID scores."""
 
315
  c.block,
316
  ]
317
  for c in leaderboard
318
+ if c.competition_id == competition_id and ((c.uid in scores and scores[c.uid]["fresh"]) or show_stale)
319
  ]
320
 
321
 
322
  def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
323
  """Returns the latest benchmarks and the time they were run."""
324
  if not BENCHMARK_WANDB_PROJECT:
325
+ print("No benchmark project set.")
326
  return None, None
327
  runs = get_wandb_runs(project=BENCHMARK_WANDB_PROJECT, filters=None)
328
  for run in runs:
 
333
  return table.get_dataframe(), datetime.datetime.strptime(
334
  run.metadata["startedAt"], "%Y-%m-%dT%H:%M:%S.%f"
335
  )
336
+ print("Failed to get benchmarks from Wandb.")
337
  return None, None
338
 
339
 
 
396
  try:
397
  subtensor, metagraph = get_subtensor_and_metagraph()
398
 
399
+ print(f"Loaded subtensor and metagraph: {metagraph}")
400
 
401
  model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
402
  model_data.sort(key=lambda x: x.incentive, reverse=True)
403
 
404
+ print(f"Loaded {len(model_data)} models")
405
  vali_runs = get_wandb_runs(
406
  project=VALIDATOR_WANDB_PROJECT,
 
407
  filters={"config.type": "validator", "config.uid": 28},
408
+ # filters={"$and": [{"config.type": "validator"}], "$or": [{"config.uid": 28}, {"config.uid": 252}]},
409
  )
410
+
411
+ print(f"Loaded {len(vali_runs)} validator runs")
412
 
413
  scores = get_scores([x.uid for x in model_data], vali_runs)
414
 
 
 
 
 
415
  validator_df = get_validator_weights(metagraph)
416
  weight_keys = set()
417
  for uid, stats in validator_df.items():
 
425
  break
426
 
427
  except KeyboardInterrupt:
428
+ print("Exiting...")
429
  break
430
 
431
  except Exception as e: