Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
191e77c
1
Parent(s):
1681195
Add leaderboard for competition 2
Browse files- .gitignore +2 -1
- app.py +58 -34
- competitions.py +5 -0
- utils.py +21 -29
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
.venv
|
2 |
__pycache__/
|
3 |
-
.env
|
|
|
|
1 |
.venv
|
2 |
__pycache__/
|
3 |
+
.env
|
4 |
+
**.ipynb
|
app.py
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
# Code adapted from: https://huggingface.co/spaces/RaoFoundation/pretraining-leaderboard/blob/main/app.py
|
2 |
|
3 |
-
import os
|
4 |
import datetime
|
5 |
-
|
6 |
-
import gradio as gr
|
7 |
|
|
|
|
|
8 |
from dotenv import load_dotenv
|
9 |
from huggingface_hub import HfApi
|
10 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
11 |
|
12 |
import competitions
|
13 |
import utils
|
@@ -17,8 +16,7 @@ FONT = (
|
|
17 |
)
|
18 |
TITLE = """<h1 align="center" id="space-title" class="typewriter">Finetuning Subnet Leaderboard</h1>"""
|
19 |
HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/macrocosm-os/finetuning" target="_blank">Finetuning</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that rewards miners for producing finetuned models in defined competitions. The model with the best head-to-head score in each competition receive a steady emission of TAO.</h3>"""
|
20 |
-
|
21 |
-
EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by the Opentensor validator</h3>"""
|
22 |
|
23 |
HF_REPO_ID = "macrocosm-os/finetuning-leaderboard"
|
24 |
SECONDS_PER_BLOCK = 12
|
@@ -65,11 +63,8 @@ def main():
|
|
65 |
gr.HTML(TITLE)
|
66 |
gr.HTML(HEADER)
|
67 |
|
68 |
-
# TODO: Re-enable once ""SubtensorModule.BlocksSinceEpoch" not found" issue is resolved.
|
69 |
-
# gr.HTML(value=get_next_update_div(current_block, next_epoch_block))
|
70 |
-
|
71 |
-
# TODO: Figure out the best approach to showing the per competition rewards.
|
72 |
gr.Label(
|
|
|
73 |
value={
|
74 |
f"{c.namespace}/{c.name} ({c.commit[0:8]}) · (τ{round(c.emission, 2):,})": c.incentive
|
75 |
for c in model_data
|
@@ -83,42 +78,71 @@ def main():
|
|
83 |
gr.HTML("""<div>PPL computed using a stride of 512. See <a href='https://github.com/macrocosm-os/finetuning/blob/dev/scripts/run_benchmarks.py'>here</a> for the full code.</div>""")
|
84 |
gr.HTML(f"""<div>Last Updated: {benchmark_timestamp.strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>""")
|
85 |
|
86 |
-
with gr.Accordion("
|
87 |
gr.HTML(EVALUATION_HEADER)
|
88 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
89 |
competition_leaderboards = []
|
90 |
-
|
91 |
-
|
92 |
-
with gr.Accordion(f"{
|
93 |
-
gr.HTML(
|
94 |
competition_leaderboards.append(gr.components.Dataframe(
|
95 |
-
value=utils.leaderboard_data(model_data, scores, show_stale.value),
|
96 |
headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
|
97 |
datatype=["markdown", "number", "number", "number", "number", "number"],
|
98 |
-
elem_id="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
interactive=False,
|
100 |
visible=True,
|
101 |
))
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
show_stale.change(
|
104 |
-
lambda stale: utils.leaderboard_data(model_data, scores, stale),
|
105 |
inputs=[show_stale],
|
106 |
outputs=competition_leaderboards,
|
107 |
-
)
|
108 |
-
|
109 |
-
# TODO: Make this a multi-competition line plot
|
110 |
-
gr.LinePlot(
|
111 |
-
utils.get_losses_over_time(vali_runs),
|
112 |
-
x="timestamp",
|
113 |
-
x_title="Date",
|
114 |
-
y="SN9_MODEL",
|
115 |
-
y_title="Average Loss",
|
116 |
-
tooltip="SN9_MODEL",
|
117 |
-
interactive=True,
|
118 |
-
visible=True,
|
119 |
-
width=1024,
|
120 |
-
title="Best Average Loss Over Time",
|
121 |
-
)
|
122 |
|
123 |
with gr.Accordion("Validator Stats"):
|
124 |
gr.components.Dataframe(
|
|
|
1 |
# Code adapted from: https://huggingface.co/spaces/RaoFoundation/pretraining-leaderboard/blob/main/app.py
|
2 |
|
|
|
3 |
import datetime
|
4 |
+
import os
|
|
|
5 |
|
6 |
+
import gradio as gr
|
7 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from dotenv import load_dotenv
|
9 |
from huggingface_hub import HfApi
|
|
|
10 |
|
11 |
import competitions
|
12 |
import utils
|
|
|
16 |
)
|
17 |
TITLE = """<h1 align="center" id="space-title" class="typewriter">Finetuning Subnet Leaderboard</h1>"""
|
18 |
HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/macrocosm-os/finetuning" target="_blank">Finetuning</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that rewards miners for producing finetuned models in defined competitions. The model with the best head-to-head score in each competition receive a steady emission of TAO.</h3>"""
|
19 |
+
EVALUATION_HEADER = """<h3 align="center">Shows the latest per-competition evaluation statistics as calculated by the Taoverse validator</h3>"""
|
|
|
20 |
|
21 |
HF_REPO_ID = "macrocosm-os/finetuning-leaderboard"
|
22 |
SECONDS_PER_BLOCK = 12
|
|
|
63 |
gr.HTML(TITLE)
|
64 |
gr.HTML(HEADER)
|
65 |
|
|
|
|
|
|
|
|
|
66 |
gr.Label(
|
67 |
+
label="Emissions",
|
68 |
value={
|
69 |
f"{c.namespace}/{c.name} ({c.commit[0:8]}) · (τ{round(c.emission, 2):,})": c.incentive
|
70 |
for c in model_data
|
|
|
78 |
gr.HTML("""<div>PPL computed using a stride of 512. See <a href='https://github.com/macrocosm-os/finetuning/blob/dev/scripts/run_benchmarks.py'>here</a> for the full code.</div>""")
|
79 |
gr.HTML(f"""<div>Last Updated: {benchmark_timestamp.strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>""")
|
80 |
|
81 |
+
with gr.Accordion("Competition Results"):
|
82 |
gr.HTML(EVALUATION_HEADER)
|
83 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
84 |
competition_leaderboards = []
|
85 |
+
losses_1 = utils.get_losses_over_time(vali_runs, 1)
|
86 |
+
comp_1 = competitions.COMPETITION_DETAILS[1]
|
87 |
+
with gr.Accordion(f"{comp_1.name} Competition"):
|
88 |
+
gr.HTML(comp_1.html_description)
|
89 |
competition_leaderboards.append(gr.components.Dataframe(
|
90 |
+
value=utils.leaderboard_data(model_data, scores, 1, show_stale.value),
|
91 |
headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
|
92 |
datatype=["markdown", "number", "number", "number", "number", "number"],
|
93 |
+
elem_id="comp1-table",
|
94 |
+
interactive=False,
|
95 |
+
visible=True,
|
96 |
+
))
|
97 |
+
gr.LinePlot(
|
98 |
+
losses_1,
|
99 |
+
x="timestamp",
|
100 |
+
x_title="Date",
|
101 |
+
y="losses",
|
102 |
+
y_title="Average Loss",
|
103 |
+
interactive=True,
|
104 |
+
visible=True,
|
105 |
+
width=1024,
|
106 |
+
title="Best Average Loss Over Time",
|
107 |
+
)
|
108 |
+
comp_2 = competitions.COMPETITION_DETAILS[2]
|
109 |
+
losses_2 = utils.get_losses_over_time(vali_runs, 2)
|
110 |
+
# Covert the losses into % of correct answers.
|
111 |
+
losses_2["losses"] = losses_2["losses"].apply(lambda x: 1 - x if x else None)
|
112 |
+
with gr.Accordion(f"{comp_2.name} Competition"):
|
113 |
+
gr.HTML(comp_2.html_description)
|
114 |
+
competition_leaderboards.append(gr.components.Dataframe(
|
115 |
+
value=utils.leaderboard_data(model_data, scores, 2, show_stale.value),
|
116 |
+
headers=["Name", "Win Rate", "MC Score", "Weight", "UID", "Block"],
|
117 |
+
datatype=["markdown", "number", "number", "number", "number", "number"],
|
118 |
+
elem_id="comp2-table",
|
119 |
interactive=False,
|
120 |
visible=True,
|
121 |
))
|
122 |
+
gr.LinePlot(
|
123 |
+
losses_2,
|
124 |
+
x="timestamp",
|
125 |
+
x_title="Date",
|
126 |
+
y="losses",
|
127 |
+
y_title="MC Score",
|
128 |
+
interactive=True,
|
129 |
+
visible=True,
|
130 |
+
width=1024,
|
131 |
+
title="Best MC Score Over Time",
|
132 |
+
)
|
133 |
+
gr.HTML("""
|
134 |
+
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
135 |
+
<li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
|
136 |
+
<li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
|
137 |
+
<li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
|
138 |
+
<li><b>UID:</b> the Bittensor UID of the miner</li>
|
139 |
+
<li><b>Weight:</b> the bittensor weight set for this model</li>
|
140 |
+
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>.""")
|
141 |
show_stale.change(
|
142 |
+
lambda stale: [utils.leaderboard_data(model_data, scores, 1, stale), utils.leaderboard_data(model_data, scores, 2, stale)],
|
143 |
inputs=[show_stale],
|
144 |
outputs=competition_leaderboards,
|
145 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
with gr.Accordion("Validator Stats"):
|
148 |
gr.components.Dataframe(
|
competitions.py
CHANGED
@@ -16,5 +16,10 @@ COMPETITION_DETAILS: Dict[int, CompetitionDetails] = {
|
|
16 |
1: CompetitionDetails(
|
17 |
name="SN9_MODEL",
|
18 |
html_description="""<b>Competition ID 1</b><br/>Produce the best fine-tuned model from a Subnet 9 pretrained model. Models are evaluated using synthetic prompt/response data from Subnet 18.""",
|
|
|
|
|
|
|
|
|
|
|
19 |
)
|
20 |
}
|
|
|
16 |
1: CompetitionDetails(
|
17 |
name="SN9_MODEL",
|
18 |
html_description="""<b>Competition ID 1</b><br/>Produce the best fine-tuned model from a Subnet 9 pretrained model. Models are evaluated using synthetic prompt/response data from Subnet 18.""",
|
19 |
+
),
|
20 |
+
2: CompetitionDetails(
|
21 |
+
name="General Knowledge Chat-bot",
|
22 |
+
# TODO: Add link to SN1 dataset details.
|
23 |
+
html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
|
24 |
)
|
25 |
}
|
utils.py
CHANGED
@@ -80,7 +80,7 @@ def run_with_retries(func, *args, **kwargs):
|
|
80 |
try:
|
81 |
return func(*args, **kwargs)
|
82 |
except (Exception, RuntimeError):
|
83 |
-
|
84 |
if i == RETRIES - 1:
|
85 |
raise
|
86 |
time.sleep(DELAY_SECS)
|
@@ -167,7 +167,7 @@ def get_wandb_runs(project: str, filters: Dict[str, Any]) -> List:
|
|
167 |
if len(runs) > 0:
|
168 |
return runs
|
169 |
# WandDB API is quite unreliable. Wait another minute and try again.
|
170 |
-
|
171 |
time.sleep(60)
|
172 |
|
173 |
|
@@ -235,14 +235,14 @@ def get_validator_weights(
|
|
235 |
return ret
|
236 |
|
237 |
|
238 |
-
def get_losses_over_time(wandb_runs: List) -> pd.DataFrame:
|
239 |
"""Returns a dataframe of the best average model loss over time."""
|
240 |
timestamps = []
|
241 |
-
|
242 |
|
243 |
for run in wandb_runs:
|
244 |
# For each run, check the 10 most recent steps.
|
245 |
-
|
246 |
should_add_datapoint = False
|
247 |
min_step = max(0, run.lastHistoryStep - 10)
|
248 |
history_scan = HistoryScan(
|
@@ -261,26 +261,19 @@ def get_losses_over_time(wandb_runs: List) -> pd.DataFrame:
|
|
261 |
|
262 |
for _, uid_data in all_uid_data.items():
|
263 |
loss = uid_data.get("average_loss", math.inf)
|
264 |
-
|
265 |
-
if
|
266 |
continue
|
267 |
|
268 |
-
if loss <
|
269 |
-
|
270 |
should_add_datapoint = True
|
271 |
# Now that we've processed the run's most recent steps, check if we should add a datapoint.
|
272 |
if should_add_datapoint:
|
273 |
timestamps.append(max_timestamp)
|
274 |
-
|
275 |
-
# Set None for any that aren't active during this run.
|
276 |
-
for id, losses in datapoints_per_comp_id.items():
|
277 |
-
losses.append(best_loss_per_competition_id.get(id, None))
|
278 |
-
|
279 |
-
# Create a dictionary of competitions to lists of losses.
|
280 |
-
output_columns = {competitions.COMPETITION_DETAILS[id].name: losses for id, losses in datapoints_per_comp_id.items()}
|
281 |
|
282 |
-
return pd.DataFrame({"timestamp": timestamps,
|
283 |
-
|
284 |
|
285 |
def next_epoch(subtensor: bt.subtensor, block: int) -> int:
|
286 |
return (
|
@@ -308,6 +301,7 @@ def format_score(uid: int, scores, key) -> Optional[float]:
|
|
308 |
def leaderboard_data(
|
309 |
leaderboard: List[ModelData],
|
310 |
scores: Dict[int, Dict[str, Optional[float]]],
|
|
|
311 |
show_stale: bool,
|
312 |
) -> List[List[Any]]:
|
313 |
"""Returns the leaderboard data, based on models data and UID scores."""
|
@@ -321,14 +315,14 @@ def leaderboard_data(
|
|
321 |
c.block,
|
322 |
]
|
323 |
for c in leaderboard
|
324 |
-
if (c.uid in scores and scores[c.uid]["fresh"]) or show_stale
|
325 |
]
|
326 |
|
327 |
|
328 |
def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
|
329 |
"""Returns the latest benchmarks and the time they were run."""
|
330 |
if not BENCHMARK_WANDB_PROJECT:
|
331 |
-
|
332 |
return None, None
|
333 |
runs = get_wandb_runs(project=BENCHMARK_WANDB_PROJECT, filters=None)
|
334 |
for run in runs:
|
@@ -339,7 +333,7 @@ def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
|
|
339 |
return table.get_dataframe(), datetime.datetime.strptime(
|
340 |
run.metadata["startedAt"], "%Y-%m-%dT%H:%M:%S.%f"
|
341 |
)
|
342 |
-
|
343 |
return None, None
|
344 |
|
345 |
|
@@ -402,24 +396,22 @@ def load_state_vars() -> dict[Any]:
|
|
402 |
try:
|
403 |
subtensor, metagraph = get_subtensor_and_metagraph()
|
404 |
|
405 |
-
|
406 |
|
407 |
model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
|
408 |
model_data.sort(key=lambda x: x.incentive, reverse=True)
|
409 |
|
410 |
-
|
411 |
vali_runs = get_wandb_runs(
|
412 |
project=VALIDATOR_WANDB_PROJECT,
|
413 |
-
# TODO: Update to point to the OTF vali on finetuning
|
414 |
filters={"config.type": "validator", "config.uid": 28},
|
|
|
415 |
)
|
|
|
|
|
416 |
|
417 |
scores = get_scores([x.uid for x in model_data], vali_runs)
|
418 |
|
419 |
-
# TODO: Re-enable once ""SubtensorModule.BlocksSinceEpoch" not found" issue is resolved.
|
420 |
-
# current_block = metagraph.block.item()
|
421 |
-
# next_epoch_block = next_epoch(subtensor, current_block)
|
422 |
-
|
423 |
validator_df = get_validator_weights(metagraph)
|
424 |
weight_keys = set()
|
425 |
for uid, stats in validator_df.items():
|
@@ -433,7 +425,7 @@ def load_state_vars() -> dict[Any]:
|
|
433 |
break
|
434 |
|
435 |
except KeyboardInterrupt:
|
436 |
-
|
437 |
break
|
438 |
|
439 |
except Exception as e:
|
|
|
80 |
try:
|
81 |
return func(*args, **kwargs)
|
82 |
except (Exception, RuntimeError):
|
83 |
+
print(f"Failed to run function: {traceback.format_exc()}")
|
84 |
if i == RETRIES - 1:
|
85 |
raise
|
86 |
time.sleep(DELAY_SECS)
|
|
|
167 |
if len(runs) > 0:
|
168 |
return runs
|
169 |
# WandDB API is quite unreliable. Wait another minute and try again.
|
170 |
+
print("Failed to get runs from Wandb. Trying again in 60 seconds.")
|
171 |
time.sleep(60)
|
172 |
|
173 |
|
|
|
235 |
return ret
|
236 |
|
237 |
|
238 |
+
def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
|
239 |
"""Returns a dataframe of the best average model loss over time."""
|
240 |
timestamps = []
|
241 |
+
losses = []
|
242 |
|
243 |
for run in wandb_runs:
|
244 |
# For each run, check the 10 most recent steps.
|
245 |
+
best_loss = math.inf
|
246 |
should_add_datapoint = False
|
247 |
min_step = max(0, run.lastHistoryStep - 10)
|
248 |
history_scan = HistoryScan(
|
|
|
261 |
|
262 |
for _, uid_data in all_uid_data.items():
|
263 |
loss = uid_data.get("average_loss", math.inf)
|
264 |
+
c_id = uid_data.get("competition_id", None)
|
265 |
+
if c_id is None or c_id != competition_id:
|
266 |
continue
|
267 |
|
268 |
+
if loss < best_loss:
|
269 |
+
best_loss = uid_data["average_loss"]
|
270 |
should_add_datapoint = True
|
271 |
# Now that we've processed the run's most recent steps, check if we should add a datapoint.
|
272 |
if should_add_datapoint:
|
273 |
timestamps.append(max_timestamp)
|
274 |
+
losses.append(best_loss)
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
276 |
+
return pd.DataFrame({"timestamp": timestamps, "losses": losses })
|
|
|
277 |
|
278 |
def next_epoch(subtensor: bt.subtensor, block: int) -> int:
|
279 |
return (
|
|
|
301 |
def leaderboard_data(
|
302 |
leaderboard: List[ModelData],
|
303 |
scores: Dict[int, Dict[str, Optional[float]]],
|
304 |
+
competition_id: int,
|
305 |
show_stale: bool,
|
306 |
) -> List[List[Any]]:
|
307 |
"""Returns the leaderboard data, based on models data and UID scores."""
|
|
|
315 |
c.block,
|
316 |
]
|
317 |
for c in leaderboard
|
318 |
+
if c.competition_id == competition_id and ((c.uid in scores and scores[c.uid]["fresh"]) or show_stale)
|
319 |
]
|
320 |
|
321 |
|
322 |
def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
|
323 |
"""Returns the latest benchmarks and the time they were run."""
|
324 |
if not BENCHMARK_WANDB_PROJECT:
|
325 |
+
print("No benchmark project set.")
|
326 |
return None, None
|
327 |
runs = get_wandb_runs(project=BENCHMARK_WANDB_PROJECT, filters=None)
|
328 |
for run in runs:
|
|
|
333 |
return table.get_dataframe(), datetime.datetime.strptime(
|
334 |
run.metadata["startedAt"], "%Y-%m-%dT%H:%M:%S.%f"
|
335 |
)
|
336 |
+
print("Failed to get benchmarks from Wandb.")
|
337 |
return None, None
|
338 |
|
339 |
|
|
|
396 |
try:
|
397 |
subtensor, metagraph = get_subtensor_and_metagraph()
|
398 |
|
399 |
+
print(f"Loaded subtensor and metagraph: {metagraph}")
|
400 |
|
401 |
model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
|
402 |
model_data.sort(key=lambda x: x.incentive, reverse=True)
|
403 |
|
404 |
+
print(f"Loaded {len(model_data)} models")
|
405 |
vali_runs = get_wandb_runs(
|
406 |
project=VALIDATOR_WANDB_PROJECT,
|
|
|
407 |
filters={"config.type": "validator", "config.uid": 28},
|
408 |
+
# filters={"$and": [{"config.type": "validator"}], "$or": [{"config.uid": 28}, {"config.uid": 252}]},
|
409 |
)
|
410 |
+
|
411 |
+
print(f"Loaded {len(vali_runs)} validator runs")
|
412 |
|
413 |
scores = get_scores([x.uid for x in model_data], vali_runs)
|
414 |
|
|
|
|
|
|
|
|
|
415 |
validator_df = get_validator_weights(metagraph)
|
416 |
weight_keys = set()
|
417 |
for uid, stats in validator_df.items():
|
|
|
425 |
break
|
426 |
|
427 |
except KeyboardInterrupt:
|
428 |
+
print("Exiting...")
|
429 |
break
|
430 |
|
431 |
except Exception as e:
|