Spaces:

macrocosm-os
/

finetuning-leaderboard

Running on CPU Upgrade

App Files Files Community

rusticluftig commited on Sep 4, 2024

Commit

191e77c

1 Parent(s): 1681195

Add leaderboard for competition 2

Browse files

Files changed (4) hide show

.gitignore +2 -1
app.py +58 -34
competitions.py +5 -0
utils.py +21 -29

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .venv
 __pycache__/
-.env

 .venv
 __pycache__/
+.env
+**.ipynb

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 # Code adapted from: https://huggingface.co/spaces/RaoFoundation/pretraining-leaderboard/blob/main/app.py
-import os
 import datetime
-from typing import Dict
-import gradio as gr
 from dotenv import load_dotenv
 from huggingface_hub import HfApi
-from apscheduler.schedulers.background import BackgroundScheduler
 import competitions
 import utils
@@ -17,8 +16,7 @@ FONT = (
 )
 TITLE = """<h1 align="center" id="space-title" class="typewriter">Finetuning Subnet Leaderboard</h1>"""
 HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/macrocosm-os/finetuning" target="_blank">Finetuning</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that rewards miners for producing finetuned models in defined competitions. The model with the best head-to-head score in each competition receive a steady emission of TAO.</h3>"""
-EVALUATION_DETAILS = """<ul><li><b>Name:</b> the 🤗 Hugging Face model name (click to go to the model card)</li><li><b>Rewards / Day:</b> the expected rewards per day based on current ranking.</li><li><b>Last Average Loss:</b> the last loss value on the evaluation data for the model as calculated by a validator (lower is better)</li><li><b>UID:</b> the Bittensor UID of the miner</li><li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
-EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by the Opentensor validator</h3>"""
 HF_REPO_ID = "macrocosm-os/finetuning-leaderboard"
 SECONDS_PER_BLOCK = 12
@@ -65,11 +63,8 @@ def main():
         gr.HTML(TITLE)
         gr.HTML(HEADER)
-        # TODO: Re-enable once ""SubtensorModule.BlocksSinceEpoch" not found" issue is resolved.
-        # gr.HTML(value=get_next_update_div(current_block, next_epoch_block))
-        # TODO: Figure out the best approach to showing the per competition rewards.
         gr.Label(
             value={
                 f"{c.namespace}/{c.name} ({c.commit[0:8]}) · (τ{round(c.emission, 2):,})": c.incentive
                 for c in model_data
@@ -83,42 +78,71 @@ def main():
                 gr.HTML("""<div>PPL computed using a stride of 512. See <a href='https://github.com/macrocosm-os/finetuning/blob/dev/scripts/run_benchmarks.py'>here</a> for the full code.</div>""")
                 gr.HTML(f"""<div>Last Updated: {benchmark_timestamp.strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>""")
-        with gr.Accordion("Evaluation Stats"):
             gr.HTML(EVALUATION_HEADER)
             show_stale = gr.Checkbox(label="Show Stale", interactive=True)
             competition_leaderboards = []
-            # TODO: Dynamically generate per-competition leaderboards based on model_data.
-            competition_details = competitions.COMPETITION_DETAILS[1]
-            with gr.Accordion(f"{competition_details.name} competition"):
-                gr.HTML(competition_details.html_description)
                 competition_leaderboards.append(gr.components.Dataframe(
-                    value=utils.leaderboard_data(model_data, scores, show_stale.value),
                     headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
                     datatype=["markdown", "number", "number", "number", "number", "number"],
-                    elem_id="leaderboard-table",
                     interactive=False,
                     visible=True,
                 ))
-            gr.HTML(EVALUATION_DETAILS)
             show_stale.change(
-                lambda stale: utils.leaderboard_data(model_data, scores, stale),
                 inputs=[show_stale],
                 outputs=competition_leaderboards,
-            )
-            # TODO: Make this a multi-competition line plot
-            gr.LinePlot(
-                utils.get_losses_over_time(vali_runs),
-                x="timestamp",
-                x_title="Date",
-                y="SN9_MODEL",
-                y_title="Average Loss",
-                tooltip="SN9_MODEL",
-                interactive=True,
-                visible=True,
-                width=1024,
-                title="Best Average Loss Over Time",
-            )
         with gr.Accordion("Validator Stats"):
             gr.components.Dataframe(

 # Code adapted from: https://huggingface.co/spaces/RaoFoundation/pretraining-leaderboard/blob/main/app.py
 import datetime
+import os
+import gradio as gr
+from apscheduler.schedulers.background import BackgroundScheduler
 from dotenv import load_dotenv
 from huggingface_hub import HfApi
 import competitions
 import utils
 )
 TITLE = """<h1 align="center" id="space-title" class="typewriter">Finetuning Subnet Leaderboard</h1>"""
 HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/macrocosm-os/finetuning" target="_blank">Finetuning</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that rewards miners for producing finetuned models in defined competitions. The model with the best head-to-head score in each competition receive a steady emission of TAO.</h3>"""
+EVALUATION_HEADER = """<h3 align="center">Shows the latest per-competition evaluation statistics as calculated by the Taoverse validator</h3>"""
 HF_REPO_ID = "macrocosm-os/finetuning-leaderboard"
 SECONDS_PER_BLOCK = 12
         gr.HTML(TITLE)
         gr.HTML(HEADER)
         gr.Label(
+            label="Emissions",
             value={
                 f"{c.namespace}/{c.name} ({c.commit[0:8]}) · (τ{round(c.emission, 2):,})": c.incentive
                 for c in model_data
                 gr.HTML("""<div>PPL computed using a stride of 512. See <a href='https://github.com/macrocosm-os/finetuning/blob/dev/scripts/run_benchmarks.py'>here</a> for the full code.</div>""")
                 gr.HTML(f"""<div>Last Updated: {benchmark_timestamp.strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>""")
+        with gr.Accordion("Competition Results"):
             gr.HTML(EVALUATION_HEADER)
             show_stale = gr.Checkbox(label="Show Stale", interactive=True)
             competition_leaderboards = []
+            losses_1 = utils.get_losses_over_time(vali_runs, 1)
+            comp_1 = competitions.COMPETITION_DETAILS[1]
+            with gr.Accordion(f"{comp_1.name} Competition"):
+                gr.HTML(comp_1.html_description)
                 competition_leaderboards.append(gr.components.Dataframe(
+                    value=utils.leaderboard_data(model_data, scores, 1, show_stale.value),
                     headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
                     datatype=["markdown", "number", "number", "number", "number", "number"],
+                    elem_id="comp1-table",
+                    interactive=False,
+                    visible=True,
+                ))
+                gr.LinePlot(
+                    losses_1,
+                    x="timestamp",
+                    x_title="Date",
+                    y="losses",
+                    y_title="Average Loss",
+                    interactive=True,
+                    visible=True,
+                    width=1024,
+                    title="Best Average Loss Over Time",
+                )
+            comp_2 = competitions.COMPETITION_DETAILS[2]
+            losses_2 = utils.get_losses_over_time(vali_runs, 2)
+            # Covert the losses into % of correct answers.
+            losses_2["losses"] = losses_2["losses"].apply(lambda x: 1 - x if x else None)
+            with gr.Accordion(f"{comp_2.name} Competition"):
+                gr.HTML(comp_2.html_description)
+                competition_leaderboards.append(gr.components.Dataframe(
+                    value=utils.leaderboard_data(model_data, scores, 2, show_stale.value),
+                    headers=["Name", "Win Rate", "MC Score", "Weight", "UID", "Block"],
+                    datatype=["markdown", "number", "number", "number", "number", "number"],
+                    elem_id="comp2-table",
                     interactive=False,
                     visible=True,
                 ))
+                gr.LinePlot(
+                    losses_2,
+                    x="timestamp",
+                    x_title="Date",
+                    y="losses",
+                    y_title="MC Score",
+                    interactive=True,
+                    visible=True,
+                    width=1024,
+                    title="Best MC Score Over Time",
+                )
+            gr.HTML("""
+                    <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
+                    <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
+                    <li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
+                    <li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
+                    <li><b>UID:</b> the Bittensor UID of the miner</li>
+                    <li><b>Weight:</b> the bittensor weight set for this model</li>
+                    <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>.""")
             show_stale.change(
+                lambda stale: [utils.leaderboard_data(model_data, scores, 1, stale), utils.leaderboard_data(model_data, scores, 2, stale)],
                 inputs=[show_stale],
                 outputs=competition_leaderboards,
+            )
         with gr.Accordion("Validator Stats"):
             gr.components.Dataframe(

competitions.py CHANGED Viewed

@@ -16,5 +16,10 @@ COMPETITION_DETAILS: Dict[int, CompetitionDetails] = {
     1: CompetitionDetails(
         name="SN9_MODEL",
         html_description="""<b>Competition ID 1</b><br/>Produce the best fine-tuned model from a Subnet 9 pretrained model. Models are evaluated using synthetic prompt/response data from Subnet 18.""",
     )
 }

     1: CompetitionDetails(
         name="SN9_MODEL",
         html_description="""<b>Competition ID 1</b><br/>Produce the best fine-tuned model from a Subnet 9 pretrained model. Models are evaluated using synthetic prompt/response data from Subnet 18.""",
+    ),
+    2: CompetitionDetails(
+        name="General Knowledge Chat-bot",
+        # TODO: Add link to SN1 dataset details.
+        html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
     )
 }

utils.py CHANGED Viewed

@@ -80,7 +80,7 @@ def run_with_retries(func, *args, **kwargs):
         try:
             return func(*args, **kwargs)
         except (Exception, RuntimeError):
-            bt.logging.error(f"Failed to run function: {traceback.format_exc()}")
             if i == RETRIES - 1:
                 raise
             time.sleep(DELAY_SECS)
@@ -167,7 +167,7 @@ def get_wandb_runs(project: str, filters: Dict[str, Any]) -> List:
         if len(runs) > 0:
             return runs
         # WandDB API is quite unreliable. Wait another minute and try again.
-        bt.logging.error("Failed to get runs from Wandb. Trying again in 60 seconds.")
         time.sleep(60)
@@ -235,14 +235,14 @@ def get_validator_weights(
     return ret
-def get_losses_over_time(wandb_runs: List) -> pd.DataFrame:
     """Returns a dataframe of the best average model loss over time."""
     timestamps = []
-    datapoints_per_comp_id = {id: [] for id in competitions.COMPETITION_DETAILS}
     for run in wandb_runs:
         # For each run, check the 10 most recent steps.
-        best_loss_per_competition_id = defaultdict(lambda: math.inf)
         should_add_datapoint = False
         min_step = max(0, run.lastHistoryStep - 10)
         history_scan = HistoryScan(
@@ -261,26 +261,19 @@ def get_losses_over_time(wandb_runs: List) -> pd.DataFrame:
             for _, uid_data in all_uid_data.items():
                 loss = uid_data.get("average_loss", math.inf)
-                competition_id = uid_data.get("competition_id", None)
-                if not competition_id:
                     continue
-                if loss < best_loss_per_competition_id[competition_id]:
-                    best_loss_per_competition_id[competition_id] = uid_data["average_loss"]
                     should_add_datapoint = True
         # Now that we've processed the run's most recent steps, check if we should add a datapoint.
         if should_add_datapoint:
             timestamps.append(max_timestamp)
-            # Iterate through all possible competitions and add the best loss for each.
-            # Set None for any that aren't active during this run.
-            for id, losses in datapoints_per_comp_id.items():
-                losses.append(best_loss_per_competition_id.get(id, None))
-    # Create a dictionary of competitions to lists of losses.
-    output_columns = {competitions.COMPETITION_DETAILS[id].name: losses for id, losses in datapoints_per_comp_id.items()}
-    return pd.DataFrame({"timestamp": timestamps, **output_columns})
 def next_epoch(subtensor: bt.subtensor, block: int) -> int:
     return (
@@ -308,6 +301,7 @@ def format_score(uid: int, scores, key) -> Optional[float]:
 def leaderboard_data(
     leaderboard: List[ModelData],
     scores: Dict[int, Dict[str, Optional[float]]],
     show_stale: bool,
 ) -> List[List[Any]]:
     """Returns the leaderboard data, based on models data and UID scores."""
@@ -321,14 +315,14 @@ def leaderboard_data(
             c.block,
         ]
         for c in leaderboard
-        if (c.uid in scores and scores[c.uid]["fresh"]) or show_stale
     ]
 def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
     """Returns the latest benchmarks and the time they were run."""
     if not BENCHMARK_WANDB_PROJECT:
-        bt.logging.error("No benchmark project set.")
         return None, None
     runs = get_wandb_runs(project=BENCHMARK_WANDB_PROJECT, filters=None)
     for run in runs:
@@ -339,7 +333,7 @@ def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
                 return table.get_dataframe(), datetime.datetime.strptime(
                     run.metadata["startedAt"], "%Y-%m-%dT%H:%M:%S.%f"
                 )
-    bt.logging.error("Failed to get benchmarks from Wandb.")
     return None, None
@@ -402,24 +396,22 @@ def load_state_vars() -> dict[Any]:
         try:
             subtensor, metagraph = get_subtensor_and_metagraph()
-            bt.logging.success("Loaded subtensor and metagraph")
             model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
             model_data.sort(key=lambda x: x.incentive, reverse=True)
-            bt.logging.success(f"Loaded {len(model_data)} models")
             vali_runs = get_wandb_runs(
                 project=VALIDATOR_WANDB_PROJECT,
-                # TODO: Update to point to the OTF vali on finetuning
                 filters={"config.type": "validator", "config.uid": 28},
             )
             scores = get_scores([x.uid for x in model_data], vali_runs)
-            # TODO: Re-enable once ""SubtensorModule.BlocksSinceEpoch" not found" issue is resolved.
-            # current_block = metagraph.block.item()
-            # next_epoch_block = next_epoch(subtensor, current_block)
             validator_df = get_validator_weights(metagraph)
             weight_keys = set()
             for uid, stats in validator_df.items():
@@ -433,7 +425,7 @@ def load_state_vars() -> dict[Any]:
             break
         except KeyboardInterrupt:
-            bt.logging.error("Exiting...")
             break
         except Exception as e:

         try:
             return func(*args, **kwargs)
         except (Exception, RuntimeError):
+            print(f"Failed to run function: {traceback.format_exc()}")
             if i == RETRIES - 1:
                 raise
             time.sleep(DELAY_SECS)
         if len(runs) > 0:
             return runs
         # WandDB API is quite unreliable. Wait another minute and try again.
+        print("Failed to get runs from Wandb. Trying again in 60 seconds.")
         time.sleep(60)
     return ret
+def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
     """Returns a dataframe of the best average model loss over time."""
     timestamps = []
+    losses = []
     for run in wandb_runs:
         # For each run, check the 10 most recent steps.
+        best_loss = math.inf
         should_add_datapoint = False
         min_step = max(0, run.lastHistoryStep - 10)
         history_scan = HistoryScan(
             for _, uid_data in all_uid_data.items():
                 loss = uid_data.get("average_loss", math.inf)
+                c_id = uid_data.get("competition_id", None)
+                if c_id is None or c_id != competition_id:
                     continue
+                if loss < best_loss:
+                    best_loss = uid_data["average_loss"]
                     should_add_datapoint = True
         # Now that we've processed the run's most recent steps, check if we should add a datapoint.
         if should_add_datapoint:
             timestamps.append(max_timestamp)
+            losses.append(best_loss)
+    return pd.DataFrame({"timestamp": timestamps, "losses": losses })
 def next_epoch(subtensor: bt.subtensor, block: int) -> int:
     return (
 def leaderboard_data(
     leaderboard: List[ModelData],
     scores: Dict[int, Dict[str, Optional[float]]],
+    competition_id: int,
     show_stale: bool,
 ) -> List[List[Any]]:
     """Returns the leaderboard data, based on models data and UID scores."""
             c.block,
         ]
         for c in leaderboard
+        if c.competition_id == competition_id and ((c.uid in scores and scores[c.uid]["fresh"]) or show_stale)
     ]
 def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
     """Returns the latest benchmarks and the time they were run."""
     if not BENCHMARK_WANDB_PROJECT:
+        print("No benchmark project set.")
         return None, None
     runs = get_wandb_runs(project=BENCHMARK_WANDB_PROJECT, filters=None)
     for run in runs:
                 return table.get_dataframe(), datetime.datetime.strptime(
                     run.metadata["startedAt"], "%Y-%m-%dT%H:%M:%S.%f"
                 )
+    print("Failed to get benchmarks from Wandb.")
     return None, None
         try:
             subtensor, metagraph = get_subtensor_and_metagraph()
+            print(f"Loaded subtensor and metagraph: {metagraph}")
             model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
             model_data.sort(key=lambda x: x.incentive, reverse=True)
+            print(f"Loaded {len(model_data)} models")
             vali_runs = get_wandb_runs(
                 project=VALIDATOR_WANDB_PROJECT,
                 filters={"config.type": "validator", "config.uid": 28},
+                # filters={"$and": [{"config.type": "validator"}], "$or": [{"config.uid": 28}, {"config.uid": 252}]},
             )
+            print(f"Loaded {len(vali_runs)} validator runs")
             scores = get_scores([x.uid for x in model_data], vali_runs)
             validator_df = get_validator_weights(metagraph)
             weight_keys = set()
             for uid, stats in validator_df.items():
             break
         except KeyboardInterrupt:
+            print("Exiting...")
             break
         except Exception as e: