Spaces:

macrocosm-os
/

finetuning-leaderboard

Running on CPU Upgrade

App Files Files Community

finetuning-leaderboard / app.py

rusticluftig

Add benchmark data to the LB

9b87de8 4 months ago

raw

history blame

8.15 kB

	# Code adapted from: https://huggingface.co./spaces/RaoFoundation/pretraining-leaderboard/blob/main/app.py

	import datetime
	import os

	import gradio as gr
	from apscheduler.schedulers.background import BackgroundScheduler
	from dotenv import load_dotenv
	from huggingface_hub import HfApi
	import matplotlib.pyplot as plt

	import competitions
	import utils

	FONT = (
	"""<link href="https://fonts.cdnfonts.com/css/jmh-typewriter" rel="stylesheet">"""
	)
	TITLE = """<h1 align="center" id="space-title" class="typewriter">Finetuning Subnet Leaderboard</h1>"""
	HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/macrocosm-os/finetuning" target="_blank">Finetuning</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that rewards miners for producing finetuned models in defined competitions. The model with the best head-to-head score in each competition receive a steady emission of TAO.</h3>"""
	EVALUATION_HEADER = """<h3 align="center">Shows the latest per-competition evaluation statistics as calculated by the Taoverse validator</h3>"""

	HF_REPO_ID = "macrocosm-os/finetuning-leaderboard"
	SECONDS_PER_BLOCK = 12

	load_dotenv()

	HF_TOKEN = os.environ.get("HF_TOKEN", None)
	API = HfApi(token=HF_TOKEN)


	def get_next_update_div(current_block: int, next_update_block: int) -> str:
	now = datetime.datetime.now()
	blocks_to_go = next_update_block - current_block
	next_update_time = now + datetime.timedelta(
	seconds=blocks_to_go * SECONDS_PER_BLOCK
	)
	delta = next_update_time - now
	return f"""<div align="center" style="font-size: larger;">Next reward update: <b>{blocks_to_go}</b> blocks (~{int(delta.total_seconds() // 60)} minutes)</div>"""


	def get_last_updated_div() -> str:
	return f"""<div>Last Updated: {datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>"""


	def restart_space():
	API.restart_space(repo_id=HF_REPO_ID, token=HF_TOKEN)


	def main():
	# To avoid leaderboard failures, infinitely try until we get all data
	# needed to populate the dashboard

	state_vars = utils.load_state_vars()
	model_data = state_vars["model_data"]
	vali_runs = state_vars["vali_runs"]
	scores = state_vars["scores"]
	validator_df = state_vars["validator_df"]
	benchmarks_df = state_vars["benchmarks_df"]
	benchmarks_targets = state_vars["benchmarks_targets"]
	losses_2 = state_vars["losses_2"]

	demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
	with demo:
	gr.HTML(FONT)
	gr.HTML(TITLE)
	gr.HTML(HEADER)

	gr.Label(
	label="Emissions",
	value={
	f"{c.namespace}/{c.name} ({c.commit[0:8]}) · (τ{round(c.emission, 2):,})": c.incentive
	for c in model_data
	if c.incentive
	},
	num_top_classes=10,
	)

	with gr.Accordion("Competition Results"):
	gr.HTML(EVALUATION_HEADER)
	show_stale = gr.Checkbox(label="Show Stale", interactive=True)
	competition_leaderboards = []
	comp_2 = competitions.COMPETITION_DETAILS[2]
	# Covert the losses into % of correct answers.
	losses_2["losses"] = losses_2["losses"].apply(
	lambda x: 1 - x if x else None
	)
	with gr.Accordion(f"{comp_2.name} Competition"):
	gr.HTML(comp_2.html_description)
	competition_leaderboards.append(
	gr.components.Dataframe(
	value=utils.leaderboard_data(
	model_data, scores, 2, show_stale.value
	),
	headers=[
	"Name",
	"Win Rate",
	"MC Score",
	"Weight",
	"UID",
	"Block",
	],
	datatype=[
	"markdown",
	"number",
	"number",
	"number",
	"number",
	"number",
	],
	elem_id="comp2-table",
	interactive=False,
	visible=True,
	)
	)
	gr.LinePlot(
	losses_2,
	x="timestamp",
	x_title="Date",
	y="losses",
	y_title="MC Score",
	interactive=True,
	visible=True,
	width=1024,
	title="Best MC Score Over Time",
	)
	gr.HTML(
	"""
	<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
	<li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
	<li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
	<li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
	<li><b>UID:</b> the Bittensor UID of the miner</li>
	<li><b>Weight:</b> the bittensor weight set for this model</li>
	<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
	)
	show_stale.change(
	lambda stale: [utils.leaderboard_data(model_data, scores, 2, stale)],
	inputs=[show_stale],
	outputs=competition_leaderboards,
	)

	if benchmarks_df is not None:

	def create_benchmark_plot(benchmark: str):
	fig = plt.figure(figsize=(10, 8))

	plt.plot(benchmarks_df["timestamp"], benchmarks_df[benchmark])

	# Adding horizontal dotted lines for various benchmark targets (well-known models)
	for model, score in benchmarks_targets[benchmark].items():
	plt.axhline(y=score, linestyle="--", label=f"{model}")
	plt.text(
	benchmarks_df["timestamp"].max(),
	score,
	f"{model}",
	va="center",
	ha="right",
	backgroundcolor="white",
	)

	# Adding labels and title
	plt.ylabel(benchmark.upper())
	plt.title(f"{benchmark.upper()} Over Time")
	plt.xticks(rotation=45)

	return fig

	with gr.Accordion("Top Model Benchmarks"):
	mmlu = create_benchmark_plot("mmlu")
	mmlu_pro = create_benchmark_plot("mmlu_pro")
	gr.Plot(mmlu)
	gr.Plot(mmlu_pro)
	gr.HTML(
	"""<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
	)
	gr.HTML(
	"""<ul><li>MMLU: Raw score</li><li>MMLU Pro: Normalized score using <a href='https://huggingface.co./docs/leaderboards/open_llm_leaderboard/normalization'>this</a> method</li></ul>"""
	)

	with gr.Accordion("Validator Stats"):
	gr.components.Dataframe(
	utils.make_validator_dataframe(validator_df, model_data),
	interactive=False,
	visible=True,
	)
	gr.HTML(value=get_last_updated_div())

	scheduler = BackgroundScheduler()
	scheduler.add_job(
	restart_space, "interval", seconds=60 * 30
	) # restart every 15 minutes
	scheduler.start()

	demo.launch()


	main()