import os import random import glob import json import numpy as np from flask import Flask, render_template, request app = Flask(__name__) with open("problems.json") as f: problems = json.load(f) problem_choices = [q["question_title"] for q in problems] random_idxs = list(range(len(problems))) # random.seed(42) # random.shuffle(random_idxs) problems = [problems[idx] for idx in random_idxs] with open("all_outputs.json") as f: all_outputs = json.load(f) all_models = list(all_outputs.keys()) num_questions_filtered = len(problems) all_correctness_by_problem = { idx: {model: np.mean(all_outputs[model][idx]["pass1_list"]) for model in all_models} for idx in random_idxs } def calculate_color(performance): # Convert performance to a value between 0 and 1 # Calculate the red and green components of the color if performance > 0.75: return f"rgba(0, 150, 0, 0.5)" elif performance > 0.5: return f"rgba(50, 150, 0, {performance})" elif performance > 0.25: return f"rgba(150, 50, 0, {1-performance})" else: return f"rgba(150, 0, 0, 0.5)" all_evaluations_by_problem_colored = [ ( trueidx, { model: { "correctness": f"{all_correctness_by_problem[idx][model]*100:.1f}", "correctness_color": calculate_color( all_correctness_by_problem[idx][model] ), } for model in all_models }, problems[idx]["difficulty"], problems[idx]["question_id"], ) for trueidx, idx in enumerate(random_idxs) ] all_data_for_view_formatted = { model: [ [ {"code": a, "pass1": b, "metadata": c} for a, b, c in zip( row["code_list"], row["pass1_list"], row["metadata_list"] ) ] # print(row) for idx in random_idxs for row in [resp[idx]] ] for model, resp in all_outputs.items() } @app.route("/") def home(): # Fetch your data here print(all_models) return render_template( "index.html", models=all_models, problems=all_evaluations_by_problem_colored ) @app.route("/problem/") def problem(problem_idx): # Fetch your data here data = { model: all_data_for_view_formatted[model][problem_idx] for model in all_models } evaluation = all_evaluations_by_problem_colored[problem_idx][1] question = problems[problem_idx] # print(data) return render_template( "problem.html", problem_idx=problem_idx, question_id=all_evaluations_by_problem_colored[problem_idx][3], evaluation=evaluation, models=all_models, question=question, data=data, ) mini_models = [ # "DeepSeek-V2", "DeepSeek-V3", "DeepSeek-R1-Preview", # "DSCoder-33b-Ins", # "GPT-4-Turbo-2024-04-09", "GPT-4O-2024-05-13", "Claude-3.5-Sonnet-20240620", "Gemini-Flash-2.0-Thinking", # "Gemini-Exp-1206", # "Claude-3-Sonnet", "O1-2024-12-17 (N=1) (High)", "QwQ-32B-Preview (N=1)", ] @app.route("/mini") def mini(): # Fetch your data here return render_template( "index_mini.html", models=mini_models, problems=all_evaluations_by_problem_colored, ) @app.route("/problem_mini/") def problem_mini(problem_idx): # Fetch your data here data = { model: all_data_for_view_formatted[model][problem_idx] for model in mini_models } evaluation = all_evaluations_by_problem_colored[problem_idx][1] question = problems[problem_idx] # print(data) return render_template( "problem_mini.html", problem_idx=problem_idx, question_id=all_evaluations_by_problem_colored[problem_idx][3], evaluation=evaluation, models=mini_models, question=question, data=data, ) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)