Spaces:
Running
Running
import pandas as pd | |
import json | |
from typing import Dict, Any, Tuple | |
# Keep all the constant mappings outside the class | |
MODEL_NAME_MAP = { | |
"Claude_3.5_new": "Claude-3.5-Sonnet (1022)", | |
"GPT_4o": "GPT-4o (0513)", | |
"Claude_3.5": "Claude-3.5-Sonnet (0622)", | |
"Gemini_1.5_pro_002": "Gemini-1.5-Pro-002", | |
"InternVL2_76B": "InternVL2-Llama3-76B", | |
"Qwen2_VL_72B": "Qwen2-VL-72B", | |
"llava_onevision_72B": "Llava-OneVision-72B", | |
"NVLM": "NVLM-72B", | |
"GPT_4o_mini": "GPT-4o mini", | |
"Gemini_1.5_flash_002": "Gemini-1.5-Flash-002", | |
"Pixtral_12B": "Pixtral 12B", | |
"Aria": "Aria-MoE-25B", | |
"Qwen2_VL_7B": "Qwen2-VL-7B", | |
"InternVL2_8B": "InternVL2-8B", | |
"llava_onevision_7B": "Llava-OneVision-7B", | |
"Llama_3_2_11B": "Llama-3.2-11B", | |
"Phi-3.5-vision": "Phi-3.5-Vision", | |
"MiniCPM_v2.6": "MiniCPM-V2.6", | |
"Idefics3": "Idefics3-8B-Llama3", | |
"Aquila_VL_2B": "Aquila-VL-2B-llava-qwen", | |
"POINTS_7B": "POINTS-Qwen2.5-7B", | |
"Qwen2_VL_2B": "Qwen2-VL-2B", | |
"InternVL2_2B": "InternVL2-2B", | |
"Molmo_7B_D": "Molmo-7B-D-0924", | |
"Molmo_72B": "Molmo-72B-0924", | |
} | |
DIMENSION_NAME_MAP = { | |
"skills": "Skills", | |
"input_format": "Input Format", | |
"output_format": "Output Format", | |
"input_num": "Visual Input Number", | |
"app": "Application" | |
} | |
KEYWORD_NAME_MAP = { | |
# Skills | |
"Object Recognition and Classification": "Object Recognition", | |
"Text Recognition (OCR)": "OCR", | |
"Language Understanding and Generation": "Language", | |
"Scene and Event Understanding": "Scene/Event", | |
"Mathematical and Logical Reasoning": "Math/Logic", | |
"Commonsense and Social Reasoning": "Commonsense", | |
"Ethical and Safety Reasoning": "Ethics/Safety", | |
"Domain-Specific Knowledge and Skills": "Domain-Specific", | |
"Spatial and Temporal Reasoning": "Spatial/Temporal", | |
"Planning and Decision Making": "Planning/Decision", | |
# Input Format | |
'User Interface Screenshots': "UI related", | |
'Text-Based Images and Documents': "Documents", | |
'Diagrams and Data Visualizations': "Infographics", | |
'Videos': "Videos", | |
'Artistic and Creative Content': "Arts/Creative", | |
'Photographs': "Photographs", | |
'3D Models and Aerial Imagery': "3D related", | |
# Application | |
'Information_Extraction': "Info Extraction", | |
'Planning' : "Planning", | |
'Coding': "Coding", | |
'Perception': "Perception", | |
'Metrics': "Metrics", | |
'Science': "Science", | |
'Knowledge': "Knowledge", | |
'Mathematics': "Math", | |
# Output format | |
'contextual_formatted_text': "Contexual", | |
'structured_output': "Structured", | |
'exact_text': "Exact", | |
'numerical_data': "Numerical", | |
'open_ended_output': "Open-ended", | |
'multiple_choice': "MC", | |
"6-8 images": "6-8 imgs", | |
"1-image": "1 img", | |
"2-3 images": "2-3 imgs", | |
"4-5 images": "4-5 imgs", | |
"9-image or more": "9+ imgs", | |
"video": "Video", | |
} | |
class BaseDataLoader: | |
# Define the base MODEL_GROUPS structure | |
BASE_MODEL_GROUPS = { | |
"All": list(MODEL_NAME_MAP.keys()), | |
"Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B'], | |
"Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B"], | |
"Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'], | |
"Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'], | |
"Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B"], | |
"Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B",] | |
} | |
def __init__(self): | |
self.MODEL_DATA = self._load_model_data() | |
self.SUMMARY_DATA = self._load_summary_data() | |
self.SUPER_GROUPS = self._initialize_super_groups() | |
self.MODEL_GROUPS = self._initialize_model_groups() | |
def _initialize_super_groups(self): | |
# Define the desired order of super groups | |
groups = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in self.MODEL_DATA[next(iter(self.MODEL_DATA))][dim].keys()] | |
for dim in self.MODEL_DATA[next(iter(self.MODEL_DATA))]} | |
order = ["Skills", "Application", "Output Format", "Input Format", "Visual Input Number"] | |
# Sort the dictionary based on the predefined order | |
return {k: groups[k] for k in order if k in groups} | |
def _initialize_model_groups(self) -> Dict[str, list]: | |
# Get the list of available models from the loaded data | |
available_models = set(self.MODEL_DATA.keys()) | |
# Create filtered groups based on available models | |
filtered_groups = {} | |
for group_name, models in self.BASE_MODEL_GROUPS.items(): | |
if group_name == "All": | |
filtered_groups[group_name] = sorted(list(available_models)) | |
else: | |
filtered_models = [model for model in models if model in available_models] | |
if filtered_models: # Only include group if it has models | |
filtered_groups[group_name] = filtered_models | |
return filtered_groups | |
def _load_model_data(self) -> Dict[str, Any]: | |
raise NotImplementedError("Subclasses must implement _load_model_data") | |
def _load_summary_data(self) -> Dict[str, Any]: | |
raise NotImplementedError("Subclasses must implement _load_summary_data") | |
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: | |
raise NotImplementedError("Subclasses must implement get_df") | |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]: | |
raise NotImplementedError("Subclasses must implement get_leaderboard_data") | |
class DefaultDataLoader(BaseDataLoader): | |
def __init__(self): | |
super().__init__() | |
def _load_model_data(self) -> Dict[str, Any]: | |
with open("./static/eval_results/Default/all_model_keywords_stats.json", "r") as f: | |
return json.load(f) | |
def _load_summary_data(self) -> Dict[str, Any]: | |
with open("./static/eval_results/Default/all_summary.json", "r") as f: | |
return json.load(f) | |
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: | |
original_dimension = get_original_dimension(selected_super_group) | |
data = [] | |
for model in self.MODEL_GROUPS[selected_model_group]: | |
model_data = self.MODEL_DATA[model] | |
summary = self.SUMMARY_DATA[model] | |
core_noncot_score = summary["core_noncot"]["macro_mean_score"] | |
core_cot_score = summary["core_cot"]["macro_mean_score"] | |
row = { | |
"Models": get_display_model_name(model), | |
"Overall": round(summary["overall_score"] * 100, 2), | |
"Core(w/o CoT)": round(core_noncot_score * 100, 2), | |
"Core(w/ CoT)": round(core_cot_score * 100, 2), | |
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) | |
} | |
for keyword in self.SUPER_GROUPS[selected_super_group]: | |
original_keyword = get_original_keyword(keyword) | |
if original_dimension in model_data and original_keyword in model_data[original_dimension]: | |
row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2) | |
else: | |
row[keyword] = None | |
data.append(row) | |
df = pd.DataFrame(data) | |
df = df.sort_values(by="Overall", ascending=False) | |
return df | |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]: | |
df = self.get_df(selected_super_group, selected_model_group) | |
headers = ["Models", "Overall", "Core(w/o CoT)", "Core(w/ CoT)", "Open-ended"] + self.SUPER_GROUPS[selected_super_group] | |
data = df[headers].values.tolist() | |
return headers, data | |
class CoreSingleDataLoader(BaseDataLoader): | |
def __init__(self): | |
super().__init__() | |
def _load_model_data(self) -> Dict[str, Any]: | |
with open("./static/eval_results/Core_SI/all_model_keywords_stats.json", "r") as f: | |
return json.load(f) | |
def _load_summary_data(self) -> Dict[str, Any]: | |
with open("./static/eval_results/Core_SI/all_summary.json", "r") as f: | |
return json.load(f) | |
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: | |
original_dimension = get_original_dimension(selected_super_group) | |
data = [] | |
for model in self.MODEL_GROUPS[selected_model_group]: | |
model_data = self.MODEL_DATA[model] | |
summary = self.SUMMARY_DATA[model] | |
core_si_score = summary["macro_mean_score"] | |
row = { | |
"Models": get_display_model_name(model), | |
"Core SI": round(core_si_score * 100, 2), | |
} | |
for keyword in self.SUPER_GROUPS[selected_super_group]: | |
original_keyword = get_original_keyword(keyword) | |
if original_dimension in model_data and original_keyword in model_data[original_dimension]: | |
row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2) | |
else: | |
row[keyword] = None | |
data.append(row) | |
df = pd.DataFrame(data) | |
df = df.sort_values(by="Core SI", ascending=False) | |
return df | |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]: | |
df = self.get_df(selected_super_group, selected_model_group) | |
headers = ["Models", "Core SI"] + self.SUPER_GROUPS[selected_super_group] | |
data = df[headers].values.tolist() | |
return headers, data | |
# Keep your helper functions | |
def get_original_dimension(mapped_dimension): | |
return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension) | |
def get_original_keyword(mapped_keyword): | |
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword) | |
def get_display_model_name(model_name): | |
return MODEL_NAME_MAP.get(model_name, model_name) | |