CultriX commited on
Commit
90cb3d2
·
verified ·
1 Parent(s): 7b7eb30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -200
app.py CHANGED
@@ -1,208 +1,125 @@
1
- import gradio as gr
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
- import seaborn as sns
5
- import plotly.graph_objs as go
6
- import plotly.io as pio
7
- from io import StringIO
8
- import base64
9
-
10
- # Read the data from the file
11
- def parse_data(file_content):
12
- lines = file_content.splitlines()
13
-
14
- model_data = []
15
- current_model = None
16
-
17
- for line in lines:
18
- line = line.strip()
19
- if line.startswith('hf (pretrained='):
20
- current_model = line.split('pretrained=')[1].split(',')[0]
21
- elif line and current_model:
22
- if '|' in line:
23
- # Parse table row
24
- parts = [p.strip() for p in line.split('|')]
25
- if len(parts) >= 2: # Ensure the correct number of columns
26
- try:
27
- task_name = parts[0]
28
- value = float(parts[1]) # Extract the numeric value
29
- model_data.append([
30
- current_model,
31
- task_name, # Task name
32
- value
33
- ])
34
- except ValueError:
35
- print(f"Skipping row due to invalid value: {parts}")
36
- if not model_data:
37
- print("No valid data found in the file.")
38
- return pd.DataFrame(model_data, columns=['Model', 'Task', 'Value'])
39
-
40
- # Calculate average performance
41
- def calculate_averages(data):
42
- if data.empty:
43
- print("No data available to calculate averages.")
44
- return pd.DataFrame(columns=['Model', 'Average Performance'])
45
- return data.groupby('Model')['Value'].mean().reset_index().rename(columns={'Value': 'Average Performance'})
46
-
47
- def create_bar_chart(df, category):
48
- """Create a horizontal bar chart for the specified category."""
49
- sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
50
- fig = go.Figure(go.Bar(
51
- x=sorted_df[category],
52
- y=sorted_df['Model'],
53
- orientation='h',
54
- marker=dict(color=sorted_df[category], colorscale='Viridis'),
55
- hoverinfo='x+y',
56
- text=sorted_df[category],
57
- textposition='auto'
58
- ))
59
- fig.update_layout(
60
- margin=dict(l=20, r=20, t=20, b=20),
61
- title=f"Leaderboard for {category} Scores"
62
- )
63
- return fig
64
-
65
- def generate_visualizations(data, averages):
66
- sns.set(style='whitegrid')
67
-
68
- if averages.empty:
69
- print("No averages to visualize.")
70
- return None, None, None, None, None, None
71
-
72
- averages = averages.sort_values(by='Average Performance')
73
-
74
- # Matplotlib average performance plot
75
  plt.figure(figsize=(12, 8))
76
- sns.barplot(data=averages, x='Average Performance', y='Model', palette='viridis')
77
- plt.title('Average Performance of Models', fontsize=16)
78
- plt.xlabel('Average Performance', fontsize=12)
79
- plt.ylabel('Model', fontsize=12)
 
 
80
  plt.tight_layout()
81
-
82
- # Save the plot to a buffer
83
- buffer_avg = StringIO()
84
- plt.savefig(buffer_avg, format='png')
85
- buffer_avg.seek(0)
86
- image_avg = base64.b64encode(buffer_avg.read()).decode('utf-8')
87
- plt.close()
88
-
89
- # Line plot for task performance by model
90
- sorted_models = averages['Model'].tolist()
91
- data['Model'] = pd.Categorical(data['Model'], categories=sorted_models, ordered=True)
92
- data = data.sort_values(by=['Model', 'Task'])
93
-
94
- if data.empty:
95
- print("No data available for line plot.")
96
- return image_avg, None, None, None, None, None
97
-
98
  plt.figure(figsize=(14, 10))
99
- sns.lineplot(data=data, x='Task', y='Value', hue='Model', marker='o')
100
- plt.title('Task Performance by Model', fontsize=16)
101
- plt.xlabel('Task', fontsize=12)
102
- plt.ylabel('Performance', fontsize=12)
103
- plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Model')
 
 
104
  plt.xticks(rotation=45)
 
 
105
  plt.tight_layout()
106
-
107
- # Save the line plot to a buffer
108
- buffer_line = StringIO()
109
- plt.savefig(buffer_line, format='png')
110
- buffer_line.seek(0)
111
- image_line = base64.b64encode(buffer_line.read()).decode('utf-8')
112
- plt.close()
113
-
114
- # Heatmap of task performance
115
- pivot_table = data.pivot_table(index='Task', columns='Model', values='Value')
116
- plt.figure(figsize=(12, 10))
117
- sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
118
- plt.title('Task Performance Heatmap', fontsize=16)
119
- plt.xlabel('Model', fontsize=12)
120
- plt.ylabel('Task', fontsize=12)
121
- plt.tight_layout()
122
-
123
- # Save the heatmap to a buffer
124
- buffer_heatmap = StringIO()
125
- plt.savefig(buffer_heatmap, format='png')
126
- buffer_heatmap.seek(0)
127
- image_heatmap = base64.b64encode(buffer_heatmap.read()).decode('utf-8')
128
- plt.close()
129
-
130
- # Boxplot of performance distribution per model
131
- plt.figure(figsize=(12, 8))
132
- sns.boxplot(data=data, x='Model', y='Value', palette='Set2')
133
- plt.title('Performance Distribution per Model', fontsize=16)
134
- plt.xlabel('Model', fontsize=12)
135
- plt.ylabel('Performance', fontsize=12)
136
- plt.xticks(rotation=45)
137
  plt.tight_layout()
138
-
139
- # Save the boxplot to a buffer
140
- buffer_boxplot = StringIO()
141
- plt.savefig(buffer_boxplot, format='png')
142
- buffer_boxplot.seek(0)
143
- image_boxplot = base64.b64encode(buffer_boxplot.read()).decode('utf-8')
144
- plt.close()
145
-
146
- # Create plotly bar charts
147
- fig1 = create_bar_chart(averages, 'Average Performance')
148
- plotly_avg = pio.to_html(fig1, full_html=False)
149
-
150
- plotly_tasks = {}
151
- # Assuming you have tasks in the dataframe and want to display it
152
- tasks = data['Task'].unique()
153
- for task in tasks:
154
- task_data = data[data['Task'] == task]
155
- fig2 = create_bar_chart(task_data, 'Value')
156
- fig2.update_layout(title=f"Leaderboard for {task} Scores")
157
- plotly_tasks[task] = pio.to_html(fig2, full_html=False)
158
-
159
- return image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
160
-
161
- def process_and_visualize(file_content):
162
- data = parse_data(file_content)
163
- averages = calculate_averages(data)
164
-
165
- image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks = generate_visualizations(data, averages)
166
-
167
- output_text = f"Average Performance per Model:\n{averages.sort_values(by='Average Performance').to_string()}"
168
-
169
- return output_text, image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
170
-
171
- if __name__ == "__main__":
172
-
173
- task_names = ['tinyArc', 'tinyHellaswag', 'tinyMMLU', 'tinyTruthfulQA', 'tinyTruthfulQA_mc1', 'tinyWinogrande']
174
-
175
- with gr.Blocks(title="LLM Benchmark Visualizer") as demo:
176
- gr.Markdown("Upload your LLM benchmark data and visualize the results.")
177
-
178
- with gr.Row():
179
- input_text = gr.Textbox(lines=10, label="Paste your data here")
180
-
181
- with gr.Row():
182
- output_text = gr.Textbox(label="Average Performance per Model")
183
-
184
- with gr.Row():
185
- with gr.Column():
186
- image_avg = gr.Image(label="Matplotlib Average Performance Chart")
187
- image_line = gr.Image(label="Matplotlib Task Performance Line Chart")
188
- with gr.Column():
189
- image_heatmap = gr.Image(label="Matplotlib Task Performance Heatmap")
190
- image_boxplot = gr.Image(label="Matplotlib Performance Distribution Boxplot")
191
- with gr.Row():
192
- plotly_avg = gr.HTML(label="Plotly Average Performance Chart")
193
-
194
- task_tabs = gr.TabbedInterface([])
195
-
196
- def update_tabs(file_content):
197
- _, _, _, _, _, _, plotly_tasks = process_and_visualize(file_content)
198
- return [gr.HTML(value=html, label=task) for task, html in plotly_tasks.items()]
199
-
200
- input_text.change(
201
- fn=process_and_visualize,
202
- inputs=input_text,
203
- outputs=[output_text, image_avg, image_line, image_heatmap, image_boxplot, plotly_avg],
204
- )
205
-
206
- input_text.change(fn=update_tabs, inputs=input_text, outputs=[task_tabs])
207
-
208
- demo.launch(share=True)
 
 
1
  import pandas as pd
2
  import matplotlib.pyplot as plt
3
+ import gradio as gr
4
+
5
+ # Input data
6
+ data_full = [
7
+ ["CultriX/Qwen2.5-14B-SLERPv7", 0.7205, 0.8272, 0.7541, 0.6581, 0.5000, 0.7290],
8
+ ["djuna/Q2.5-Veltha-14B-0.5", 0.7492, 0.8386, 0.7305, 0.5980, 0.4300, 0.7817],
9
+ ["CultriX/Qwen2.5-14B-FinalMerge", 0.7248, 0.8277, 0.7113, 0.7052, 0.5700, 0.7001],
10
+ ["CultriX/Qwen2.5-14B-MultiCultyv2", 0.7295, 0.8359, 0.7363, 0.5767, 0.4400, 0.7316],
11
+ ["CultriX/Qwen2.5-14B-Brocav7", 0.7445, 0.8353, 0.7508, 0.6292, 0.4600, 0.7629],
12
+ ["CultriX/Qwen2.5-14B-Broca", 0.7456, 0.8352, 0.7480, 0.6034, 0.4400, 0.7716],
13
+ ["CultriX/Qwen2.5-14B-Brocav3", 0.7395, 0.8388, 0.7393, 0.6405, 0.4700, 0.7659],
14
+ ["CultriX/Qwen2.5-14B-Brocav4", 0.7432, 0.8377, 0.7444, 0.6277, 0.4800, 0.7580],
15
+ ["CultriX/Qwen2.5-14B-Brocav2", 0.7492, 0.8302, 0.7508, 0.6377, 0.5100, 0.7478],
16
+ ["CultriX/Qwen2.5-14B-Brocav5", 0.7445, 0.8313, 0.7547, 0.6376, 0.5000, 0.7304],
17
+ ["CultriX/Qwen2.5-14B-Brocav6", 0.7179, 0.8354, 0.7531, 0.6378, 0.4900, 0.7524],
18
+ ["CultriX/Qwenfinity-2.5-14B", 0.7347, 0.8254, 0.7279, 0.7267, 0.5600, 0.6970],
19
+ ["CultriX/Qwen2.5-14B-Emergedv2", 0.7137, 0.8335, 0.7363, 0.5836, 0.4400, 0.7344],
20
+ ["CultriX/Qwen2.5-14B-Unity", 0.7063, 0.8343, 0.7423, 0.6820, 0.5700, 0.7498],
21
+ ["CultriX/Qwen2.5-14B-MultiCultyv3", 0.7132, 0.8216, 0.7395, 0.6792, 0.5500, 0.7120],
22
+ ["CultriX/Qwen2.5-14B-Emergedv3", 0.7436, 0.8312, 0.7519, 0.6585, 0.5500, 0.7068],
23
+ ["CultriX/SeQwence-14Bv1", 0.7278, 0.8410, 0.7541, 0.6816, 0.5200, 0.7539],
24
+ ["CultriX/Qwen2.5-14B-Wernickev2", 0.7391, 0.8168, 0.7273, 0.6220, 0.4500, 0.7572],
25
+ ["CultriX/Qwen2.5-14B-Wernickev3", 0.7357, 0.8148, 0.7245, 0.7023, 0.5500, 0.7869],
26
+ ["CultriX/Qwen2.5-14B-Wernickev4", 0.7355, 0.8290, 0.7497, 0.6306, 0.4800, 0.7635],
27
+ ["CultriX/SeQwential-14B-v1", 0.7355, 0.8205, 0.7549, 0.6367, 0.4800, 0.7626],
28
+ ["CultriX/Qwen2.5-14B-Wernickev5", 0.7224, 0.8272, 0.7541, 0.6790, 0.5100, 0.7578],
29
+ ["CultriX/Qwen2.5-14B-Wernickev6", 0.6994, 0.7549, 0.5816, 0.6991, 0.5800, 0.7267],
30
+ ["CultriX/Qwen2.5-14B-Wernickev7", 0.7147, 0.7599, 0.6097, 0.7056, 0.5700, 0.7164],
31
+ ["CultriX/Qwen2.5-14B-FinalMerge-tmp2", 0.7255, 0.8192, 0.7535, 0.6671, 0.5000, 0.7612],
32
+ ]
33
+
34
+ columns = ["Model Configuration", "tinyArc", "tinyHellaswag", "tinyMMLU", "tinyTruthfulQA", "tinyTruthfulQA_mc1", "tinyWinogrande"]
35
+
36
+ # Convert to DataFrame
37
+ df_full = pd.DataFrame(data_full, columns=columns)
38
+
39
+ def plot_average_scores():
40
+ df_full["Average Score"] = df_full.iloc[:, 1:].mean(axis=1)
41
+ df_avg_sorted = df_full.sort_values(by="Average Score", ascending=False)
42
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  plt.figure(figsize=(12, 8))
44
+ plt.barh(df_avg_sorted["Model Configuration"], df_avg_sorted["Average Score"])
45
+ plt.title("Average Performance of Models Across Tasks", fontsize=16)
46
+ plt.xlabel("Average Score", fontsize=14)
47
+ plt.ylabel("Model Configuration", fontsize=14)
48
+ plt.gca().invert_yaxis()
49
+ plt.grid(axis='x', linestyle='--', alpha=0.7)
50
  plt.tight_layout()
51
+ plt.savefig("average_performance.png")
52
+ return "average_performance.png"
53
+
54
+ def plot_task_performance():
55
+ df_full_melted = df_full.melt(id_vars="Model Configuration", var_name="Task", value_name="Score")
56
+
 
 
 
 
 
 
 
 
 
 
 
57
  plt.figure(figsize=(14, 10))
58
+ for model in df_full["Model Configuration"]:
59
+ model_data = df_full_melted[df_full_melted["Model Configuration"] == model]
60
+ plt.plot(model_data["Task"], model_data["Score"], marker="o", label=model)
61
+
62
+ plt.title("Performance of All Models Across Tasks", fontsize=16)
63
+ plt.xlabel("Task", fontsize=14)
64
+ plt.ylabel("Score", fontsize=14)
65
  plt.xticks(rotation=45)
66
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
67
+ plt.grid(axis='y', linestyle='--', alpha=0.7)
68
  plt.tight_layout()
69
+ plt.savefig("task_performance.png")
70
+ return "task_performance.png"
71
+
72
+ def plot_task_specific_top_models():
73
+ top_models = df_full.iloc[:, :-1].set_index("Model Configuration").idxmax()
74
+ top_scores = df_full.iloc[:, :-1].set_index("Model Configuration").max()
75
+
76
+ results = pd.DataFrame({"Top Model": top_models, "Score": top_scores}).reset_index().rename(columns={"index": "Task"})
77
+
78
+ plt.figure(figsize=(12, 6))
79
+ plt.bar(results["Task"], results["Score"])
80
+ plt.title("Task-Specific Top Models", fontsize=16)
81
+ plt.xlabel("Task", fontsize=14)
82
+ plt.ylabel("Score", fontsize=14)
83
+ plt.grid(axis="y", linestyle="--", alpha=0.7)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  plt.tight_layout()
85
+ plt.savefig("task_specific_top_models.png")
86
+ return "task_specific_top_models.png"
87
+
88
+ def top_3_models_per_task():
89
+ top_3_data = {
90
+ task: df_full.nlargest(3, task)[["Model Configuration", task]].values.tolist()
91
+ for task in df_full.columns[1:-1]
92
+ }
93
+ top_3_results = pd.DataFrame({
94
+ task: {
95
+ "Top 3 Models": [entry[0] for entry in top_3_data[task]],
96
+ "Scores": [entry[1] for entry in top_3_data[task]],
97
+ }
98
+ for task in top_3_data
99
+ }).T.rename_axis("Task").reset_index()
100
+ return top_3_results
101
+
102
+ with gr.Blocks() as demo:
103
+ gr.Markdown("# Model Performance Analysis")
104
+
105
+ with gr.Row():
106
+ btn1 = gr.Button("Show Average Performance")
107
+ img1 = gr.Image(type="filepath")
108
+ btn1.click(plot_average_scores, outputs=img1)
109
+
110
+ with gr.Row():
111
+ btn2 = gr.Button("Show Task Performance")
112
+ img2 = gr.Image(type="filepath")
113
+ btn2.click(plot_task_performance, outputs=img2)
114
+
115
+ with gr.Row():
116
+ btn3 = gr.Button("Task-Specific Top Models")
117
+ img3 = gr.Image(type="filepath")
118
+ btn3.click(plot_task_specific_top_models, outputs=img3)
119
+
120
+ with gr.Row():
121
+ btn4 = gr.Button("Top 3 Models Per Task")
122
+ output4 = gr.Dataframe()
123
+ btn4.click(top_3_models_per_task, outputs=output4)
124
+
125
+ demo.launch()