StringChaos
commited on
Commit
·
6fe41a3
1
Parent(s):
b3bc767
explorer updated evals
Browse files- all_outputs.json +2 -2
- app.py +3 -1
- templates/index.html +3 -3
- templates/index_mini.html +3 -3
all_outputs.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8fa8293294ed03c607e4c0d861b50203bd6aee395414d8f5d9d7e9a7853acc9
|
3 |
+
size 284487753
|
app.py
CHANGED
@@ -56,6 +56,7 @@ all_evaluations_by_problem_colored = [
|
|
56 |
for model in all_models
|
57 |
},
|
58 |
problems[idx]["difficulty"],
|
|
|
59 |
)
|
60 |
for trueidx, idx in enumerate(random_idxs)
|
61 |
]
|
@@ -110,6 +111,7 @@ def problem(problem_idx):
|
|
110 |
mini_models = [
|
111 |
# "DeepSeek-V2",
|
112 |
"DeepSeek-V3",
|
|
|
113 |
# "DSCoder-33b-Ins",
|
114 |
# "GPT-4-Turbo-2024-04-09",
|
115 |
"GPT-4O-2024-05-13",
|
@@ -117,8 +119,8 @@ mini_models = [
|
|
117 |
"Gemini-Flash-2.0-Thinking",
|
118 |
# "Gemini-Exp-1206",
|
119 |
# "Claude-3-Sonnet",
|
120 |
-
"O1-Mini-2024-09-12 (N=1)",
|
121 |
"O1-2024-12-17 (N=1) (High)",
|
|
|
122 |
]
|
123 |
|
124 |
|
|
|
56 |
for model in all_models
|
57 |
},
|
58 |
problems[idx]["difficulty"],
|
59 |
+
problems[idx]["question_id"],
|
60 |
)
|
61 |
for trueidx, idx in enumerate(random_idxs)
|
62 |
]
|
|
|
111 |
mini_models = [
|
112 |
# "DeepSeek-V2",
|
113 |
"DeepSeek-V3",
|
114 |
+
"DeepSeek-R1-Preview",
|
115 |
# "DSCoder-33b-Ins",
|
116 |
# "GPT-4-Turbo-2024-04-09",
|
117 |
"GPT-4O-2024-05-13",
|
|
|
119 |
"Gemini-Flash-2.0-Thinking",
|
120 |
# "Gemini-Exp-1206",
|
121 |
# "Claude-3-Sonnet",
|
|
|
122 |
"O1-2024-12-17 (N=1) (High)",
|
123 |
+
"QwQ-32B-Preview (N=1)",
|
124 |
]
|
125 |
|
126 |
|
templates/index.html
CHANGED
@@ -86,7 +86,7 @@
|
|
86 |
<table id='model-table' style="align:center">
|
87 |
<thead>
|
88 |
<tr>
|
89 |
-
<th>
|
90 |
<th>Difficulty</th>
|
91 |
{% for model in models %}
|
92 |
<th class="column-{{ model }}">{{ model }}</th>
|
@@ -94,9 +94,9 @@
|
|
94 |
</tr>
|
95 |
</thead>
|
96 |
<tbody>
|
97 |
-
{% for problem_idx, problem, difficulty in problems %}
|
98 |
<tr>
|
99 |
-
<td> <a href="{{ url_for('problem', problem_idx=problem_idx) }}"> {{
|
100 |
<td> {{ difficulty }} </td>
|
101 |
{% for model in models %}
|
102 |
<td style="background-color: {{ problem[model]['correctness_color'] }};" class="column-{{ model }}">
|
|
|
86 |
<table id='model-table' style="align:center">
|
87 |
<thead>
|
88 |
<tr>
|
89 |
+
<th>Question ID</th>
|
90 |
<th>Difficulty</th>
|
91 |
{% for model in models %}
|
92 |
<th class="column-{{ model }}">{{ model }}</th>
|
|
|
94 |
</tr>
|
95 |
</thead>
|
96 |
<tbody>
|
97 |
+
{% for problem_idx, problem, difficulty, question_id in problems %}
|
98 |
<tr>
|
99 |
+
<td> <a href="{{ url_for('problem', problem_idx=problem_idx) }}"> {{ question_id }} </a> </td>
|
100 |
<td> {{ difficulty }} </td>
|
101 |
{% for model in models %}
|
102 |
<td style="background-color: {{ problem[model]['correctness_color'] }};" class="column-{{ model }}">
|
templates/index_mini.html
CHANGED
@@ -85,7 +85,7 @@
|
|
85 |
<table id='model-table' style="align:center">
|
86 |
<thead>
|
87 |
<tr>
|
88 |
-
<th>
|
89 |
<th>Difficulty</th>
|
90 |
{% for model in models %}
|
91 |
<th class="column-{{ model }}">{{ model }}</th>
|
@@ -93,9 +93,9 @@
|
|
93 |
</tr>
|
94 |
</thead>
|
95 |
<tbody>
|
96 |
-
{% for problem_idx, problem, difficulty in problems %}
|
97 |
<tr>
|
98 |
-
<td> <a href="{{ url_for('problem_mini', problem_idx=problem_idx) }}"> {{
|
99 |
<td> {{ difficulty }} </td>
|
100 |
{% for model in models %}
|
101 |
<td style="background-color: {{ problem[model]['correctness_color'] }};" class="column-{{ model }}">
|
|
|
85 |
<table id='model-table' style="align:center">
|
86 |
<thead>
|
87 |
<tr>
|
88 |
+
<th>Question ID</th>
|
89 |
<th>Difficulty</th>
|
90 |
{% for model in models %}
|
91 |
<th class="column-{{ model }}">{{ model }}</th>
|
|
|
93 |
</tr>
|
94 |
</thead>
|
95 |
<tbody>
|
96 |
+
{% for problem_idx, problem, difficulty, question_id in problems %}
|
97 |
<tr>
|
98 |
+
<td> <a href="{{ url_for('problem_mini', problem_idx=problem_idx) }}"> {{ question_id }} </a> </td>
|
99 |
<td> {{ difficulty }} </td>
|
100 |
{% for model in models %}
|
101 |
<td style="background-color: {{ problem[model]['correctness_color'] }};" class="column-{{ model }}">
|