Spaces:
Running
Running
Moved overall results. Showing README.
Browse files
README.md
CHANGED
@@ -1,13 +1,17 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🤔
|
4 |
colorFrom: purple
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: bsd-3-clause
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Verbal Reasoning Challenge
|
3 |
emoji: 🤔
|
4 |
colorFrom: purple
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.15.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: bsd-3-clause
|
11 |
---
|
12 |
|
13 |
+
# PhD Knowledge Not Required: A Reasoning Challenge for Large Language Models
|
14 |
+
|
15 |
+
This application presents the results of several models that we have
|
16 |
+
evaluated on verbal reasoning challenge. The overall results are below.
|
17 |
+
Use the tabs above to explore the results in more detail.
|
app.py
CHANGED
@@ -23,7 +23,7 @@ import pandas as pd
|
|
23 |
import numpy as np
|
24 |
from metrics import load_results, accuracy_by_model_and_time
|
25 |
import metrics
|
26 |
-
|
27 |
|
28 |
def get_model_response(prompt_id, model_name):
|
29 |
query = f"""
|
@@ -226,14 +226,24 @@ def all_challenges_view():
|
|
226 |
|
227 |
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
def create_interface():
|
231 |
with gr.Blocks() as demo:
|
232 |
with gr.Tabs():
|
|
|
|
|
233 |
with gr.TabItem("All Challenges"):
|
234 |
all_challenges_view()
|
235 |
-
with gr.TabItem("Accuracy by Model"):
|
236 |
-
gr.DataFrame(metrics.accuracy_by_model(conn).to_df())
|
237 |
with gr.TabItem("Accuracy Over Time"):
|
238 |
summary_view()
|
239 |
with gr.TabItem("DeepSeek R1 Analysis"):
|
|
|
23 |
import numpy as np
|
24 |
from metrics import load_results, accuracy_by_model_and_time
|
25 |
import metrics
|
26 |
+
from pathlib import Path
|
27 |
|
28 |
def get_model_response(prompt_id, model_name):
|
29 |
query = f"""
|
|
|
226 |
|
227 |
|
228 |
|
229 |
+
def overview_view():
|
230 |
+
with gr.Blocks(fill_height=True):
|
231 |
+
with gr.Row():
|
232 |
+
readme_text = Path("README.md").read_text()
|
233 |
+
# Find the second "---" and remove the text after it.
|
234 |
+
readme_text = readme_text.split("---")[2]
|
235 |
+
gr.Markdown(readme_text)
|
236 |
+
with gr.Row():
|
237 |
+
gr.DataFrame(metrics.accuracy_by_model(conn).to_df())
|
238 |
+
|
239 |
|
240 |
def create_interface():
|
241 |
with gr.Blocks() as demo:
|
242 |
with gr.Tabs():
|
243 |
+
with gr.TabItem("Overview"):
|
244 |
+
overview_view()
|
245 |
with gr.TabItem("All Challenges"):
|
246 |
all_challenges_view()
|
|
|
|
|
247 |
with gr.TabItem("Accuracy Over Time"):
|
248 |
summary_view()
|
249 |
with gr.TabItem("DeepSeek R1 Analysis"):
|