Cosmetic changes, update results
Browse files- app.py +61 -23
- results/Bgym-Claude-3.5-Sonnet/README.md +1 -0
- results/Bgym-Claude-3.5-Sonnet/workarena-l1.json +16 -0
- results/Bgym-Claude-3.5-Sonnet/workarena-l2.json +16 -0
- results/{test-agent β Bgym-Claude-3.5-Sonnet}/workarena-l3.json +3 -3
- results/Bgym-GPT-3.5/workarena-l1.json +0 -28
- results/Bgym-GPT-4o-V/config.json +0 -4
- results/Bgym-GPT-4o-mini/README.md +1 -0
- results/{test-agent β Bgym-GPT-4o-mini}/miniwob.json +3 -3
- results/{test-agent β Bgym-GPT-4o-mini}/workarena-l1.json +3 -3
- results/{test-agent β Bgym-GPT-4o-mini}/workarena-l2.json +3 -3
- results/Bgym-GPT-4o/config.json +0 -4
- results/Bgym-GPT-4o/miniwob.json +2 -2
- results/Bgym-GPT-4o/workarena-l1.json +2 -2
- results/Bgym-GPT-4o/workarena-l2.json +2 -2
- results/Bgym-GPT-o1-mini/README.md +1 -0
- results/{test-agent/webarena.json β Bgym-GPT-o1-mini/workarena-l1.json} +4 -4
- results/Bgym-GPT-o1-mini/workarena-l2.json +16 -0
- results/Bgym-Llama-3-70b/config.json +0 -4
- results/Bgym-Llama-3-70b/workarena-l1.json +0 -42
- results/Bgym-Llama-3.1-70b/README.md +1 -0
- results/Bgym-Llama-3.1-70b/workarena-l1.json +16 -0
- results/Bgym-Llama-3.1-70b/workarena-l2.json +16 -0
- results/Bgym-Mixtral-8x22b/config.json +0 -4
- results/Bgym-Mixtral-8x22b/workarena-l1.json +0 -28
- results/test-agent/README.md +0 -1
app.py
CHANGED
@@ -9,6 +9,7 @@ import plotly.graph_objs as go
|
|
9 |
from huggingface_hub import HfApi
|
10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
11 |
import streamlit.components.v1 as components
|
|
|
12 |
|
13 |
from urllib.parse import quote
|
14 |
from pathlib import Path
|
@@ -49,6 +50,26 @@ def sanitize_cell_value(value: Any) -> str:
|
|
49 |
return html.escape(str(value))
|
50 |
|
51 |
def create_html_table_main(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
html = '''
|
53 |
<style>
|
54 |
table {
|
@@ -87,7 +108,28 @@ def create_html_table_main(df):
|
|
87 |
html += '</div>'
|
88 |
return html
|
89 |
|
90 |
-
def create_html_table_benchmark(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
html = '''
|
92 |
<style>
|
93 |
table {
|
@@ -127,6 +169,8 @@ def create_html_table_benchmark(df):
|
|
127 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
128 |
elif column == "Reproduced_all":
|
129 |
continue
|
|
|
|
|
130 |
else:
|
131 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
132 |
html += '</tr>'
|
@@ -183,7 +227,10 @@ def main():
|
|
183 |
continue
|
184 |
agent_results = []
|
185 |
for benchmark in BENCHMARKS:
|
186 |
-
|
|
|
|
|
|
|
187 |
agent_results.extend(json.load(f))
|
188 |
all_results[agent] = agent_results
|
189 |
|
@@ -217,11 +264,9 @@ def main():
|
|
217 |
if dfs_to_concat:
|
218 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
df = df.sort_values(by='WebArena', ascending=False)
|
224 |
-
|
225 |
# Add a search bar
|
226 |
search_query = st.text_input("Search agents", "", key="search_main")
|
227 |
|
@@ -240,14 +285,6 @@ def main():
|
|
240 |
return ""
|
241 |
|
242 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
243 |
-
# st.dataframe(
|
244 |
-
# df[['Agent'] + BENCHMARKS],
|
245 |
-
# use_container_width=True,
|
246 |
-
# column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
|
247 |
-
# hide_index=True,
|
248 |
-
# # height=int(len(df) * 36.2),
|
249 |
-
# )
|
250 |
-
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
251 |
html_table = create_html_table_main(df)
|
252 |
st.markdown(html_table, unsafe_allow_html=True)
|
253 |
|
@@ -395,18 +432,21 @@ MIT
|
|
395 |
for value in values:
|
396 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
|
397 |
result_dict["Score"] = value["score"]
|
|
|
398 |
result_dict["Benchmark Specific"] = value["benchmark_specific"]
|
399 |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
|
400 |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
|
401 |
result_dict["Reproducible"] = value["reproducible"]
|
402 |
result_dict["Comments"] = value["comments"]
|
403 |
result_dict["Study ID"] = value["study_id"]
|
|
|
404 |
result_dict["Date"] = value["date_time"]
|
405 |
result_dict["Reproduced"] = []
|
406 |
result_dict["Reproduced_all"] = []
|
407 |
flag = 1
|
408 |
if not flag:
|
409 |
result_dict["Score"] = "-"
|
|
|
410 |
result_dict["Benchmark Specific"] = "-"
|
411 |
result_dict["Benchmark Tuned"] = "-"
|
412 |
result_dict["Followed Evaluation Protocol"] = "-"
|
@@ -418,6 +458,7 @@ MIT
|
|
418 |
result_dict["Reproduced_all"] = []
|
419 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
|
420 |
result_dict["Reproduced"].append(value["score"])
|
|
|
421 |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
|
422 |
if result_dict["Reproduced"]:
|
423 |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
|
@@ -435,14 +476,11 @@ MIT
|
|
435 |
# Concatenate the DataFrames
|
436 |
if dfs_to_concat:
|
437 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
# hide_index=True,
|
444 |
-
# )
|
445 |
-
html_table = create_html_table_benchmark(df_)
|
446 |
st.markdown(html_table, unsafe_allow_html=True)
|
447 |
|
448 |
|
|
|
9 |
from huggingface_hub import HfApi
|
10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
11 |
import streamlit.components.v1 as components
|
12 |
+
from datetime import datetime
|
13 |
|
14 |
from urllib.parse import quote
|
15 |
from pathlib import Path
|
|
|
50 |
return html.escape(str(value))
|
51 |
|
52 |
def create_html_table_main(df):
|
53 |
+
col1, col2 = st.columns([2,6])
|
54 |
+
with col1:
|
55 |
+
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
|
56 |
+
with col2:
|
57 |
+
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
|
58 |
+
|
59 |
+
def get_sort_value(row):
|
60 |
+
if row == "-":
|
61 |
+
return float('-inf')
|
62 |
+
else:
|
63 |
+
try:
|
64 |
+
return float(row)
|
65 |
+
except ValueError:
|
66 |
+
return row
|
67 |
+
|
68 |
+
# Sort dataframe
|
69 |
+
if sort_order == "Ascending":
|
70 |
+
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
|
71 |
+
else:
|
72 |
+
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
|
73 |
html = '''
|
74 |
<style>
|
75 |
table {
|
|
|
108 |
html += '</div>'
|
109 |
return html
|
110 |
|
111 |
+
def create_html_table_benchmark(df, benchmark):
|
112 |
+
col1, col2 = st.columns([2,6])
|
113 |
+
with col1:
|
114 |
+
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
|
115 |
+
with col2:
|
116 |
+
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
|
117 |
+
|
118 |
+
def get_sort_value(row):
|
119 |
+
if row == "-":
|
120 |
+
return float('-inf')
|
121 |
+
else:
|
122 |
+
try:
|
123 |
+
return float(row)
|
124 |
+
except ValueError:
|
125 |
+
return row
|
126 |
+
|
127 |
+
# Sort dataframe
|
128 |
+
if sort_order == "Ascending":
|
129 |
+
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
|
130 |
+
else:
|
131 |
+
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
|
132 |
+
|
133 |
html = '''
|
134 |
<style>
|
135 |
table {
|
|
|
169 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
170 |
elif column == "Reproduced_all":
|
171 |
continue
|
172 |
+
# elif column == "Score":
|
173 |
+
# html += f'<td>{row[column]}</td>'
|
174 |
else:
|
175 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
176 |
html += '</tr>'
|
|
|
227 |
continue
|
228 |
agent_results = []
|
229 |
for benchmark in BENCHMARKS:
|
230 |
+
file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
|
231 |
+
if not file_path.is_file():
|
232 |
+
continue
|
233 |
+
with open(file_path) as f:
|
234 |
agent_results.extend(json.load(f))
|
235 |
all_results[agent] = agent_results
|
236 |
|
|
|
264 |
if dfs_to_concat:
|
265 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
266 |
|
267 |
+
for benchmark in BENCHMARKS:
|
268 |
+
df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
|
269 |
+
df[benchmark] = df[benchmark].astype(str)
|
|
|
|
|
270 |
# Add a search bar
|
271 |
search_query = st.text_input("Search agents", "", key="search_main")
|
272 |
|
|
|
285 |
return ""
|
286 |
|
287 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
html_table = create_html_table_main(df)
|
289 |
st.markdown(html_table, unsafe_allow_html=True)
|
290 |
|
|
|
432 |
for value in values:
|
433 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
|
434 |
result_dict["Score"] = value["score"]
|
435 |
+
result_dict["std_err"] = value["std_err"]
|
436 |
result_dict["Benchmark Specific"] = value["benchmark_specific"]
|
437 |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
|
438 |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
|
439 |
result_dict["Reproducible"] = value["reproducible"]
|
440 |
result_dict["Comments"] = value["comments"]
|
441 |
result_dict["Study ID"] = value["study_id"]
|
442 |
+
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
|
443 |
result_dict["Date"] = value["date_time"]
|
444 |
result_dict["Reproduced"] = []
|
445 |
result_dict["Reproduced_all"] = []
|
446 |
flag = 1
|
447 |
if not flag:
|
448 |
result_dict["Score"] = "-"
|
449 |
+
result_dict["std_err"] = "-"
|
450 |
result_dict["Benchmark Specific"] = "-"
|
451 |
result_dict["Benchmark Tuned"] = "-"
|
452 |
result_dict["Followed Evaluation Protocol"] = "-"
|
|
|
458 |
result_dict["Reproduced_all"] = []
|
459 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
|
460 |
result_dict["Reproduced"].append(value["score"])
|
461 |
+
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
|
462 |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
|
463 |
if result_dict["Reproduced"]:
|
464 |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
|
|
|
476 |
# Concatenate the DataFrames
|
477 |
if dfs_to_concat:
|
478 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
479 |
+
df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
|
480 |
+
df_['Score'] = df_['Score'].astype(str)
|
481 |
+
df_['Score'] = df_.apply(lambda row: f"{row['Score']} Β± {row['std_err']}", axis=1)
|
482 |
+
df_ = df_.drop(columns=['std_err'])
|
483 |
+
html_table = create_html_table_benchmark(df_, benchmark)
|
|
|
|
|
|
|
484 |
st.markdown(html_table, unsafe_allow_html=True)
|
485 |
|
486 |
|
results/Bgym-Claude-3.5-Sonnet/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
### Claude 3.5 Sonnet model
|
results/Bgym-Claude-3.5-Sonnet/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 56.4,
|
7 |
+
"std_err": 2.7,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Claude-3.5-Sonnet/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 39.1,
|
8 |
+
"std_err": 3.2,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/{test-agent β Bgym-Claude-3.5-Sonnet}/workarena-l3.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
-
"score": 0.
|
8 |
-
"std_err": 0.
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
+
"score": 0.4,
|
8 |
+
"std_err": 0.4,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-3.5/workarena-l1.json
CHANGED
@@ -12,33 +12,5 @@
|
|
12 |
"reproducible": "Yes",
|
13 |
"comments": "NA",
|
14 |
"original_or_reproduced": "Original"
|
15 |
-
},
|
16 |
-
{
|
17 |
-
"agent_name": "Bgym-GPT-3.5",
|
18 |
-
"study_id": "study_id",
|
19 |
-
"benchmark": "WorkArena-L1",
|
20 |
-
"score": 5.7,
|
21 |
-
"std_err": 0.3,
|
22 |
-
"benchmark_specific": "No",
|
23 |
-
"benchmark_tuned": "No",
|
24 |
-
"followed_evaluation_protocol": "Yes",
|
25 |
-
"reproducible": "Yes",
|
26 |
-
"comments": "NA",
|
27 |
-
"original_or_reproduced": "Reproduced",
|
28 |
-
"date_time": "2021-01-04 12:06:00"
|
29 |
-
},
|
30 |
-
{
|
31 |
-
"benchmark": "WorkArena-L1",
|
32 |
-
"agent_name": "Bgym-GPT-3.5",
|
33 |
-
"study_id": "study_id",
|
34 |
-
"score": 5.1,
|
35 |
-
"std_err": 0.3,
|
36 |
-
"benchmark_specific": "No",
|
37 |
-
"benchmark_tuned": "No",
|
38 |
-
"followed_evaluation_protocol": "Yes",
|
39 |
-
"reproducible": "Yes",
|
40 |
-
"comments": "NA",
|
41 |
-
"original_or_reproduced": "Reproduced",
|
42 |
-
"date_time": "2021-01-04 12:06:00"
|
43 |
}
|
44 |
]
|
|
|
12 |
"reproducible": "Yes",
|
13 |
"comments": "NA",
|
14 |
"original_or_reproduced": "Original"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
}
|
16 |
]
|
results/Bgym-GPT-4o-V/config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"agent_name": "GPT-4o-V",
|
3 |
-
"backend_llm": "GPT-4o-V"
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-mini/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
## GPT-4o-mini model
|
results/{test-agent β Bgym-GPT-4o-mini}/miniwob.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
+
"score": 58.8,
|
8 |
+
"std_err": 1.4,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/{test-agent β Bgym-GPT-4o-mini}/workarena-l1.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 27,
|
8 |
+
"std_err": 2.4,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/{test-agent β Bgym-GPT-4o-mini}/workarena-l2.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
-
"score":
|
8 |
-
"std_err": 0.
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 1.3,
|
8 |
+
"std_err": 0.7,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o/config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"agent_name": "GPT-4o",
|
3 |
-
"backend_llm": "GPT-4o"
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o/miniwob.json
CHANGED
@@ -4,8 +4,8 @@
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
+
"score": 65.6,
|
8 |
+
"std_err": 1.9,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o/workarena-l1.json
CHANGED
@@ -4,8 +4,8 @@
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 45.5,
|
8 |
+
"std_err": 2.7,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o/workarena-l2.json
CHANGED
@@ -4,8 +4,8 @@
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 8.5,
|
8 |
+
"std_err": 1.8,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-o1-mini/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
## GPT-o1-mini model
|
results/{test-agent/webarena.json β Bgym-GPT-o1-mini/workarena-l1.json}
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 56.7,
|
8 |
+
"std_err": 2.7,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-o1-mini/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 14.9,
|
8 |
+
"std_err": 2.3,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3-70b/config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"agent_name": "Llama-3-70B",
|
3 |
-
"backend_llm": "Llama-3-70B"
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/workarena-l1.json
CHANGED
@@ -12,47 +12,5 @@
|
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
"date_time": "2021-01-01 12:00:00"
|
15 |
-
},
|
16 |
-
{
|
17 |
-
"agent_name": "Bgym-Llama-3-70b",
|
18 |
-
"study_id": "study_id",
|
19 |
-
"benchmark": "WorkArena-L1",
|
20 |
-
"score": 15.9,
|
21 |
-
"std_err": 0.6,
|
22 |
-
"benchmark_specific": "No",
|
23 |
-
"benchmark_tuned": "No",
|
24 |
-
"followed_evaluation_protocol": "Yes",
|
25 |
-
"reproducible": "Yes",
|
26 |
-
"comments": "NA",
|
27 |
-
"original_or_reproduced": "Reproduced",
|
28 |
-
"date_time": "2021-01-04 12:06:00"
|
29 |
-
},
|
30 |
-
{
|
31 |
-
"agent_name": "Bgym-Llama-3-70b",
|
32 |
-
"study_id": "study_id",
|
33 |
-
"benchmark": "WorkArena-L1",
|
34 |
-
"score": 19.9,
|
35 |
-
"std_err": 0.6,
|
36 |
-
"benchmark_specific": "No",
|
37 |
-
"benchmark_tuned": "No",
|
38 |
-
"followed_evaluation_protocol": "Yes",
|
39 |
-
"reproducible": "Yes",
|
40 |
-
"comments": "NA",
|
41 |
-
"original_or_reproduced": "Reproduced",
|
42 |
-
"date_time": "2021-01-05 2:07:00"
|
43 |
-
},
|
44 |
-
{
|
45 |
-
"agent_name": "Bgym-Llama-3-70b",
|
46 |
-
"study_id": "study_id",
|
47 |
-
"benchmark": "WorkArena-L1",
|
48 |
-
"score": 17.9,
|
49 |
-
"std_err": 0.6,
|
50 |
-
"benchmark_specific": "No",
|
51 |
-
"benchmark_tuned": "No",
|
52 |
-
"followed_evaluation_protocol": "Yes",
|
53 |
-
"reproducible": "Yes",
|
54 |
-
"comments": "NA",
|
55 |
-
"original_or_reproduced": "Reproduced",
|
56 |
-
"date_time": "2021-01-12 12:00:00"
|
57 |
}
|
58 |
]
|
|
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
"date_time": "2021-01-01 12:00:00"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
}
|
16 |
]
|
results/Bgym-Llama-3.1-70b/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
### Llama-3.1-70B
|
results/Bgym-Llama-3.1-70b/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 27.9,
|
7 |
+
"std_err": 2.5,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-70b/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 2.1,
|
8 |
+
"std_err": 0.9,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Mixtral-8x22b/config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"agent_name": "Mixtral-8x22B",
|
3 |
-
"backend_llm": "Mixtral-8x22B"
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/workarena-l1.json
CHANGED
@@ -12,33 +12,5 @@
|
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
"date_time": "2021-01-04 12:06:00"
|
15 |
-
},
|
16 |
-
{
|
17 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
18 |
-
"study_id": "study_id",
|
19 |
-
"benchmark": "WorkArena-L1",
|
20 |
-
"score": 11.4,
|
21 |
-
"std_err": 0.7,
|
22 |
-
"benchmark_specific": "No",
|
23 |
-
"benchmark_tuned": "No",
|
24 |
-
"followed_evaluation_protocol": "Yes",
|
25 |
-
"reproducible": "Yes",
|
26 |
-
"comments": "NA",
|
27 |
-
"original_or_reproduced": "Reproduced",
|
28 |
-
"date_time": "2021-01-04 12:06:00"
|
29 |
-
},
|
30 |
-
{
|
31 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
32 |
-
"study_id": "study_id",
|
33 |
-
"benchmark": "WorkArena-L1",
|
34 |
-
"score": 13.4,
|
35 |
-
"std_err": 0.7,
|
36 |
-
"benchmark_specific": "No",
|
37 |
-
"benchmark_tuned": "No",
|
38 |
-
"followed_evaluation_protocol": "Yes",
|
39 |
-
"reproducible": "Yes",
|
40 |
-
"comments": "NA",
|
41 |
-
"original_or_reproduced": "Reproduced",
|
42 |
-
"date_time": "2021-01-04 12:06:00"
|
43 |
}
|
44 |
]
|
|
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
"date_time": "2021-01-04 12:06:00"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
}
|
16 |
]
|
results/test-agent/README.md
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
### Test agent
|
|
|
|