meghsn commited on
Commit
f4d95d8
Β·
1 Parent(s): e80279f

Cosmetic changes, update results

Browse files
app.py CHANGED
@@ -9,6 +9,7 @@ import plotly.graph_objs as go
9
  from huggingface_hub import HfApi
10
  from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
11
  import streamlit.components.v1 as components
 
12
 
13
  from urllib.parse import quote
14
  from pathlib import Path
@@ -49,6 +50,26 @@ def sanitize_cell_value(value: Any) -> str:
49
  return html.escape(str(value))
50
 
51
  def create_html_table_main(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  html = '''
53
  <style>
54
  table {
@@ -87,7 +108,28 @@ def create_html_table_main(df):
87
  html += '</div>'
88
  return html
89
 
90
- def create_html_table_benchmark(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  html = '''
92
  <style>
93
  table {
@@ -127,6 +169,8 @@ def create_html_table_benchmark(df):
127
  html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
128
  elif column == "Reproduced_all":
129
  continue
 
 
130
  else:
131
  html += f'<td>{sanitize_cell_value(row[column])}</td>'
132
  html += '</tr>'
@@ -183,7 +227,10 @@ def main():
183
  continue
184
  agent_results = []
185
  for benchmark in BENCHMARKS:
186
- with open(f"results/{agent}/{benchmark.lower()}.json") as f:
 
 
 
187
  agent_results.extend(json.load(f))
188
  all_results[agent] = agent_results
189
 
@@ -217,11 +264,9 @@ def main():
217
  if dfs_to_concat:
218
  df = pd.concat(dfs_to_concat, ignore_index=True)
219
 
220
- # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
221
- # df['Average'] = df['Average'].round(2)
222
- # Sort values
223
- df = df.sort_values(by='WebArena', ascending=False)
224
-
225
  # Add a search bar
226
  search_query = st.text_input("Search agents", "", key="search_main")
227
 
@@ -240,14 +285,6 @@ def main():
240
  return ""
241
 
242
  df['Agent'] = df['Agent'].apply(make_hyperlink)
243
- # st.dataframe(
244
- # df[['Agent'] + BENCHMARKS],
245
- # use_container_width=True,
246
- # column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
247
- # hide_index=True,
248
- # # height=int(len(df) * 36.2),
249
- # )
250
- # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
251
  html_table = create_html_table_main(df)
252
  st.markdown(html_table, unsafe_allow_html=True)
253
 
@@ -395,18 +432,21 @@ MIT
395
  for value in values:
396
  if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
397
  result_dict["Score"] = value["score"]
 
398
  result_dict["Benchmark Specific"] = value["benchmark_specific"]
399
  result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
400
  result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
401
  result_dict["Reproducible"] = value["reproducible"]
402
  result_dict["Comments"] = value["comments"]
403
  result_dict["Study ID"] = value["study_id"]
 
404
  result_dict["Date"] = value["date_time"]
405
  result_dict["Reproduced"] = []
406
  result_dict["Reproduced_all"] = []
407
  flag = 1
408
  if not flag:
409
  result_dict["Score"] = "-"
 
410
  result_dict["Benchmark Specific"] = "-"
411
  result_dict["Benchmark Tuned"] = "-"
412
  result_dict["Followed Evaluation Protocol"] = "-"
@@ -418,6 +458,7 @@ MIT
418
  result_dict["Reproduced_all"] = []
419
  if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
420
  result_dict["Reproduced"].append(value["score"])
 
421
  result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
422
  if result_dict["Reproduced"]:
423
  result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
@@ -435,14 +476,11 @@ MIT
435
  # Concatenate the DataFrames
436
  if dfs_to_concat:
437
  df_ = pd.concat(dfs_to_concat, ignore_index=True)
438
- # st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
439
- # st.dataframe(
440
- # df_,
441
- # use_container_width=True,
442
- # column_config={benchmark: {'alignment': 'center'}},
443
- # hide_index=True,
444
- # )
445
- html_table = create_html_table_benchmark(df_)
446
  st.markdown(html_table, unsafe_allow_html=True)
447
 
448
 
 
9
  from huggingface_hub import HfApi
10
  from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
11
  import streamlit.components.v1 as components
12
+ from datetime import datetime
13
 
14
  from urllib.parse import quote
15
  from pathlib import Path
 
50
  return html.escape(str(value))
51
 
52
  def create_html_table_main(df):
53
+ col1, col2 = st.columns([2,6])
54
+ with col1:
55
+ sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
56
+ with col2:
57
+ sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
58
+
59
+ def get_sort_value(row):
60
+ if row == "-":
61
+ return float('-inf')
62
+ else:
63
+ try:
64
+ return float(row)
65
+ except ValueError:
66
+ return row
67
+
68
+ # Sort dataframe
69
+ if sort_order == "Ascending":
70
+ df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
71
+ else:
72
+ df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
73
  html = '''
74
  <style>
75
  table {
 
108
  html += '</div>'
109
  return html
110
 
111
+ def create_html_table_benchmark(df, benchmark):
112
+ col1, col2 = st.columns([2,6])
113
+ with col1:
114
+ sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
115
+ with col2:
116
+ sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
117
+
118
+ def get_sort_value(row):
119
+ if row == "-":
120
+ return float('-inf')
121
+ else:
122
+ try:
123
+ return float(row)
124
+ except ValueError:
125
+ return row
126
+
127
+ # Sort dataframe
128
+ if sort_order == "Ascending":
129
+ df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
130
+ else:
131
+ df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
132
+
133
  html = '''
134
  <style>
135
  table {
 
169
  html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
170
  elif column == "Reproduced_all":
171
  continue
172
+ # elif column == "Score":
173
+ # html += f'<td>{row[column]}</td>'
174
  else:
175
  html += f'<td>{sanitize_cell_value(row[column])}</td>'
176
  html += '</tr>'
 
227
  continue
228
  agent_results = []
229
  for benchmark in BENCHMARKS:
230
+ file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
231
+ if not file_path.is_file():
232
+ continue
233
+ with open(file_path) as f:
234
  agent_results.extend(json.load(f))
235
  all_results[agent] = agent_results
236
 
 
264
  if dfs_to_concat:
265
  df = pd.concat(dfs_to_concat, ignore_index=True)
266
 
267
+ for benchmark in BENCHMARKS:
268
+ df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
269
+ df[benchmark] = df[benchmark].astype(str)
 
 
270
  # Add a search bar
271
  search_query = st.text_input("Search agents", "", key="search_main")
272
 
 
285
  return ""
286
 
287
  df['Agent'] = df['Agent'].apply(make_hyperlink)
 
 
 
 
 
 
 
 
288
  html_table = create_html_table_main(df)
289
  st.markdown(html_table, unsafe_allow_html=True)
290
 
 
432
  for value in values:
433
  if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
434
  result_dict["Score"] = value["score"]
435
+ result_dict["std_err"] = value["std_err"]
436
  result_dict["Benchmark Specific"] = value["benchmark_specific"]
437
  result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
438
  result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
439
  result_dict["Reproducible"] = value["reproducible"]
440
  result_dict["Comments"] = value["comments"]
441
  result_dict["Study ID"] = value["study_id"]
442
+ value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
443
  result_dict["Date"] = value["date_time"]
444
  result_dict["Reproduced"] = []
445
  result_dict["Reproduced_all"] = []
446
  flag = 1
447
  if not flag:
448
  result_dict["Score"] = "-"
449
+ result_dict["std_err"] = "-"
450
  result_dict["Benchmark Specific"] = "-"
451
  result_dict["Benchmark Tuned"] = "-"
452
  result_dict["Followed Evaluation Protocol"] = "-"
 
458
  result_dict["Reproduced_all"] = []
459
  if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
460
  result_dict["Reproduced"].append(value["score"])
461
+ value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
462
  result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
463
  if result_dict["Reproduced"]:
464
  result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
 
476
  # Concatenate the DataFrames
477
  if dfs_to_concat:
478
  df_ = pd.concat(dfs_to_concat, ignore_index=True)
479
+ df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
480
+ df_['Score'] = df_['Score'].astype(str)
481
+ df_['Score'] = df_.apply(lambda row: f"{row['Score']} Β± {row['std_err']}", axis=1)
482
+ df_ = df_.drop(columns=['std_err'])
483
+ html_table = create_html_table_benchmark(df_, benchmark)
 
 
 
484
  st.markdown(html_table, unsafe_allow_html=True)
485
 
486
 
results/Bgym-Claude-3.5-Sonnet/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ### Claude 3.5 Sonnet model
results/Bgym-Claude-3.5-Sonnet/workarena-l1.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Claude-3.5-Sonnet",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L1",
6
+ "score": 56.4,
7
+ "std_err": 2.7,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Claude-3.5-Sonnet/workarena-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Claude-3.5-Sonnet",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L2",
7
+ "score": 39.1,
8
+ "std_err": 3.2,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/{test-agent β†’ Bgym-Claude-3.5-Sonnet}/workarena-l3.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "test-agent",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
+ "score": 0.4,
8
+ "std_err": 0.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-3.5/workarena-l1.json CHANGED
@@ -12,33 +12,5 @@
12
  "reproducible": "Yes",
13
  "comments": "NA",
14
  "original_or_reproduced": "Original"
15
- },
16
- {
17
- "agent_name": "Bgym-GPT-3.5",
18
- "study_id": "study_id",
19
- "benchmark": "WorkArena-L1",
20
- "score": 5.7,
21
- "std_err": 0.3,
22
- "benchmark_specific": "No",
23
- "benchmark_tuned": "No",
24
- "followed_evaluation_protocol": "Yes",
25
- "reproducible": "Yes",
26
- "comments": "NA",
27
- "original_or_reproduced": "Reproduced",
28
- "date_time": "2021-01-04 12:06:00"
29
- },
30
- {
31
- "benchmark": "WorkArena-L1",
32
- "agent_name": "Bgym-GPT-3.5",
33
- "study_id": "study_id",
34
- "score": 5.1,
35
- "std_err": 0.3,
36
- "benchmark_specific": "No",
37
- "benchmark_tuned": "No",
38
- "followed_evaluation_protocol": "Yes",
39
- "reproducible": "Yes",
40
- "comments": "NA",
41
- "original_or_reproduced": "Reproduced",
42
- "date_time": "2021-01-04 12:06:00"
43
  }
44
  ]
 
12
  "reproducible": "Yes",
13
  "comments": "NA",
14
  "original_or_reproduced": "Original"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
16
  ]
results/Bgym-GPT-4o-V/config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "agent_name": "GPT-4o-V",
3
- "backend_llm": "GPT-4o-V"
4
- }
 
 
 
 
 
results/Bgym-GPT-4o-mini/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## GPT-4o-mini model
results/{test-agent β†’ Bgym-GPT-4o-mini}/miniwob.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "test-agent",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
- "score": 43.4,
8
- "std_err": 0.1,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
+ "score": 58.8,
8
+ "std_err": 1.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/{test-agent β†’ Bgym-GPT-4o-mini}/workarena-l1.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "test-agent",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
- "score": 6.1,
8
- "std_err": 0.3,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
+ "score": 27,
8
+ "std_err": 2.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/{test-agent β†’ Bgym-GPT-4o-mini}/workarena-l2.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "test-agent",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
+ "score": 1.3,
8
+ "std_err": 0.7,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-4o/config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "agent_name": "GPT-4o",
3
- "backend_llm": "GPT-4o"
4
- }
 
 
 
 
 
results/Bgym-GPT-4o/miniwob.json CHANGED
@@ -4,8 +4,8 @@
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
- "score": 71.3,
8
- "std_err": 0.5,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
+ "score": 65.6,
8
+ "std_err": 1.9,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-4o/workarena-l1.json CHANGED
@@ -4,8 +4,8 @@
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
- "score": 42.7,
8
- "std_err": 0.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
+ "score": 45.5,
8
+ "std_err": 2.7,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-4o/workarena-l2.json CHANGED
@@ -4,8 +4,8 @@
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
- "score": 3.0,
8
- "std_err": 0.6,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
+ "score": 8.5,
8
+ "std_err": 1.8,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-o1-mini/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## GPT-o1-mini model
results/{test-agent/webarena.json β†’ Bgym-GPT-o1-mini/workarena-l1.json} RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "test-agent",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WebArena",
7
- "score": 6.7,
8
- "std_err": 0.2,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-o1-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L1",
7
+ "score": 56.7,
8
+ "std_err": 2.7,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-o1-mini/workarena-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-o1-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L2",
7
+ "score": 14.9,
8
+ "std_err": 2.3,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3-70b/config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "agent_name": "Llama-3-70B",
3
- "backend_llm": "Llama-3-70B"
4
- }
 
 
 
 
 
results/Bgym-Llama-3-70b/workarena-l1.json CHANGED
@@ -12,47 +12,5 @@
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
  "date_time": "2021-01-01 12:00:00"
15
- },
16
- {
17
- "agent_name": "Bgym-Llama-3-70b",
18
- "study_id": "study_id",
19
- "benchmark": "WorkArena-L1",
20
- "score": 15.9,
21
- "std_err": 0.6,
22
- "benchmark_specific": "No",
23
- "benchmark_tuned": "No",
24
- "followed_evaluation_protocol": "Yes",
25
- "reproducible": "Yes",
26
- "comments": "NA",
27
- "original_or_reproduced": "Reproduced",
28
- "date_time": "2021-01-04 12:06:00"
29
- },
30
- {
31
- "agent_name": "Bgym-Llama-3-70b",
32
- "study_id": "study_id",
33
- "benchmark": "WorkArena-L1",
34
- "score": 19.9,
35
- "std_err": 0.6,
36
- "benchmark_specific": "No",
37
- "benchmark_tuned": "No",
38
- "followed_evaluation_protocol": "Yes",
39
- "reproducible": "Yes",
40
- "comments": "NA",
41
- "original_or_reproduced": "Reproduced",
42
- "date_time": "2021-01-05 2:07:00"
43
- },
44
- {
45
- "agent_name": "Bgym-Llama-3-70b",
46
- "study_id": "study_id",
47
- "benchmark": "WorkArena-L1",
48
- "score": 17.9,
49
- "std_err": 0.6,
50
- "benchmark_specific": "No",
51
- "benchmark_tuned": "No",
52
- "followed_evaluation_protocol": "Yes",
53
- "reproducible": "Yes",
54
- "comments": "NA",
55
- "original_or_reproduced": "Reproduced",
56
- "date_time": "2021-01-12 12:00:00"
57
  }
58
  ]
 
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
  "date_time": "2021-01-01 12:00:00"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
16
  ]
results/Bgym-Llama-3.1-70b/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ### Llama-3.1-70B
results/Bgym-Llama-3.1-70b/workarena-l1.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-70b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L1",
6
+ "score": 27.9,
7
+ "std_err": 2.5,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-70b/workarena-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-70b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L2",
7
+ "score": 2.1,
8
+ "std_err": 0.9,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Mixtral-8x22b/config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "agent_name": "Mixtral-8x22B",
3
- "backend_llm": "Mixtral-8x22B"
4
- }
 
 
 
 
 
results/Bgym-Mixtral-8x22b/workarena-l1.json CHANGED
@@ -12,33 +12,5 @@
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
  "date_time": "2021-01-04 12:06:00"
15
- },
16
- {
17
- "agent_name": "Bgym-Mixtral-8x22b",
18
- "study_id": "study_id",
19
- "benchmark": "WorkArena-L1",
20
- "score": 11.4,
21
- "std_err": 0.7,
22
- "benchmark_specific": "No",
23
- "benchmark_tuned": "No",
24
- "followed_evaluation_protocol": "Yes",
25
- "reproducible": "Yes",
26
- "comments": "NA",
27
- "original_or_reproduced": "Reproduced",
28
- "date_time": "2021-01-04 12:06:00"
29
- },
30
- {
31
- "agent_name": "Bgym-Mixtral-8x22b",
32
- "study_id": "study_id",
33
- "benchmark": "WorkArena-L1",
34
- "score": 13.4,
35
- "std_err": 0.7,
36
- "benchmark_specific": "No",
37
- "benchmark_tuned": "No",
38
- "followed_evaluation_protocol": "Yes",
39
- "reproducible": "Yes",
40
- "comments": "NA",
41
- "original_or_reproduced": "Reproduced",
42
- "date_time": "2021-01-04 12:06:00"
43
  }
44
  ]
 
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
  "date_time": "2021-01-04 12:06:00"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
16
  ]
results/test-agent/README.md DELETED
@@ -1 +0,0 @@
1
- ### Test agent