Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

meghsn commited on Nov 28, 2024

Commit

f4d95d8

1 Parent(s): e80279f

Cosmetic changes, update results

Browse files

Files changed (26) hide show

app.py +61 -23
results/Bgym-Claude-3.5-Sonnet/README.md +1 -0
results/Bgym-Claude-3.5-Sonnet/workarena-l1.json +16 -0
results/Bgym-Claude-3.5-Sonnet/workarena-l2.json +16 -0
results/{test-agent → Bgym-Claude-3.5-Sonnet}/workarena-l3.json +3 -3
results/Bgym-GPT-3.5/workarena-l1.json +0 -28
results/Bgym-GPT-4o-V/config.json +0 -4
results/Bgym-GPT-4o-mini/README.md +1 -0
results/{test-agent → Bgym-GPT-4o-mini}/miniwob.json +3 -3
results/{test-agent → Bgym-GPT-4o-mini}/workarena-l1.json +3 -3
results/{test-agent → Bgym-GPT-4o-mini}/workarena-l2.json +3 -3
results/Bgym-GPT-4o/config.json +0 -4
results/Bgym-GPT-4o/miniwob.json +2 -2
results/Bgym-GPT-4o/workarena-l1.json +2 -2
results/Bgym-GPT-4o/workarena-l2.json +2 -2
results/Bgym-GPT-o1-mini/README.md +1 -0
results/{test-agent/webarena.json → Bgym-GPT-o1-mini/workarena-l1.json} +4 -4
results/Bgym-GPT-o1-mini/workarena-l2.json +16 -0
results/Bgym-Llama-3-70b/config.json +0 -4
results/Bgym-Llama-3-70b/workarena-l1.json +0 -42
results/Bgym-Llama-3.1-70b/README.md +1 -0
results/Bgym-Llama-3.1-70b/workarena-l1.json +16 -0
results/Bgym-Llama-3.1-70b/workarena-l2.json +16 -0
results/Bgym-Mixtral-8x22b/config.json +0 -4
results/Bgym-Mixtral-8x22b/workarena-l1.json +0 -28
results/test-agent/README.md +0 -1

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import plotly.graph_objs as go
 from huggingface_hub import HfApi
 from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 import streamlit.components.v1 as components
 from urllib.parse import quote
 from pathlib import Path
@@ -49,6 +50,26 @@ def sanitize_cell_value(value: Any) -> str:
     return html.escape(str(value))
 def create_html_table_main(df):
     html = '''
     <style>
         table {
@@ -87,7 +108,28 @@ def create_html_table_main(df):
     html += '</div>'
     return html
-def create_html_table_benchmark(df):
     html = '''
     <style>
         table {
@@ -127,6 +169,8 @@ def create_html_table_benchmark(df):
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
             elif column == "Reproduced_all":
                 continue
             else:
                 html += f'<td>{sanitize_cell_value(row[column])}</td>'
         html += '</tr>'
@@ -183,7 +227,10 @@ def main():
             continue
         agent_results = []
         for benchmark in BENCHMARKS:
-            with open(f"results/{agent}/{benchmark.lower()}.json") as f:
                 agent_results.extend(json.load(f))
         all_results[agent] = agent_results
@@ -217,11 +264,9 @@ def main():
         if dfs_to_concat:
             df = pd.concat(dfs_to_concat, ignore_index=True)
-        # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
-        # df['Average'] = df['Average'].round(2)
-        # Sort values
-        df = df.sort_values(by='WebArena', ascending=False)
         # Add a search bar
         search_query = st.text_input("Search agents", "", key="search_main")
@@ -240,14 +285,6 @@ def main():
                 return ""
         df['Agent'] = df['Agent'].apply(make_hyperlink)
-        # st.dataframe(
-        #     df[['Agent'] + BENCHMARKS],
-        #     use_container_width=True,
-        #     column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
-        #     hide_index=True,
-        #     # height=int(len(df) * 36.2),
-        # )
-        # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
         html_table = create_html_table_main(df)
         st.markdown(html_table, unsafe_allow_html=True)
@@ -395,18 +432,21 @@ MIT
                     for value in values:
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
                             result_dict["Score"] = value["score"]
                             result_dict["Benchmark Specific"] = value["benchmark_specific"]
                             result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
                             result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
                             result_dict["Reproducible"] = value["reproducible"]
                             result_dict["Comments"] = value["comments"]
                             result_dict["Study ID"] = value["study_id"]
                             result_dict["Date"] = value["date_time"]
                             result_dict["Reproduced"] = []
                             result_dict["Reproduced_all"] = []
                             flag = 1
                         if not flag:
                             result_dict["Score"] = "-"
                             result_dict["Benchmark Specific"] = "-"
                             result_dict["Benchmark Tuned"] = "-"
                             result_dict["Followed Evaluation Protocol"] = "-"
@@ -418,6 +458,7 @@ MIT
                             result_dict["Reproduced_all"] = []
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
                             result_dict["Reproduced"].append(value["score"])
                             result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
                     if result_dict["Reproduced"]:
                         result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
@@ -435,14 +476,11 @@ MIT
             # Concatenate the DataFrames
             if dfs_to_concat:
                 df_ = pd.concat(dfs_to_concat, ignore_index=True)
-            # st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
-            # st.dataframe(
-            #     df_,
-            #     use_container_width=True,
-            #     column_config={benchmark: {'alignment': 'center'}},
-            #     hide_index=True,
-            # )
-            html_table = create_html_table_benchmark(df_)
             st.markdown(html_table, unsafe_allow_html=True)

 from huggingface_hub import HfApi
 from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 import streamlit.components.v1 as components
+from datetime import datetime
 from urllib.parse import quote
 from pathlib import Path
     return html.escape(str(value))
 def create_html_table_main(df):
+    col1, col2 = st.columns([2,6])
+    with col1:
+        sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
+    with col2:
+        sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
+    def get_sort_value(row):
+            if row == "-":
+                return float('-inf')
+            else:
+                try:
+                    return float(row)
+                except ValueError:
+                    return row
+    # Sort dataframe
+    if sort_order == "Ascending":
+        df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
+    else:
+        df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
     html = '''
     <style>
         table {
     html += '</div>'
     return html
+def create_html_table_benchmark(df, benchmark):
+    col1, col2 = st.columns([2,6])
+    with col1:
+        sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
+    with col2:
+        sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
+    def get_sort_value(row):
+            if row == "-":
+                return float('-inf')
+            else:
+                try:
+                    return float(row)
+                except ValueError:
+                    return row
+    # Sort dataframe
+    if sort_order == "Ascending":
+        df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
+    else:
+        df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
     html = '''
     <style>
         table {
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
             elif column == "Reproduced_all":
                 continue
+            # elif column == "Score":
+            #     html += f'<td>{row[column]}</td>'
             else:
                 html += f'<td>{sanitize_cell_value(row[column])}</td>'
         html += '</tr>'
             continue
         agent_results = []
         for benchmark in BENCHMARKS:
+            file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
+            if not file_path.is_file():
+                continue
+            with open(file_path) as f:
                 agent_results.extend(json.load(f))
         all_results[agent] = agent_results
         if dfs_to_concat:
             df = pd.concat(dfs_to_concat, ignore_index=True)
+        for benchmark in BENCHMARKS:
+            df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
+            df[benchmark] = df[benchmark].astype(str)
         # Add a search bar
         search_query = st.text_input("Search agents", "", key="search_main")
                 return ""
         df['Agent'] = df['Agent'].apply(make_hyperlink)
         html_table = create_html_table_main(df)
         st.markdown(html_table, unsafe_allow_html=True)
                     for value in values:
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
                             result_dict["Score"] = value["score"]
+                            result_dict["std_err"] = value["std_err"]
                             result_dict["Benchmark Specific"] = value["benchmark_specific"]
                             result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
                             result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
                             result_dict["Reproducible"] = value["reproducible"]
                             result_dict["Comments"] = value["comments"]
                             result_dict["Study ID"] = value["study_id"]
+                            value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
                             result_dict["Date"] = value["date_time"]
                             result_dict["Reproduced"] = []
                             result_dict["Reproduced_all"] = []
                             flag = 1
                         if not flag:
                             result_dict["Score"] = "-"
+                            result_dict["std_err"] = "-"
                             result_dict["Benchmark Specific"] = "-"
                             result_dict["Benchmark Tuned"] = "-"
                             result_dict["Followed Evaluation Protocol"] = "-"
                             result_dict["Reproduced_all"] = []
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
                             result_dict["Reproduced"].append(value["score"])
+                            value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
                             result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
                     if result_dict["Reproduced"]:
                         result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
             # Concatenate the DataFrames
             if dfs_to_concat:
                 df_ = pd.concat(dfs_to_concat, ignore_index=True)
+            df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
+            df_['Score'] = df_['Score'].astype(str)
+            df_['Score'] = df_.apply(lambda row: f"{row['Score']} ± {row['std_err']}", axis=1)
+            df_ = df_.drop(columns=['std_err'])
+            html_table = create_html_table_benchmark(df_, benchmark)
             st.markdown(html_table, unsafe_allow_html=True)

results/Bgym-Claude-3.5-Sonnet/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ### Claude 3.5 Sonnet model

results/Bgym-Claude-3.5-Sonnet/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Claude-3.5-Sonnet",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 56.4,
+        "std_err": 2.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Claude-3.5-Sonnet/workarena-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Claude-3.5-Sonnet",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 39.1,
+        "std_err": 3.2,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/{test-agent → Bgym-Claude-3.5-Sonnet}/workarena-l3.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "test-agent",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
-        "score": 0.0,
-        "std_err": 0.0,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
+        "score": 0.4,
+        "std_err": 0.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-3.5/workarena-l1.json CHANGED Viewed

@@ -12,33 +12,5 @@
         "reproducible": "Yes",
         "comments": "NA",
         "original_or_reproduced": "Original"
-    },
-    {
-        "agent_name": "Bgym-GPT-3.5",
-        "study_id": "study_id",
-        "benchmark": "WorkArena-L1",
-        "score": 5.7,
-        "std_err": 0.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Reproduced",
-        "date_time": "2021-01-04 12:06:00"
-    },
-    {
-        "benchmark": "WorkArena-L1",
-        "agent_name": "Bgym-GPT-3.5",
-        "study_id": "study_id",
-        "score": 5.1,
-        "std_err": 0.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Reproduced",
-        "date_time": "2021-01-04 12:06:00"
     }
 ]

         "reproducible": "Yes",
         "comments": "NA",
         "original_or_reproduced": "Original"
     }
 ]

results/Bgym-GPT-4o-V/config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-    "agent_name": "GPT-4o-V",
-    "backend_llm": "GPT-4o-V"
-}

results/Bgym-GPT-4o-mini/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## GPT-4o-mini model

results/{test-agent → Bgym-GPT-4o-mini}/miniwob.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "test-agent",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
-        "score": 43.4,
-        "std_err": 0.1,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
+        "score": 58.8,
+        "std_err": 1.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/{test-agent → Bgym-GPT-4o-mini}/workarena-l1.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "test-agent",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
-        "score": 6.1,
-        "std_err": 0.3,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
+        "score": 27,
+        "std_err": 2.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/{test-agent → Bgym-GPT-4o-mini}/workarena-l2.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "test-agent",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
-        "score": 0.0,
-        "std_err": 0.0,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
+        "score": 1.3,
+        "std_err": 0.7,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-4o/config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-    "agent_name": "GPT-4o",
-    "backend_llm": "GPT-4o"
-}

results/Bgym-GPT-4o/miniwob.json CHANGED Viewed

@@ -4,8 +4,8 @@
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
-        "score": 71.3,
-        "std_err": 0.5,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
+        "score": 65.6,
+        "std_err": 1.9,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-4o/workarena-l1.json CHANGED Viewed

@@ -4,8 +4,8 @@
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
-        "score": 42.7,
-        "std_err": 0.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
+        "score": 45.5,
+        "std_err": 2.7,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-4o/workarena-l2.json CHANGED Viewed

@@ -4,8 +4,8 @@
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
-        "score": 3.0,
-        "std_err": 0.6,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
+        "score": 8.5,
+        "std_err": 1.8,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-o1-mini/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## GPT-o1-mini model

results/{test-agent/webarena.json → Bgym-GPT-o1-mini/workarena-l1.json} RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "test-agent",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WebArena",
-        "score": 6.7,
-        "std_err": 0.2,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-o1-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L1",
+        "score": 56.7,
+        "std_err": 2.7,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-o1-mini/workarena-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-o1-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 14.9,
+        "std_err": 2.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3-70b/config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-    "agent_name": "Llama-3-70B",
-    "backend_llm": "Llama-3-70B"
-}

results/Bgym-Llama-3-70b/workarena-l1.json CHANGED Viewed

@@ -12,47 +12,5 @@
         "comments": "NA",
         "original_or_reproduced": "Original",
         "date_time": "2021-01-01 12:00:00"
-    },
-    {
-        "agent_name": "Bgym-Llama-3-70b",
-        "study_id": "study_id",
-        "benchmark": "WorkArena-L1",
-        "score": 15.9,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Reproduced",
-        "date_time": "2021-01-04 12:06:00"
-    },
-    {
-        "agent_name": "Bgym-Llama-3-70b",
-        "study_id": "study_id",
-        "benchmark": "WorkArena-L1",
-        "score": 19.9,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Reproduced",
-        "date_time": "2021-01-05 2:07:00"
-    },
-    {
-        "agent_name": "Bgym-Llama-3-70b",
-        "study_id": "study_id",
-        "benchmark": "WorkArena-L1",
-        "score": 17.9,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Reproduced",
-        "date_time": "2021-01-12 12:00:00"
     }
 ]

         "comments": "NA",
         "original_or_reproduced": "Original",
         "date_time": "2021-01-01 12:00:00"
     }
 ]

results/Bgym-Llama-3.1-70b/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ### Llama-3.1-70B

results/Bgym-Llama-3.1-70b/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 27.9,
+        "std_err": 2.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-70b/workarena-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 2.1,
+        "std_err": 0.9,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Mixtral-8x22b/config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-    "agent_name": "Mixtral-8x22B",
-    "backend_llm": "Mixtral-8x22B"
-}

results/Bgym-Mixtral-8x22b/workarena-l1.json CHANGED Viewed

@@ -12,33 +12,5 @@
         "comments": "NA",
         "original_or_reproduced": "Original",
         "date_time": "2021-01-04 12:06:00"
-    },
-    {
-        "agent_name": "Bgym-Mixtral-8x22b",
-        "study_id": "study_id",
-        "benchmark": "WorkArena-L1",
-        "score": 11.4,
-        "std_err": 0.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Reproduced",
-        "date_time": "2021-01-04 12:06:00"
-    },
-    {
-        "agent_name": "Bgym-Mixtral-8x22b",
-        "study_id": "study_id",
-        "benchmark": "WorkArena-L1",
-        "score": 13.4,
-        "std_err": 0.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Reproduced",
-        "date_time": "2021-01-04 12:06:00"
     }
 ]

         "comments": "NA",
         "original_or_reproduced": "Original",
         "date_time": "2021-01-04 12:06:00"
     }
 ]

results/test-agent/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- ### Test agent