jardinet-souffleton commited on
Commit
d2de681
·
1 Parent(s): 6d59540

Add README and workarena-l1.json for GenericAgent-o1-mini and GenericAgent-o3-mini

Browse files
results/GenericAgent-o1-mini/README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### GenericAgent-o1-mini
2
+
3
+ This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
+
5
+ It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/agent_configs.py):
6
+ ```python
7
+ BASE_FLAGS = FLAGS_GPT_4o = GenericPromptFlags(
8
+ obs=dp.ObsFlags(
9
+ use_html=False,
10
+ use_ax_tree=True,
11
+ use_focused_element=True,
12
+ use_error_logs=True,
13
+ use_history=True,
14
+ use_past_error_logs=False,
15
+ use_action_history=True,
16
+ use_think_history=False,
17
+ use_diff=False,
18
+ html_type="pruned_html",
19
+ use_screenshot=False,
20
+ use_som=False,
21
+ extract_visible_tag=True,
22
+ extract_clickable_tag=True,
23
+ extract_coords="False",
24
+ filter_visible_elements_only=False,
25
+ ),
26
+ action=dp.ActionFlags(
27
+ action_set=bgym.HighLevelActionSetArgs(
28
+ subsets=["bid"],
29
+ multiaction=False,
30
+ ),
31
+ long_description=False,
32
+ individual_examples=False,
33
+ ),
34
+ use_plan=False,
35
+ use_criticise=False,
36
+ use_thinking=True,
37
+ use_memory=False,
38
+ use_concrete_example=True,
39
+ use_abstract_example=True,
40
+ use_hints=True,
41
+ enable_chat=False,
42
+ max_prompt_tokens=40_000,
43
+ be_cautious=True,
44
+ extra_instructions=None,
45
+ )
46
+ ```
results/GenericAgent-o1-mini/workarena-l1.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "GenericAgent-o1-mini",
4
+ "study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7",
5
+ "date_time": "2025-02-07 14:00:00",
6
+ "benchmark": "WorkArena-L1",
7
+ "score": 51.8,
8
+ "std_err": 2.80,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "Additional details",
14
+ "original_or_reproduced": "Reproduced"
15
+ }
16
+ ]
results/GenericAgent-o3-mini/README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### GenericAgent-o3-mini
2
+
3
+ This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
+
5
+ It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/agent_configs.py):
6
+ ```python
7
+ BASE_FLAGS = FLAGS_GPT_4o = GenericPromptFlags(
8
+ obs=dp.ObsFlags(
9
+ use_html=False,
10
+ use_ax_tree=True,
11
+ use_focused_element=True,
12
+ use_error_logs=True,
13
+ use_history=True,
14
+ use_past_error_logs=False,
15
+ use_action_history=True,
16
+ use_think_history=False,
17
+ use_diff=False,
18
+ html_type="pruned_html",
19
+ use_screenshot=False,
20
+ use_som=False,
21
+ extract_visible_tag=True,
22
+ extract_clickable_tag=True,
23
+ extract_coords="False",
24
+ filter_visible_elements_only=False,
25
+ ),
26
+ action=dp.ActionFlags(
27
+ action_set=bgym.HighLevelActionSetArgs(
28
+ subsets=["bid"],
29
+ multiaction=False,
30
+ ),
31
+ long_description=False,
32
+ individual_examples=False,
33
+ ),
34
+ use_plan=False,
35
+ use_criticise=False,
36
+ use_thinking=True,
37
+ use_memory=False,
38
+ use_concrete_example=True,
39
+ use_abstract_example=True,
40
+ use_hints=True,
41
+ enable_chat=False,
42
+ max_prompt_tokens=40_000,
43
+ be_cautious=True,
44
+ extra_instructions=None,
45
+ )
46
+ ```
results/GenericAgent-o3-mini/workarena-l1.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "GenericAgent-o3-mini",
4
+ "study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7",
5
+ "date_time": "2025-02-07 14:00:00",
6
+ "benchmark": "WorkArena-L1",
7
+ "score": 48.2,
8
+ "std_err": 2.80,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "Additional details",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]