Commit
·
d2de681
1
Parent(s):
6d59540
Add README and workarena-l1.json for GenericAgent-o1-mini and GenericAgent-o3-mini
Browse files
results/GenericAgent-o1-mini/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-o1-mini
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/agent_configs.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = FLAGS_GPT_4o = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=False,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
action_set=bgym.HighLevelActionSetArgs(
|
28 |
+
subsets=["bid"],
|
29 |
+
multiaction=False,
|
30 |
+
),
|
31 |
+
long_description=False,
|
32 |
+
individual_examples=False,
|
33 |
+
),
|
34 |
+
use_plan=False,
|
35 |
+
use_criticise=False,
|
36 |
+
use_thinking=True,
|
37 |
+
use_memory=False,
|
38 |
+
use_concrete_example=True,
|
39 |
+
use_abstract_example=True,
|
40 |
+
use_hints=True,
|
41 |
+
enable_chat=False,
|
42 |
+
max_prompt_tokens=40_000,
|
43 |
+
be_cautious=True,
|
44 |
+
extra_instructions=None,
|
45 |
+
)
|
46 |
+
```
|
results/GenericAgent-o1-mini/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-o1-mini",
|
4 |
+
"study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7",
|
5 |
+
"date_time": "2025-02-07 14:00:00",
|
6 |
+
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 51.8,
|
8 |
+
"std_err": 2.80,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "Additional details",
|
14 |
+
"original_or_reproduced": "Reproduced"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-o3-mini/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-o3-mini
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/agent_configs.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = FLAGS_GPT_4o = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=False,
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
action_set=bgym.HighLevelActionSetArgs(
|
28 |
+
subsets=["bid"],
|
29 |
+
multiaction=False,
|
30 |
+
),
|
31 |
+
long_description=False,
|
32 |
+
individual_examples=False,
|
33 |
+
),
|
34 |
+
use_plan=False,
|
35 |
+
use_criticise=False,
|
36 |
+
use_thinking=True,
|
37 |
+
use_memory=False,
|
38 |
+
use_concrete_example=True,
|
39 |
+
use_abstract_example=True,
|
40 |
+
use_hints=True,
|
41 |
+
enable_chat=False,
|
42 |
+
max_prompt_tokens=40_000,
|
43 |
+
be_cautious=True,
|
44 |
+
extra_instructions=None,
|
45 |
+
)
|
46 |
+
```
|
results/GenericAgent-o3-mini/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-o3-mini",
|
4 |
+
"study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7",
|
5 |
+
"date_time": "2025-02-07 14:00:00",
|
6 |
+
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 48.2,
|
8 |
+
"std_err": 2.80,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "Additional details",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|