benbogin's picture
leaderboard
507ce38
[
{
"Agent": "SWE-Agent",
"Base model": "gpt-4o-2024-08-06",
"Expert (Accuracy)": "16.3",
"Expert (Landmarks)": "36.8",
"Masked (Accuracy)": "46.1",
"Masked (Landmarks)": "74.9"
},
{
"Agent": "React",
"Base model": "gpt-4o-2024-08-06",
"Expert (Accuracy)": "12.2",
"Expert (Landmarks)": "33.6",
"Masked (Accuracy)": "37.0",
"Masked (Landmarks)": "65.7"
},
{
"Agent": "React-Super",
"Base model": "gpt-4o-2024-08-06",
"Expert (Accuracy)": "14.4",
"Expert (Landmarks)": "42.6",
"Masked (Accuracy)": "41.6",
"Masked (Landmarks)": "72.5"
},
{
"Agent": "SWE-Agent",
"Base model": "gpt-4o-mini-2024-07-18",
"Expert (Accuracy)": "3.3",
"Expert (Landmarks)": "16.1",
"Masked (Accuracy)": "27.0",
"Masked (Landmarks)": "51.8"
},
{
"Agent": "React-Super",
"Base model": "gpt-4o-mini-2024-07-18",
"Expert (Accuracy)": "5.6",
"Expert (Landmarks)": "20.6",
"Masked (Accuracy)": "31.5",
"Masked (Landmarks)": "58.3"
},
{
"Agent": "SWE-Agent",
"Base model": "Llama 3.1 70B",
"Expert (Accuracy)": "5.6",
"Expert (Landmarks)": "4.8",
"Masked (Accuracy)": "17.4",
"Masked (Landmarks)": "35.0"
},
{
"Agent": "React-Super",
"Base model": "Llama 3.1 70B",
"Expert (Accuracy)": "6.1",
"Expert (Landmarks)": "9.6",
"Masked (Accuracy)": "22.8",
"Masked (Landmarks)": "38.3"
},
{
"Agent": "SWE-Agent",
"Base model": "Mixtral-8x22B-Instruct-v0.1",
"Expert (Accuracy)": "1.1",
"Expert (Landmarks)": "0.0",
"Masked (Accuracy)": "9.5",
"Masked (Landmarks)": "26.6"
},
{
"Agent": "React-Super",
"Base model": "Mixtral-8x22B-Instruct-v0.1",
"Expert (Accuracy)": "3.3",
"Expert (Landmarks)": "3.7",
"Masked (Accuracy)": "7.0",
"Masked (Landmarks)": "13.2"
}
]