open-agent-leaderboard / src /overall_math_score.json
liaojiajia
update sc-cot scores
be9cdf5
{
"time": "2025-01-24 15:10:27",
"results": {
"IO": {
"META": {
"Algorithm": "IO",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 37.83,
"Cost($)": 0.3328
},
"AQuA": {
"Score": 38.98,
"Cost($)": 0.038
}
},
"ReAct-Pro*": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 74.91,
"Cost($)": 3.4633
},
"AQuA": {
"Score": 64.57,
"Cost($)": 0.4928
}
},
"PoT": {
"META": {
"Algorithm": "PoT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 76.88,
"Cost($)": 0.6902
},
"AQuA": {
"Score": 59.45,
"Cost($)": 0.1748
}
},
"CoT": {
"META": {
"Algorithm": "CoT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 78.7,
"Cost($)": 0.6788
},
"AQuA": {
"Score": 61.02,
"Cost($)": 0.0957
}
},
"SC-CoT": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 79.91,
"Cost($)": 3.3938
},
"AQuA": {
"Score": 66.14,
"Cost($)": 0.7888
}
},
"IO-Doubao-lite-32k": {
"META": {
"Algorithm": "IO",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 72.02,
"Cost($)": 0.0354
},
"AQuA": {
"Score": 79.13,
"Cost($)": 0.0058
}
},
"ReAct-Pro*-Doubao-lite-32k": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 85.6,
"Cost($)": 0.2512
},
"AQuA": {
"Score": 77.56,
"Cost($)": 0.0445
}
},
"PoT-Doubao-lite-32k": {
"META": {
"Algorithm": "PoT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 79.61,
"Cost($)": 0.0576
},
"AQuA": {
"Score": 71.65,
"Cost($)": 0.0147
}
},
"CoT-Doubao-lite-32k": {
"META": {
"Algorithm": "CoT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 89.31,
"Cost($)": 0.0558
},
"AQuA": {
"Score": 82.68,
"Cost($)": 0.0066
}
},
"SC-CoT-Doubao-lite-32k": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 87.26,
"Cost($)": 0.2083
},
"AQuA": {
"Score": 81.1,
"Cost($)": 0.0519
}
},
"IO-gpt-4o": {
"META": {
"Algorithm": "IO",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 88.4,
"Cost($)": 3.3463
},
"AQuA": {
"Score": 75.59,
"Cost($)": 1.1453
}
},
"ReAct-Pro*-gpt-4o": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 63.31,
"Cost($)": 39.0751
},
"AQuA": {
"Score": 57.48,
"Cost($)": 2.304
}
},
"PoT-gpt-4o": {
"META": {
"Algorithm": "PoT",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 93.1,
"Cost($)": 4.2166
},
"AQuA": {
"Score": 75.2,
"Cost($)": 1.6087
}
},
"CoT-gpt-4o": {
"META": {
"Algorithm": "CoT",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 94.09,
"Cost($)": 4.5367
},
"AQuA": {
"Score": 82.68,
"Cost($)": 1.0417
}
},
"SC-CoT-gpt-4o": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 90.3,
"Cost($)": 31.0542
},
"AQuA": {
"Score": 86.61,
"Cost($)": 8.1485
}
},
"IO-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 86.58,
"Cost($)": 0.4899
},
"AQuA": {
"Score": 84.25,
"Cost($)": 0.0742
}
},
"ReAct-Pro*-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 87.26,
"Cost($)": 10.5479
},
"AQuA": {
"Score": 73.23,
"Cost($)": 0.3177
}
},
"PoT-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 92.34,
"Cost($)": 0.7054
},
"AQuA": {
"Score": 75.2,
"Cost($)": 0.1645
}
},
"CoT-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 92.87,
"Cost($)": 0.7195
},
"AQuA": {
"Score": 86.22,
"Cost($)": 0.0808
}
},
"SC-CoT-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 93.86,
"Cost($)": 5.9858
},
"AQuA": {
"Score": 85.04,
"Cost($)": 1.0348
}
},
"IO-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 92.27,
"Cost($)": 0.4709
},
"AQuA": {
"Score": 82.68,
"Cost($)": 0.0798
}
},
"ReAct-Pro*-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 87.64,
"Cost($)": 10.1124
},
"AQuA": {
"Score": 79.13,
"Cost($)": 0.768
}
},
"PoT-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 73.09,
"Cost($)": 0.9736
},
"AQuA": {
"Score": 79.53,
"Cost($)": 0.1746
}
},
"CoT-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 93.93,
"Cost($)": 0.687
},
"AQuA": {
"Score": 83.46,
"Cost($)": 0.0927
}
},
"SC-CoT-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 95.07,
"Cost($)": 6.2005
},
"AQuA": {
"Score": 82.28,
"Cost($)": 1.0756
}
},
"IO-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 57.24,
"Cost($)": 0.0
},
"AQuA": {
"Score": 78.74,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 82.87,
"Cost($)": 0.0
},
"AQuA": {
"Score": 74.41,
"Cost($)": 0.0
}
},
"PoT-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 58.83,
"Cost($)": 0.0
},
"AQuA": {
"Score": 68.11,
"Cost($)": 0.0
}
},
"CoT-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 85.67,
"Cost($)": 0.0
},
"AQuA": {
"Score": 80.71,
"Cost($)": 0.0
}
},
"SC-CoT-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 91.13,
"Cost($)": 0.0
},
"AQuA": {
"Score": 79.92,
"Cost($)": 0.0
}
},
"IO-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 57.16,
"Cost($)": 0.0
},
"AQuA": {
"Score": 51.18,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 67.78,
"Cost($)": 0.0
},
"AQuA": {
"Score": 55.51,
"Cost($)": 0.0
}
},
"PoT-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 38.67,
"Cost($)": 0.0
},
"AQuA": {
"Score": 36.61,
"Cost($)": 0.0
}
},
"CoT-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 75.44,
"Cost($)": 0.0
},
"AQuA": {
"Score": 60.63,
"Cost($)": 0.0
}
},
"SC-CoT-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 73.46,
"Cost($)": 0.0
},
"AQuA": {
"Score": 59.45,
"Cost($)": 0.0
}
},
"IO-Internllm2_5-7B": {
"META": {
"Algorithm": "IO",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 11.6,
"Cost($)": 0.0
},
"AQuA": {
"Score": 47.64,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Internllm2_5-7B": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 33.51,
"Cost($)": 0.0
},
"AQuA": {
"Score": 40.94,
"Cost($)": 0.0
}
},
"PoT-Internllm2_5-7B": {
"META": {
"Algorithm": "PoT",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 38.21,
"Cost($)": 0.0
},
"AQuA": {
"Score": 36.61,
"Cost($)": 0.0
}
},
"CoT-Internllm2_5-7B": {
"META": {
"Algorithm": "CoT",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 77.71,
"Cost($)": 0.0
},
"AQuA": {
"Score": 52.76,
"Cost($)": 0.0
}
},
"SC-CoT-Internllm2_5-7B": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 48.22,
"Cost($)": 0.0
},
"AQuA": {
"Score": 39.37,
"Cost($)": 0.0
}
},
"IO-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 16.68,
"Cost($)": 0.0
},
"AQuA": {
"Score": 29.13,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 24.87,
"Cost($)": 0.0
},
"AQuA": {
"Score": 25.59,
"Cost($)": 0.0
}
},
"PoT-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 18.5,
"Cost($)": 0.0
},
"AQuA": {
"Score": 30.71,
"Cost($)": 0.0
}
},
"CoT-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 55.5,
"Cost($)": 0.0
},
"AQuA": {
"Score": 40.55,
"Cost($)": 0.0
}
},
"SC-CoT-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 11.75,
"Cost($)": 0.0
},
"AQuA": {
"Score": 23.62,
"Cost($)": 0.0
}
},
"IO-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 14.71,
"Cost($)": 0.0
},
"AQuA": {
"Score": 27.17,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 7.66,
"Cost($)": 0.0
},
"AQuA": {
"Score": 24.02,
"Cost($)": 0.0
}
},
"PoT-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 9.62,
"Cost($)": 0.0
},
"AQuA": {
"Score": 17.32,
"Cost($)": 0.0
}
},
"CoT-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 35.94,
"Cost($)": 0.0
},
"AQuA": {
"Score": 33.07,
"Cost($)": 0.0
}
},
"SC-CoT-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 1.67,
"Cost($)": 0.0
},
"AQuA": {
"Score": 22.83,
"Cost($)": 0.0
}
}
}
}