{ "time": "2025-01-24 15:10:27", "results": { "IO": { "META": { "Algorithm": "IO", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 37.83, "Cost($)": 0.3328 }, "AQuA": { "Score": 38.98, "Cost($)": 0.038 } }, "ReAct-Pro*": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 74.91, "Cost($)": 3.4633 }, "AQuA": { "Score": 64.57, "Cost($)": 0.4928 } }, "PoT": { "META": { "Algorithm": "PoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 76.88, "Cost($)": 0.6902 }, "AQuA": { "Score": 59.45, "Cost($)": 0.1748 } }, "CoT": { "META": { "Algorithm": "CoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 78.7, "Cost($)": 0.6788 }, "AQuA": { "Score": 61.02, "Cost($)": 0.0957 } }, "SC-CoT": { "META": { "Algorithm": "SC-CoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 79.91, "Cost($)": 3.3938 }, "AQuA": { "Score": 66.14, "Cost($)": 0.7888 } }, "IO-Doubao-lite-32k": { "META": { "Algorithm": "IO", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 72.02, "Cost($)": 0.0354 }, "AQuA": { "Score": 79.13, "Cost($)": 0.0058 } }, "ReAct-Pro*-Doubao-lite-32k": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 85.6, "Cost($)": 0.2512 }, "AQuA": { "Score": 77.56, "Cost($)": 0.0445 } }, "PoT-Doubao-lite-32k": { "META": { "Algorithm": "PoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 79.61, "Cost($)": 0.0576 }, "AQuA": { "Score": 71.65, "Cost($)": 0.0147 } }, "CoT-Doubao-lite-32k": { "META": { "Algorithm": "CoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 89.31, "Cost($)": 0.0558 }, "AQuA": { "Score": 82.68, "Cost($)": 0.0066 } }, "SC-CoT-Doubao-lite-32k": { "META": { "Algorithm": "SC-CoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 87.26, "Cost($)": 0.2083 }, "AQuA": { "Score": 81.1, "Cost($)": 0.0519 } }, "IO-gpt-4o": { "META": { "Algorithm": "IO", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 88.4, "Cost($)": 3.3463 }, "AQuA": { "Score": 75.59, "Cost($)": 1.1453 } }, "ReAct-Pro*-gpt-4o": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 63.31, "Cost($)": 39.0751 }, "AQuA": { "Score": 57.48, "Cost($)": 2.304 } }, "PoT-gpt-4o": { "META": { "Algorithm": "PoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.1, "Cost($)": 4.2166 }, "AQuA": { "Score": 75.2, "Cost($)": 1.6087 } }, "CoT-gpt-4o": { "META": { "Algorithm": "CoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 94.09, "Cost($)": 4.5367 }, "AQuA": { "Score": 82.68, "Cost($)": 1.0417 } }, "SC-CoT-gpt-4o": { "META": { "Algorithm": "SC-CoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 90.3, "Cost($)": 31.0542 }, "AQuA": { "Score": 86.61, "Cost($)": 8.1485 } }, "IO-Qwen2.5-72B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 86.58, "Cost($)": 0.4899 }, "AQuA": { "Score": 84.25, "Cost($)": 0.0742 } }, "ReAct-Pro*-Qwen2.5-72B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 87.26, "Cost($)": 10.5479 }, "AQuA": { "Score": 73.23, "Cost($)": 0.3177 } }, "PoT-Qwen2.5-72B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.34, "Cost($)": 0.7054 }, "AQuA": { "Score": 75.2, "Cost($)": 0.1645 } }, "CoT-Qwen2.5-72B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.87, "Cost($)": 0.7195 }, "AQuA": { "Score": 86.22, "Cost($)": 0.0808 } }, "SC-CoT-Qwen2.5-72B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.86, "Cost($)": 5.9858 }, "AQuA": { "Score": 85.04, "Cost($)": 1.0348 } }, "IO-Llama-3.3-70B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.27, "Cost($)": 0.4709 }, "AQuA": { "Score": 82.68, "Cost($)": 0.0798 } }, "ReAct-Pro*-Llama-3.3-70B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 87.64, "Cost($)": 10.1124 }, "AQuA": { "Score": 79.13, "Cost($)": 0.768 } }, "PoT-Llama-3.3-70B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 73.09, "Cost($)": 0.9736 }, "AQuA": { "Score": 79.53, "Cost($)": 0.1746 } }, "CoT-Llama-3.3-70B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.93, "Cost($)": 0.687 }, "AQuA": { "Score": 83.46, "Cost($)": 0.0927 } }, "SC-CoT-Llama-3.3-70B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 95.07, "Cost($)": 6.2005 }, "AQuA": { "Score": 82.28, "Cost($)": 1.0756 } }, "IO-Qwen2.5-7B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 57.24, "Cost($)": 0.0 }, "AQuA": { "Score": 78.74, "Cost($)": 0.0 } }, "ReAct-Pro*-Qwen2.5-7B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 82.87, "Cost($)": 0.0 }, "AQuA": { "Score": 74.41, "Cost($)": 0.0 } }, "PoT-Qwen2.5-7B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 58.83, "Cost($)": 0.0 }, "AQuA": { "Score": 68.11, "Cost($)": 0.0 } }, "CoT-Qwen2.5-7B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 85.67, "Cost($)": 0.0 }, "AQuA": { "Score": 80.71, "Cost($)": 0.0 } }, "SC-CoT-Qwen2.5-7B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 91.13, "Cost($)": 0.0 }, "AQuA": { "Score": 79.92, "Cost($)": 0.0 } }, "IO-Llama-3.1-8B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 57.16, "Cost($)": 0.0 }, "AQuA": { "Score": 51.18, "Cost($)": 0.0 } }, "ReAct-Pro*-Llama-3.1-8B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 67.78, "Cost($)": 0.0 }, "AQuA": { "Score": 55.51, "Cost($)": 0.0 } }, "PoT-Llama-3.1-8B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 38.67, "Cost($)": 0.0 }, "AQuA": { "Score": 36.61, "Cost($)": 0.0 } }, "CoT-Llama-3.1-8B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 75.44, "Cost($)": 0.0 }, "AQuA": { "Score": 60.63, "Cost($)": 0.0 } }, "SC-CoT-Llama-3.1-8B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 73.46, "Cost($)": 0.0 }, "AQuA": { "Score": 59.45, "Cost($)": 0.0 } }, "IO-Internllm2_5-7B": { "META": { "Algorithm": "IO", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 11.6, "Cost($)": 0.0 }, "AQuA": { "Score": 47.64, "Cost($)": 0.0 } }, "ReAct-Pro*-Internllm2_5-7B": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 33.51, "Cost($)": 0.0 }, "AQuA": { "Score": 40.94, "Cost($)": 0.0 } }, "PoT-Internllm2_5-7B": { "META": { "Algorithm": "PoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 38.21, "Cost($)": 0.0 }, "AQuA": { "Score": 36.61, "Cost($)": 0.0 } }, "CoT-Internllm2_5-7B": { "META": { "Algorithm": "CoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 77.71, "Cost($)": 0.0 }, "AQuA": { "Score": 52.76, "Cost($)": 0.0 } }, "SC-CoT-Internllm2_5-7B": { "META": { "Algorithm": "SC-CoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 48.22, "Cost($)": 0.0 }, "AQuA": { "Score": 39.37, "Cost($)": 0.0 } }, "IO-Qwen2-1.5B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 16.68, "Cost($)": 0.0 }, "AQuA": { "Score": 29.13, "Cost($)": 0.0 } }, "ReAct-Pro*-Qwen2-1.5B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 24.87, "Cost($)": 0.0 }, "AQuA": { "Score": 25.59, "Cost($)": 0.0 } }, "PoT-Qwen2-1.5B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 18.5, "Cost($)": 0.0 }, "AQuA": { "Score": 30.71, "Cost($)": 0.0 } }, "CoT-Qwen2-1.5B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 55.5, "Cost($)": 0.0 }, "AQuA": { "Score": 40.55, "Cost($)": 0.0 } }, "SC-CoT-Qwen2-1.5B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 11.75, "Cost($)": 0.0 }, "AQuA": { "Score": 23.62, "Cost($)": 0.0 } }, "IO-Qwen2-0.5B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 14.71, "Cost($)": 0.0 }, "AQuA": { "Score": 27.17, "Cost($)": 0.0 } }, "ReAct-Pro*-Qwen2-0.5B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 7.66, "Cost($)": 0.0 }, "AQuA": { "Score": 24.02, "Cost($)": 0.0 } }, "PoT-Qwen2-0.5B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 9.62, "Cost($)": 0.0 }, "AQuA": { "Score": 17.32, "Cost($)": 0.0 } }, "CoT-Qwen2-0.5B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 35.94, "Cost($)": 0.0 }, "AQuA": { "Score": 33.07, "Cost($)": 0.0 } }, "SC-CoT-Qwen2-0.5B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 1.67, "Cost($)": 0.0 }, "AQuA": { "Score": 22.83, "Cost($)": 0.0 } } } }