Spaces:
Running
Running
{ | |
"time": "2025-01-24 15:10:27", | |
"results": { | |
"IO": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 37.83, | |
"Cost($)": 0.3328 | |
}, | |
"AQuA": { | |
"Score": 38.98, | |
"Cost($)": 0.038 | |
} | |
}, | |
"ReAct-Pro*": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 74.91, | |
"Cost($)": 3.4633 | |
}, | |
"AQuA": { | |
"Score": 64.57, | |
"Cost($)": 0.4928 | |
} | |
}, | |
"PoT": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 76.88, | |
"Cost($)": 0.6902 | |
}, | |
"AQuA": { | |
"Score": 59.45, | |
"Cost($)": 0.1748 | |
} | |
}, | |
"CoT": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 78.7, | |
"Cost($)": 0.6788 | |
}, | |
"AQuA": { | |
"Score": 61.02, | |
"Cost($)": 0.0957 | |
} | |
}, | |
"SC-CoT": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 79.91, | |
"Cost($)": 3.3938 | |
}, | |
"AQuA": { | |
"Score": 66.14, | |
"Cost($)": 0.7888 | |
} | |
}, | |
"IO-Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 72.02, | |
"Cost($)": 0.0354 | |
}, | |
"AQuA": { | |
"Score": 79.13, | |
"Cost($)": 0.0058 | |
} | |
}, | |
"ReAct-Pro*-Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 85.6, | |
"Cost($)": 0.2512 | |
}, | |
"AQuA": { | |
"Score": 77.56, | |
"Cost($)": 0.0445 | |
} | |
}, | |
"PoT-Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 79.61, | |
"Cost($)": 0.0576 | |
}, | |
"AQuA": { | |
"Score": 71.65, | |
"Cost($)": 0.0147 | |
} | |
}, | |
"CoT-Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 89.31, | |
"Cost($)": 0.0558 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Cost($)": 0.0066 | |
} | |
}, | |
"SC-CoT-Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 87.26, | |
"Cost($)": 0.2083 | |
}, | |
"AQuA": { | |
"Score": 81.1, | |
"Cost($)": 0.0519 | |
} | |
}, | |
"IO-gpt-4o": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 88.4, | |
"Cost($)": 3.3463 | |
}, | |
"AQuA": { | |
"Score": 75.59, | |
"Cost($)": 1.1453 | |
} | |
}, | |
"ReAct-Pro*-gpt-4o": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 63.31, | |
"Cost($)": 39.0751 | |
}, | |
"AQuA": { | |
"Score": 57.48, | |
"Cost($)": 2.304 | |
} | |
}, | |
"PoT-gpt-4o": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.1, | |
"Cost($)": 4.2166 | |
}, | |
"AQuA": { | |
"Score": 75.2, | |
"Cost($)": 1.6087 | |
} | |
}, | |
"CoT-gpt-4o": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 94.09, | |
"Cost($)": 4.5367 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Cost($)": 1.0417 | |
} | |
}, | |
"SC-CoT-gpt-4o": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 90.3, | |
"Cost($)": 31.0542 | |
}, | |
"AQuA": { | |
"Score": 86.61, | |
"Cost($)": 8.1485 | |
} | |
}, | |
"IO-Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 86.58, | |
"Cost($)": 0.4899 | |
}, | |
"AQuA": { | |
"Score": 84.25, | |
"Cost($)": 0.0742 | |
} | |
}, | |
"ReAct-Pro*-Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 87.26, | |
"Cost($)": 10.5479 | |
}, | |
"AQuA": { | |
"Score": 73.23, | |
"Cost($)": 0.3177 | |
} | |
}, | |
"PoT-Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.34, | |
"Cost($)": 0.7054 | |
}, | |
"AQuA": { | |
"Score": 75.2, | |
"Cost($)": 0.1645 | |
} | |
}, | |
"CoT-Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.87, | |
"Cost($)": 0.7195 | |
}, | |
"AQuA": { | |
"Score": 86.22, | |
"Cost($)": 0.0808 | |
} | |
}, | |
"SC-CoT-Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.86, | |
"Cost($)": 5.9858 | |
}, | |
"AQuA": { | |
"Score": 85.04, | |
"Cost($)": 1.0348 | |
} | |
}, | |
"IO-Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.27, | |
"Cost($)": 0.4709 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Cost($)": 0.0798 | |
} | |
}, | |
"ReAct-Pro*-Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 87.64, | |
"Cost($)": 10.1124 | |
}, | |
"AQuA": { | |
"Score": 79.13, | |
"Cost($)": 0.768 | |
} | |
}, | |
"PoT-Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 73.09, | |
"Cost($)": 0.9736 | |
}, | |
"AQuA": { | |
"Score": 79.53, | |
"Cost($)": 0.1746 | |
} | |
}, | |
"CoT-Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.93, | |
"Cost($)": 0.687 | |
}, | |
"AQuA": { | |
"Score": 83.46, | |
"Cost($)": 0.0927 | |
} | |
}, | |
"SC-CoT-Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 95.07, | |
"Cost($)": 6.2005 | |
}, | |
"AQuA": { | |
"Score": 82.28, | |
"Cost($)": 1.0756 | |
} | |
}, | |
"IO-Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 57.24, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 78.74, | |
"Cost($)": 0.0 | |
} | |
}, | |
"ReAct-Pro*-Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 82.87, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 74.41, | |
"Cost($)": 0.0 | |
} | |
}, | |
"PoT-Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 58.83, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 68.11, | |
"Cost($)": 0.0 | |
} | |
}, | |
"CoT-Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 85.67, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 80.71, | |
"Cost($)": 0.0 | |
} | |
}, | |
"SC-CoT-Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 91.13, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 79.92, | |
"Cost($)": 0.0 | |
} | |
}, | |
"IO-Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 57.16, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 51.18, | |
"Cost($)": 0.0 | |
} | |
}, | |
"ReAct-Pro*-Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 67.78, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 55.51, | |
"Cost($)": 0.0 | |
} | |
}, | |
"PoT-Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 38.67, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 36.61, | |
"Cost($)": 0.0 | |
} | |
}, | |
"CoT-Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 75.44, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 60.63, | |
"Cost($)": 0.0 | |
} | |
}, | |
"SC-CoT-Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 73.46, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 59.45, | |
"Cost($)": 0.0 | |
} | |
}, | |
"IO-Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 11.6, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 47.64, | |
"Cost($)": 0.0 | |
} | |
}, | |
"ReAct-Pro*-Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 33.51, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 40.94, | |
"Cost($)": 0.0 | |
} | |
}, | |
"PoT-Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 38.21, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 36.61, | |
"Cost($)": 0.0 | |
} | |
}, | |
"CoT-Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 77.71, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 52.76, | |
"Cost($)": 0.0 | |
} | |
}, | |
"SC-CoT-Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 48.22, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 39.37, | |
"Cost($)": 0.0 | |
} | |
}, | |
"IO-Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 16.68, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 29.13, | |
"Cost($)": 0.0 | |
} | |
}, | |
"ReAct-Pro*-Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 24.87, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 25.59, | |
"Cost($)": 0.0 | |
} | |
}, | |
"PoT-Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 18.5, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 30.71, | |
"Cost($)": 0.0 | |
} | |
}, | |
"CoT-Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 55.5, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 40.55, | |
"Cost($)": 0.0 | |
} | |
}, | |
"SC-CoT-Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 11.75, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 23.62, | |
"Cost($)": 0.0 | |
} | |
}, | |
"IO-Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 14.71, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 27.17, | |
"Cost($)": 0.0 | |
} | |
}, | |
"ReAct-Pro*-Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 7.66, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 24.02, | |
"Cost($)": 0.0 | |
} | |
}, | |
"PoT-Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 9.62, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 17.32, | |
"Cost($)": 0.0 | |
} | |
}, | |
"CoT-Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 35.94, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 33.07, | |
"Cost($)": 0.0 | |
} | |
}, | |
"SC-CoT-Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 1.67, | |
"Cost($)": 0.0 | |
}, | |
"AQuA": { | |
"Score": 22.83, | |
"Cost($)": 0.0 | |
} | |
} | |
} | |
} |