lewtun HF staff commited on
Commit
43975e6
·
1 Parent(s): 33a8591
app.py CHANGED
@@ -31,7 +31,7 @@ def get_leaderboard_df(merge_values: bool = True):
31
  path_parts = Path(filepath).parts
32
  date = filepath.stem.split("_")[-1][:-3].split("T")[0]
33
  model_revision = "_".join(path_parts[1:4]) + "_" + date
34
- task = path_parts[4].capitalize()
35
  df.loc[model_revision, "Date"] = date
36
 
37
  with open(filepath, "r") as file:
@@ -82,15 +82,25 @@ def get_leaderboard_df(merge_values: bool = True):
82
  for k, v in data["results"].items():
83
  if k != "all" and "_average" not in k:
84
  version = k.split("|")[1].split(":")[-1]
85
- value = v["qem"]
86
  df.loc[model_revision, f"{task}_{version}"] = value
87
  # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
88
  elif task.lower() in ["aimo_kaggle_hard_pot"]:
89
  for k, v in data["results"].items():
90
  if k != "all" and "_average" not in k:
91
  version = k.split("|")[1].split(":")[-1]
92
- value = v["qem"]
93
  df.loc[model_revision, f"{task}_{version}"] = value
 
 
 
 
 
 
 
 
 
 
94
  # For AlpacaEval we report base winrate and lenght corrected one
95
  elif task.lower() == "alpaca_eval":
96
  value = data["results"][first_result_key]["win_rate"]
@@ -98,27 +108,24 @@ def get_leaderboard_df(merge_values: bool = True):
98
  value = data["results"][first_result_key]["length_controlled_winrate"]
99
  df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
100
  else:
101
- df.loc[model_revision, task] = value
102
-
103
- # Put IFEval / BBH / AGIEval / AlpacaEval in first columns
104
- alpaca_col = df.pop("Alpaca_eval")
105
- df.insert(1, "Alpaca_eval", alpaca_col)
106
- alpaca_col = df.pop("Alpaca_eval_lc")
107
- df.insert(2, "Alpaca_eval_lc", alpaca_col)
108
- ifeval_col = df.pop("Ifeval")
109
- df.insert(3, "Ifeval", ifeval_col)
110
- bbh_col = df.pop("Bbh")
111
- df.insert(4, "Bbh", bbh_col)
112
- agieval_col = df.pop("Agieval")
113
- df.insert(5, "Agieval", agieval_col)
114
- gsm8k_col = df.pop("Gsm8k")
115
- df.insert(6, "Gsm8k", gsm8k_col)
116
- mmlu_col = df.pop("Mmlu")
117
- df.insert(7, "Mmlu", mmlu_col)
118
 
119
  # Drop rows where every entry is NaN
120
  df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
 
 
 
 
 
 
 
 
 
 
 
 
121
  df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
 
122
  # Convert all values to percentage
123
  df[df.select_dtypes(include=["number"]).columns] *= 100.0
124
  df = df.sort_values(by=["Average"], ascending=False)
@@ -132,12 +139,6 @@ def get_leaderboard_df(merge_values: bool = True):
132
  df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
133
  df.drop_duplicates(subset=["Model"], inplace=True)
134
  df = df.sort_values(by=["Average"], ascending=False).round(2)
135
-
136
- # Trim minimath column names
137
- df.columns = [c.replace("_level_", "_l") for c in df.columns]
138
-
139
- # Trim AIMO column names
140
- df.columns = [c.replace("Aimo_", "") for c in df.columns]
141
  return df
142
 
143
 
@@ -153,6 +154,8 @@ def update_table(search_query):
153
  search_terms = [term.strip().lower() for term in search_terms]
154
  pattern = "|".join(search_terms)
155
  df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
 
 
156
  return df
157
 
158
 
 
31
  path_parts = Path(filepath).parts
32
  date = filepath.stem.split("_")[-1][:-3].split("T")[0]
33
  model_revision = "_".join(path_parts[1:4]) + "_" + date
34
+ task = path_parts[4] # .capitalize()
35
  df.loc[model_revision, "Date"] = date
36
 
37
  with open(filepath, "r") as file:
 
82
  for k, v in data["results"].items():
83
  if k != "all" and "_average" not in k:
84
  version = k.split("|")[1].split(":")[-1]
85
+ value = v["qem"] if "qem" in v else v["score"]
86
  df.loc[model_revision, f"{task}_{version}"] = value
87
  # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
88
  elif task.lower() in ["aimo_kaggle_hard_pot"]:
89
  for k, v in data["results"].items():
90
  if k != "all" and "_average" not in k:
91
  version = k.split("|")[1].split(":")[-1]
92
+ value = v["qem"] if "qem" in v else v["score"]
93
  df.loc[model_revision, f"{task}_{version}"] = value
94
+ # For kaggle_tora we report accuracy, so need to divide by 100
95
+ elif task.lower() in [
96
+ "aimo_tora_eval_kaggle_medium",
97
+ "aimo_kaggle_fast_eval_hard",
98
+ "aimo_kaggle_tora_medium",
99
+ "aimo_kaggle_tora_hard",
100
+ ]:
101
+ for k, v in data["results"].items():
102
+ value = float(v["qem"]) / 100.0
103
+ df.loc[model_revision, f"{task}"] = value
104
  # For AlpacaEval we report base winrate and lenght corrected one
105
  elif task.lower() == "alpaca_eval":
106
  value = data["results"][first_result_key]["win_rate"]
 
108
  value = data["results"][first_result_key]["length_controlled_winrate"]
109
  df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
110
  else:
111
+ df.loc[model_revision, task] = float(value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  # Drop rows where every entry is NaN
114
  df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
115
+ # Trim minimath column names
116
+ df.columns = [c.replace("_level_", "_l") for c in df.columns]
117
+
118
+ # Trim AIMO column names
119
+ df.columns = [c.replace("aimo_", "") for c in df.columns]
120
+
121
+ # Rename old AIMO columns
122
+ df.rename(
123
+ columns={"tora_eval_kaggle_medium": "kaggle_tora_medium", "kaggle_fast_eval_hard": "kaggle_tora_hard"},
124
+ inplace=True,
125
+ )
126
+
127
  df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
128
+
129
  # Convert all values to percentage
130
  df[df.select_dtypes(include=["number"]).columns] *= 100.0
131
  df = df.sort_values(by=["Average"], ascending=False)
 
139
  df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
140
  df.drop_duplicates(subset=["Model"], inplace=True)
141
  df = df.sort_values(by=["Average"], ascending=False).round(2)
 
 
 
 
 
 
142
  return df
143
 
144
 
 
154
  search_terms = [term.strip().lower() for term in search_terms]
155
  pattern = "|".join(search_terms)
156
  df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
157
+ # Drop any columns which are all NaN
158
+ df = df.dropna(how="all", axis=1)
159
  return df
160
 
161
 
eval_results/data/deepseek-math-7b-kto-v00.00/checkpoint-1100/main/aimo_kaggle_hard_pot/results_2024-05-29T11-53-40.101219.json DELETED
@@ -1,187 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 158474.057687477,
9
- "end_time": 159098.894280397,
10
- "total_evaluation_time_secondes": "624.8365929200081",
11
- "model_name": "data_deepseek-math-7b-kto-v00.00_checkpoint-1100",
12
- "model_sha": "",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard_pot:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.02
21
- },
22
- "custom|aimo_kaggle_hard_pot:v1|0": {
23
- "qem": 0.04,
24
- "qem_stderr": 0.02799416848895062
25
- },
26
- "custom|aimo_kaggle_hard_pot:v2|0": {
27
- "qem": 0.06,
28
- "qem_stderr": 0.0339266916772512
29
- },
30
- "custom|aimo_kaggle_hard_pot:_average|0": {
31
- "qem": 0.04,
32
- "qem_stderr": 0.02730695338873394
33
- },
34
- "all": {
35
- "qem": 0.04,
36
- "qem_stderr": 0.02730695338873394
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_hard_pot:v0|0": 0,
41
- "custom|aimo_kaggle_hard_pot:v1|0": 0,
42
- "custom|aimo_kaggle_hard_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_hard_pot:v0": {
46
- "name": "aimo_kaggle_hard_pot:v0",
47
- "prompt_function": "kaggle_hard_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "frozen": false,
65
- "suite": [
66
- "custom"
67
- ],
68
- "original_num_docs": 50,
69
- "effective_num_docs": 50,
70
- "trust_dataset": null,
71
- "must_remove_duplicate_docs": null
72
- },
73
- "custom|aimo_kaggle_hard_pot:v1": {
74
- "name": "aimo_kaggle_hard_pot:v1",
75
- "prompt_function": "kaggle_hard_pot_prompt_fn_v1",
76
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
77
- "hf_subset": "v0",
78
- "metric": [
79
- "quasi_exact_match_code_and_math"
80
- ],
81
- "hf_avail_splits": [
82
- "train"
83
- ],
84
- "evaluation_splits": [
85
- "train"
86
- ],
87
- "few_shots_split": null,
88
- "few_shots_select": null,
89
- "generation_size": 2048,
90
- "stop_sequence": null,
91
- "output_regex": null,
92
- "frozen": false,
93
- "suite": [
94
- "custom"
95
- ],
96
- "original_num_docs": 50,
97
- "effective_num_docs": 50,
98
- "trust_dataset": null,
99
- "must_remove_duplicate_docs": null
100
- },
101
- "custom|aimo_kaggle_hard_pot:v2": {
102
- "name": "aimo_kaggle_hard_pot:v2",
103
- "prompt_function": "kaggle_hard_pot_prompt_fn_v2",
104
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
105
- "hf_subset": "v0",
106
- "metric": [
107
- "quasi_exact_match_code_and_math"
108
- ],
109
- "hf_avail_splits": [
110
- "train"
111
- ],
112
- "evaluation_splits": [
113
- "train"
114
- ],
115
- "few_shots_split": null,
116
- "few_shots_select": null,
117
- "generation_size": 2048,
118
- "stop_sequence": null,
119
- "output_regex": null,
120
- "frozen": false,
121
- "suite": [
122
- "custom"
123
- ],
124
- "original_num_docs": 50,
125
- "effective_num_docs": 50,
126
- "trust_dataset": null,
127
- "must_remove_duplicate_docs": null
128
- }
129
- },
130
- "summary_tasks": {
131
- "custom|aimo_kaggle_hard_pot:v0|0": {
132
- "hashes": {
133
- "hash_examples": "303213a38d9f7512",
134
- "hash_full_prompts": "bfd073e62f180246",
135
- "hash_input_tokens": "a647c539b203fcea",
136
- "hash_cont_tokens": "8af712c383b42f4f"
137
- },
138
- "truncated": 50,
139
- "non_truncated": 0,
140
- "padded": 37,
141
- "non_padded": 13,
142
- "effective_few_shots": 0.0,
143
- "num_truncated_few_shots": 0
144
- },
145
- "custom|aimo_kaggle_hard_pot:v1|0": {
146
- "hashes": {
147
- "hash_examples": "e4234b97ad92862f",
148
- "hash_full_prompts": "4747f0cd9a10355c",
149
- "hash_input_tokens": "646c64e34f75a472",
150
- "hash_cont_tokens": "e6d6557050167ea8"
151
- },
152
- "truncated": 50,
153
- "non_truncated": 0,
154
- "padded": 32,
155
- "non_padded": 18,
156
- "effective_few_shots": 0.0,
157
- "num_truncated_few_shots": 0
158
- },
159
- "custom|aimo_kaggle_hard_pot:v2|0": {
160
- "hashes": {
161
- "hash_examples": "6396eb8833e13ba0",
162
- "hash_full_prompts": "0584a2707d3a5d56",
163
- "hash_input_tokens": "e1aae06d528de511",
164
- "hash_cont_tokens": "e7f276640ec93b73"
165
- },
166
- "truncated": 50,
167
- "non_truncated": 0,
168
- "padded": 30,
169
- "non_padded": 20,
170
- "effective_few_shots": 0.0,
171
- "num_truncated_few_shots": 0
172
- }
173
- },
174
- "summary_general": {
175
- "hashes": {
176
- "hash_examples": "648c9a107d279e1e",
177
- "hash_full_prompts": "c13504bf8e62491b",
178
- "hash_input_tokens": "753d23e4899a9f2d",
179
- "hash_cont_tokens": "d6c8fd284cb6c9a9"
180
- },
181
- "truncated": 150,
182
- "non_truncated": 0,
183
- "padded": 99,
184
- "non_padded": 51,
185
- "num_truncated_few_shots": 0
186
- }
187
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/data/deepseek-math-7b-kto-v00.00/checkpoint-1100/main/aimo_kaggle_medium_pot/results_2024-05-29T11-48-26.048131.json DELETED
@@ -1,187 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 4221015.208132972,
9
- "end_time": 4221347.983842688,
10
- "total_evaluation_time_secondes": "332.77570971567184",
11
- "model_name": "data_deepseek-math-7b-kto-v00.00_checkpoint-1100",
12
- "model_sha": "",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium_pot:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "custom|aimo_kaggle_medium_pot:v1|0": {
23
- "qem": 0.05,
24
- "qem_stderr": 0.03489912202260563
25
- },
26
- "custom|aimo_kaggle_medium_pot:v2|0": {
27
- "qem": 0.125,
28
- "qem_stderr": 0.05295740910852021
29
- },
30
- "custom|aimo_kaggle_medium_pot:_average|0": {
31
- "qem": 0.05833333333333333,
32
- "qem_stderr": 0.02928551037704195
33
- },
34
- "all": {
35
- "qem": 0.05833333333333333,
36
- "qem_stderr": 0.02928551037704195
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_medium_pot:v0|0": 0,
41
- "custom|aimo_kaggle_medium_pot:v1|0": 0,
42
- "custom|aimo_kaggle_medium_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_medium_pot:v0": {
46
- "name": "aimo_kaggle_medium_pot:v0",
47
- "prompt_function": "kaggle_medium_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "frozen": false,
65
- "suite": [
66
- "custom"
67
- ],
68
- "original_num_docs": 40,
69
- "effective_num_docs": 40,
70
- "trust_dataset": null,
71
- "must_remove_duplicate_docs": null
72
- },
73
- "custom|aimo_kaggle_medium_pot:v1": {
74
- "name": "aimo_kaggle_medium_pot:v1",
75
- "prompt_function": "kaggle_medium_pot_prompt_fn_v1",
76
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
77
- "hf_subset": "v0",
78
- "metric": [
79
- "quasi_exact_match_code_and_math"
80
- ],
81
- "hf_avail_splits": [
82
- "train"
83
- ],
84
- "evaluation_splits": [
85
- "train"
86
- ],
87
- "few_shots_split": null,
88
- "few_shots_select": null,
89
- "generation_size": 2048,
90
- "stop_sequence": null,
91
- "output_regex": null,
92
- "frozen": false,
93
- "suite": [
94
- "custom"
95
- ],
96
- "original_num_docs": 40,
97
- "effective_num_docs": 40,
98
- "trust_dataset": null,
99
- "must_remove_duplicate_docs": null
100
- },
101
- "custom|aimo_kaggle_medium_pot:v2": {
102
- "name": "aimo_kaggle_medium_pot:v2",
103
- "prompt_function": "kaggle_medium_pot_prompt_fn_v2",
104
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
105
- "hf_subset": "v0",
106
- "metric": [
107
- "quasi_exact_match_code_and_math"
108
- ],
109
- "hf_avail_splits": [
110
- "train"
111
- ],
112
- "evaluation_splits": [
113
- "train"
114
- ],
115
- "few_shots_split": null,
116
- "few_shots_select": null,
117
- "generation_size": 2048,
118
- "stop_sequence": null,
119
- "output_regex": null,
120
- "frozen": false,
121
- "suite": [
122
- "custom"
123
- ],
124
- "original_num_docs": 40,
125
- "effective_num_docs": 40,
126
- "trust_dataset": null,
127
- "must_remove_duplicate_docs": null
128
- }
129
- },
130
- "summary_tasks": {
131
- "custom|aimo_kaggle_medium_pot:v0|0": {
132
- "hashes": {
133
- "hash_examples": "2799c24461029dc3",
134
- "hash_full_prompts": "2af864fbfc2e0a79",
135
- "hash_input_tokens": "324d7ae112f05205",
136
- "hash_cont_tokens": "982187434eb2bfe6"
137
- },
138
- "truncated": 40,
139
- "non_truncated": 0,
140
- "padded": 29,
141
- "non_padded": 11,
142
- "effective_few_shots": 0.0,
143
- "num_truncated_few_shots": 0
144
- },
145
- "custom|aimo_kaggle_medium_pot:v1|0": {
146
- "hashes": {
147
- "hash_examples": "806b2e2056b41f84",
148
- "hash_full_prompts": "8123a0d96a6ceb9d",
149
- "hash_input_tokens": "b1f2be384f5fe5f1",
150
- "hash_cont_tokens": "8bf35f908f11fca1"
151
- },
152
- "truncated": 40,
153
- "non_truncated": 0,
154
- "padded": 26,
155
- "non_padded": 14,
156
- "effective_few_shots": 0.0,
157
- "num_truncated_few_shots": 0
158
- },
159
- "custom|aimo_kaggle_medium_pot:v2|0": {
160
- "hashes": {
161
- "hash_examples": "d8534375acc5d427",
162
- "hash_full_prompts": "71ba7c8172fec45a",
163
- "hash_input_tokens": "3e80be021360103e",
164
- "hash_cont_tokens": "368e802422b88fee"
165
- },
166
- "truncated": 40,
167
- "non_truncated": 0,
168
- "padded": 31,
169
- "non_padded": 9,
170
- "effective_few_shots": 0.0,
171
- "num_truncated_few_shots": 0
172
- }
173
- },
174
- "summary_general": {
175
- "hashes": {
176
- "hash_examples": "623505a45a4910c2",
177
- "hash_full_prompts": "0ee7c8ef786b9aa3",
178
- "hash_input_tokens": "e7650dee68c62549",
179
- "hash_cont_tokens": "529792dd876efcf8"
180
- },
181
- "truncated": 120,
182
- "non_truncated": 0,
183
- "padded": 86,
184
- "non_padded": 34,
185
- "num_truncated_few_shots": 0
186
- }
187
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/data/deepseek-math-7b-kto-v01.00/checkpoint-500/main/aimo_kaggle_hard_pot/results_2024-05-30T09-30-22.587150.json DELETED
@@ -1,187 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1769817.680840058,
9
- "end_time": 1770501.834820281,
10
- "total_evaluation_time_secondes": "684.1539802229963",
11
- "model_name": "data_deepseek-math-7b-kto-v01.00_checkpoint-500",
12
- "model_sha": "",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard_pot:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.02
21
- },
22
- "custom|aimo_kaggle_hard_pot:v1|0": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.02
25
- },
26
- "custom|aimo_kaggle_hard_pot:v2|0": {
27
- "qem": 0.1,
28
- "qem_stderr": 0.04285714285714283
29
- },
30
- "custom|aimo_kaggle_hard_pot:_average|0": {
31
- "qem": 0.04666666666666667,
32
- "qem_stderr": 0.02761904761904761
33
- },
34
- "all": {
35
- "qem": 0.04666666666666667,
36
- "qem_stderr": 0.02761904761904761
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_hard_pot:v0|0": 0,
41
- "custom|aimo_kaggle_hard_pot:v1|0": 0,
42
- "custom|aimo_kaggle_hard_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_hard_pot:v0": {
46
- "name": "aimo_kaggle_hard_pot:v0",
47
- "prompt_function": "kaggle_hard_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "frozen": false,
65
- "suite": [
66
- "custom"
67
- ],
68
- "original_num_docs": 50,
69
- "effective_num_docs": 50,
70
- "trust_dataset": null,
71
- "must_remove_duplicate_docs": null
72
- },
73
- "custom|aimo_kaggle_hard_pot:v1": {
74
- "name": "aimo_kaggle_hard_pot:v1",
75
- "prompt_function": "kaggle_hard_pot_prompt_fn_v1",
76
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
77
- "hf_subset": "v0",
78
- "metric": [
79
- "quasi_exact_match_code_and_math"
80
- ],
81
- "hf_avail_splits": [
82
- "train"
83
- ],
84
- "evaluation_splits": [
85
- "train"
86
- ],
87
- "few_shots_split": null,
88
- "few_shots_select": null,
89
- "generation_size": 2048,
90
- "stop_sequence": null,
91
- "output_regex": null,
92
- "frozen": false,
93
- "suite": [
94
- "custom"
95
- ],
96
- "original_num_docs": 50,
97
- "effective_num_docs": 50,
98
- "trust_dataset": null,
99
- "must_remove_duplicate_docs": null
100
- },
101
- "custom|aimo_kaggle_hard_pot:v2": {
102
- "name": "aimo_kaggle_hard_pot:v2",
103
- "prompt_function": "kaggle_hard_pot_prompt_fn_v2",
104
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
105
- "hf_subset": "v0",
106
- "metric": [
107
- "quasi_exact_match_code_and_math"
108
- ],
109
- "hf_avail_splits": [
110
- "train"
111
- ],
112
- "evaluation_splits": [
113
- "train"
114
- ],
115
- "few_shots_split": null,
116
- "few_shots_select": null,
117
- "generation_size": 2048,
118
- "stop_sequence": null,
119
- "output_regex": null,
120
- "frozen": false,
121
- "suite": [
122
- "custom"
123
- ],
124
- "original_num_docs": 50,
125
- "effective_num_docs": 50,
126
- "trust_dataset": null,
127
- "must_remove_duplicate_docs": null
128
- }
129
- },
130
- "summary_tasks": {
131
- "custom|aimo_kaggle_hard_pot:v0|0": {
132
- "hashes": {
133
- "hash_examples": "303213a38d9f7512",
134
- "hash_full_prompts": "bfd073e62f180246",
135
- "hash_input_tokens": "a647c539b203fcea",
136
- "hash_cont_tokens": "a42fb77baa626887"
137
- },
138
- "truncated": 50,
139
- "non_truncated": 0,
140
- "padded": 37,
141
- "non_padded": 13,
142
- "effective_few_shots": 0.0,
143
- "num_truncated_few_shots": 0
144
- },
145
- "custom|aimo_kaggle_hard_pot:v1|0": {
146
- "hashes": {
147
- "hash_examples": "e4234b97ad92862f",
148
- "hash_full_prompts": "4747f0cd9a10355c",
149
- "hash_input_tokens": "646c64e34f75a472",
150
- "hash_cont_tokens": "389ba1911e6ab503"
151
- },
152
- "truncated": 50,
153
- "non_truncated": 0,
154
- "padded": 32,
155
- "non_padded": 18,
156
- "effective_few_shots": 0.0,
157
- "num_truncated_few_shots": 0
158
- },
159
- "custom|aimo_kaggle_hard_pot:v2|0": {
160
- "hashes": {
161
- "hash_examples": "6396eb8833e13ba0",
162
- "hash_full_prompts": "0584a2707d3a5d56",
163
- "hash_input_tokens": "e1aae06d528de511",
164
- "hash_cont_tokens": "95d69487d0b81a0a"
165
- },
166
- "truncated": 50,
167
- "non_truncated": 0,
168
- "padded": 30,
169
- "non_padded": 20,
170
- "effective_few_shots": 0.0,
171
- "num_truncated_few_shots": 0
172
- }
173
- },
174
- "summary_general": {
175
- "hashes": {
176
- "hash_examples": "648c9a107d279e1e",
177
- "hash_full_prompts": "c13504bf8e62491b",
178
- "hash_input_tokens": "753d23e4899a9f2d",
179
- "hash_cont_tokens": "1d7af82972d43dbe"
180
- },
181
- "truncated": 150,
182
- "non_truncated": 0,
183
- "padded": 99,
184
- "non_padded": 51,
185
- "num_truncated_few_shots": 0
186
- }
187
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/data/deepseek-math-7b-kto-v01.00/checkpoint-500/main/aimo_kaggle_hard_pot/results_2024-05-30T11-56-47.491725.json DELETED
@@ -1,187 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 218990.585347064,
9
- "end_time": 219650.665919556,
10
- "total_evaluation_time_secondes": "660.0805724919774",
11
- "model_name": "data_deepseek-math-7b-kto-v01.00_checkpoint-500",
12
- "model_sha": "",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard_pot:v0|0": {
19
- "score": 0.02,
20
- "score_stderr": 0.02
21
- },
22
- "custom|aimo_kaggle_hard_pot:v1|0": {
23
- "score": 0.02,
24
- "score_stderr": 0.02
25
- },
26
- "custom|aimo_kaggle_hard_pot:v2|0": {
27
- "score": 0.1,
28
- "score_stderr": 0.04285714285714283
29
- },
30
- "custom|aimo_kaggle_hard_pot:_average|0": {
31
- "score": 0.04666666666666667,
32
- "score_stderr": 0.02761904761904761
33
- },
34
- "all": {
35
- "score": 0.04666666666666667,
36
- "score_stderr": 0.02761904761904761
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_hard_pot:v0|0": 0,
41
- "custom|aimo_kaggle_hard_pot:v1|0": 0,
42
- "custom|aimo_kaggle_hard_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_hard_pot:v0": {
46
- "name": "aimo_kaggle_hard_pot:v0",
47
- "prompt_function": "kaggle_hard_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "frozen": false,
65
- "suite": [
66
- "custom"
67
- ],
68
- "original_num_docs": 50,
69
- "effective_num_docs": 50,
70
- "trust_dataset": null,
71
- "must_remove_duplicate_docs": null
72
- },
73
- "custom|aimo_kaggle_hard_pot:v1": {
74
- "name": "aimo_kaggle_hard_pot:v1",
75
- "prompt_function": "kaggle_hard_pot_prompt_fn_v1",
76
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
77
- "hf_subset": "v0",
78
- "metric": [
79
- "quasi_exact_match_code_and_math"
80
- ],
81
- "hf_avail_splits": [
82
- "train"
83
- ],
84
- "evaluation_splits": [
85
- "train"
86
- ],
87
- "few_shots_split": null,
88
- "few_shots_select": null,
89
- "generation_size": 2048,
90
- "stop_sequence": null,
91
- "output_regex": null,
92
- "frozen": false,
93
- "suite": [
94
- "custom"
95
- ],
96
- "original_num_docs": 50,
97
- "effective_num_docs": 50,
98
- "trust_dataset": null,
99
- "must_remove_duplicate_docs": null
100
- },
101
- "custom|aimo_kaggle_hard_pot:v2": {
102
- "name": "aimo_kaggle_hard_pot:v2",
103
- "prompt_function": "kaggle_hard_pot_prompt_fn_v2",
104
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
105
- "hf_subset": "v0",
106
- "metric": [
107
- "quasi_exact_match_code_and_math"
108
- ],
109
- "hf_avail_splits": [
110
- "train"
111
- ],
112
- "evaluation_splits": [
113
- "train"
114
- ],
115
- "few_shots_split": null,
116
- "few_shots_select": null,
117
- "generation_size": 2048,
118
- "stop_sequence": null,
119
- "output_regex": null,
120
- "frozen": false,
121
- "suite": [
122
- "custom"
123
- ],
124
- "original_num_docs": 50,
125
- "effective_num_docs": 50,
126
- "trust_dataset": null,
127
- "must_remove_duplicate_docs": null
128
- }
129
- },
130
- "summary_tasks": {
131
- "custom|aimo_kaggle_hard_pot:v0|0": {
132
- "hashes": {
133
- "hash_examples": "303213a38d9f7512",
134
- "hash_full_prompts": "bfd073e62f180246",
135
- "hash_input_tokens": "a647c539b203fcea",
136
- "hash_cont_tokens": "a42fb77baa626887"
137
- },
138
- "truncated": 50,
139
- "non_truncated": 0,
140
- "padded": 37,
141
- "non_padded": 13,
142
- "effective_few_shots": 0.0,
143
- "num_truncated_few_shots": 0
144
- },
145
- "custom|aimo_kaggle_hard_pot:v1|0": {
146
- "hashes": {
147
- "hash_examples": "e4234b97ad92862f",
148
- "hash_full_prompts": "4747f0cd9a10355c",
149
- "hash_input_tokens": "646c64e34f75a472",
150
- "hash_cont_tokens": "389ba1911e6ab503"
151
- },
152
- "truncated": 50,
153
- "non_truncated": 0,
154
- "padded": 32,
155
- "non_padded": 18,
156
- "effective_few_shots": 0.0,
157
- "num_truncated_few_shots": 0
158
- },
159
- "custom|aimo_kaggle_hard_pot:v2|0": {
160
- "hashes": {
161
- "hash_examples": "6396eb8833e13ba0",
162
- "hash_full_prompts": "0584a2707d3a5d56",
163
- "hash_input_tokens": "e1aae06d528de511",
164
- "hash_cont_tokens": "95d69487d0b81a0a"
165
- },
166
- "truncated": 50,
167
- "non_truncated": 0,
168
- "padded": 30,
169
- "non_padded": 20,
170
- "effective_few_shots": 0.0,
171
- "num_truncated_few_shots": 0
172
- }
173
- },
174
- "summary_general": {
175
- "hashes": {
176
- "hash_examples": "648c9a107d279e1e",
177
- "hash_full_prompts": "c13504bf8e62491b",
178
- "hash_input_tokens": "753d23e4899a9f2d",
179
- "hash_cont_tokens": "1d7af82972d43dbe"
180
- },
181
- "truncated": 150,
182
- "non_truncated": 0,
183
- "padded": 99,
184
- "non_padded": 51,
185
- "num_truncated_few_shots": 0
186
- }
187
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/data/deepseek-math-7b-kto-v01.00/checkpoint-500/main/aimo_kaggle_hard_pot/results_2024-05-30T14-47-24.379216.json DELETED
@@ -1,187 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 229226.755021449,
9
- "end_time": 229887.551242834,
10
- "total_evaluation_time_secondes": "660.7962213849823",
11
- "model_name": "data_deepseek-math-7b-kto-v01.00_checkpoint-500",
12
- "model_sha": "",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard_pot:v0|0": {
19
- "score": 0.02,
20
- "score_stderr": 0.02
21
- },
22
- "custom|aimo_kaggle_hard_pot:v1|0": {
23
- "score": 0.02,
24
- "score_stderr": 0.02
25
- },
26
- "custom|aimo_kaggle_hard_pot:v2|0": {
27
- "score": 0.1,
28
- "score_stderr": 0.04285714285714283
29
- },
30
- "custom|aimo_kaggle_hard_pot:_average|0": {
31
- "score": 0.04666666666666667,
32
- "score_stderr": 0.02761904761904761
33
- },
34
- "all": {
35
- "score": 0.04666666666666667,
36
- "score_stderr": 0.02761904761904761
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_hard_pot:v0|0": 0,
41
- "custom|aimo_kaggle_hard_pot:v1|0": 0,
42
- "custom|aimo_kaggle_hard_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_hard_pot:v0": {
46
- "name": "aimo_kaggle_hard_pot:v0",
47
- "prompt_function": "kaggle_hard_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "frozen": false,
65
- "suite": [
66
- "custom"
67
- ],
68
- "original_num_docs": 50,
69
- "effective_num_docs": 50,
70
- "trust_dataset": null,
71
- "must_remove_duplicate_docs": null
72
- },
73
- "custom|aimo_kaggle_hard_pot:v1": {
74
- "name": "aimo_kaggle_hard_pot:v1",
75
- "prompt_function": "kaggle_hard_pot_prompt_fn_v1",
76
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
77
- "hf_subset": "v0",
78
- "metric": [
79
- "quasi_exact_match_code_and_math"
80
- ],
81
- "hf_avail_splits": [
82
- "train"
83
- ],
84
- "evaluation_splits": [
85
- "train"
86
- ],
87
- "few_shots_split": null,
88
- "few_shots_select": null,
89
- "generation_size": 2048,
90
- "stop_sequence": null,
91
- "output_regex": null,
92
- "frozen": false,
93
- "suite": [
94
- "custom"
95
- ],
96
- "original_num_docs": 50,
97
- "effective_num_docs": 50,
98
- "trust_dataset": null,
99
- "must_remove_duplicate_docs": null
100
- },
101
- "custom|aimo_kaggle_hard_pot:v2": {
102
- "name": "aimo_kaggle_hard_pot:v2",
103
- "prompt_function": "kaggle_hard_pot_prompt_fn_v2",
104
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
105
- "hf_subset": "v0",
106
- "metric": [
107
- "quasi_exact_match_code_and_math"
108
- ],
109
- "hf_avail_splits": [
110
- "train"
111
- ],
112
- "evaluation_splits": [
113
- "train"
114
- ],
115
- "few_shots_split": null,
116
- "few_shots_select": null,
117
- "generation_size": 2048,
118
- "stop_sequence": null,
119
- "output_regex": null,
120
- "frozen": false,
121
- "suite": [
122
- "custom"
123
- ],
124
- "original_num_docs": 50,
125
- "effective_num_docs": 50,
126
- "trust_dataset": null,
127
- "must_remove_duplicate_docs": null
128
- }
129
- },
130
- "summary_tasks": {
131
- "custom|aimo_kaggle_hard_pot:v0|0": {
132
- "hashes": {
133
- "hash_examples": "303213a38d9f7512",
134
- "hash_full_prompts": "bfd073e62f180246",
135
- "hash_input_tokens": "a647c539b203fcea",
136
- "hash_cont_tokens": "a42fb77baa626887"
137
- },
138
- "truncated": 50,
139
- "non_truncated": 0,
140
- "padded": 37,
141
- "non_padded": 13,
142
- "effective_few_shots": 0.0,
143
- "num_truncated_few_shots": 0
144
- },
145
- "custom|aimo_kaggle_hard_pot:v1|0": {
146
- "hashes": {
147
- "hash_examples": "e4234b97ad92862f",
148
- "hash_full_prompts": "4747f0cd9a10355c",
149
- "hash_input_tokens": "646c64e34f75a472",
150
- "hash_cont_tokens": "389ba1911e6ab503"
151
- },
152
- "truncated": 50,
153
- "non_truncated": 0,
154
- "padded": 32,
155
- "non_padded": 18,
156
- "effective_few_shots": 0.0,
157
- "num_truncated_few_shots": 0
158
- },
159
- "custom|aimo_kaggle_hard_pot:v2|0": {
160
- "hashes": {
161
- "hash_examples": "6396eb8833e13ba0",
162
- "hash_full_prompts": "0584a2707d3a5d56",
163
- "hash_input_tokens": "e1aae06d528de511",
164
- "hash_cont_tokens": "95d69487d0b81a0a"
165
- },
166
- "truncated": 50,
167
- "non_truncated": 0,
168
- "padded": 30,
169
- "non_padded": 20,
170
- "effective_few_shots": 0.0,
171
- "num_truncated_few_shots": 0
172
- }
173
- },
174
- "summary_general": {
175
- "hashes": {
176
- "hash_examples": "648c9a107d279e1e",
177
- "hash_full_prompts": "c13504bf8e62491b",
178
- "hash_input_tokens": "753d23e4899a9f2d",
179
- "hash_cont_tokens": "1d7af82972d43dbe"
180
- },
181
- "truncated": 150,
182
- "non_truncated": 0,
183
- "padded": 99,
184
- "non_padded": 51,
185
- "num_truncated_few_shots": 0
186
- }
187
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/data/deepseek-math-7b-kto-v01.00/checkpoint-500/main/aimo_kaggle_hard_pot/results_2024-05-30T15-40-17.606001.json DELETED
@@ -1,187 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 351189.55692062,
9
- "end_time": 351799.590479848,
10
- "total_evaluation_time_secondes": "610.0335592280026",
11
- "model_name": "data_deepseek-math-7b-kto-v01.00_checkpoint-500",
12
- "model_sha": "",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard_pot:v0|0": {
19
- "score": 0.02,
20
- "score_stderr": 0.02
21
- },
22
- "custom|aimo_kaggle_hard_pot:v1|0": {
23
- "score": 0.02,
24
- "score_stderr": 0.02
25
- },
26
- "custom|aimo_kaggle_hard_pot:v2|0": {
27
- "score": 0.1,
28
- "score_stderr": 0.04285714285714283
29
- },
30
- "custom|aimo_kaggle_hard_pot:_average|0": {
31
- "score": 0.04666666666666667,
32
- "score_stderr": 0.02761904761904761
33
- },
34
- "all": {
35
- "score": 0.04666666666666667,
36
- "score_stderr": 0.02761904761904761
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_hard_pot:v0|0": 0,
41
- "custom|aimo_kaggle_hard_pot:v1|0": 0,
42
- "custom|aimo_kaggle_hard_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_hard_pot:v0": {
46
- "name": "aimo_kaggle_hard_pot:v0",
47
- "prompt_function": "kaggle_hard_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "frozen": false,
65
- "suite": [
66
- "custom"
67
- ],
68
- "original_num_docs": 50,
69
- "effective_num_docs": 50,
70
- "trust_dataset": null,
71
- "must_remove_duplicate_docs": null
72
- },
73
- "custom|aimo_kaggle_hard_pot:v1": {
74
- "name": "aimo_kaggle_hard_pot:v1",
75
- "prompt_function": "kaggle_hard_pot_prompt_fn_v1",
76
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
77
- "hf_subset": "v0",
78
- "metric": [
79
- "quasi_exact_match_code_and_math"
80
- ],
81
- "hf_avail_splits": [
82
- "train"
83
- ],
84
- "evaluation_splits": [
85
- "train"
86
- ],
87
- "few_shots_split": null,
88
- "few_shots_select": null,
89
- "generation_size": 2048,
90
- "stop_sequence": null,
91
- "output_regex": null,
92
- "frozen": false,
93
- "suite": [
94
- "custom"
95
- ],
96
- "original_num_docs": 50,
97
- "effective_num_docs": 50,
98
- "trust_dataset": null,
99
- "must_remove_duplicate_docs": null
100
- },
101
- "custom|aimo_kaggle_hard_pot:v2": {
102
- "name": "aimo_kaggle_hard_pot:v2",
103
- "prompt_function": "kaggle_hard_pot_prompt_fn_v2",
104
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
105
- "hf_subset": "v0",
106
- "metric": [
107
- "quasi_exact_match_code_and_math"
108
- ],
109
- "hf_avail_splits": [
110
- "train"
111
- ],
112
- "evaluation_splits": [
113
- "train"
114
- ],
115
- "few_shots_split": null,
116
- "few_shots_select": null,
117
- "generation_size": 2048,
118
- "stop_sequence": null,
119
- "output_regex": null,
120
- "frozen": false,
121
- "suite": [
122
- "custom"
123
- ],
124
- "original_num_docs": 50,
125
- "effective_num_docs": 50,
126
- "trust_dataset": null,
127
- "must_remove_duplicate_docs": null
128
- }
129
- },
130
- "summary_tasks": {
131
- "custom|aimo_kaggle_hard_pot:v0|0": {
132
- "hashes": {
133
- "hash_examples": "303213a38d9f7512",
134
- "hash_full_prompts": "bfd073e62f180246",
135
- "hash_input_tokens": "a647c539b203fcea",
136
- "hash_cont_tokens": "a42fb77baa626887"
137
- },
138
- "truncated": 50,
139
- "non_truncated": 0,
140
- "padded": 37,
141
- "non_padded": 13,
142
- "effective_few_shots": 0.0,
143
- "num_truncated_few_shots": 0
144
- },
145
- "custom|aimo_kaggle_hard_pot:v1|0": {
146
- "hashes": {
147
- "hash_examples": "e4234b97ad92862f",
148
- "hash_full_prompts": "4747f0cd9a10355c",
149
- "hash_input_tokens": "646c64e34f75a472",
150
- "hash_cont_tokens": "389ba1911e6ab503"
151
- },
152
- "truncated": 50,
153
- "non_truncated": 0,
154
- "padded": 32,
155
- "non_padded": 18,
156
- "effective_few_shots": 0.0,
157
- "num_truncated_few_shots": 0
158
- },
159
- "custom|aimo_kaggle_hard_pot:v2|0": {
160
- "hashes": {
161
- "hash_examples": "6396eb8833e13ba0",
162
- "hash_full_prompts": "0584a2707d3a5d56",
163
- "hash_input_tokens": "e1aae06d528de511",
164
- "hash_cont_tokens": "95d69487d0b81a0a"
165
- },
166
- "truncated": 50,
167
- "non_truncated": 0,
168
- "padded": 30,
169
- "non_padded": 20,
170
- "effective_few_shots": 0.0,
171
- "num_truncated_few_shots": 0
172
- }
173
- },
174
- "summary_general": {
175
- "hashes": {
176
- "hash_examples": "648c9a107d279e1e",
177
- "hash_full_prompts": "c13504bf8e62491b",
178
- "hash_input_tokens": "753d23e4899a9f2d",
179
- "hash_cont_tokens": "1d7af82972d43dbe"
180
- },
181
- "truncated": 150,
182
- "non_truncated": 0,
183
- "padded": 99,
184
- "non_padded": 51,
185
- "num_truncated_few_shots": 0
186
- }
187
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/data/deepseek-math-7b-kto-v01.00/checkpoint-500/main/aimo_kaggle_medium_pot/results_2024-05-30T09-22-56.144851.json DELETED
@@ -1,187 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 345938.21674439,
9
- "end_time": 346259.093267297,
10
- "total_evaluation_time_secondes": "320.8765229069977",
11
- "model_name": "data_deepseek-math-7b-kto-v01.00_checkpoint-500",
12
- "model_sha": "",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium_pot:v0|0": {
19
- "qem": 0.025,
20
- "qem_stderr": 0.024999999999999998
21
- },
22
- "custom|aimo_kaggle_medium_pot:v1|0": {
23
- "qem": 0.025,
24
- "qem_stderr": 0.02499999999999999
25
- },
26
- "custom|aimo_kaggle_medium_pot:v2|0": {
27
- "qem": 0.225,
28
- "qem_stderr": 0.06686668711812967
29
- },
30
- "custom|aimo_kaggle_medium_pot:_average|0": {
31
- "qem": 0.09166666666666667,
32
- "qem_stderr": 0.03895556237270988
33
- },
34
- "all": {
35
- "qem": 0.09166666666666667,
36
- "qem_stderr": 0.03895556237270988
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_medium_pot:v0|0": 0,
41
- "custom|aimo_kaggle_medium_pot:v1|0": 0,
42
- "custom|aimo_kaggle_medium_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_medium_pot:v0": {
46
- "name": "aimo_kaggle_medium_pot:v0",
47
- "prompt_function": "kaggle_medium_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "frozen": false,
65
- "suite": [
66
- "custom"
67
- ],
68
- "original_num_docs": 40,
69
- "effective_num_docs": 40,
70
- "trust_dataset": null,
71
- "must_remove_duplicate_docs": null
72
- },
73
- "custom|aimo_kaggle_medium_pot:v1": {
74
- "name": "aimo_kaggle_medium_pot:v1",
75
- "prompt_function": "kaggle_medium_pot_prompt_fn_v1",
76
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
77
- "hf_subset": "v0",
78
- "metric": [
79
- "quasi_exact_match_code_and_math"
80
- ],
81
- "hf_avail_splits": [
82
- "train"
83
- ],
84
- "evaluation_splits": [
85
- "train"
86
- ],
87
- "few_shots_split": null,
88
- "few_shots_select": null,
89
- "generation_size": 2048,
90
- "stop_sequence": null,
91
- "output_regex": null,
92
- "frozen": false,
93
- "suite": [
94
- "custom"
95
- ],
96
- "original_num_docs": 40,
97
- "effective_num_docs": 40,
98
- "trust_dataset": null,
99
- "must_remove_duplicate_docs": null
100
- },
101
- "custom|aimo_kaggle_medium_pot:v2": {
102
- "name": "aimo_kaggle_medium_pot:v2",
103
- "prompt_function": "kaggle_medium_pot_prompt_fn_v2",
104
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
105
- "hf_subset": "v0",
106
- "metric": [
107
- "quasi_exact_match_code_and_math"
108
- ],
109
- "hf_avail_splits": [
110
- "train"
111
- ],
112
- "evaluation_splits": [
113
- "train"
114
- ],
115
- "few_shots_split": null,
116
- "few_shots_select": null,
117
- "generation_size": 2048,
118
- "stop_sequence": null,
119
- "output_regex": null,
120
- "frozen": false,
121
- "suite": [
122
- "custom"
123
- ],
124
- "original_num_docs": 40,
125
- "effective_num_docs": 40,
126
- "trust_dataset": null,
127
- "must_remove_duplicate_docs": null
128
- }
129
- },
130
- "summary_tasks": {
131
- "custom|aimo_kaggle_medium_pot:v0|0": {
132
- "hashes": {
133
- "hash_examples": "2799c24461029dc3",
134
- "hash_full_prompts": "2af864fbfc2e0a79",
135
- "hash_input_tokens": "324d7ae112f05205",
136
- "hash_cont_tokens": "c722f0ca923e4e78"
137
- },
138
- "truncated": 40,
139
- "non_truncated": 0,
140
- "padded": 29,
141
- "non_padded": 11,
142
- "effective_few_shots": 0.0,
143
- "num_truncated_few_shots": 0
144
- },
145
- "custom|aimo_kaggle_medium_pot:v1|0": {
146
- "hashes": {
147
- "hash_examples": "806b2e2056b41f84",
148
- "hash_full_prompts": "8123a0d96a6ceb9d",
149
- "hash_input_tokens": "b1f2be384f5fe5f1",
150
- "hash_cont_tokens": "95497768ab3d9728"
151
- },
152
- "truncated": 40,
153
- "non_truncated": 0,
154
- "padded": 26,
155
- "non_padded": 14,
156
- "effective_few_shots": 0.0,
157
- "num_truncated_few_shots": 0
158
- },
159
- "custom|aimo_kaggle_medium_pot:v2|0": {
160
- "hashes": {
161
- "hash_examples": "d8534375acc5d427",
162
- "hash_full_prompts": "71ba7c8172fec45a",
163
- "hash_input_tokens": "3e80be021360103e",
164
- "hash_cont_tokens": "45304647229c8ee3"
165
- },
166
- "truncated": 40,
167
- "non_truncated": 0,
168
- "padded": 31,
169
- "non_padded": 9,
170
- "effective_few_shots": 0.0,
171
- "num_truncated_few_shots": 0
172
- }
173
- },
174
- "summary_general": {
175
- "hashes": {
176
- "hash_examples": "623505a45a4910c2",
177
- "hash_full_prompts": "0ee7c8ef786b9aa3",
178
- "hash_input_tokens": "e7650dee68c62549",
179
- "hash_cont_tokens": "8aa3f2652e7ee4e4"
180
- },
181
- "truncated": 120,
182
- "non_truncated": 0,
183
- "padded": 86,
184
- "non_padded": 34,
185
- "num_truncated_few_shots": 0
186
- }
187
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/data/deepseek-math-7b-kto-v01.00/checkpoint-500/main/aimo_kaggle_medium_pot/results_2024-05-30T15-35-30.139225.json DELETED
@@ -1,187 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1507691.053685872,
9
- "end_time": 1508013.679683445,
10
- "total_evaluation_time_secondes": "322.6259975731373",
11
- "model_name": "data_deepseek-math-7b-kto-v01.00_checkpoint-500",
12
- "model_sha": "",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium_pot:v0|0": {
19
- "score": 0.025,
20
- "score_stderr": 0.024999999999999998
21
- },
22
- "custom|aimo_kaggle_medium_pot:v1|0": {
23
- "score": 0.025,
24
- "score_stderr": 0.02499999999999999
25
- },
26
- "custom|aimo_kaggle_medium_pot:v2|0": {
27
- "score": 0.225,
28
- "score_stderr": 0.06686668711812967
29
- },
30
- "custom|aimo_kaggle_medium_pot:_average|0": {
31
- "score": 0.09166666666666667,
32
- "score_stderr": 0.03895556237270988
33
- },
34
- "all": {
35
- "score": 0.09166666666666667,
36
- "score_stderr": 0.03895556237270988
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_medium_pot:v0|0": 0,
41
- "custom|aimo_kaggle_medium_pot:v1|0": 0,
42
- "custom|aimo_kaggle_medium_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_medium_pot:v0": {
46
- "name": "aimo_kaggle_medium_pot:v0",
47
- "prompt_function": "kaggle_medium_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "frozen": false,
65
- "suite": [
66
- "custom"
67
- ],
68
- "original_num_docs": 40,
69
- "effective_num_docs": 40,
70
- "trust_dataset": null,
71
- "must_remove_duplicate_docs": null
72
- },
73
- "custom|aimo_kaggle_medium_pot:v1": {
74
- "name": "aimo_kaggle_medium_pot:v1",
75
- "prompt_function": "kaggle_medium_pot_prompt_fn_v1",
76
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
77
- "hf_subset": "v0",
78
- "metric": [
79
- "quasi_exact_match_code_and_math"
80
- ],
81
- "hf_avail_splits": [
82
- "train"
83
- ],
84
- "evaluation_splits": [
85
- "train"
86
- ],
87
- "few_shots_split": null,
88
- "few_shots_select": null,
89
- "generation_size": 2048,
90
- "stop_sequence": null,
91
- "output_regex": null,
92
- "frozen": false,
93
- "suite": [
94
- "custom"
95
- ],
96
- "original_num_docs": 40,
97
- "effective_num_docs": 40,
98
- "trust_dataset": null,
99
- "must_remove_duplicate_docs": null
100
- },
101
- "custom|aimo_kaggle_medium_pot:v2": {
102
- "name": "aimo_kaggle_medium_pot:v2",
103
- "prompt_function": "kaggle_medium_pot_prompt_fn_v2",
104
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
105
- "hf_subset": "v0",
106
- "metric": [
107
- "quasi_exact_match_code_and_math"
108
- ],
109
- "hf_avail_splits": [
110
- "train"
111
- ],
112
- "evaluation_splits": [
113
- "train"
114
- ],
115
- "few_shots_split": null,
116
- "few_shots_select": null,
117
- "generation_size": 2048,
118
- "stop_sequence": null,
119
- "output_regex": null,
120
- "frozen": false,
121
- "suite": [
122
- "custom"
123
- ],
124
- "original_num_docs": 40,
125
- "effective_num_docs": 40,
126
- "trust_dataset": null,
127
- "must_remove_duplicate_docs": null
128
- }
129
- },
130
- "summary_tasks": {
131
- "custom|aimo_kaggle_medium_pot:v0|0": {
132
- "hashes": {
133
- "hash_examples": "2799c24461029dc3",
134
- "hash_full_prompts": "2af864fbfc2e0a79",
135
- "hash_input_tokens": "324d7ae112f05205",
136
- "hash_cont_tokens": "c722f0ca923e4e78"
137
- },
138
- "truncated": 40,
139
- "non_truncated": 0,
140
- "padded": 29,
141
- "non_padded": 11,
142
- "effective_few_shots": 0.0,
143
- "num_truncated_few_shots": 0
144
- },
145
- "custom|aimo_kaggle_medium_pot:v1|0": {
146
- "hashes": {
147
- "hash_examples": "806b2e2056b41f84",
148
- "hash_full_prompts": "8123a0d96a6ceb9d",
149
- "hash_input_tokens": "b1f2be384f5fe5f1",
150
- "hash_cont_tokens": "95497768ab3d9728"
151
- },
152
- "truncated": 40,
153
- "non_truncated": 0,
154
- "padded": 26,
155
- "non_padded": 14,
156
- "effective_few_shots": 0.0,
157
- "num_truncated_few_shots": 0
158
- },
159
- "custom|aimo_kaggle_medium_pot:v2|0": {
160
- "hashes": {
161
- "hash_examples": "d8534375acc5d427",
162
- "hash_full_prompts": "71ba7c8172fec45a",
163
- "hash_input_tokens": "3e80be021360103e",
164
- "hash_cont_tokens": "45304647229c8ee3"
165
- },
166
- "truncated": 40,
167
- "non_truncated": 0,
168
- "padded": 31,
169
- "non_padded": 9,
170
- "effective_few_shots": 0.0,
171
- "num_truncated_few_shots": 0
172
- }
173
- },
174
- "summary_general": {
175
- "hashes": {
176
- "hash_examples": "623505a45a4910c2",
177
- "hash_full_prompts": "0ee7c8ef786b9aa3",
178
- "hash_input_tokens": "e7650dee68c62549",
179
- "hash_cont_tokens": "8aa3f2652e7ee4e4"
180
- },
181
- "truncated": 120,
182
- "non_truncated": 0,
183
- "padded": 86,
184
- "non_padded": 34,
185
- "num_truncated_few_shots": 0
186
- }
187
- }