|
{ |
|
"best_metric": 1.0909315347671509, |
|
"best_model_checkpoint": "outputs/checkpoint-602", |
|
"epoch": 14.898785425101215, |
|
"eval_steps": 500, |
|
"global_step": 690, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4318488529014845, |
|
"grad_norm": 0.9229329228401184, |
|
"learning_rate": 6e-06, |
|
"loss": 2.3402, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.863697705802969, |
|
"grad_norm": 0.6567079424858093, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.2949, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9932523616734144, |
|
"eval_loss": 2.04097318649292, |
|
"eval_runtime": 12.0995, |
|
"eval_samples_per_second": 30.745, |
|
"eval_steps_per_second": 3.884, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.2955465587044535, |
|
"grad_norm": 0.5602224469184875, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.1934, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.7273954116059378, |
|
"grad_norm": 0.5213916301727295, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.0095, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.9865047233468287, |
|
"eval_loss": 1.7542632818222046, |
|
"eval_runtime": 12.0873, |
|
"eval_samples_per_second": 30.776, |
|
"eval_steps_per_second": 3.888, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.1592442645074224, |
|
"grad_norm": 0.6186114549636841, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9523, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.591093117408907, |
|
"grad_norm": 0.5964342951774597, |
|
"learning_rate": 2.9915022003152058e-05, |
|
"loss": 1.8469, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.979757085020243, |
|
"eval_loss": 1.581282138824463, |
|
"eval_runtime": 12.0784, |
|
"eval_samples_per_second": 30.799, |
|
"eval_steps_per_second": 3.891, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 3.0229419703103915, |
|
"grad_norm": 0.9454211592674255, |
|
"learning_rate": 2.9661050847268002e-05, |
|
"loss": 1.8366, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.454790823211876, |
|
"grad_norm": 0.9509153962135315, |
|
"learning_rate": 2.924096412702572e-05, |
|
"loss": 1.7299, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.8866396761133606, |
|
"grad_norm": 0.9890360832214355, |
|
"learning_rate": 2.8659521592823702e-05, |
|
"loss": 1.6954, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.9946018893387314, |
|
"eval_loss": 1.4414902925491333, |
|
"eval_runtime": 12.0827, |
|
"eval_samples_per_second": 30.788, |
|
"eval_steps_per_second": 3.89, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 4.318488529014845, |
|
"grad_norm": 1.2653647661209106, |
|
"learning_rate": 2.792331122090709e-05, |
|
"loss": 1.6064, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.75033738191633, |
|
"grad_norm": 1.5335606336593628, |
|
"learning_rate": 2.7040674568964454e-05, |
|
"loss": 1.5995, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.987854251012146, |
|
"eval_loss": 1.3295938968658447, |
|
"eval_runtime": 12.0736, |
|
"eval_samples_per_second": 30.811, |
|
"eval_steps_per_second": 3.893, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 5.182186234817814, |
|
"grad_norm": 1.3195627927780151, |
|
"learning_rate": 2.6021612262946008e-05, |
|
"loss": 1.5563, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.614035087719298, |
|
"grad_norm": 1.599996566772461, |
|
"learning_rate": 2.487767068597558e-05, |
|
"loss": 1.4811, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.98110661268556, |
|
"eval_loss": 1.2465661764144897, |
|
"eval_runtime": 12.0805, |
|
"eval_samples_per_second": 30.793, |
|
"eval_steps_per_second": 3.891, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 6.045883940620783, |
|
"grad_norm": 1.6401485204696655, |
|
"learning_rate": 2.3621811153216105e-05, |
|
"loss": 1.4402, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.477732793522267, |
|
"grad_norm": 1.6716389656066895, |
|
"learning_rate": 2.2268263054989755e-05, |
|
"loss": 1.3571, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.909581646423752, |
|
"grad_norm": 1.7910152673721313, |
|
"learning_rate": 2.0832362632099814e-05, |
|
"loss": 1.4024, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.995951417004049, |
|
"eval_loss": 1.1861686706542969, |
|
"eval_runtime": 12.072, |
|
"eval_samples_per_second": 30.815, |
|
"eval_steps_per_second": 3.893, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 7.341430499325236, |
|
"grad_norm": 2.179117441177368, |
|
"learning_rate": 1.9330379210094315e-05, |
|
"loss": 1.3156, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.77327935222672, |
|
"grad_norm": 1.932990550994873, |
|
"learning_rate": 1.7779330861306716e-05, |
|
"loss": 1.3311, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.989203778677463, |
|
"eval_loss": 1.1523733139038086, |
|
"eval_runtime": 12.0776, |
|
"eval_samples_per_second": 30.801, |
|
"eval_steps_per_second": 3.892, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 2.1301991939544678, |
|
"learning_rate": 1.6196791583296248e-05, |
|
"loss": 1.2848, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.63697705802969, |
|
"grad_norm": 2.5514206886291504, |
|
"learning_rate": 1.460069217843338e-05, |
|
"loss": 1.2557, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.982456140350877, |
|
"eval_loss": 1.1161961555480957, |
|
"eval_runtime": 12.0718, |
|
"eval_samples_per_second": 30.816, |
|
"eval_steps_per_second": 3.893, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 9.068825910931174, |
|
"grad_norm": 1.910114049911499, |
|
"learning_rate": 1.3009117090744171e-05, |
|
"loss": 1.254, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.50067476383266, |
|
"grad_norm": 2.328728437423706, |
|
"learning_rate": 1.1440099501933278e-05, |
|
"loss": 1.2017, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.932523616734143, |
|
"grad_norm": 2.1410059928894043, |
|
"learning_rate": 9.911417008229545e-06, |
|
"loss": 1.2285, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.997300944669366, |
|
"eval_loss": 1.1040430068969727, |
|
"eval_runtime": 12.0718, |
|
"eval_samples_per_second": 30.816, |
|
"eval_steps_per_second": 3.893, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 10.364372469635628, |
|
"grad_norm": 2.478320837020874, |
|
"learning_rate": 8.44039019311717e-06, |
|
"loss": 1.1817, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 10.796221322537113, |
|
"grad_norm": 2.545358180999756, |
|
"learning_rate": 7.043686378203864e-06, |
|
"loss": 1.1507, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.99055330634278, |
|
"eval_loss": 1.0961867570877075, |
|
"eval_runtime": 12.0643, |
|
"eval_samples_per_second": 30.835, |
|
"eval_steps_per_second": 3.896, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 11.228070175438596, |
|
"grad_norm": 2.47627854347229, |
|
"learning_rate": 5.7371307758071225e-06, |
|
"loss": 1.207, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 11.65991902834008, |
|
"grad_norm": 2.8737947940826416, |
|
"learning_rate": 4.535527182975231e-06, |
|
"loss": 1.169, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 11.983805668016194, |
|
"eval_loss": 1.094117522239685, |
|
"eval_runtime": 12.0615, |
|
"eval_samples_per_second": 30.842, |
|
"eval_steps_per_second": 3.897, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 12.091767881241566, |
|
"grad_norm": 2.6336514949798584, |
|
"learning_rate": 3.4524902485514042e-06, |
|
"loss": 1.1437, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 12.523616734143049, |
|
"grad_norm": 2.0161590576171875, |
|
"learning_rate": 2.500291213762274e-06, |
|
"loss": 1.1425, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 12.955465587044534, |
|
"grad_norm": 3.3482580184936523, |
|
"learning_rate": 1.6897188741514285e-06, |
|
"loss": 1.1293, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.998650472334683, |
|
"eval_loss": 1.0909315347671509, |
|
"eval_runtime": 12.0609, |
|
"eval_samples_per_second": 30.844, |
|
"eval_steps_per_second": 3.897, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 13.387314439946019, |
|
"grad_norm": 2.3085474967956543, |
|
"learning_rate": 1.0299573382149235e-06, |
|
"loss": 1.1141, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 13.819163292847504, |
|
"grad_norm": 2.5478107929229736, |
|
"learning_rate": 5.284819677822611e-07, |
|
"loss": 1.18, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 13.991902834008098, |
|
"eval_loss": 1.0923157930374146, |
|
"eval_runtime": 12.0657, |
|
"eval_samples_per_second": 30.831, |
|
"eval_steps_per_second": 3.895, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 14.251012145748987, |
|
"grad_norm": 2.4361941814422607, |
|
"learning_rate": 1.909746791798317e-07, |
|
"loss": 1.151, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 14.682860998650472, |
|
"grad_norm": 2.4427309036254883, |
|
"learning_rate": 2.1259564848570835e-08, |
|
"loss": 1.1257, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 14.898785425101215, |
|
"eval_loss": 1.0921934843063354, |
|
"eval_runtime": 12.0392, |
|
"eval_samples_per_second": 30.899, |
|
"eval_steps_per_second": 3.904, |
|
"step": 690 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 690, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"total_flos": 6.133360120720589e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|