{ "best_metric": 1.0909315347671509, "best_model_checkpoint": "outputs/checkpoint-602", "epoch": 14.898785425101215, "eval_steps": 500, "global_step": 690, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4318488529014845, "grad_norm": 0.9229329228401184, "learning_rate": 6e-06, "loss": 2.3402, "step": 20 }, { "epoch": 0.863697705802969, "grad_norm": 0.6567079424858093, "learning_rate": 1.2e-05, "loss": 2.2949, "step": 40 }, { "epoch": 0.9932523616734144, "eval_loss": 2.04097318649292, "eval_runtime": 12.0995, "eval_samples_per_second": 30.745, "eval_steps_per_second": 3.884, "step": 46 }, { "epoch": 1.2955465587044535, "grad_norm": 0.5602224469184875, "learning_rate": 1.8e-05, "loss": 2.1934, "step": 60 }, { "epoch": 1.7273954116059378, "grad_norm": 0.5213916301727295, "learning_rate": 2.4e-05, "loss": 2.0095, "step": 80 }, { "epoch": 1.9865047233468287, "eval_loss": 1.7542632818222046, "eval_runtime": 12.0873, "eval_samples_per_second": 30.776, "eval_steps_per_second": 3.888, "step": 92 }, { "epoch": 2.1592442645074224, "grad_norm": 0.6186114549636841, "learning_rate": 3e-05, "loss": 1.9523, "step": 100 }, { "epoch": 2.591093117408907, "grad_norm": 0.5964342951774597, "learning_rate": 2.9915022003152058e-05, "loss": 1.8469, "step": 120 }, { "epoch": 2.979757085020243, "eval_loss": 1.581282138824463, "eval_runtime": 12.0784, "eval_samples_per_second": 30.799, "eval_steps_per_second": 3.891, "step": 138 }, { "epoch": 3.0229419703103915, "grad_norm": 0.9454211592674255, "learning_rate": 2.9661050847268002e-05, "loss": 1.8366, "step": 140 }, { "epoch": 3.454790823211876, "grad_norm": 0.9509153962135315, "learning_rate": 2.924096412702572e-05, "loss": 1.7299, "step": 160 }, { "epoch": 3.8866396761133606, "grad_norm": 0.9890360832214355, "learning_rate": 2.8659521592823702e-05, "loss": 1.6954, "step": 180 }, { "epoch": 3.9946018893387314, "eval_loss": 1.4414902925491333, "eval_runtime": 12.0827, "eval_samples_per_second": 30.788, "eval_steps_per_second": 3.89, "step": 185 }, { "epoch": 4.318488529014845, "grad_norm": 1.2653647661209106, "learning_rate": 2.792331122090709e-05, "loss": 1.6064, "step": 200 }, { "epoch": 4.75033738191633, "grad_norm": 1.5335606336593628, "learning_rate": 2.7040674568964454e-05, "loss": 1.5995, "step": 220 }, { "epoch": 4.987854251012146, "eval_loss": 1.3295938968658447, "eval_runtime": 12.0736, "eval_samples_per_second": 30.811, "eval_steps_per_second": 3.893, "step": 231 }, { "epoch": 5.182186234817814, "grad_norm": 1.3195627927780151, "learning_rate": 2.6021612262946008e-05, "loss": 1.5563, "step": 240 }, { "epoch": 5.614035087719298, "grad_norm": 1.599996566772461, "learning_rate": 2.487767068597558e-05, "loss": 1.4811, "step": 260 }, { "epoch": 5.98110661268556, "eval_loss": 1.2465661764144897, "eval_runtime": 12.0805, "eval_samples_per_second": 30.793, "eval_steps_per_second": 3.891, "step": 277 }, { "epoch": 6.045883940620783, "grad_norm": 1.6401485204696655, "learning_rate": 2.3621811153216105e-05, "loss": 1.4402, "step": 280 }, { "epoch": 6.477732793522267, "grad_norm": 1.6716389656066895, "learning_rate": 2.2268263054989755e-05, "loss": 1.3571, "step": 300 }, { "epoch": 6.909581646423752, "grad_norm": 1.7910152673721313, "learning_rate": 2.0832362632099814e-05, "loss": 1.4024, "step": 320 }, { "epoch": 6.995951417004049, "eval_loss": 1.1861686706542969, "eval_runtime": 12.072, "eval_samples_per_second": 30.815, "eval_steps_per_second": 3.893, "step": 324 }, { "epoch": 7.341430499325236, "grad_norm": 2.179117441177368, "learning_rate": 1.9330379210094315e-05, "loss": 1.3156, "step": 340 }, { "epoch": 7.77327935222672, "grad_norm": 1.932990550994873, "learning_rate": 1.7779330861306716e-05, "loss": 1.3311, "step": 360 }, { "epoch": 7.989203778677463, "eval_loss": 1.1523733139038086, "eval_runtime": 12.0776, "eval_samples_per_second": 30.801, "eval_steps_per_second": 3.892, "step": 370 }, { "epoch": 8.205128205128204, "grad_norm": 2.1301991939544678, "learning_rate": 1.6196791583296248e-05, "loss": 1.2848, "step": 380 }, { "epoch": 8.63697705802969, "grad_norm": 2.5514206886291504, "learning_rate": 1.460069217843338e-05, "loss": 1.2557, "step": 400 }, { "epoch": 8.982456140350877, "eval_loss": 1.1161961555480957, "eval_runtime": 12.0718, "eval_samples_per_second": 30.816, "eval_steps_per_second": 3.893, "step": 416 }, { "epoch": 9.068825910931174, "grad_norm": 1.910114049911499, "learning_rate": 1.3009117090744171e-05, "loss": 1.254, "step": 420 }, { "epoch": 9.50067476383266, "grad_norm": 2.328728437423706, "learning_rate": 1.1440099501933278e-05, "loss": 1.2017, "step": 440 }, { "epoch": 9.932523616734143, "grad_norm": 2.1410059928894043, "learning_rate": 9.911417008229545e-06, "loss": 1.2285, "step": 460 }, { "epoch": 9.997300944669366, "eval_loss": 1.1040430068969727, "eval_runtime": 12.0718, "eval_samples_per_second": 30.816, "eval_steps_per_second": 3.893, "step": 463 }, { "epoch": 10.364372469635628, "grad_norm": 2.478320837020874, "learning_rate": 8.44039019311717e-06, "loss": 1.1817, "step": 480 }, { "epoch": 10.796221322537113, "grad_norm": 2.545358180999756, "learning_rate": 7.043686378203864e-06, "loss": 1.1507, "step": 500 }, { "epoch": 10.99055330634278, "eval_loss": 1.0961867570877075, "eval_runtime": 12.0643, "eval_samples_per_second": 30.835, "eval_steps_per_second": 3.896, "step": 509 }, { "epoch": 11.228070175438596, "grad_norm": 2.47627854347229, "learning_rate": 5.7371307758071225e-06, "loss": 1.207, "step": 520 }, { "epoch": 11.65991902834008, "grad_norm": 2.8737947940826416, "learning_rate": 4.535527182975231e-06, "loss": 1.169, "step": 540 }, { "epoch": 11.983805668016194, "eval_loss": 1.094117522239685, "eval_runtime": 12.0615, "eval_samples_per_second": 30.842, "eval_steps_per_second": 3.897, "step": 555 }, { "epoch": 12.091767881241566, "grad_norm": 2.6336514949798584, "learning_rate": 3.4524902485514042e-06, "loss": 1.1437, "step": 560 }, { "epoch": 12.523616734143049, "grad_norm": 2.0161590576171875, "learning_rate": 2.500291213762274e-06, "loss": 1.1425, "step": 580 }, { "epoch": 12.955465587044534, "grad_norm": 3.3482580184936523, "learning_rate": 1.6897188741514285e-06, "loss": 1.1293, "step": 600 }, { "epoch": 12.998650472334683, "eval_loss": 1.0909315347671509, "eval_runtime": 12.0609, "eval_samples_per_second": 30.844, "eval_steps_per_second": 3.897, "step": 602 }, { "epoch": 13.387314439946019, "grad_norm": 2.3085474967956543, "learning_rate": 1.0299573382149235e-06, "loss": 1.1141, "step": 620 }, { "epoch": 13.819163292847504, "grad_norm": 2.5478107929229736, "learning_rate": 5.284819677822611e-07, "loss": 1.18, "step": 640 }, { "epoch": 13.991902834008098, "eval_loss": 1.0923157930374146, "eval_runtime": 12.0657, "eval_samples_per_second": 30.831, "eval_steps_per_second": 3.895, "step": 648 }, { "epoch": 14.251012145748987, "grad_norm": 2.4361941814422607, "learning_rate": 1.909746791798317e-07, "loss": 1.151, "step": 660 }, { "epoch": 14.682860998650472, "grad_norm": 2.4427309036254883, "learning_rate": 2.1259564848570835e-08, "loss": 1.1257, "step": 680 }, { "epoch": 14.898785425101215, "eval_loss": 1.0921934843063354, "eval_runtime": 12.0392, "eval_samples_per_second": 30.899, "eval_steps_per_second": 3.904, "step": 690 } ], "logging_steps": 20, "max_steps": 690, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "total_flos": 6.133360120720589e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }