{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6206896551724138, "eval_steps": 1, "global_step": 45, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013793103448275862, "grad_norm": 6.989287853240967, "learning_rate": 2.2727272727272728e-06, "loss": 1.8926, "step": 1 }, { "epoch": 0.013793103448275862, "eval_loss": 2.144650459289551, "eval_runtime": 17.2592, "eval_samples_per_second": 1.159, "eval_steps_per_second": 0.579, "step": 1 }, { "epoch": 0.027586206896551724, "grad_norm": 6.281332492828369, "learning_rate": 4.5454545454545455e-06, "loss": 1.986, "step": 2 }, { "epoch": 0.027586206896551724, "eval_loss": 2.0905685424804688, "eval_runtime": 17.6184, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.568, "step": 2 }, { "epoch": 0.041379310344827586, "grad_norm": 4.347537040710449, "learning_rate": 6.818181818181818e-06, "loss": 1.9355, "step": 3 }, { "epoch": 0.041379310344827586, "eval_loss": 1.9983774423599243, "eval_runtime": 17.5928, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 3 }, { "epoch": 0.05517241379310345, "grad_norm": 4.809764385223389, "learning_rate": 9.090909090909091e-06, "loss": 1.7509, "step": 4 }, { "epoch": 0.05517241379310345, "eval_loss": 1.8737837076187134, "eval_runtime": 17.6375, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 4 }, { "epoch": 0.06896551724137931, "grad_norm": 3.4548702239990234, "learning_rate": 1.1363636363636365e-05, "loss": 1.8838, "step": 5 }, { "epoch": 0.06896551724137931, "eval_loss": 1.7746165990829468, "eval_runtime": 18.0257, "eval_samples_per_second": 1.11, "eval_steps_per_second": 0.555, "step": 5 }, { "epoch": 0.08275862068965517, "grad_norm": 3.1943702697753906, "learning_rate": 1.3636363636363637e-05, "loss": 1.7707, "step": 6 }, { "epoch": 0.08275862068965517, "eval_loss": 1.6792665719985962, "eval_runtime": 17.7498, "eval_samples_per_second": 1.127, "eval_steps_per_second": 0.563, "step": 6 }, { "epoch": 0.09655172413793103, "grad_norm": 3.318288564682007, "learning_rate": 1.590909090909091e-05, "loss": 1.7171, "step": 7 }, { "epoch": 0.09655172413793103, "eval_loss": 1.5874873399734497, "eval_runtime": 17.6295, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 7 }, { "epoch": 0.1103448275862069, "grad_norm": 3.210330009460449, "learning_rate": 1.8181818181818182e-05, "loss": 1.5734, "step": 8 }, { "epoch": 0.1103448275862069, "eval_loss": 1.535287618637085, "eval_runtime": 17.6232, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.567, "step": 8 }, { "epoch": 0.12413793103448276, "grad_norm": 3.2319107055664062, "learning_rate": 2.0454545454545457e-05, "loss": 1.7986, "step": 9 }, { "epoch": 0.12413793103448276, "eval_loss": 1.467301607131958, "eval_runtime": 17.5824, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 9 }, { "epoch": 0.13793103448275862, "grad_norm": 2.79286789894104, "learning_rate": 2.272727272727273e-05, "loss": 1.5025, "step": 10 }, { "epoch": 0.13793103448275862, "eval_loss": 1.3961191177368164, "eval_runtime": 18.3446, "eval_samples_per_second": 1.09, "eval_steps_per_second": 0.545, "step": 10 }, { "epoch": 0.15172413793103448, "grad_norm": 2.885422706604004, "learning_rate": 2.5e-05, "loss": 1.5477, "step": 11 }, { "epoch": 0.15172413793103448, "eval_loss": 1.3420469760894775, "eval_runtime": 17.7683, "eval_samples_per_second": 1.126, "eval_steps_per_second": 0.563, "step": 11 }, { "epoch": 0.16551724137931034, "grad_norm": 2.7671327590942383, "learning_rate": 2.7272727272727273e-05, "loss": 1.6921, "step": 12 }, { "epoch": 0.16551724137931034, "eval_loss": 1.3071445226669312, "eval_runtime": 17.652, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.567, "step": 12 }, { "epoch": 0.1793103448275862, "grad_norm": 2.9047963619232178, "learning_rate": 2.954545454545455e-05, "loss": 1.5365, "step": 13 }, { "epoch": 0.1793103448275862, "eval_loss": 1.2601890563964844, "eval_runtime": 17.5232, "eval_samples_per_second": 1.141, "eval_steps_per_second": 0.571, "step": 13 }, { "epoch": 0.19310344827586207, "grad_norm": 2.6054675579071045, "learning_rate": 3.181818181818182e-05, "loss": 1.6621, "step": 14 }, { "epoch": 0.19310344827586207, "eval_loss": 1.2506535053253174, "eval_runtime": 17.6295, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 14 }, { "epoch": 0.20689655172413793, "grad_norm": 2.538036823272705, "learning_rate": 3.409090909090909e-05, "loss": 1.6763, "step": 15 }, { "epoch": 0.20689655172413793, "eval_loss": 1.2366451025009155, "eval_runtime": 18.6172, "eval_samples_per_second": 1.074, "eval_steps_per_second": 0.537, "step": 15 }, { "epoch": 0.2206896551724138, "grad_norm": 2.5125789642333984, "learning_rate": 3.6363636363636364e-05, "loss": 1.668, "step": 16 }, { "epoch": 0.2206896551724138, "eval_loss": 1.2205184698104858, "eval_runtime": 17.7529, "eval_samples_per_second": 1.127, "eval_steps_per_second": 0.563, "step": 16 }, { "epoch": 0.23448275862068965, "grad_norm": 5.055665969848633, "learning_rate": 3.8636363636363636e-05, "loss": 1.5703, "step": 17 }, { "epoch": 0.23448275862068965, "eval_loss": 1.167407751083374, "eval_runtime": 17.5902, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 17 }, { "epoch": 0.2482758620689655, "grad_norm": 2.567411422729492, "learning_rate": 4.0909090909090915e-05, "loss": 1.4859, "step": 18 }, { "epoch": 0.2482758620689655, "eval_loss": 1.1367636919021606, "eval_runtime": 17.4832, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 18 }, { "epoch": 0.2620689655172414, "grad_norm": 2.3214948177337646, "learning_rate": 4.318181818181819e-05, "loss": 1.4511, "step": 19 }, { "epoch": 0.2620689655172414, "eval_loss": 1.1296402215957642, "eval_runtime": 17.6655, "eval_samples_per_second": 1.132, "eval_steps_per_second": 0.566, "step": 19 }, { "epoch": 0.27586206896551724, "grad_norm": 2.390448570251465, "learning_rate": 4.545454545454546e-05, "loss": 1.7181, "step": 20 }, { "epoch": 0.27586206896551724, "eval_loss": 1.126497507095337, "eval_runtime": 17.9736, "eval_samples_per_second": 1.113, "eval_steps_per_second": 0.556, "step": 20 }, { "epoch": 0.2896551724137931, "grad_norm": 2.3728342056274414, "learning_rate": 4.772727272727273e-05, "loss": 1.4155, "step": 21 }, { "epoch": 0.2896551724137931, "eval_loss": 1.09345281124115, "eval_runtime": 17.8283, "eval_samples_per_second": 1.122, "eval_steps_per_second": 0.561, "step": 21 }, { "epoch": 0.30344827586206896, "grad_norm": 2.4872097969055176, "learning_rate": 5e-05, "loss": 1.3752, "step": 22 }, { "epoch": 0.30344827586206896, "eval_loss": 1.0705276727676392, "eval_runtime": 17.6481, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.567, "step": 22 }, { "epoch": 0.31724137931034485, "grad_norm": 2.953234910964966, "learning_rate": 4.999672209164081e-05, "loss": 1.4449, "step": 23 }, { "epoch": 0.31724137931034485, "eval_loss": 1.0468412637710571, "eval_runtime": 17.6053, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 23 }, { "epoch": 0.3310344827586207, "grad_norm": 2.47603702545166, "learning_rate": 4.998688922613788e-05, "loss": 1.4286, "step": 24 }, { "epoch": 0.3310344827586207, "eval_loss": 1.0292497873306274, "eval_runtime": 17.5777, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 24 }, { "epoch": 0.3448275862068966, "grad_norm": 2.2879106998443604, "learning_rate": 4.997050398198977e-05, "loss": 1.5164, "step": 25 }, { "epoch": 0.3448275862068966, "eval_loss": 1.0196115970611572, "eval_runtime": 17.9439, "eval_samples_per_second": 1.115, "eval_steps_per_second": 0.557, "step": 25 }, { "epoch": 0.3586206896551724, "grad_norm": 2.319134473800659, "learning_rate": 4.9947570655942796e-05, "loss": 1.5282, "step": 26 }, { "epoch": 0.3586206896551724, "eval_loss": 1.013381004333496, "eval_runtime": 17.7628, "eval_samples_per_second": 1.126, "eval_steps_per_second": 0.563, "step": 26 }, { "epoch": 0.3724137931034483, "grad_norm": 2.259608745574951, "learning_rate": 4.991809526186424e-05, "loss": 1.4901, "step": 27 }, { "epoch": 0.3724137931034483, "eval_loss": 1.0091207027435303, "eval_runtime": 17.619, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.568, "step": 27 }, { "epoch": 0.38620689655172413, "grad_norm": 2.2252631187438965, "learning_rate": 4.988208552916535e-05, "loss": 1.5518, "step": 28 }, { "epoch": 0.38620689655172413, "eval_loss": 1.0063353776931763, "eval_runtime": 17.4778, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 28 }, { "epoch": 0.4, "grad_norm": 2.2154901027679443, "learning_rate": 4.983955090077444e-05, "loss": 1.4682, "step": 29 }, { "epoch": 0.4, "eval_loss": 0.99261075258255, "eval_runtime": 17.4894, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 29 }, { "epoch": 0.41379310344827586, "grad_norm": 2.320786237716675, "learning_rate": 4.9790502530660635e-05, "loss": 1.4691, "step": 30 }, { "epoch": 0.41379310344827586, "eval_loss": 0.9836109280586243, "eval_runtime": 16.9043, "eval_samples_per_second": 1.183, "eval_steps_per_second": 0.592, "step": 30 }, { "epoch": 0.42758620689655175, "grad_norm": 2.1385531425476074, "learning_rate": 4.9734953280908904e-05, "loss": 1.4696, "step": 31 }, { "epoch": 0.42758620689655175, "eval_loss": 0.976610541343689, "eval_runtime": 17.3486, "eval_samples_per_second": 1.153, "eval_steps_per_second": 0.576, "step": 31 }, { "epoch": 0.4413793103448276, "grad_norm": 2.2254769802093506, "learning_rate": 4.967291771834727e-05, "loss": 1.531, "step": 32 }, { "epoch": 0.4413793103448276, "eval_loss": 0.9718761444091797, "eval_runtime": 17.5285, "eval_samples_per_second": 1.141, "eval_steps_per_second": 0.57, "step": 32 }, { "epoch": 0.45517241379310347, "grad_norm": 2.34843373298645, "learning_rate": 4.960441211072686e-05, "loss": 1.5484, "step": 33 }, { "epoch": 0.45517241379310347, "eval_loss": 0.9682589769363403, "eval_runtime": 17.5952, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 33 }, { "epoch": 0.4689655172413793, "grad_norm": 1.9610539674758911, "learning_rate": 4.9529454422455976e-05, "loss": 1.3204, "step": 34 }, { "epoch": 0.4689655172413793, "eval_loss": 0.9610344767570496, "eval_runtime": 17.6076, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 34 }, { "epoch": 0.4827586206896552, "grad_norm": 2.2027809619903564, "learning_rate": 4.944806430988927e-05, "loss": 1.3801, "step": 35 }, { "epoch": 0.4827586206896552, "eval_loss": 0.9546059370040894, "eval_runtime": 17.5811, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 35 }, { "epoch": 0.496551724137931, "grad_norm": 2.3457250595092773, "learning_rate": 4.936026311617316e-05, "loss": 1.4401, "step": 36 }, { "epoch": 0.496551724137931, "eval_loss": 0.9482511281967163, "eval_runtime": 17.8351, "eval_samples_per_second": 1.121, "eval_steps_per_second": 0.561, "step": 36 }, { "epoch": 0.5103448275862069, "grad_norm": 2.161039352416992, "learning_rate": 4.926607386564898e-05, "loss": 1.4067, "step": 37 }, { "epoch": 0.5103448275862069, "eval_loss": 0.9448164701461792, "eval_runtime": 17.6014, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 37 }, { "epoch": 0.5241379310344828, "grad_norm": 2.1683900356292725, "learning_rate": 4.916552125781528e-05, "loss": 1.3806, "step": 38 }, { "epoch": 0.5241379310344828, "eval_loss": 0.9402996897697449, "eval_runtime": 17.6524, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.566, "step": 38 }, { "epoch": 0.5379310344827586, "grad_norm": 2.2735962867736816, "learning_rate": 4.9058631660850765e-05, "loss": 1.4937, "step": 39 }, { "epoch": 0.5379310344827586, "eval_loss": 0.9291872978210449, "eval_runtime": 17.5838, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.569, "step": 39 }, { "epoch": 0.5655172413793104, "grad_norm": 2.2170450687408447, "learning_rate": 2.2727272727272728e-06, "loss": 1.316, "step": 41 }, { "epoch": 0.5655172413793104, "eval_loss": 0.9163956642150879, "eval_runtime": 15.7145, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.636, "step": 41 }, { "epoch": 0.5793103448275863, "grad_norm": 2.2266974449157715, "learning_rate": 4.5454545454545455e-06, "loss": 1.3854, "step": 42 }, { "epoch": 0.5793103448275863, "eval_loss": 0.9137259721755981, "eval_runtime": 15.7133, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.636, "step": 42 }, { "epoch": 0.593103448275862, "grad_norm": 2.3451268672943115, "learning_rate": 6.818181818181818e-06, "loss": 1.4208, "step": 43 }, { "epoch": 0.593103448275862, "eval_loss": 0.9096618890762329, "eval_runtime": 15.7895, "eval_samples_per_second": 1.267, "eval_steps_per_second": 0.633, "step": 43 }, { "epoch": 0.6068965517241379, "grad_norm": 2.0125885009765625, "learning_rate": 9.090909090909091e-06, "loss": 1.4302, "step": 44 }, { "epoch": 0.6068965517241379, "eval_loss": 0.9058458209037781, "eval_runtime": 15.6899, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.637, "step": 44 } ], "logging_steps": 1, "max_steps": 216, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.643858048835584e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }