{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8275862068965517, "eval_steps": 1, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013793103448275862, "grad_norm": 6.989287853240967, "learning_rate": 2.2727272727272728e-06, "loss": 1.8926, "step": 1 }, { "epoch": 0.013793103448275862, "eval_loss": 2.144650459289551, "eval_runtime": 17.2592, "eval_samples_per_second": 1.159, "eval_steps_per_second": 0.579, "step": 1 }, { "epoch": 0.027586206896551724, "grad_norm": 6.281332492828369, "learning_rate": 4.5454545454545455e-06, "loss": 1.986, "step": 2 }, { "epoch": 0.027586206896551724, "eval_loss": 2.0905685424804688, "eval_runtime": 17.6184, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.568, "step": 2 }, { "epoch": 0.041379310344827586, "grad_norm": 4.347537040710449, "learning_rate": 6.818181818181818e-06, "loss": 1.9355, "step": 3 }, { "epoch": 0.041379310344827586, "eval_loss": 1.9983774423599243, "eval_runtime": 17.5928, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 3 }, { "epoch": 0.05517241379310345, "grad_norm": 4.809764385223389, "learning_rate": 9.090909090909091e-06, "loss": 1.7509, "step": 4 }, { "epoch": 0.05517241379310345, "eval_loss": 1.8737837076187134, "eval_runtime": 17.6375, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 4 }, { "epoch": 0.06896551724137931, "grad_norm": 3.4548702239990234, "learning_rate": 1.1363636363636365e-05, "loss": 1.8838, "step": 5 }, { "epoch": 0.06896551724137931, "eval_loss": 1.7746165990829468, "eval_runtime": 18.0257, "eval_samples_per_second": 1.11, "eval_steps_per_second": 0.555, "step": 5 }, { "epoch": 0.08275862068965517, "grad_norm": 3.1943702697753906, "learning_rate": 1.3636363636363637e-05, "loss": 1.7707, "step": 6 }, { "epoch": 0.08275862068965517, "eval_loss": 1.6792665719985962, "eval_runtime": 17.7498, "eval_samples_per_second": 1.127, "eval_steps_per_second": 0.563, "step": 6 }, { "epoch": 0.09655172413793103, "grad_norm": 3.318288564682007, "learning_rate": 1.590909090909091e-05, "loss": 1.7171, "step": 7 }, { "epoch": 0.09655172413793103, "eval_loss": 1.5874873399734497, "eval_runtime": 17.6295, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 7 }, { "epoch": 0.1103448275862069, "grad_norm": 3.210330009460449, "learning_rate": 1.8181818181818182e-05, "loss": 1.5734, "step": 8 }, { "epoch": 0.1103448275862069, "eval_loss": 1.535287618637085, "eval_runtime": 17.6232, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.567, "step": 8 }, { "epoch": 0.12413793103448276, "grad_norm": 3.2319107055664062, "learning_rate": 2.0454545454545457e-05, "loss": 1.7986, "step": 9 }, { "epoch": 0.12413793103448276, "eval_loss": 1.467301607131958, "eval_runtime": 17.5824, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 9 }, { "epoch": 0.13793103448275862, "grad_norm": 2.79286789894104, "learning_rate": 2.272727272727273e-05, "loss": 1.5025, "step": 10 }, { "epoch": 0.13793103448275862, "eval_loss": 1.3961191177368164, "eval_runtime": 18.3446, "eval_samples_per_second": 1.09, "eval_steps_per_second": 0.545, "step": 10 }, { "epoch": 0.15172413793103448, "grad_norm": 2.885422706604004, "learning_rate": 2.5e-05, "loss": 1.5477, "step": 11 }, { "epoch": 0.15172413793103448, "eval_loss": 1.3420469760894775, "eval_runtime": 17.7683, "eval_samples_per_second": 1.126, "eval_steps_per_second": 0.563, "step": 11 }, { "epoch": 0.16551724137931034, "grad_norm": 2.7671327590942383, "learning_rate": 2.7272727272727273e-05, "loss": 1.6921, "step": 12 }, { "epoch": 0.16551724137931034, "eval_loss": 1.3071445226669312, "eval_runtime": 17.652, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.567, "step": 12 }, { "epoch": 0.1793103448275862, "grad_norm": 2.9047963619232178, "learning_rate": 2.954545454545455e-05, "loss": 1.5365, "step": 13 }, { "epoch": 0.1793103448275862, "eval_loss": 1.2601890563964844, "eval_runtime": 17.5232, "eval_samples_per_second": 1.141, "eval_steps_per_second": 0.571, "step": 13 }, { "epoch": 0.19310344827586207, "grad_norm": 2.6054675579071045, "learning_rate": 3.181818181818182e-05, "loss": 1.6621, "step": 14 }, { "epoch": 0.19310344827586207, "eval_loss": 1.2506535053253174, "eval_runtime": 17.6295, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 14 }, { "epoch": 0.20689655172413793, "grad_norm": 2.538036823272705, "learning_rate": 3.409090909090909e-05, "loss": 1.6763, "step": 15 }, { "epoch": 0.20689655172413793, "eval_loss": 1.2366451025009155, "eval_runtime": 18.6172, "eval_samples_per_second": 1.074, "eval_steps_per_second": 0.537, "step": 15 }, { "epoch": 0.2206896551724138, "grad_norm": 2.5125789642333984, "learning_rate": 3.6363636363636364e-05, "loss": 1.668, "step": 16 }, { "epoch": 0.2206896551724138, "eval_loss": 1.2205184698104858, "eval_runtime": 17.7529, "eval_samples_per_second": 1.127, "eval_steps_per_second": 0.563, "step": 16 }, { "epoch": 0.23448275862068965, "grad_norm": 5.055665969848633, "learning_rate": 3.8636363636363636e-05, "loss": 1.5703, "step": 17 }, { "epoch": 0.23448275862068965, "eval_loss": 1.167407751083374, "eval_runtime": 17.5902, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 17 }, { "epoch": 0.2482758620689655, "grad_norm": 2.567411422729492, "learning_rate": 4.0909090909090915e-05, "loss": 1.4859, "step": 18 }, { "epoch": 0.2482758620689655, "eval_loss": 1.1367636919021606, "eval_runtime": 17.4832, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 18 }, { "epoch": 0.2620689655172414, "grad_norm": 2.3214948177337646, "learning_rate": 4.318181818181819e-05, "loss": 1.4511, "step": 19 }, { "epoch": 0.2620689655172414, "eval_loss": 1.1296402215957642, "eval_runtime": 17.6655, "eval_samples_per_second": 1.132, "eval_steps_per_second": 0.566, "step": 19 }, { "epoch": 0.27586206896551724, "grad_norm": 2.390448570251465, "learning_rate": 4.545454545454546e-05, "loss": 1.7181, "step": 20 }, { "epoch": 0.27586206896551724, "eval_loss": 1.126497507095337, "eval_runtime": 17.9736, "eval_samples_per_second": 1.113, "eval_steps_per_second": 0.556, "step": 20 }, { "epoch": 0.2896551724137931, "grad_norm": 2.3728342056274414, "learning_rate": 4.772727272727273e-05, "loss": 1.4155, "step": 21 }, { "epoch": 0.2896551724137931, "eval_loss": 1.09345281124115, "eval_runtime": 17.8283, "eval_samples_per_second": 1.122, "eval_steps_per_second": 0.561, "step": 21 }, { "epoch": 0.30344827586206896, "grad_norm": 2.4872097969055176, "learning_rate": 5e-05, "loss": 1.3752, "step": 22 }, { "epoch": 0.30344827586206896, "eval_loss": 1.0705276727676392, "eval_runtime": 17.6481, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.567, "step": 22 }, { "epoch": 0.31724137931034485, "grad_norm": 2.953234910964966, "learning_rate": 4.999672209164081e-05, "loss": 1.4449, "step": 23 }, { "epoch": 0.31724137931034485, "eval_loss": 1.0468412637710571, "eval_runtime": 17.6053, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 23 }, { "epoch": 0.3310344827586207, "grad_norm": 2.47603702545166, "learning_rate": 4.998688922613788e-05, "loss": 1.4286, "step": 24 }, { "epoch": 0.3310344827586207, "eval_loss": 1.0292497873306274, "eval_runtime": 17.5777, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 24 }, { "epoch": 0.3448275862068966, "grad_norm": 2.2879106998443604, "learning_rate": 4.997050398198977e-05, "loss": 1.5164, "step": 25 }, { "epoch": 0.3448275862068966, "eval_loss": 1.0196115970611572, "eval_runtime": 17.9439, "eval_samples_per_second": 1.115, "eval_steps_per_second": 0.557, "step": 25 }, { "epoch": 0.3586206896551724, "grad_norm": 2.319134473800659, "learning_rate": 4.9947570655942796e-05, "loss": 1.5282, "step": 26 }, { "epoch": 0.3586206896551724, "eval_loss": 1.013381004333496, "eval_runtime": 17.7628, "eval_samples_per_second": 1.126, "eval_steps_per_second": 0.563, "step": 26 }, { "epoch": 0.3724137931034483, "grad_norm": 2.259608745574951, "learning_rate": 4.991809526186424e-05, "loss": 1.4901, "step": 27 }, { "epoch": 0.3724137931034483, "eval_loss": 1.0091207027435303, "eval_runtime": 17.619, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.568, "step": 27 }, { "epoch": 0.38620689655172413, "grad_norm": 2.2252631187438965, "learning_rate": 4.988208552916535e-05, "loss": 1.5518, "step": 28 }, { "epoch": 0.38620689655172413, "eval_loss": 1.0063353776931763, "eval_runtime": 17.4778, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 28 }, { "epoch": 0.4, "grad_norm": 2.2154901027679443, "learning_rate": 4.983955090077444e-05, "loss": 1.4682, "step": 29 }, { "epoch": 0.4, "eval_loss": 0.99261075258255, "eval_runtime": 17.4894, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 29 }, { "epoch": 0.41379310344827586, "grad_norm": 2.320786237716675, "learning_rate": 4.9790502530660635e-05, "loss": 1.4691, "step": 30 }, { "epoch": 0.41379310344827586, "eval_loss": 0.9836109280586243, "eval_runtime": 16.9043, "eval_samples_per_second": 1.183, "eval_steps_per_second": 0.592, "step": 30 }, { "epoch": 0.42758620689655175, "grad_norm": 2.1385531425476074, "learning_rate": 4.9734953280908904e-05, "loss": 1.4696, "step": 31 }, { "epoch": 0.42758620689655175, "eval_loss": 0.976610541343689, "eval_runtime": 17.3486, "eval_samples_per_second": 1.153, "eval_steps_per_second": 0.576, "step": 31 }, { "epoch": 0.4413793103448276, "grad_norm": 2.2254769802093506, "learning_rate": 4.967291771834727e-05, "loss": 1.531, "step": 32 }, { "epoch": 0.4413793103448276, "eval_loss": 0.9718761444091797, "eval_runtime": 17.5285, "eval_samples_per_second": 1.141, "eval_steps_per_second": 0.57, "step": 32 }, { "epoch": 0.45517241379310347, "grad_norm": 2.34843373298645, "learning_rate": 4.960441211072686e-05, "loss": 1.5484, "step": 33 }, { "epoch": 0.45517241379310347, "eval_loss": 0.9682589769363403, "eval_runtime": 17.5952, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 33 }, { "epoch": 0.4689655172413793, "grad_norm": 1.9610539674758911, "learning_rate": 4.9529454422455976e-05, "loss": 1.3204, "step": 34 }, { "epoch": 0.4689655172413793, "eval_loss": 0.9610344767570496, "eval_runtime": 17.6076, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 34 }, { "epoch": 0.4827586206896552, "grad_norm": 2.2027809619903564, "learning_rate": 4.944806430988927e-05, "loss": 1.3801, "step": 35 }, { "epoch": 0.4827586206896552, "eval_loss": 0.9546059370040894, "eval_runtime": 17.5811, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 35 }, { "epoch": 0.496551724137931, "grad_norm": 2.3457250595092773, "learning_rate": 4.936026311617316e-05, "loss": 1.4401, "step": 36 }, { "epoch": 0.496551724137931, "eval_loss": 0.9482511281967163, "eval_runtime": 17.8351, "eval_samples_per_second": 1.121, "eval_steps_per_second": 0.561, "step": 36 }, { "epoch": 0.5103448275862069, "grad_norm": 2.161039352416992, "learning_rate": 4.926607386564898e-05, "loss": 1.4067, "step": 37 }, { "epoch": 0.5103448275862069, "eval_loss": 0.9448164701461792, "eval_runtime": 17.6014, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 37 }, { "epoch": 0.5241379310344828, "grad_norm": 2.1683900356292725, "learning_rate": 4.916552125781528e-05, "loss": 1.3806, "step": 38 }, { "epoch": 0.5241379310344828, "eval_loss": 0.9402996897697449, "eval_runtime": 17.6524, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.566, "step": 38 }, { "epoch": 0.5379310344827586, "grad_norm": 2.2735962867736816, "learning_rate": 4.9058631660850765e-05, "loss": 1.4937, "step": 39 }, { "epoch": 0.5379310344827586, "eval_loss": 0.9291872978210449, "eval_runtime": 17.5838, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.569, "step": 39 }, { "epoch": 0.5655172413793104, "grad_norm": 2.2170450687408447, "learning_rate": 2.2727272727272728e-06, "loss": 1.316, "step": 41 }, { "epoch": 0.5655172413793104, "eval_loss": 0.9163956642150879, "eval_runtime": 15.7145, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.636, "step": 41 }, { "epoch": 0.5793103448275863, "grad_norm": 2.2266974449157715, "learning_rate": 4.5454545454545455e-06, "loss": 1.3854, "step": 42 }, { "epoch": 0.5793103448275863, "eval_loss": 0.9137259721755981, "eval_runtime": 15.7133, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.636, "step": 42 }, { "epoch": 0.593103448275862, "grad_norm": 2.3451268672943115, "learning_rate": 6.818181818181818e-06, "loss": 1.4208, "step": 43 }, { "epoch": 0.593103448275862, "eval_loss": 0.9096618890762329, "eval_runtime": 15.7895, "eval_samples_per_second": 1.267, "eval_steps_per_second": 0.633, "step": 43 }, { "epoch": 0.6068965517241379, "grad_norm": 2.0125885009765625, "learning_rate": 9.090909090909091e-06, "loss": 1.4302, "step": 44 }, { "epoch": 0.6068965517241379, "eval_loss": 0.9058458209037781, "eval_runtime": 15.6899, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.637, "step": 44 }, { "epoch": 0.6206896551724138, "grad_norm": 2.1096601486206055, "learning_rate": 1.1363636363636365e-05, "loss": 1.3981, "step": 45 }, { "epoch": 0.6206896551724138, "eval_loss": 0.8982122540473938, "eval_runtime": 15.707, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.637, "step": 45 }, { "epoch": 0.6344827586206897, "grad_norm": 1.971846342086792, "learning_rate": 1.3636363636363637e-05, "loss": 1.263, "step": 46 }, { "epoch": 0.6344827586206897, "eval_loss": 0.891434371471405, "eval_runtime": 15.7993, "eval_samples_per_second": 1.266, "eval_steps_per_second": 0.633, "step": 46 }, { "epoch": 0.6482758620689655, "grad_norm": 1.9724080562591553, "learning_rate": 1.590909090909091e-05, "loss": 1.355, "step": 47 }, { "epoch": 0.6482758620689655, "eval_loss": 0.8870094418525696, "eval_runtime": 15.6828, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.638, "step": 47 }, { "epoch": 0.6620689655172414, "grad_norm": 2.0631349086761475, "learning_rate": 1.8181818181818182e-05, "loss": 1.3375, "step": 48 }, { "epoch": 0.6620689655172414, "eval_loss": 0.8790606260299683, "eval_runtime": 15.6783, "eval_samples_per_second": 1.276, "eval_steps_per_second": 0.638, "step": 48 }, { "epoch": 0.6758620689655173, "grad_norm": 2.1942760944366455, "learning_rate": 2.0454545454545457e-05, "loss": 1.3937, "step": 49 }, { "epoch": 0.6758620689655173, "eval_loss": 0.8732376098632812, "eval_runtime": 15.6854, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.638, "step": 49 }, { "epoch": 0.6896551724137931, "grad_norm": 2.119081497192383, "learning_rate": 2.272727272727273e-05, "loss": 1.5447, "step": 50 }, { "epoch": 0.6896551724137931, "eval_loss": 0.8692445755004883, "eval_runtime": 15.6827, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.638, "step": 50 }, { "epoch": 0.7034482758620689, "grad_norm": 1.9801068305969238, "learning_rate": 2.5e-05, "loss": 1.2777, "step": 51 }, { "epoch": 0.7034482758620689, "eval_loss": 0.8668963313102722, "eval_runtime": 15.7049, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.637, "step": 51 }, { "epoch": 0.7172413793103448, "grad_norm": 2.0645248889923096, "learning_rate": 2.7272727272727273e-05, "loss": 1.3444, "step": 52 }, { "epoch": 0.7172413793103448, "eval_loss": 0.8615155220031738, "eval_runtime": 15.6899, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.637, "step": 52 }, { "epoch": 0.7310344827586207, "grad_norm": 2.1377453804016113, "learning_rate": 2.954545454545455e-05, "loss": 1.4174, "step": 53 }, { "epoch": 0.7310344827586207, "eval_loss": 0.8575263023376465, "eval_runtime": 15.6427, "eval_samples_per_second": 1.279, "eval_steps_per_second": 0.639, "step": 53 }, { "epoch": 0.7448275862068966, "grad_norm": 2.1462454795837402, "learning_rate": 3.181818181818182e-05, "loss": 1.429, "step": 54 }, { "epoch": 0.7448275862068966, "eval_loss": 0.8533774614334106, "eval_runtime": 15.7668, "eval_samples_per_second": 1.268, "eval_steps_per_second": 0.634, "step": 54 }, { "epoch": 0.7724137931034483, "grad_norm": 2.1803667545318604, "learning_rate": 2.2727272727272728e-06, "loss": 1.4271, "step": 56 }, { "epoch": 0.7724137931034483, "eval_loss": 0.8433731198310852, "eval_runtime": 17.9885, "eval_samples_per_second": 1.112, "eval_steps_per_second": 0.556, "step": 56 }, { "epoch": 0.7862068965517242, "grad_norm": 2.3162448406219482, "learning_rate": 4.5454545454545455e-06, "loss": 1.4689, "step": 57 }, { "epoch": 0.7862068965517242, "eval_loss": 0.8418852090835571, "eval_runtime": 18.2763, "eval_samples_per_second": 1.094, "eval_steps_per_second": 0.547, "step": 57 }, { "epoch": 0.8, "grad_norm": 1.9732853174209595, "learning_rate": 6.818181818181818e-06, "loss": 1.2825, "step": 58 }, { "epoch": 0.8, "eval_loss": 0.8386393785476685, "eval_runtime": 18.1184, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.552, "step": 58 }, { "epoch": 0.8137931034482758, "grad_norm": 2.0547423362731934, "learning_rate": 9.090909090909091e-06, "loss": 1.2972, "step": 59 }, { "epoch": 0.8137931034482758, "eval_loss": 0.8355510830879211, "eval_runtime": 18.2216, "eval_samples_per_second": 1.098, "eval_steps_per_second": 0.549, "step": 59 } ], "logging_steps": 1, "max_steps": 216, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.477855969291469e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }