{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.524137931034483, "eval_steps": 1, "global_step": 110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013793103448275862, "grad_norm": 6.989287853240967, "learning_rate": 2.2727272727272728e-06, "loss": 1.8926, "step": 1 }, { "epoch": 0.013793103448275862, "eval_loss": 2.144650459289551, "eval_runtime": 17.2592, "eval_samples_per_second": 1.159, "eval_steps_per_second": 0.579, "step": 1 }, { "epoch": 0.027586206896551724, "grad_norm": 6.281332492828369, "learning_rate": 4.5454545454545455e-06, "loss": 1.986, "step": 2 }, { "epoch": 0.027586206896551724, "eval_loss": 2.0905685424804688, "eval_runtime": 17.6184, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.568, "step": 2 }, { "epoch": 0.041379310344827586, "grad_norm": 4.347537040710449, "learning_rate": 6.818181818181818e-06, "loss": 1.9355, "step": 3 }, { "epoch": 0.041379310344827586, "eval_loss": 1.9983774423599243, "eval_runtime": 17.5928, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 3 }, { "epoch": 0.05517241379310345, "grad_norm": 4.809764385223389, "learning_rate": 9.090909090909091e-06, "loss": 1.7509, "step": 4 }, { "epoch": 0.05517241379310345, "eval_loss": 1.8737837076187134, "eval_runtime": 17.6375, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 4 }, { "epoch": 0.06896551724137931, "grad_norm": 3.4548702239990234, "learning_rate": 1.1363636363636365e-05, "loss": 1.8838, "step": 5 }, { "epoch": 0.06896551724137931, "eval_loss": 1.7746165990829468, "eval_runtime": 18.0257, "eval_samples_per_second": 1.11, "eval_steps_per_second": 0.555, "step": 5 }, { "epoch": 0.08275862068965517, "grad_norm": 3.1943702697753906, "learning_rate": 1.3636363636363637e-05, "loss": 1.7707, "step": 6 }, { "epoch": 0.08275862068965517, "eval_loss": 1.6792665719985962, "eval_runtime": 17.7498, "eval_samples_per_second": 1.127, "eval_steps_per_second": 0.563, "step": 6 }, { "epoch": 0.09655172413793103, "grad_norm": 3.318288564682007, "learning_rate": 1.590909090909091e-05, "loss": 1.7171, "step": 7 }, { "epoch": 0.09655172413793103, "eval_loss": 1.5874873399734497, "eval_runtime": 17.6295, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 7 }, { "epoch": 0.1103448275862069, "grad_norm": 3.210330009460449, "learning_rate": 1.8181818181818182e-05, "loss": 1.5734, "step": 8 }, { "epoch": 0.1103448275862069, "eval_loss": 1.535287618637085, "eval_runtime": 17.6232, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.567, "step": 8 }, { "epoch": 0.12413793103448276, "grad_norm": 3.2319107055664062, "learning_rate": 2.0454545454545457e-05, "loss": 1.7986, "step": 9 }, { "epoch": 0.12413793103448276, "eval_loss": 1.467301607131958, "eval_runtime": 17.5824, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 9 }, { "epoch": 0.13793103448275862, "grad_norm": 2.79286789894104, "learning_rate": 2.272727272727273e-05, "loss": 1.5025, "step": 10 }, { "epoch": 0.13793103448275862, "eval_loss": 1.3961191177368164, "eval_runtime": 18.3446, "eval_samples_per_second": 1.09, "eval_steps_per_second": 0.545, "step": 10 }, { "epoch": 0.15172413793103448, "grad_norm": 2.885422706604004, "learning_rate": 2.5e-05, "loss": 1.5477, "step": 11 }, { "epoch": 0.15172413793103448, "eval_loss": 1.3420469760894775, "eval_runtime": 17.7683, "eval_samples_per_second": 1.126, "eval_steps_per_second": 0.563, "step": 11 }, { "epoch": 0.16551724137931034, "grad_norm": 2.7671327590942383, "learning_rate": 2.7272727272727273e-05, "loss": 1.6921, "step": 12 }, { "epoch": 0.16551724137931034, "eval_loss": 1.3071445226669312, "eval_runtime": 17.652, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.567, "step": 12 }, { "epoch": 0.1793103448275862, "grad_norm": 2.9047963619232178, "learning_rate": 2.954545454545455e-05, "loss": 1.5365, "step": 13 }, { "epoch": 0.1793103448275862, "eval_loss": 1.2601890563964844, "eval_runtime": 17.5232, "eval_samples_per_second": 1.141, "eval_steps_per_second": 0.571, "step": 13 }, { "epoch": 0.19310344827586207, "grad_norm": 2.6054675579071045, "learning_rate": 3.181818181818182e-05, "loss": 1.6621, "step": 14 }, { "epoch": 0.19310344827586207, "eval_loss": 1.2506535053253174, "eval_runtime": 17.6295, "eval_samples_per_second": 1.134, "eval_steps_per_second": 0.567, "step": 14 }, { "epoch": 0.20689655172413793, "grad_norm": 2.538036823272705, "learning_rate": 3.409090909090909e-05, "loss": 1.6763, "step": 15 }, { "epoch": 0.20689655172413793, "eval_loss": 1.2366451025009155, "eval_runtime": 18.6172, "eval_samples_per_second": 1.074, "eval_steps_per_second": 0.537, "step": 15 }, { "epoch": 0.2206896551724138, "grad_norm": 2.5125789642333984, "learning_rate": 3.6363636363636364e-05, "loss": 1.668, "step": 16 }, { "epoch": 0.2206896551724138, "eval_loss": 1.2205184698104858, "eval_runtime": 17.7529, "eval_samples_per_second": 1.127, "eval_steps_per_second": 0.563, "step": 16 }, { "epoch": 0.23448275862068965, "grad_norm": 5.055665969848633, "learning_rate": 3.8636363636363636e-05, "loss": 1.5703, "step": 17 }, { "epoch": 0.23448275862068965, "eval_loss": 1.167407751083374, "eval_runtime": 17.5902, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 17 }, { "epoch": 0.2482758620689655, "grad_norm": 2.567411422729492, "learning_rate": 4.0909090909090915e-05, "loss": 1.4859, "step": 18 }, { "epoch": 0.2482758620689655, "eval_loss": 1.1367636919021606, "eval_runtime": 17.4832, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 18 }, { "epoch": 0.2620689655172414, "grad_norm": 2.3214948177337646, "learning_rate": 4.318181818181819e-05, "loss": 1.4511, "step": 19 }, { "epoch": 0.2620689655172414, "eval_loss": 1.1296402215957642, "eval_runtime": 17.6655, "eval_samples_per_second": 1.132, "eval_steps_per_second": 0.566, "step": 19 }, { "epoch": 0.27586206896551724, "grad_norm": 2.390448570251465, "learning_rate": 4.545454545454546e-05, "loss": 1.7181, "step": 20 }, { "epoch": 0.27586206896551724, "eval_loss": 1.126497507095337, "eval_runtime": 17.9736, "eval_samples_per_second": 1.113, "eval_steps_per_second": 0.556, "step": 20 }, { "epoch": 0.2896551724137931, "grad_norm": 2.3728342056274414, "learning_rate": 4.772727272727273e-05, "loss": 1.4155, "step": 21 }, { "epoch": 0.2896551724137931, "eval_loss": 1.09345281124115, "eval_runtime": 17.8283, "eval_samples_per_second": 1.122, "eval_steps_per_second": 0.561, "step": 21 }, { "epoch": 0.30344827586206896, "grad_norm": 2.4872097969055176, "learning_rate": 5e-05, "loss": 1.3752, "step": 22 }, { "epoch": 0.30344827586206896, "eval_loss": 1.0705276727676392, "eval_runtime": 17.6481, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.567, "step": 22 }, { "epoch": 0.31724137931034485, "grad_norm": 2.953234910964966, "learning_rate": 4.999672209164081e-05, "loss": 1.4449, "step": 23 }, { "epoch": 0.31724137931034485, "eval_loss": 1.0468412637710571, "eval_runtime": 17.6053, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 23 }, { "epoch": 0.3310344827586207, "grad_norm": 2.47603702545166, "learning_rate": 4.998688922613788e-05, "loss": 1.4286, "step": 24 }, { "epoch": 0.3310344827586207, "eval_loss": 1.0292497873306274, "eval_runtime": 17.5777, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 24 }, { "epoch": 0.3448275862068966, "grad_norm": 2.2879106998443604, "learning_rate": 4.997050398198977e-05, "loss": 1.5164, "step": 25 }, { "epoch": 0.3448275862068966, "eval_loss": 1.0196115970611572, "eval_runtime": 17.9439, "eval_samples_per_second": 1.115, "eval_steps_per_second": 0.557, "step": 25 }, { "epoch": 0.3586206896551724, "grad_norm": 2.319134473800659, "learning_rate": 4.9947570655942796e-05, "loss": 1.5282, "step": 26 }, { "epoch": 0.3586206896551724, "eval_loss": 1.013381004333496, "eval_runtime": 17.7628, "eval_samples_per_second": 1.126, "eval_steps_per_second": 0.563, "step": 26 }, { "epoch": 0.3724137931034483, "grad_norm": 2.259608745574951, "learning_rate": 4.991809526186424e-05, "loss": 1.4901, "step": 27 }, { "epoch": 0.3724137931034483, "eval_loss": 1.0091207027435303, "eval_runtime": 17.619, "eval_samples_per_second": 1.135, "eval_steps_per_second": 0.568, "step": 27 }, { "epoch": 0.38620689655172413, "grad_norm": 2.2252631187438965, "learning_rate": 4.988208552916535e-05, "loss": 1.5518, "step": 28 }, { "epoch": 0.38620689655172413, "eval_loss": 1.0063353776931763, "eval_runtime": 17.4778, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 28 }, { "epoch": 0.4, "grad_norm": 2.2154901027679443, "learning_rate": 4.983955090077444e-05, "loss": 1.4682, "step": 29 }, { "epoch": 0.4, "eval_loss": 0.99261075258255, "eval_runtime": 17.4894, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.572, "step": 29 }, { "epoch": 0.41379310344827586, "grad_norm": 2.320786237716675, "learning_rate": 4.9790502530660635e-05, "loss": 1.4691, "step": 30 }, { "epoch": 0.41379310344827586, "eval_loss": 0.9836109280586243, "eval_runtime": 16.9043, "eval_samples_per_second": 1.183, "eval_steps_per_second": 0.592, "step": 30 }, { "epoch": 0.42758620689655175, "grad_norm": 2.1385531425476074, "learning_rate": 4.9734953280908904e-05, "loss": 1.4696, "step": 31 }, { "epoch": 0.42758620689655175, "eval_loss": 0.976610541343689, "eval_runtime": 17.3486, "eval_samples_per_second": 1.153, "eval_steps_per_second": 0.576, "step": 31 }, { "epoch": 0.4413793103448276, "grad_norm": 2.2254769802093506, "learning_rate": 4.967291771834727e-05, "loss": 1.531, "step": 32 }, { "epoch": 0.4413793103448276, "eval_loss": 0.9718761444091797, "eval_runtime": 17.5285, "eval_samples_per_second": 1.141, "eval_steps_per_second": 0.57, "step": 32 }, { "epoch": 0.45517241379310347, "grad_norm": 2.34843373298645, "learning_rate": 4.960441211072686e-05, "loss": 1.5484, "step": 33 }, { "epoch": 0.45517241379310347, "eval_loss": 0.9682589769363403, "eval_runtime": 17.5952, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 33 }, { "epoch": 0.4689655172413793, "grad_norm": 1.9610539674758911, "learning_rate": 4.9529454422455976e-05, "loss": 1.3204, "step": 34 }, { "epoch": 0.4689655172413793, "eval_loss": 0.9610344767570496, "eval_runtime": 17.6076, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 34 }, { "epoch": 0.4827586206896552, "grad_norm": 2.2027809619903564, "learning_rate": 4.944806430988927e-05, "loss": 1.3801, "step": 35 }, { "epoch": 0.4827586206896552, "eval_loss": 0.9546059370040894, "eval_runtime": 17.5811, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.569, "step": 35 }, { "epoch": 0.496551724137931, "grad_norm": 2.3457250595092773, "learning_rate": 4.936026311617316e-05, "loss": 1.4401, "step": 36 }, { "epoch": 0.496551724137931, "eval_loss": 0.9482511281967163, "eval_runtime": 17.8351, "eval_samples_per_second": 1.121, "eval_steps_per_second": 0.561, "step": 36 }, { "epoch": 0.5103448275862069, "grad_norm": 2.161039352416992, "learning_rate": 4.926607386564898e-05, "loss": 1.4067, "step": 37 }, { "epoch": 0.5103448275862069, "eval_loss": 0.9448164701461792, "eval_runtime": 17.6014, "eval_samples_per_second": 1.136, "eval_steps_per_second": 0.568, "step": 37 }, { "epoch": 0.5241379310344828, "grad_norm": 2.1683900356292725, "learning_rate": 4.916552125781528e-05, "loss": 1.3806, "step": 38 }, { "epoch": 0.5241379310344828, "eval_loss": 0.9402996897697449, "eval_runtime": 17.6524, "eval_samples_per_second": 1.133, "eval_steps_per_second": 0.566, "step": 38 }, { "epoch": 0.5379310344827586, "grad_norm": 2.2735962867736816, "learning_rate": 4.9058631660850765e-05, "loss": 1.4937, "step": 39 }, { "epoch": 0.5379310344827586, "eval_loss": 0.9291872978210449, "eval_runtime": 17.5838, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.569, "step": 39 }, { "epoch": 0.5655172413793104, "grad_norm": 2.2170450687408447, "learning_rate": 2.2727272727272728e-06, "loss": 1.316, "step": 41 }, { "epoch": 0.5655172413793104, "eval_loss": 0.9163956642150879, "eval_runtime": 15.7145, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.636, "step": 41 }, { "epoch": 0.5793103448275863, "grad_norm": 2.2266974449157715, "learning_rate": 4.5454545454545455e-06, "loss": 1.3854, "step": 42 }, { "epoch": 0.5793103448275863, "eval_loss": 0.9137259721755981, "eval_runtime": 15.7133, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.636, "step": 42 }, { "epoch": 0.593103448275862, "grad_norm": 2.3451268672943115, "learning_rate": 6.818181818181818e-06, "loss": 1.4208, "step": 43 }, { "epoch": 0.593103448275862, "eval_loss": 0.9096618890762329, "eval_runtime": 15.7895, "eval_samples_per_second": 1.267, "eval_steps_per_second": 0.633, "step": 43 }, { "epoch": 0.6068965517241379, "grad_norm": 2.0125885009765625, "learning_rate": 9.090909090909091e-06, "loss": 1.4302, "step": 44 }, { "epoch": 0.6068965517241379, "eval_loss": 0.9058458209037781, "eval_runtime": 15.6899, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.637, "step": 44 }, { "epoch": 0.6206896551724138, "grad_norm": 2.1096601486206055, "learning_rate": 1.1363636363636365e-05, "loss": 1.3981, "step": 45 }, { "epoch": 0.6206896551724138, "eval_loss": 0.8982122540473938, "eval_runtime": 15.707, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.637, "step": 45 }, { "epoch": 0.6344827586206897, "grad_norm": 1.971846342086792, "learning_rate": 1.3636363636363637e-05, "loss": 1.263, "step": 46 }, { "epoch": 0.6344827586206897, "eval_loss": 0.891434371471405, "eval_runtime": 15.7993, "eval_samples_per_second": 1.266, "eval_steps_per_second": 0.633, "step": 46 }, { "epoch": 0.6482758620689655, "grad_norm": 1.9724080562591553, "learning_rate": 1.590909090909091e-05, "loss": 1.355, "step": 47 }, { "epoch": 0.6482758620689655, "eval_loss": 0.8870094418525696, "eval_runtime": 15.6828, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.638, "step": 47 }, { "epoch": 0.6620689655172414, "grad_norm": 2.0631349086761475, "learning_rate": 1.8181818181818182e-05, "loss": 1.3375, "step": 48 }, { "epoch": 0.6620689655172414, "eval_loss": 0.8790606260299683, "eval_runtime": 15.6783, "eval_samples_per_second": 1.276, "eval_steps_per_second": 0.638, "step": 48 }, { "epoch": 0.6758620689655173, "grad_norm": 2.1942760944366455, "learning_rate": 2.0454545454545457e-05, "loss": 1.3937, "step": 49 }, { "epoch": 0.6758620689655173, "eval_loss": 0.8732376098632812, "eval_runtime": 15.6854, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.638, "step": 49 }, { "epoch": 0.6896551724137931, "grad_norm": 2.119081497192383, "learning_rate": 2.272727272727273e-05, "loss": 1.5447, "step": 50 }, { "epoch": 0.6896551724137931, "eval_loss": 0.8692445755004883, "eval_runtime": 15.6827, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.638, "step": 50 }, { "epoch": 0.7034482758620689, "grad_norm": 1.9801068305969238, "learning_rate": 2.5e-05, "loss": 1.2777, "step": 51 }, { "epoch": 0.7034482758620689, "eval_loss": 0.8668963313102722, "eval_runtime": 15.7049, "eval_samples_per_second": 1.273, "eval_steps_per_second": 0.637, "step": 51 }, { "epoch": 0.7172413793103448, "grad_norm": 2.0645248889923096, "learning_rate": 2.7272727272727273e-05, "loss": 1.3444, "step": 52 }, { "epoch": 0.7172413793103448, "eval_loss": 0.8615155220031738, "eval_runtime": 15.6899, "eval_samples_per_second": 1.275, "eval_steps_per_second": 0.637, "step": 52 }, { "epoch": 0.7310344827586207, "grad_norm": 2.1377453804016113, "learning_rate": 2.954545454545455e-05, "loss": 1.4174, "step": 53 }, { "epoch": 0.7310344827586207, "eval_loss": 0.8575263023376465, "eval_runtime": 15.6427, "eval_samples_per_second": 1.279, "eval_steps_per_second": 0.639, "step": 53 }, { "epoch": 0.7448275862068966, "grad_norm": 2.1462454795837402, "learning_rate": 3.181818181818182e-05, "loss": 1.429, "step": 54 }, { "epoch": 0.7448275862068966, "eval_loss": 0.8533774614334106, "eval_runtime": 15.7668, "eval_samples_per_second": 1.268, "eval_steps_per_second": 0.634, "step": 54 }, { "epoch": 0.7724137931034483, "grad_norm": 2.1803667545318604, "learning_rate": 2.2727272727272728e-06, "loss": 1.4271, "step": 56 }, { "epoch": 0.7724137931034483, "eval_loss": 0.8433731198310852, "eval_runtime": 17.9885, "eval_samples_per_second": 1.112, "eval_steps_per_second": 0.556, "step": 56 }, { "epoch": 0.7862068965517242, "grad_norm": 2.3162448406219482, "learning_rate": 4.5454545454545455e-06, "loss": 1.4689, "step": 57 }, { "epoch": 0.7862068965517242, "eval_loss": 0.8418852090835571, "eval_runtime": 18.2763, "eval_samples_per_second": 1.094, "eval_steps_per_second": 0.547, "step": 57 }, { "epoch": 0.8, "grad_norm": 1.9732853174209595, "learning_rate": 6.818181818181818e-06, "loss": 1.2825, "step": 58 }, { "epoch": 0.8, "eval_loss": 0.8386393785476685, "eval_runtime": 18.1184, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.552, "step": 58 }, { "epoch": 0.8137931034482758, "grad_norm": 2.0547423362731934, "learning_rate": 9.090909090909091e-06, "loss": 1.2972, "step": 59 }, { "epoch": 0.8137931034482758, "eval_loss": 0.8355510830879211, "eval_runtime": 18.2216, "eval_samples_per_second": 1.098, "eval_steps_per_second": 0.549, "step": 59 }, { "epoch": 0.8275862068965517, "grad_norm": 2.0684103965759277, "learning_rate": 1.1363636363636365e-05, "loss": 1.3615, "step": 60 }, { "epoch": 0.8275862068965517, "eval_loss": 0.8328086137771606, "eval_runtime": 18.8073, "eval_samples_per_second": 1.063, "eval_steps_per_second": 0.532, "step": 60 }, { "epoch": 0.8413793103448276, "grad_norm": 2.0212347507476807, "learning_rate": 1.3636363636363637e-05, "loss": 1.3648, "step": 61 }, { "epoch": 0.8413793103448276, "eval_loss": 0.8308294415473938, "eval_runtime": 18.376, "eval_samples_per_second": 1.088, "eval_steps_per_second": 0.544, "step": 61 }, { "epoch": 0.8551724137931035, "grad_norm": 1.9967029094696045, "learning_rate": 1.590909090909091e-05, "loss": 1.4334, "step": 62 }, { "epoch": 0.8551724137931035, "eval_loss": 0.8297985792160034, "eval_runtime": 18.24, "eval_samples_per_second": 1.096, "eval_steps_per_second": 0.548, "step": 62 }, { "epoch": 0.8689655172413793, "grad_norm": 1.956730842590332, "learning_rate": 1.8181818181818182e-05, "loss": 1.246, "step": 63 }, { "epoch": 0.8689655172413793, "eval_loss": 0.8276138305664062, "eval_runtime": 18.1099, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.552, "step": 63 }, { "epoch": 0.8827586206896552, "grad_norm": 1.8840367794036865, "learning_rate": 2.0454545454545457e-05, "loss": 1.2346, "step": 64 }, { "epoch": 0.8827586206896552, "eval_loss": 0.8268927335739136, "eval_runtime": 18.2242, "eval_samples_per_second": 1.097, "eval_steps_per_second": 0.549, "step": 64 }, { "epoch": 0.896551724137931, "grad_norm": 1.9588886499404907, "learning_rate": 2.272727272727273e-05, "loss": 1.3699, "step": 65 }, { "epoch": 0.896551724137931, "eval_loss": 0.8241379857063293, "eval_runtime": 18.6162, "eval_samples_per_second": 1.074, "eval_steps_per_second": 0.537, "step": 65 }, { "epoch": 0.9103448275862069, "grad_norm": 2.001984119415283, "learning_rate": 2.5e-05, "loss": 1.4399, "step": 66 }, { "epoch": 0.9103448275862069, "eval_loss": 0.8220138549804688, "eval_runtime": 18.4936, "eval_samples_per_second": 1.081, "eval_steps_per_second": 0.541, "step": 66 }, { "epoch": 0.9241379310344827, "grad_norm": 1.9502840042114258, "learning_rate": 2.7272727272727273e-05, "loss": 1.1969, "step": 67 }, { "epoch": 0.9241379310344827, "eval_loss": 0.8098680377006531, "eval_runtime": 18.2406, "eval_samples_per_second": 1.096, "eval_steps_per_second": 0.548, "step": 67 }, { "epoch": 0.9379310344827586, "grad_norm": 1.8572745323181152, "learning_rate": 2.954545454545455e-05, "loss": 1.1968, "step": 68 }, { "epoch": 0.9379310344827586, "eval_loss": 0.7934565544128418, "eval_runtime": 18.2457, "eval_samples_per_second": 1.096, "eval_steps_per_second": 0.548, "step": 68 }, { "epoch": 0.9517241379310345, "grad_norm": 2.0354831218719482, "learning_rate": 3.181818181818182e-05, "loss": 1.2528, "step": 69 }, { "epoch": 0.9517241379310345, "eval_loss": 0.7829666137695312, "eval_runtime": 18.2217, "eval_samples_per_second": 1.098, "eval_steps_per_second": 0.549, "step": 69 }, { "epoch": 0.9655172413793104, "grad_norm": 2.1164538860321045, "learning_rate": 3.409090909090909e-05, "loss": 1.3873, "step": 70 }, { "epoch": 0.9655172413793104, "eval_loss": 0.7753366231918335, "eval_runtime": 18.5302, "eval_samples_per_second": 1.079, "eval_steps_per_second": 0.54, "step": 70 }, { "epoch": 0.9793103448275862, "grad_norm": 2.032721996307373, "learning_rate": 3.6363636363636364e-05, "loss": 1.232, "step": 71 }, { "epoch": 0.9793103448275862, "eval_loss": 0.7628229856491089, "eval_runtime": 18.4062, "eval_samples_per_second": 1.087, "eval_steps_per_second": 0.543, "step": 71 }, { "epoch": 0.993103448275862, "grad_norm": 2.1039462089538574, "learning_rate": 3.8636363636363636e-05, "loss": 1.2715, "step": 72 }, { "epoch": 0.993103448275862, "eval_loss": 0.751362681388855, "eval_runtime": 18.2628, "eval_samples_per_second": 1.095, "eval_steps_per_second": 0.548, "step": 72 }, { "epoch": 1.006896551724138, "grad_norm": 2.1415343284606934, "learning_rate": 4.0909090909090915e-05, "loss": 1.3012, "step": 73 }, { "epoch": 1.006896551724138, "eval_loss": 0.7407116293907166, "eval_runtime": 18.1993, "eval_samples_per_second": 1.099, "eval_steps_per_second": 0.549, "step": 73 }, { "epoch": 1.0206896551724138, "grad_norm": 1.9539107084274292, "learning_rate": 4.318181818181819e-05, "loss": 1.1411, "step": 74 }, { "epoch": 1.0206896551724138, "eval_loss": 0.7367935180664062, "eval_runtime": 18.2237, "eval_samples_per_second": 1.097, "eval_steps_per_second": 0.549, "step": 74 }, { "epoch": 1.0344827586206897, "grad_norm": 2.0641109943389893, "learning_rate": 4.545454545454546e-05, "loss": 1.0793, "step": 75 }, { "epoch": 1.0344827586206897, "eval_loss": 0.7307212948799133, "eval_runtime": 18.5348, "eval_samples_per_second": 1.079, "eval_steps_per_second": 0.54, "step": 75 }, { "epoch": 1.0482758620689656, "grad_norm": 1.918042778968811, "learning_rate": 4.772727272727273e-05, "loss": 1.0897, "step": 76 }, { "epoch": 1.0482758620689656, "eval_loss": 0.7253277897834778, "eval_runtime": 18.4554, "eval_samples_per_second": 1.084, "eval_steps_per_second": 0.542, "step": 76 }, { "epoch": 1.0620689655172413, "grad_norm": 2.216691493988037, "learning_rate": 5e-05, "loss": 1.2309, "step": 77 }, { "epoch": 1.0620689655172413, "eval_loss": 0.7224608659744263, "eval_runtime": 18.0728, "eval_samples_per_second": 1.107, "eval_steps_per_second": 0.553, "step": 77 }, { "epoch": 1.0758620689655172, "grad_norm": 2.304621934890747, "learning_rate": 4.999672209164081e-05, "loss": 1.1722, "step": 78 }, { "epoch": 1.0758620689655172, "eval_loss": 0.7266848683357239, "eval_runtime": 18.2053, "eval_samples_per_second": 1.099, "eval_steps_per_second": 0.549, "step": 78 }, { "epoch": 1.089655172413793, "grad_norm": 2.0087103843688965, "learning_rate": 4.998688922613788e-05, "loss": 1.105, "step": 79 }, { "epoch": 1.089655172413793, "eval_loss": 0.7276325225830078, "eval_runtime": 18.0661, "eval_samples_per_second": 1.107, "eval_steps_per_second": 0.554, "step": 79 }, { "epoch": 1.103448275862069, "grad_norm": 2.047912836074829, "learning_rate": 4.997050398198977e-05, "loss": 1.0507, "step": 80 }, { "epoch": 1.103448275862069, "eval_loss": 0.7239590883255005, "eval_runtime": 18.4343, "eval_samples_per_second": 1.085, "eval_steps_per_second": 0.542, "step": 80 }, { "epoch": 1.1172413793103448, "grad_norm": 2.004422664642334, "learning_rate": 4.9947570655942796e-05, "loss": 0.9516, "step": 81 }, { "epoch": 1.1172413793103448, "eval_loss": 0.7144821882247925, "eval_runtime": 18.3983, "eval_samples_per_second": 1.087, "eval_steps_per_second": 0.544, "step": 81 }, { "epoch": 1.1310344827586207, "grad_norm": 2.013328790664673, "learning_rate": 4.991809526186424e-05, "loss": 1.0593, "step": 82 }, { "epoch": 1.1310344827586207, "eval_loss": 0.7059406638145447, "eval_runtime": 18.2362, "eval_samples_per_second": 1.097, "eval_steps_per_second": 0.548, "step": 82 }, { "epoch": 1.1448275862068966, "grad_norm": 2.068134069442749, "learning_rate": 4.988208552916535e-05, "loss": 1.1188, "step": 83 }, { "epoch": 1.1448275862068966, "eval_loss": 0.7021835446357727, "eval_runtime": 18.1868, "eval_samples_per_second": 1.1, "eval_steps_per_second": 0.55, "step": 83 }, { "epoch": 1.1586206896551725, "grad_norm": 2.2628672122955322, "learning_rate": 4.983955090077444e-05, "loss": 1.1473, "step": 84 }, { "epoch": 1.1586206896551725, "eval_loss": 0.6942790150642395, "eval_runtime": 18.1494, "eval_samples_per_second": 1.102, "eval_steps_per_second": 0.551, "step": 84 }, { "epoch": 1.1724137931034484, "grad_norm": 2.1747775077819824, "learning_rate": 4.9790502530660635e-05, "loss": 1.1778, "step": 85 }, { "epoch": 1.1724137931034484, "eval_loss": 0.6942981481552124, "eval_runtime": 18.8763, "eval_samples_per_second": 1.06, "eval_steps_per_second": 0.53, "step": 85 }, { "epoch": 1.186206896551724, "grad_norm": 2.152348041534424, "learning_rate": 4.9734953280908904e-05, "loss": 1.331, "step": 86 }, { "epoch": 1.186206896551724, "eval_loss": 0.6978840827941895, "eval_runtime": 18.349, "eval_samples_per_second": 1.09, "eval_steps_per_second": 0.545, "step": 86 }, { "epoch": 1.2, "grad_norm": 2.069314956665039, "learning_rate": 4.967291771834727e-05, "loss": 1.1638, "step": 87 }, { "epoch": 1.2, "eval_loss": 0.6983293294906616, "eval_runtime": 18.1961, "eval_samples_per_second": 1.099, "eval_steps_per_second": 0.55, "step": 87 }, { "epoch": 1.2137931034482758, "grad_norm": 2.037853717803955, "learning_rate": 4.960441211072686e-05, "loss": 1.1118, "step": 88 }, { "epoch": 1.2137931034482758, "eval_loss": 0.6962876915931702, "eval_runtime": 18.1105, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.552, "step": 88 }, { "epoch": 1.2275862068965517, "grad_norm": 1.9961076974868774, "learning_rate": 4.9529454422455976e-05, "loss": 1.0972, "step": 89 }, { "epoch": 1.2275862068965517, "eval_loss": 0.6896785497665405, "eval_runtime": 18.0553, "eval_samples_per_second": 1.108, "eval_steps_per_second": 0.554, "step": 89 }, { "epoch": 1.2413793103448276, "grad_norm": 2.28176212310791, "learning_rate": 4.944806430988927e-05, "loss": 1.3304, "step": 90 }, { "epoch": 1.2413793103448276, "eval_loss": 0.6826642751693726, "eval_runtime": 18.5055, "eval_samples_per_second": 1.081, "eval_steps_per_second": 0.54, "step": 90 }, { "epoch": 1.2551724137931035, "grad_norm": 1.894646406173706, "learning_rate": 4.936026311617316e-05, "loss": 1.0935, "step": 91 }, { "epoch": 1.2551724137931035, "eval_loss": 0.678307831287384, "eval_runtime": 18.3532, "eval_samples_per_second": 1.09, "eval_steps_per_second": 0.545, "step": 91 }, { "epoch": 1.2689655172413792, "grad_norm": 2.0475075244903564, "learning_rate": 4.926607386564898e-05, "loss": 1.2393, "step": 92 }, { "epoch": 1.2689655172413792, "eval_loss": 0.6765857934951782, "eval_runtime": 18.2689, "eval_samples_per_second": 1.095, "eval_steps_per_second": 0.547, "step": 92 }, { "epoch": 1.282758620689655, "grad_norm": 2.140949249267578, "learning_rate": 4.916552125781528e-05, "loss": 1.0277, "step": 93 }, { "epoch": 1.282758620689655, "eval_loss": 0.6735562682151794, "eval_runtime": 18.1407, "eval_samples_per_second": 1.102, "eval_steps_per_second": 0.551, "step": 93 }, { "epoch": 1.296551724137931, "grad_norm": 2.235147476196289, "learning_rate": 4.9058631660850765e-05, "loss": 1.2081, "step": 94 }, { "epoch": 1.296551724137931, "eval_loss": 0.6619122624397278, "eval_runtime": 18.2145, "eval_samples_per_second": 1.098, "eval_steps_per_second": 0.549, "step": 94 }, { "epoch": 1.3103448275862069, "grad_norm": 2.077143669128418, "learning_rate": 4.894543310469968e-05, "loss": 1.2378, "step": 95 }, { "epoch": 1.3103448275862069, "eval_loss": 0.6547893285751343, "eval_runtime": 18.7488, "eval_samples_per_second": 1.067, "eval_steps_per_second": 0.533, "step": 95 }, { "epoch": 1.3241379310344827, "grad_norm": 1.9517972469329834, "learning_rate": 4.882595527372152e-05, "loss": 1.0997, "step": 96 }, { "epoch": 1.3241379310344827, "eval_loss": 0.6498640775680542, "eval_runtime": 18.4304, "eval_samples_per_second": 1.085, "eval_steps_per_second": 0.543, "step": 96 }, { "epoch": 1.3379310344827586, "grad_norm": 2.0447959899902344, "learning_rate": 4.870022949890676e-05, "loss": 0.9613, "step": 97 }, { "epoch": 1.3379310344827586, "eval_loss": 0.6370054483413696, "eval_runtime": 18.252, "eval_samples_per_second": 1.096, "eval_steps_per_second": 0.548, "step": 97 }, { "epoch": 1.3517241379310345, "grad_norm": 2.078657865524292, "learning_rate": 4.856828874966086e-05, "loss": 1.1216, "step": 98 }, { "epoch": 1.3517241379310345, "eval_loss": 0.6291982531547546, "eval_runtime": 18.2386, "eval_samples_per_second": 1.097, "eval_steps_per_second": 0.548, "step": 98 }, { "epoch": 1.3655172413793104, "grad_norm": 2.0556623935699463, "learning_rate": 4.8430167625158595e-05, "loss": 1.0718, "step": 99 }, { "epoch": 1.3655172413793104, "eval_loss": 0.6218433380126953, "eval_runtime": 18.1671, "eval_samples_per_second": 1.101, "eval_steps_per_second": 0.55, "step": 99 }, { "epoch": 1.4, "grad_norm": 2.0707523822784424, "learning_rate": 2.2727272727272728e-06, "loss": 1.2557, "step": 101 }, { "epoch": 1.4, "eval_loss": 0.6193124055862427, "eval_runtime": 15.3335, "eval_samples_per_second": 1.304, "eval_steps_per_second": 0.652, "step": 101 }, { "epoch": 1.4137931034482758, "grad_norm": 2.099379539489746, "learning_rate": 4.5454545454545455e-06, "loss": 1.1953, "step": 102 }, { "epoch": 1.4137931034482758, "eval_loss": 0.6185603141784668, "eval_runtime": 15.3458, "eval_samples_per_second": 1.303, "eval_steps_per_second": 0.652, "step": 102 }, { "epoch": 1.4275862068965517, "grad_norm": 1.9917728900909424, "learning_rate": 6.818181818181818e-06, "loss": 1.2443, "step": 103 }, { "epoch": 1.4275862068965517, "eval_loss": 0.6176949739456177, "eval_runtime": 15.2437, "eval_samples_per_second": 1.312, "eval_steps_per_second": 0.656, "step": 103 }, { "epoch": 1.4413793103448276, "grad_norm": 1.8723604679107666, "learning_rate": 9.090909090909091e-06, "loss": 1.0864, "step": 104 }, { "epoch": 1.4413793103448276, "eval_loss": 0.6164005994796753, "eval_runtime": 15.2753, "eval_samples_per_second": 1.309, "eval_steps_per_second": 0.655, "step": 104 }, { "epoch": 1.4551724137931035, "grad_norm": 1.98838472366333, "learning_rate": 1.1363636363636365e-05, "loss": 1.2297, "step": 105 }, { "epoch": 1.4551724137931035, "eval_loss": 0.6155186891555786, "eval_runtime": 15.2373, "eval_samples_per_second": 1.313, "eval_steps_per_second": 0.656, "step": 105 }, { "epoch": 1.4689655172413794, "grad_norm": 1.8609561920166016, "learning_rate": 1.3636363636363637e-05, "loss": 0.9922, "step": 106 }, { "epoch": 1.4689655172413794, "eval_loss": 0.6143234372138977, "eval_runtime": 15.4977, "eval_samples_per_second": 1.291, "eval_steps_per_second": 0.645, "step": 106 }, { "epoch": 1.4827586206896552, "grad_norm": 2.010931968688965, "learning_rate": 1.590909090909091e-05, "loss": 1.1338, "step": 107 }, { "epoch": 1.4827586206896552, "eval_loss": 0.610894501209259, "eval_runtime": 15.4008, "eval_samples_per_second": 1.299, "eval_steps_per_second": 0.649, "step": 107 }, { "epoch": 1.4965517241379311, "grad_norm": 1.9721729755401611, "learning_rate": 1.8181818181818182e-05, "loss": 1.0459, "step": 108 }, { "epoch": 1.4965517241379311, "eval_loss": 0.606325089931488, "eval_runtime": 15.2962, "eval_samples_per_second": 1.308, "eval_steps_per_second": 0.654, "step": 108 }, { "epoch": 1.510344827586207, "grad_norm": 2.1253089904785156, "learning_rate": 2.0454545454545457e-05, "loss": 1.285, "step": 109 }, { "epoch": 1.510344827586207, "eval_loss": 0.6013532876968384, "eval_runtime": 15.3278, "eval_samples_per_second": 1.305, "eval_steps_per_second": 0.652, "step": 109 } ], "logging_steps": 1, "max_steps": 216, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3916678204158771e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }