{ "best_metric": 0.020535213872790337, "best_model_checkpoint": "/home/paperspace/Data/models/brasingh_publicis_f5f/llm3br256/checkpoint-410", "epoch": 4.96969696969697, "eval_steps": 5, "global_step": 410, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012121212121212121, "grad_norm": 0.17719489336013794, "learning_rate": 2.4390243902439027e-06, "loss": 0.103, "step": 1 }, { "epoch": 0.024242424242424242, "grad_norm": 0.1567779779434204, "learning_rate": 4.8780487804878055e-06, "loss": 0.11, "step": 2 }, { "epoch": 0.03636363636363636, "grad_norm": 0.1601039469242096, "learning_rate": 7.317073170731707e-06, "loss": 0.1104, "step": 3 }, { "epoch": 0.048484848484848485, "grad_norm": 0.16313816606998444, "learning_rate": 9.756097560975611e-06, "loss": 0.1076, "step": 4 }, { "epoch": 0.06060606060606061, "grad_norm": 0.15266162157058716, "learning_rate": 1.2195121951219513e-05, "loss": 0.1038, "step": 5 }, { "epoch": 0.06060606060606061, "eval_loss": 0.09789121896028519, "eval_runtime": 8.116, "eval_samples_per_second": 6.161, "eval_steps_per_second": 1.602, "step": 5 }, { "epoch": 0.07272727272727272, "grad_norm": 0.1329907476902008, "learning_rate": 1.4634146341463415e-05, "loss": 0.0995, "step": 6 }, { "epoch": 0.08484848484848485, "grad_norm": 0.09588994085788727, "learning_rate": 1.707317073170732e-05, "loss": 0.0833, "step": 7 }, { "epoch": 0.09696969696969697, "grad_norm": 0.07421080023050308, "learning_rate": 1.9512195121951222e-05, "loss": 0.0756, "step": 8 }, { "epoch": 0.10909090909090909, "grad_norm": 0.0636032298207283, "learning_rate": 2.1951219512195124e-05, "loss": 0.0681, "step": 9 }, { "epoch": 0.12121212121212122, "grad_norm": 0.07186830043792725, "learning_rate": 2.4390243902439026e-05, "loss": 0.0759, "step": 10 }, { "epoch": 0.12121212121212122, "eval_loss": 0.07647334039211273, "eval_runtime": 6.2195, "eval_samples_per_second": 8.039, "eval_steps_per_second": 2.09, "step": 10 }, { "epoch": 0.13333333333333333, "grad_norm": 0.07592587172985077, "learning_rate": 2.682926829268293e-05, "loss": 0.0757, "step": 11 }, { "epoch": 0.14545454545454545, "grad_norm": 0.057555243372917175, "learning_rate": 2.926829268292683e-05, "loss": 0.0733, "step": 12 }, { "epoch": 0.15757575757575756, "grad_norm": 0.04685232415795326, "learning_rate": 3.170731707317073e-05, "loss": 0.0751, "step": 13 }, { "epoch": 0.1696969696969697, "grad_norm": 0.04220229387283325, "learning_rate": 3.414634146341464e-05, "loss": 0.0784, "step": 14 }, { "epoch": 0.18181818181818182, "grad_norm": 0.050287775695323944, "learning_rate": 3.6585365853658535e-05, "loss": 0.069, "step": 15 }, { "epoch": 0.18181818181818182, "eval_loss": 0.06831522285938263, "eval_runtime": 6.1929, "eval_samples_per_second": 8.074, "eval_steps_per_second": 2.099, "step": 15 }, { "epoch": 0.19393939393939394, "grad_norm": 0.04325024411082268, "learning_rate": 3.9024390243902444e-05, "loss": 0.0691, "step": 16 }, { "epoch": 0.20606060606060606, "grad_norm": 0.037937626242637634, "learning_rate": 4.146341463414634e-05, "loss": 0.0813, "step": 17 }, { "epoch": 0.21818181818181817, "grad_norm": 0.03867847099900246, "learning_rate": 4.390243902439025e-05, "loss": 0.065, "step": 18 }, { "epoch": 0.23030303030303031, "grad_norm": 0.03792285919189453, "learning_rate": 4.634146341463415e-05, "loss": 0.0617, "step": 19 }, { "epoch": 0.24242424242424243, "grad_norm": 0.03528020903468132, "learning_rate": 4.878048780487805e-05, "loss": 0.0729, "step": 20 }, { "epoch": 0.24242424242424243, "eval_loss": 0.062009546905756, "eval_runtime": 6.1891, "eval_samples_per_second": 8.079, "eval_steps_per_second": 2.1, "step": 20 }, { "epoch": 0.2545454545454545, "grad_norm": 0.031467072665691376, "learning_rate": 5.121951219512195e-05, "loss": 0.0602, "step": 21 }, { "epoch": 0.26666666666666666, "grad_norm": 0.03714953735470772, "learning_rate": 5.365853658536586e-05, "loss": 0.0772, "step": 22 }, { "epoch": 0.2787878787878788, "grad_norm": 0.03779144585132599, "learning_rate": 5.6097560975609764e-05, "loss": 0.0584, "step": 23 }, { "epoch": 0.2909090909090909, "grad_norm": 0.030055589973926544, "learning_rate": 5.853658536585366e-05, "loss": 0.0568, "step": 24 }, { "epoch": 0.30303030303030304, "grad_norm": 0.029797468334436417, "learning_rate": 6.097560975609756e-05, "loss": 0.0545, "step": 25 }, { "epoch": 0.30303030303030304, "eval_loss": 0.057142239063978195, "eval_runtime": 6.2052, "eval_samples_per_second": 8.058, "eval_steps_per_second": 2.095, "step": 25 }, { "epoch": 0.3151515151515151, "grad_norm": 0.029303744435310364, "learning_rate": 6.341463414634146e-05, "loss": 0.0591, "step": 26 }, { "epoch": 0.32727272727272727, "grad_norm": 0.03735222667455673, "learning_rate": 6.585365853658538e-05, "loss": 0.0836, "step": 27 }, { "epoch": 0.3393939393939394, "grad_norm": 0.02950606681406498, "learning_rate": 6.829268292682928e-05, "loss": 0.0574, "step": 28 }, { "epoch": 0.3515151515151515, "grad_norm": 0.02479255013167858, "learning_rate": 7.073170731707317e-05, "loss": 0.0506, "step": 29 }, { "epoch": 0.36363636363636365, "grad_norm": 0.030447915196418762, "learning_rate": 7.317073170731707e-05, "loss": 0.0589, "step": 30 }, { "epoch": 0.36363636363636365, "eval_loss": 0.05275052413344383, "eval_runtime": 6.1946, "eval_samples_per_second": 8.072, "eval_steps_per_second": 2.099, "step": 30 }, { "epoch": 0.37575757575757573, "grad_norm": 0.029138660058379173, "learning_rate": 7.560975609756099e-05, "loss": 0.0536, "step": 31 }, { "epoch": 0.3878787878787879, "grad_norm": 0.029026566073298454, "learning_rate": 7.804878048780489e-05, "loss": 0.0454, "step": 32 }, { "epoch": 0.4, "grad_norm": 0.03538930043578148, "learning_rate": 8.048780487804879e-05, "loss": 0.0727, "step": 33 }, { "epoch": 0.4121212121212121, "grad_norm": 0.028354594483971596, "learning_rate": 8.292682926829268e-05, "loss": 0.0557, "step": 34 }, { "epoch": 0.42424242424242425, "grad_norm": 0.02743169106543064, "learning_rate": 8.53658536585366e-05, "loss": 0.0461, "step": 35 }, { "epoch": 0.42424242424242425, "eval_loss": 0.05005570873618126, "eval_runtime": 6.192, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.099, "step": 35 }, { "epoch": 0.43636363636363634, "grad_norm": 0.03530753031373024, "learning_rate": 8.78048780487805e-05, "loss": 0.0627, "step": 36 }, { "epoch": 0.4484848484848485, "grad_norm": 0.02797996811568737, "learning_rate": 9.02439024390244e-05, "loss": 0.0527, "step": 37 }, { "epoch": 0.46060606060606063, "grad_norm": 0.022809529677033424, "learning_rate": 9.26829268292683e-05, "loss": 0.0509, "step": 38 }, { "epoch": 0.4727272727272727, "grad_norm": 0.02468150481581688, "learning_rate": 9.51219512195122e-05, "loss": 0.0488, "step": 39 }, { "epoch": 0.48484848484848486, "grad_norm": 0.030917035415768623, "learning_rate": 9.75609756097561e-05, "loss": 0.0522, "step": 40 }, { "epoch": 0.48484848484848486, "eval_loss": 0.049276672303676605, "eval_runtime": 6.1874, "eval_samples_per_second": 8.081, "eval_steps_per_second": 2.101, "step": 40 }, { "epoch": 0.49696969696969695, "grad_norm": 0.026523206382989883, "learning_rate": 0.0001, "loss": 0.0463, "step": 41 }, { "epoch": 0.509090909090909, "grad_norm": 0.028745442628860474, "learning_rate": 9.999818789066165e-05, "loss": 0.0433, "step": 42 }, { "epoch": 0.5212121212121212, "grad_norm": 0.026402153074741364, "learning_rate": 9.999275169399614e-05, "loss": 0.0393, "step": 43 }, { "epoch": 0.5333333333333333, "grad_norm": 0.02671145275235176, "learning_rate": 9.998369180404283e-05, "loss": 0.044, "step": 44 }, { "epoch": 0.5454545454545454, "grad_norm": 0.034986190497875214, "learning_rate": 9.997100887750215e-05, "loss": 0.052, "step": 45 }, { "epoch": 0.5454545454545454, "eval_loss": 0.04825693741440773, "eval_runtime": 6.1973, "eval_samples_per_second": 8.068, "eval_steps_per_second": 2.098, "step": 45 }, { "epoch": 0.5575757575757576, "grad_norm": 0.029590139165520668, "learning_rate": 9.995470383368808e-05, "loss": 0.0436, "step": 46 }, { "epoch": 0.5696969696969697, "grad_norm": 0.03095312975347042, "learning_rate": 9.99347778544615e-05, "loss": 0.0431, "step": 47 }, { "epoch": 0.5818181818181818, "grad_norm": 0.030565602704882622, "learning_rate": 9.991123238414455e-05, "loss": 0.0526, "step": 48 }, { "epoch": 0.593939393939394, "grad_norm": 0.027898119762539864, "learning_rate": 9.98840691294159e-05, "loss": 0.0447, "step": 49 }, { "epoch": 0.6060606060606061, "grad_norm": 0.03219461813569069, "learning_rate": 9.985329005918702e-05, "loss": 0.0459, "step": 50 }, { "epoch": 0.6060606060606061, "eval_loss": 0.045825447887182236, "eval_runtime": 6.1916, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.1, "step": 50 }, { "epoch": 0.6181818181818182, "grad_norm": 0.02641221322119236, "learning_rate": 9.981889740445958e-05, "loss": 0.0417, "step": 51 }, { "epoch": 0.6303030303030303, "grad_norm": 0.028501464053988457, "learning_rate": 9.978089365816357e-05, "loss": 0.0446, "step": 52 }, { "epoch": 0.6424242424242425, "grad_norm": 0.0260939784348011, "learning_rate": 9.973928157497674e-05, "loss": 0.0451, "step": 53 }, { "epoch": 0.6545454545454545, "grad_norm": 0.029564740136265755, "learning_rate": 9.969406417112489e-05, "loss": 0.0416, "step": 54 }, { "epoch": 0.6666666666666666, "grad_norm": 0.027353042736649513, "learning_rate": 9.964524472416319e-05, "loss": 0.0363, "step": 55 }, { "epoch": 0.6666666666666666, "eval_loss": 0.0433911457657814, "eval_runtime": 6.2376, "eval_samples_per_second": 8.016, "eval_steps_per_second": 2.084, "step": 55 }, { "epoch": 0.6787878787878788, "grad_norm": 0.03154386952519417, "learning_rate": 9.95928267727387e-05, "loss": 0.0457, "step": 56 }, { "epoch": 0.6909090909090909, "grad_norm": 0.0249126385897398, "learning_rate": 9.953681411633376e-05, "loss": 0.0367, "step": 57 }, { "epoch": 0.703030303030303, "grad_norm": 0.02522316575050354, "learning_rate": 9.947721081499068e-05, "loss": 0.0428, "step": 58 }, { "epoch": 0.7151515151515152, "grad_norm": 0.028446340933442116, "learning_rate": 9.941402118901744e-05, "loss": 0.0456, "step": 59 }, { "epoch": 0.7272727272727273, "grad_norm": 0.0324234701693058, "learning_rate": 9.934724981867446e-05, "loss": 0.0553, "step": 60 }, { "epoch": 0.7272727272727273, "eval_loss": 0.04182567819952965, "eval_runtime": 6.2053, "eval_samples_per_second": 8.058, "eval_steps_per_second": 2.095, "step": 60 }, { "epoch": 0.7393939393939394, "grad_norm": 0.027509605512022972, "learning_rate": 9.927690154384273e-05, "loss": 0.0443, "step": 61 }, { "epoch": 0.7515151515151515, "grad_norm": 0.025798741728067398, "learning_rate": 9.920298146367286e-05, "loss": 0.0423, "step": 62 }, { "epoch": 0.7636363636363637, "grad_norm": 0.029940692707896233, "learning_rate": 9.912549493621554e-05, "loss": 0.0469, "step": 63 }, { "epoch": 0.7757575757575758, "grad_norm": 0.032555170357227325, "learning_rate": 9.904444757803321e-05, "loss": 0.0428, "step": 64 }, { "epoch": 0.7878787878787878, "grad_norm": 0.03051156736910343, "learning_rate": 9.895984526379281e-05, "loss": 0.0444, "step": 65 }, { "epoch": 0.7878787878787878, "eval_loss": 0.0403163880109787, "eval_runtime": 6.1935, "eval_samples_per_second": 8.073, "eval_steps_per_second": 2.099, "step": 65 }, { "epoch": 0.8, "grad_norm": 0.02734997309744358, "learning_rate": 9.887169412584011e-05, "loss": 0.0389, "step": 66 }, { "epoch": 0.8121212121212121, "grad_norm": 0.026902060955762863, "learning_rate": 9.878000055375512e-05, "loss": 0.0397, "step": 67 }, { "epoch": 0.8242424242424242, "grad_norm": 0.03240904584527016, "learning_rate": 9.868477119388896e-05, "loss": 0.0387, "step": 68 }, { "epoch": 0.8363636363636363, "grad_norm": 0.02606021985411644, "learning_rate": 9.858601294888213e-05, "loss": 0.0344, "step": 69 }, { "epoch": 0.8484848484848485, "grad_norm": 0.029814746230840683, "learning_rate": 9.848373297716414e-05, "loss": 0.0469, "step": 70 }, { "epoch": 0.8484848484848485, "eval_loss": 0.03973233327269554, "eval_runtime": 6.2308, "eval_samples_per_second": 8.025, "eval_steps_per_second": 2.086, "step": 70 }, { "epoch": 0.8606060606060606, "grad_norm": 0.025392569601535797, "learning_rate": 9.837793869243468e-05, "loss": 0.0388, "step": 71 }, { "epoch": 0.8727272727272727, "grad_norm": 0.03046100027859211, "learning_rate": 9.82686377631262e-05, "loss": 0.0415, "step": 72 }, { "epoch": 0.8848484848484849, "grad_norm": 0.02428356185555458, "learning_rate": 9.815583811184808e-05, "loss": 0.037, "step": 73 }, { "epoch": 0.896969696969697, "grad_norm": 0.029197214171290398, "learning_rate": 9.803954791481239e-05, "loss": 0.0408, "step": 74 }, { "epoch": 0.9090909090909091, "grad_norm": 0.027502721175551414, "learning_rate": 9.791977560124119e-05, "loss": 0.0417, "step": 75 }, { "epoch": 0.9090909090909091, "eval_loss": 0.038558006286621094, "eval_runtime": 6.2, "eval_samples_per_second": 8.065, "eval_steps_per_second": 2.097, "step": 75 }, { "epoch": 0.9212121212121213, "grad_norm": 0.030016757547855377, "learning_rate": 9.779652985275562e-05, "loss": 0.0427, "step": 76 }, { "epoch": 0.9333333333333333, "grad_norm": 0.029366502538323402, "learning_rate": 9.766981960274653e-05, "loss": 0.0312, "step": 77 }, { "epoch": 0.9454545454545454, "grad_norm": 0.02805924229323864, "learning_rate": 9.753965403572703e-05, "loss": 0.0424, "step": 78 }, { "epoch": 0.9575757575757575, "grad_norm": 0.027496378868818283, "learning_rate": 9.740604258666668e-05, "loss": 0.0368, "step": 79 }, { "epoch": 0.9696969696969697, "grad_norm": 0.02711924910545349, "learning_rate": 9.726899494030768e-05, "loss": 0.0388, "step": 80 }, { "epoch": 0.9696969696969697, "eval_loss": 0.037164073437452316, "eval_runtime": 6.2151, "eval_samples_per_second": 8.045, "eval_steps_per_second": 2.092, "step": 80 }, { "epoch": 0.9818181818181818, "grad_norm": 0.02877042628824711, "learning_rate": 9.71285210304628e-05, "loss": 0.0367, "step": 81 }, { "epoch": 0.9939393939393939, "grad_norm": 0.029804140329360962, "learning_rate": 9.698463103929542e-05, "loss": 0.0399, "step": 82 }, { "epoch": 1.006060606060606, "grad_norm": 0.04405470937490463, "learning_rate": 9.683733539658139e-05, "loss": 0.0545, "step": 83 }, { "epoch": 1.018181818181818, "grad_norm": 0.0315798744559288, "learning_rate": 9.66866447789531e-05, "loss": 0.048, "step": 84 }, { "epoch": 1.0303030303030303, "grad_norm": 0.02551027573645115, "learning_rate": 9.653257010912559e-05, "loss": 0.0309, "step": 85 }, { "epoch": 1.0303030303030303, "eval_loss": 0.03581343591213226, "eval_runtime": 6.2558, "eval_samples_per_second": 7.993, "eval_steps_per_second": 2.078, "step": 85 }, { "epoch": 1.0424242424242425, "grad_norm": 0.03550685569643974, "learning_rate": 9.637512255510475e-05, "loss": 0.0659, "step": 86 }, { "epoch": 1.0545454545454545, "grad_norm": 0.03085348755121231, "learning_rate": 9.621431352937789e-05, "loss": 0.0502, "step": 87 }, { "epoch": 1.0666666666666667, "grad_norm": 0.02470513805747032, "learning_rate": 9.605015468808651e-05, "loss": 0.0318, "step": 88 }, { "epoch": 1.0787878787878789, "grad_norm": 0.02803831174969673, "learning_rate": 9.58826579301814e-05, "loss": 0.0446, "step": 89 }, { "epoch": 1.0909090909090908, "grad_norm": 0.03941066190600395, "learning_rate": 9.571183539656011e-05, "loss": 0.0487, "step": 90 }, { "epoch": 1.0909090909090908, "eval_loss": 0.0354202575981617, "eval_runtime": 6.1921, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.099, "step": 90 }, { "epoch": 1.103030303030303, "grad_norm": 0.029008885845541954, "learning_rate": 9.553769946918697e-05, "loss": 0.0403, "step": 91 }, { "epoch": 1.1151515151515152, "grad_norm": 0.025633882731199265, "learning_rate": 9.536026277019561e-05, "loss": 0.032, "step": 92 }, { "epoch": 1.1272727272727272, "grad_norm": 0.02955947443842888, "learning_rate": 9.517953816097396e-05, "loss": 0.0366, "step": 93 }, { "epoch": 1.1393939393939394, "grad_norm": 0.029836708679795265, "learning_rate": 9.499553874123212e-05, "loss": 0.0383, "step": 94 }, { "epoch": 1.1515151515151516, "grad_norm": 0.030258659273386, "learning_rate": 9.480827784805278e-05, "loss": 0.0348, "step": 95 }, { "epoch": 1.1515151515151516, "eval_loss": 0.034031517803668976, "eval_runtime": 6.1911, "eval_samples_per_second": 8.076, "eval_steps_per_second": 2.1, "step": 95 }, { "epoch": 1.1636363636363636, "grad_norm": 0.02571636624634266, "learning_rate": 9.461776905492446e-05, "loss": 0.0322, "step": 96 }, { "epoch": 1.1757575757575758, "grad_norm": 0.025425300002098083, "learning_rate": 9.442402617075765e-05, "loss": 0.0302, "step": 97 }, { "epoch": 1.187878787878788, "grad_norm": 0.02790471538901329, "learning_rate": 9.422706323888397e-05, "loss": 0.0305, "step": 98 }, { "epoch": 1.2, "grad_norm": 0.031999390572309494, "learning_rate": 9.402689453603815e-05, "loss": 0.0384, "step": 99 }, { "epoch": 1.2121212121212122, "grad_norm": 0.02810075506567955, "learning_rate": 9.382353457132317e-05, "loss": 0.0308, "step": 100 }, { "epoch": 1.2121212121212122, "eval_loss": 0.03338392823934555, "eval_runtime": 6.1917, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.1, "step": 100 }, { "epoch": 1.2242424242424241, "grad_norm": 0.0302734412252903, "learning_rate": 9.361699808515876e-05, "loss": 0.0341, "step": 101 }, { "epoch": 1.2363636363636363, "grad_norm": 0.033730726689100266, "learning_rate": 9.340730004821266e-05, "loss": 0.0346, "step": 102 }, { "epoch": 1.2484848484848485, "grad_norm": 0.03323773667216301, "learning_rate": 9.31944556603157e-05, "loss": 0.0408, "step": 103 }, { "epoch": 1.2606060606060607, "grad_norm": 0.027124911546707153, "learning_rate": 9.297848034936006e-05, "loss": 0.0332, "step": 104 }, { "epoch": 1.2727272727272727, "grad_norm": 0.026853900402784348, "learning_rate": 9.275938977018081e-05, "loss": 0.0318, "step": 105 }, { "epoch": 1.2727272727272727, "eval_loss": 0.03301350772380829, "eval_runtime": 6.1968, "eval_samples_per_second": 8.069, "eval_steps_per_second": 2.098, "step": 105 }, { "epoch": 1.284848484848485, "grad_norm": 0.027320127934217453, "learning_rate": 9.253719980342135e-05, "loss": 0.0339, "step": 106 }, { "epoch": 1.2969696969696969, "grad_norm": 0.0313449464738369, "learning_rate": 9.231192655438221e-05, "loss": 0.0336, "step": 107 }, { "epoch": 1.309090909090909, "grad_norm": 0.029063764959573746, "learning_rate": 9.208358635185373e-05, "loss": 0.0324, "step": 108 }, { "epoch": 1.3212121212121213, "grad_norm": 0.03135693818330765, "learning_rate": 9.185219574693242e-05, "loss": 0.0332, "step": 109 }, { "epoch": 1.3333333333333333, "grad_norm": 0.0317191518843174, "learning_rate": 9.161777151182136e-05, "loss": 0.028, "step": 110 }, { "epoch": 1.3333333333333333, "eval_loss": 0.03218723088502884, "eval_runtime": 6.2118, "eval_samples_per_second": 8.049, "eval_steps_per_second": 2.093, "step": 110 }, { "epoch": 1.3454545454545455, "grad_norm": 0.031457044184207916, "learning_rate": 9.138033063861436e-05, "loss": 0.0346, "step": 111 }, { "epoch": 1.3575757575757577, "grad_norm": 0.031810589134693146, "learning_rate": 9.113989033806434e-05, "loss": 0.0283, "step": 112 }, { "epoch": 1.3696969696969696, "grad_norm": 0.030629124492406845, "learning_rate": 9.089646803833589e-05, "loss": 0.0246, "step": 113 }, { "epoch": 1.3818181818181818, "grad_norm": 0.030411459505558014, "learning_rate": 9.065008138374189e-05, "loss": 0.0317, "step": 114 }, { "epoch": 1.393939393939394, "grad_norm": 0.029815878719091415, "learning_rate": 9.040074823346465e-05, "loss": 0.0311, "step": 115 }, { "epoch": 1.393939393939394, "eval_loss": 0.032092493027448654, "eval_runtime": 6.1885, "eval_samples_per_second": 8.08, "eval_steps_per_second": 2.101, "step": 115 }, { "epoch": 1.406060606060606, "grad_norm": 0.030812319368124008, "learning_rate": 9.014848666026138e-05, "loss": 0.0389, "step": 116 }, { "epoch": 1.4181818181818182, "grad_norm": 0.02588343806564808, "learning_rate": 8.989331494915417e-05, "loss": 0.0287, "step": 117 }, { "epoch": 1.4303030303030302, "grad_norm": 0.02780727669596672, "learning_rate": 8.963525159610465e-05, "loss": 0.0265, "step": 118 }, { "epoch": 1.4424242424242424, "grad_norm": 0.026163380593061447, "learning_rate": 8.937431530667328e-05, "loss": 0.0262, "step": 119 }, { "epoch": 1.4545454545454546, "grad_norm": 0.0316736213862896, "learning_rate": 8.911052499466357e-05, "loss": 0.0382, "step": 120 }, { "epoch": 1.4545454545454546, "eval_loss": 0.031465690582990646, "eval_runtime": 6.1922, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.099, "step": 120 }, { "epoch": 1.4666666666666668, "grad_norm": 0.03706022724509239, "learning_rate": 8.884389978075098e-05, "loss": 0.0336, "step": 121 }, { "epoch": 1.4787878787878788, "grad_norm": 0.027684392407536507, "learning_rate": 8.857445899109715e-05, "loss": 0.0267, "step": 122 }, { "epoch": 1.490909090909091, "grad_norm": 0.02498454973101616, "learning_rate": 8.83022221559489e-05, "loss": 0.0258, "step": 123 }, { "epoch": 1.503030303030303, "grad_norm": 0.03206618130207062, "learning_rate": 8.80272090082227e-05, "loss": 0.0344, "step": 124 }, { "epoch": 1.5151515151515151, "grad_norm": 0.03329097852110863, "learning_rate": 8.774943948207426e-05, "loss": 0.0316, "step": 125 }, { "epoch": 1.5151515151515151, "eval_loss": 0.030392121523618698, "eval_runtime": 6.2076, "eval_samples_per_second": 8.055, "eval_steps_per_second": 2.094, "step": 125 }, { "epoch": 1.5272727272727273, "grad_norm": 0.029471345245838165, "learning_rate": 8.746893371145366e-05, "loss": 0.0279, "step": 126 }, { "epoch": 1.5393939393939395, "grad_norm": 0.030292104929685593, "learning_rate": 8.718571202864598e-05, "loss": 0.0292, "step": 127 }, { "epoch": 1.5515151515151515, "grad_norm": 0.028025031089782715, "learning_rate": 8.689979496279746e-05, "loss": 0.0296, "step": 128 }, { "epoch": 1.5636363636363635, "grad_norm": 0.027177123352885246, "learning_rate": 8.661120323842751e-05, "loss": 0.0286, "step": 129 }, { "epoch": 1.5757575757575757, "grad_norm": 0.03291260078549385, "learning_rate": 8.631995777392645e-05, "loss": 0.0278, "step": 130 }, { "epoch": 1.5757575757575757, "eval_loss": 0.029901880770921707, "eval_runtime": 6.1931, "eval_samples_per_second": 8.073, "eval_steps_per_second": 2.099, "step": 130 }, { "epoch": 1.587878787878788, "grad_norm": 0.027456866577267647, "learning_rate": 8.602607968003935e-05, "loss": 0.0277, "step": 131 }, { "epoch": 1.6, "grad_norm": 0.02367628738284111, "learning_rate": 8.572959025833573e-05, "loss": 0.023, "step": 132 }, { "epoch": 1.612121212121212, "grad_norm": 0.030250705778598785, "learning_rate": 8.543051099966558e-05, "loss": 0.0253, "step": 133 }, { "epoch": 1.6242424242424243, "grad_norm": 0.02687668427824974, "learning_rate": 8.512886358260162e-05, "loss": 0.0249, "step": 134 }, { "epoch": 1.6363636363636362, "grad_norm": 0.032938696444034576, "learning_rate": 8.482466987186785e-05, "loss": 0.0285, "step": 135 }, { "epoch": 1.6363636363636362, "eval_loss": 0.029229959473013878, "eval_runtime": 6.1897, "eval_samples_per_second": 8.078, "eval_steps_per_second": 2.1, "step": 135 }, { "epoch": 1.6484848484848484, "grad_norm": 0.02894946001470089, "learning_rate": 8.451795191675488e-05, "loss": 0.0268, "step": 136 }, { "epoch": 1.6606060606060606, "grad_norm": 0.03599061071872711, "learning_rate": 8.420873194952152e-05, "loss": 0.0351, "step": 137 }, { "epoch": 1.6727272727272728, "grad_norm": 0.031365521252155304, "learning_rate": 8.389703238378339e-05, "loss": 0.0309, "step": 138 }, { "epoch": 1.6848484848484848, "grad_norm": 0.024763284251093864, "learning_rate": 8.358287581288822e-05, "loss": 0.0244, "step": 139 }, { "epoch": 1.696969696969697, "grad_norm": 0.026635024696588516, "learning_rate": 8.326628500827826e-05, "loss": 0.0257, "step": 140 }, { "epoch": 1.696969696969697, "eval_loss": 0.02854442596435547, "eval_runtime": 6.2016, "eval_samples_per_second": 8.062, "eval_steps_per_second": 2.096, "step": 140 }, { "epoch": 1.709090909090909, "grad_norm": 0.03084694594144821, "learning_rate": 8.294728291783966e-05, "loss": 0.0301, "step": 141 }, { "epoch": 1.7212121212121212, "grad_norm": 0.024888882413506508, "learning_rate": 8.262589266423908e-05, "loss": 0.0254, "step": 142 }, { "epoch": 1.7333333333333334, "grad_norm": 0.02867315709590912, "learning_rate": 8.230213754324773e-05, "loss": 0.0338, "step": 143 }, { "epoch": 1.7454545454545456, "grad_norm": 0.030578091740608215, "learning_rate": 8.197604102205271e-05, "loss": 0.0265, "step": 144 }, { "epoch": 1.7575757575757576, "grad_norm": 0.025194313377141953, "learning_rate": 8.16476267375561e-05, "loss": 0.0244, "step": 145 }, { "epoch": 1.7575757575757576, "eval_loss": 0.028112677857279778, "eval_runtime": 6.1903, "eval_samples_per_second": 8.077, "eval_steps_per_second": 2.1, "step": 145 }, { "epoch": 1.7696969696969695, "grad_norm": 0.03511481732130051, "learning_rate": 8.131691849466153e-05, "loss": 0.0351, "step": 146 }, { "epoch": 1.7818181818181817, "grad_norm": 0.034265320748090744, "learning_rate": 8.098394026454885e-05, "loss": 0.0318, "step": 147 }, { "epoch": 1.793939393939394, "grad_norm": 0.02798490971326828, "learning_rate": 8.064871618293646e-05, "loss": 0.0258, "step": 148 }, { "epoch": 1.8060606060606061, "grad_norm": 0.031277846544981, "learning_rate": 8.03112705483319e-05, "loss": 0.0322, "step": 149 }, { "epoch": 1.8181818181818183, "grad_norm": 0.025352105498313904, "learning_rate": 7.997162782027061e-05, "loss": 0.0256, "step": 150 }, { "epoch": 1.8181818181818183, "eval_loss": 0.027805332094430923, "eval_runtime": 6.1948, "eval_samples_per_second": 8.071, "eval_steps_per_second": 2.099, "step": 150 }, { "epoch": 1.8303030303030303, "grad_norm": 0.03179726377129555, "learning_rate": 7.962981261754294e-05, "loss": 0.0265, "step": 151 }, { "epoch": 1.8424242424242423, "grad_norm": 0.02985468879342079, "learning_rate": 7.928584971640974e-05, "loss": 0.0302, "step": 152 }, { "epoch": 1.8545454545454545, "grad_norm": 0.031871821731328964, "learning_rate": 7.893976404880643e-05, "loss": 0.0331, "step": 153 }, { "epoch": 1.8666666666666667, "grad_norm": 0.028416186571121216, "learning_rate": 7.859158070053577e-05, "loss": 0.0245, "step": 154 }, { "epoch": 1.878787878787879, "grad_norm": 0.03054559975862503, "learning_rate": 7.824132490944967e-05, "loss": 0.0338, "step": 155 }, { "epoch": 1.878787878787879, "eval_loss": 0.027029650285840034, "eval_runtime": 6.235, "eval_samples_per_second": 8.019, "eval_steps_per_second": 2.085, "step": 155 }, { "epoch": 1.8909090909090909, "grad_norm": 0.028330031782388687, "learning_rate": 7.788902206361973e-05, "loss": 0.0241, "step": 156 }, { "epoch": 1.903030303030303, "grad_norm": 0.031616389751434326, "learning_rate": 7.7534697699497e-05, "loss": 0.0301, "step": 157 }, { "epoch": 1.915151515151515, "grad_norm": 0.027048081159591675, "learning_rate": 7.717837750006106e-05, "loss": 0.0274, "step": 158 }, { "epoch": 1.9272727272727272, "grad_norm": 0.028316281735897064, "learning_rate": 7.682008729295833e-05, "loss": 0.026, "step": 159 }, { "epoch": 1.9393939393939394, "grad_norm": 0.02987455017864704, "learning_rate": 7.645985304863003e-05, "loss": 0.0309, "step": 160 }, { "epoch": 1.9393939393939394, "eval_loss": 0.02624826692044735, "eval_runtime": 6.1867, "eval_samples_per_second": 8.082, "eval_steps_per_second": 2.101, "step": 160 }, { "epoch": 1.9515151515151516, "grad_norm": 0.02562532387673855, "learning_rate": 7.609770087842969e-05, "loss": 0.0275, "step": 161 }, { "epoch": 1.9636363636363636, "grad_norm": 0.026776108890771866, "learning_rate": 7.573365703273046e-05, "loss": 0.0263, "step": 162 }, { "epoch": 1.9757575757575756, "grad_norm": 0.031301844865083694, "learning_rate": 7.536774789902246e-05, "loss": 0.0293, "step": 163 }, { "epoch": 1.9878787878787878, "grad_norm": 0.02761393040418625, "learning_rate": 7.500000000000001e-05, "loss": 0.0292, "step": 164 }, { "epoch": 2.0, "grad_norm": 0.04325617477297783, "learning_rate": 7.463043999163919e-05, "loss": 0.0378, "step": 165 }, { "epoch": 2.0, "eval_loss": 0.02608395926654339, "eval_runtime": 6.2208, "eval_samples_per_second": 8.038, "eval_steps_per_second": 2.09, "step": 165 }, { "epoch": 2.012121212121212, "grad_norm": 0.026866400614380836, "learning_rate": 7.425909466126568e-05, "loss": 0.024, "step": 166 }, { "epoch": 2.0242424242424244, "grad_norm": 0.026734622195363045, "learning_rate": 7.388599092561315e-05, "loss": 0.0238, "step": 167 }, { "epoch": 2.036363636363636, "grad_norm": 0.02514388971030712, "learning_rate": 7.351115582887211e-05, "loss": 0.0218, "step": 168 }, { "epoch": 2.0484848484848484, "grad_norm": 0.02405986562371254, "learning_rate": 7.313461654072973e-05, "loss": 0.0199, "step": 169 }, { "epoch": 2.0606060606060606, "grad_norm": 0.030505580827593803, "learning_rate": 7.275640035440045e-05, "loss": 0.0275, "step": 170 }, { "epoch": 2.0606060606060606, "eval_loss": 0.026318900287151337, "eval_runtime": 6.2393, "eval_samples_per_second": 8.014, "eval_steps_per_second": 2.084, "step": 170 }, { "epoch": 2.0727272727272728, "grad_norm": 0.03722088038921356, "learning_rate": 7.237653468464756e-05, "loss": 0.0256, "step": 171 }, { "epoch": 2.084848484848485, "grad_norm": 0.03724412992596626, "learning_rate": 7.199504706579617e-05, "loss": 0.0226, "step": 172 }, { "epoch": 2.096969696969697, "grad_norm": 0.030355574563145638, "learning_rate": 7.161196514973734e-05, "loss": 0.0188, "step": 173 }, { "epoch": 2.109090909090909, "grad_norm": 0.03693992272019386, "learning_rate": 7.12273167039238e-05, "loss": 0.0232, "step": 174 }, { "epoch": 2.121212121212121, "grad_norm": 0.03164402395486832, "learning_rate": 7.084112960935716e-05, "loss": 0.0225, "step": 175 }, { "epoch": 2.121212121212121, "eval_loss": 0.025883661583065987, "eval_runtime": 6.2414, "eval_samples_per_second": 8.011, "eval_steps_per_second": 2.083, "step": 175 }, { "epoch": 2.1333333333333333, "grad_norm": 0.031605158001184464, "learning_rate": 7.045343185856701e-05, "loss": 0.0248, "step": 176 }, { "epoch": 2.1454545454545455, "grad_norm": 0.0310862734913826, "learning_rate": 7.006425155358195e-05, "loss": 0.0244, "step": 177 }, { "epoch": 2.1575757575757577, "grad_norm": 0.031485848128795624, "learning_rate": 6.967361690389258e-05, "loss": 0.0242, "step": 178 }, { "epoch": 2.16969696969697, "grad_norm": 0.03367177024483681, "learning_rate": 6.92815562244068e-05, "loss": 0.0246, "step": 179 }, { "epoch": 2.1818181818181817, "grad_norm": 0.028202077373862267, "learning_rate": 6.88880979333973e-05, "loss": 0.0232, "step": 180 }, { "epoch": 2.1818181818181817, "eval_loss": 0.025616060942411423, "eval_runtime": 6.1872, "eval_samples_per_second": 8.081, "eval_steps_per_second": 2.101, "step": 180 }, { "epoch": 2.193939393939394, "grad_norm": 0.03502137213945389, "learning_rate": 6.849327055044183e-05, "loss": 0.0251, "step": 181 }, { "epoch": 2.206060606060606, "grad_norm": 0.029362250119447708, "learning_rate": 6.809710269435589e-05, "loss": 0.022, "step": 182 }, { "epoch": 2.2181818181818183, "grad_norm": 0.033701106905937195, "learning_rate": 6.769962308111839e-05, "loss": 0.0234, "step": 183 }, { "epoch": 2.2303030303030305, "grad_norm": 0.03379302844405174, "learning_rate": 6.730086052179004e-05, "loss": 0.0221, "step": 184 }, { "epoch": 2.242424242424242, "grad_norm": 0.027100039646029472, "learning_rate": 6.690084392042513e-05, "loss": 0.0193, "step": 185 }, { "epoch": 2.242424242424242, "eval_loss": 0.025547849014401436, "eval_runtime": 6.2367, "eval_samples_per_second": 8.017, "eval_steps_per_second": 2.084, "step": 185 }, { "epoch": 2.2545454545454544, "grad_norm": 0.03181413188576698, "learning_rate": 6.649960227197647e-05, "loss": 0.0217, "step": 186 }, { "epoch": 2.2666666666666666, "grad_norm": 0.03648809716105461, "learning_rate": 6.609716466019356e-05, "loss": 0.0239, "step": 187 }, { "epoch": 2.278787878787879, "grad_norm": 0.0302013847976923, "learning_rate": 6.569356025551454e-05, "loss": 0.0232, "step": 188 }, { "epoch": 2.290909090909091, "grad_norm": 0.028094977140426636, "learning_rate": 6.528881831295188e-05, "loss": 0.02, "step": 189 }, { "epoch": 2.303030303030303, "grad_norm": 0.03214862942695618, "learning_rate": 6.488296816997173e-05, "loss": 0.0251, "step": 190 }, { "epoch": 2.303030303030303, "eval_loss": 0.02527759224176407, "eval_runtime": 6.19, "eval_samples_per_second": 8.078, "eval_steps_per_second": 2.1, "step": 190 }, { "epoch": 2.315151515151515, "grad_norm": 0.033984988927841187, "learning_rate": 6.447603924436744e-05, "loss": 0.0243, "step": 191 }, { "epoch": 2.327272727272727, "grad_norm": 0.027719179168343544, "learning_rate": 6.406806103212725e-05, "loss": 0.0204, "step": 192 }, { "epoch": 2.3393939393939394, "grad_norm": 0.029257657006382942, "learning_rate": 6.36590631052963e-05, "loss": 0.0232, "step": 193 }, { "epoch": 2.3515151515151516, "grad_norm": 0.050508007407188416, "learning_rate": 6.32490751098331e-05, "loss": 0.0324, "step": 194 }, { "epoch": 2.3636363636363638, "grad_norm": 0.029407154768705368, "learning_rate": 6.283812676346063e-05, "loss": 0.0228, "step": 195 }, { "epoch": 2.3636363636363638, "eval_loss": 0.024870626628398895, "eval_runtime": 6.191, "eval_samples_per_second": 8.076, "eval_steps_per_second": 2.1, "step": 195 }, { "epoch": 2.375757575757576, "grad_norm": 0.0258539617061615, "learning_rate": 6.242624785351236e-05, "loss": 0.0231, "step": 196 }, { "epoch": 2.3878787878787877, "grad_norm": 0.02586168795824051, "learning_rate": 6.201346823477303e-05, "loss": 0.0193, "step": 197 }, { "epoch": 2.4, "grad_norm": 0.029741084203124046, "learning_rate": 6.159981782731474e-05, "loss": 0.0227, "step": 198 }, { "epoch": 2.412121212121212, "grad_norm": 0.029881663620471954, "learning_rate": 6.118532661432812e-05, "loss": 0.0224, "step": 199 }, { "epoch": 2.4242424242424243, "grad_norm": 0.027224918827414513, "learning_rate": 6.0770024639949074e-05, "loss": 0.0195, "step": 200 }, { "epoch": 2.4242424242424243, "eval_loss": 0.024939175695180893, "eval_runtime": 6.1942, "eval_samples_per_second": 8.072, "eval_steps_per_second": 2.099, "step": 200 }, { "epoch": 2.4363636363636365, "grad_norm": 0.028513159602880478, "learning_rate": 6.0353942007081046e-05, "loss": 0.0198, "step": 201 }, { "epoch": 2.4484848484848483, "grad_norm": 0.028778916224837303, "learning_rate": 5.993710887521302e-05, "loss": 0.0184, "step": 202 }, { "epoch": 2.4606060606060605, "grad_norm": 0.03407447412610054, "learning_rate": 5.951955545823342e-05, "loss": 0.0207, "step": 203 }, { "epoch": 2.4727272727272727, "grad_norm": 0.033413201570510864, "learning_rate": 5.9101312022240106e-05, "loss": 0.0217, "step": 204 }, { "epoch": 2.484848484848485, "grad_norm": 0.031220227479934692, "learning_rate": 5.868240888334653e-05, "loss": 0.0219, "step": 205 }, { "epoch": 2.484848484848485, "eval_loss": 0.024136777967214584, "eval_runtime": 6.2, "eval_samples_per_second": 8.065, "eval_steps_per_second": 2.097, "step": 205 }, { "epoch": 2.496969696969697, "grad_norm": 0.0299720149487257, "learning_rate": 5.826287640548425e-05, "loss": 0.0231, "step": 206 }, { "epoch": 2.509090909090909, "grad_norm": 0.030199084430933, "learning_rate": 5.784274499820214e-05, "loss": 0.0243, "step": 207 }, { "epoch": 2.5212121212121215, "grad_norm": 0.03225167095661163, "learning_rate": 5.742204511446203e-05, "loss": 0.0241, "step": 208 }, { "epoch": 2.533333333333333, "grad_norm": 0.02794428914785385, "learning_rate": 5.700080724843147e-05, "loss": 0.0217, "step": 209 }, { "epoch": 2.5454545454545454, "grad_norm": 0.026055919006466866, "learning_rate": 5.657906193327325e-05, "loss": 0.0184, "step": 210 }, { "epoch": 2.5454545454545454, "eval_loss": 0.023839673027396202, "eval_runtime": 6.1855, "eval_samples_per_second": 8.083, "eval_steps_per_second": 2.102, "step": 210 }, { "epoch": 2.5575757575757576, "grad_norm": 0.03009297326207161, "learning_rate": 5.6156839738932343e-05, "loss": 0.0233, "step": 211 }, { "epoch": 2.56969696969697, "grad_norm": 0.038690801709890366, "learning_rate": 5.573417126992003e-05, "loss": 0.0419, "step": 212 }, { "epoch": 2.581818181818182, "grad_norm": 0.03184739127755165, "learning_rate": 5.531108716309547e-05, "loss": 0.0208, "step": 213 }, { "epoch": 2.5939393939393938, "grad_norm": 0.04226066172122955, "learning_rate": 5.4887618085445094e-05, "loss": 0.0356, "step": 214 }, { "epoch": 2.606060606060606, "grad_norm": 0.02787015587091446, "learning_rate": 5.446379473185972e-05, "loss": 0.0199, "step": 215 }, { "epoch": 2.606060606060606, "eval_loss": 0.023647097870707512, "eval_runtime": 6.1981, "eval_samples_per_second": 8.067, "eval_steps_per_second": 2.097, "step": 215 }, { "epoch": 2.618181818181818, "grad_norm": 0.028222182765603065, "learning_rate": 5.4039647822909624e-05, "loss": 0.0185, "step": 216 }, { "epoch": 2.6303030303030304, "grad_norm": 0.03137464076280594, "learning_rate": 5.361520810261779e-05, "loss": 0.0212, "step": 217 }, { "epoch": 2.6424242424242426, "grad_norm": 0.028826339170336723, "learning_rate": 5.319050633623142e-05, "loss": 0.0208, "step": 218 }, { "epoch": 2.6545454545454543, "grad_norm": 0.048953138291835785, "learning_rate": 5.2765573307992036e-05, "loss": 0.0343, "step": 219 }, { "epoch": 2.6666666666666665, "grad_norm": 0.03138812631368637, "learning_rate": 5.234043981890394e-05, "loss": 0.023, "step": 220 }, { "epoch": 2.6666666666666665, "eval_loss": 0.02315612882375717, "eval_runtime": 6.1852, "eval_samples_per_second": 8.084, "eval_steps_per_second": 2.102, "step": 220 }, { "epoch": 2.6787878787878787, "grad_norm": 0.03604348748922348, "learning_rate": 5.191513668450178e-05, "loss": 0.0208, "step": 221 }, { "epoch": 2.690909090909091, "grad_norm": 0.028721556067466736, "learning_rate": 5.14896947326168e-05, "loss": 0.0178, "step": 222 }, { "epoch": 2.703030303030303, "grad_norm": 0.02585718221962452, "learning_rate": 5.1064144801142374e-05, "loss": 0.019, "step": 223 }, { "epoch": 2.7151515151515153, "grad_norm": 0.02729875221848488, "learning_rate": 5.0638517735798696e-05, "loss": 0.0184, "step": 224 }, { "epoch": 2.7272727272727275, "grad_norm": 0.029812021180987358, "learning_rate": 5.021284438789694e-05, "loss": 0.0227, "step": 225 }, { "epoch": 2.7272727272727275, "eval_loss": 0.023449590429663658, "eval_runtime": 6.1841, "eval_samples_per_second": 8.085, "eval_steps_per_second": 2.102, "step": 225 }, { "epoch": 2.7393939393939393, "grad_norm": 0.02997618354856968, "learning_rate": 4.9787155612103074e-05, "loss": 0.0205, "step": 226 }, { "epoch": 2.7515151515151515, "grad_norm": 0.028398435562849045, "learning_rate": 4.936148226420132e-05, "loss": 0.0171, "step": 227 }, { "epoch": 2.7636363636363637, "grad_norm": 0.030046509578824043, "learning_rate": 4.893585519885764e-05, "loss": 0.0197, "step": 228 }, { "epoch": 2.775757575757576, "grad_norm": 0.029226917773485184, "learning_rate": 4.851030526738321e-05, "loss": 0.0204, "step": 229 }, { "epoch": 2.787878787878788, "grad_norm": 0.03432171046733856, "learning_rate": 4.8084863315498234e-05, "loss": 0.0206, "step": 230 }, { "epoch": 2.787878787878788, "eval_loss": 0.022967081516981125, "eval_runtime": 6.1905, "eval_samples_per_second": 8.077, "eval_steps_per_second": 2.1, "step": 230 }, { "epoch": 2.8, "grad_norm": 0.029804987832903862, "learning_rate": 4.765956018109607e-05, "loss": 0.0197, "step": 231 }, { "epoch": 2.812121212121212, "grad_norm": 0.029483767226338387, "learning_rate": 4.723442669200798e-05, "loss": 0.0213, "step": 232 }, { "epoch": 2.824242424242424, "grad_norm": 0.03142073005437851, "learning_rate": 4.680949366376858e-05, "loss": 0.0237, "step": 233 }, { "epoch": 2.8363636363636364, "grad_norm": 0.029062366113066673, "learning_rate": 4.638479189738224e-05, "loss": 0.0235, "step": 234 }, { "epoch": 2.8484848484848486, "grad_norm": 0.03055807389318943, "learning_rate": 4.5960352177090395e-05, "loss": 0.0217, "step": 235 }, { "epoch": 2.8484848484848486, "eval_loss": 0.022518714889883995, "eval_runtime": 6.1918, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.1, "step": 235 }, { "epoch": 2.8606060606060604, "grad_norm": 0.027205798774957657, "learning_rate": 4.5536205268140294e-05, "loss": 0.0189, "step": 236 }, { "epoch": 2.8727272727272726, "grad_norm": 0.025477448478341103, "learning_rate": 4.511238191455491e-05, "loss": 0.0166, "step": 237 }, { "epoch": 2.8848484848484848, "grad_norm": 0.025487707927823067, "learning_rate": 4.468891283690454e-05, "loss": 0.0183, "step": 238 }, { "epoch": 2.896969696969697, "grad_norm": 0.0332886204123497, "learning_rate": 4.4265828730079987e-05, "loss": 0.0221, "step": 239 }, { "epoch": 2.909090909090909, "grad_norm": 0.029150154441595078, "learning_rate": 4.3843160261067655e-05, "loss": 0.0186, "step": 240 }, { "epoch": 2.909090909090909, "eval_loss": 0.022352781146764755, "eval_runtime": 6.1961, "eval_samples_per_second": 8.07, "eval_steps_per_second": 2.098, "step": 240 }, { "epoch": 2.9212121212121214, "grad_norm": 0.029591498896479607, "learning_rate": 4.342093806672678e-05, "loss": 0.0181, "step": 241 }, { "epoch": 2.9333333333333336, "grad_norm": 0.03216252475976944, "learning_rate": 4.2999192751568564e-05, "loss": 0.0203, "step": 242 }, { "epoch": 2.9454545454545453, "grad_norm": 0.02891668863594532, "learning_rate": 4.2577954885537986e-05, "loss": 0.0181, "step": 243 }, { "epoch": 2.9575757575757575, "grad_norm": 0.028023086488246918, "learning_rate": 4.215725500179787e-05, "loss": 0.0191, "step": 244 }, { "epoch": 2.9696969696969697, "grad_norm": 0.03082926571369171, "learning_rate": 4.1737123594515756e-05, "loss": 0.0201, "step": 245 }, { "epoch": 2.9696969696969697, "eval_loss": 0.02198323793709278, "eval_runtime": 6.1948, "eval_samples_per_second": 8.071, "eval_steps_per_second": 2.099, "step": 245 }, { "epoch": 2.981818181818182, "grad_norm": 0.0321161188185215, "learning_rate": 4.131759111665349e-05, "loss": 0.0191, "step": 246 }, { "epoch": 2.993939393939394, "grad_norm": 0.026935642585158348, "learning_rate": 4.089868797775989e-05, "loss": 0.0185, "step": 247 }, { "epoch": 3.006060606060606, "grad_norm": 0.05411810800433159, "learning_rate": 4.0480444541766576e-05, "loss": 0.0301, "step": 248 }, { "epoch": 3.018181818181818, "grad_norm": 0.027184097096323967, "learning_rate": 4.0062891124787e-05, "loss": 0.0188, "step": 249 }, { "epoch": 3.0303030303030303, "grad_norm": 0.022542983293533325, "learning_rate": 3.964605799291897e-05, "loss": 0.0147, "step": 250 }, { "epoch": 3.0303030303030303, "eval_loss": 0.021969465538859367, "eval_runtime": 6.1917, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.1, "step": 250 }, { "epoch": 3.0424242424242425, "grad_norm": 0.03472661226987839, "learning_rate": 3.922997536005094e-05, "loss": 0.0178, "step": 251 }, { "epoch": 3.0545454545454547, "grad_norm": 0.028128741309046745, "learning_rate": 3.8814673385671894e-05, "loss": 0.0157, "step": 252 }, { "epoch": 3.066666666666667, "grad_norm": 0.03135592117905617, "learning_rate": 3.840018217268527e-05, "loss": 0.0161, "step": 253 }, { "epoch": 3.0787878787878786, "grad_norm": 0.03661385551095009, "learning_rate": 3.7986531765226964e-05, "loss": 0.0161, "step": 254 }, { "epoch": 3.090909090909091, "grad_norm": 0.03205974027514458, "learning_rate": 3.757375214648764e-05, "loss": 0.0142, "step": 255 }, { "epoch": 3.090909090909091, "eval_loss": 0.022621195763349533, "eval_runtime": 6.2467, "eval_samples_per_second": 8.004, "eval_steps_per_second": 2.081, "step": 255 }, { "epoch": 3.103030303030303, "grad_norm": 0.037527382373809814, "learning_rate": 3.716187323653939e-05, "loss": 0.0167, "step": 256 }, { "epoch": 3.1151515151515152, "grad_norm": 0.03540443629026413, "learning_rate": 3.675092489016693e-05, "loss": 0.0168, "step": 257 }, { "epoch": 3.1272727272727274, "grad_norm": 0.034389954060316086, "learning_rate": 3.634093689470371e-05, "loss": 0.017, "step": 258 }, { "epoch": 3.1393939393939396, "grad_norm": 0.033294420689344406, "learning_rate": 3.5931938967872766e-05, "loss": 0.016, "step": 259 }, { "epoch": 3.1515151515151514, "grad_norm": 0.028759747743606567, "learning_rate": 3.5523960755632574e-05, "loss": 0.0149, "step": 260 }, { "epoch": 3.1515151515151514, "eval_loss": 0.021824924275279045, "eval_runtime": 6.1966, "eval_samples_per_second": 8.069, "eval_steps_per_second": 2.098, "step": 260 }, { "epoch": 3.1636363636363636, "grad_norm": 0.02833370864391327, "learning_rate": 3.5117031830028274e-05, "loss": 0.0127, "step": 261 }, { "epoch": 3.175757575757576, "grad_norm": 0.0286524910479784, "learning_rate": 3.471118168704811e-05, "loss": 0.015, "step": 262 }, { "epoch": 3.187878787878788, "grad_norm": 0.02769540622830391, "learning_rate": 3.4306439744485454e-05, "loss": 0.0154, "step": 263 }, { "epoch": 3.2, "grad_norm": 0.03221355006098747, "learning_rate": 3.390283533980646e-05, "loss": 0.0167, "step": 264 }, { "epoch": 3.212121212121212, "grad_norm": 0.026392612606287003, "learning_rate": 3.350039772802354e-05, "loss": 0.0151, "step": 265 }, { "epoch": 3.212121212121212, "eval_loss": 0.02153392694890499, "eval_runtime": 6.1848, "eval_samples_per_second": 8.084, "eval_steps_per_second": 2.102, "step": 265 }, { "epoch": 3.224242424242424, "grad_norm": 0.03104759193956852, "learning_rate": 3.309915607957487e-05, "loss": 0.0171, "step": 266 }, { "epoch": 3.2363636363636363, "grad_norm": 0.028836429119110107, "learning_rate": 3.269913947820998e-05, "loss": 0.0158, "step": 267 }, { "epoch": 3.2484848484848485, "grad_norm": 0.032903432846069336, "learning_rate": 3.2300376918881624e-05, "loss": 0.0156, "step": 268 }, { "epoch": 3.2606060606060607, "grad_norm": 0.030551951378583908, "learning_rate": 3.1902897305644095e-05, "loss": 0.0134, "step": 269 }, { "epoch": 3.2727272727272725, "grad_norm": 0.030059080570936203, "learning_rate": 3.1506729449558184e-05, "loss": 0.0174, "step": 270 }, { "epoch": 3.2727272727272725, "eval_loss": 0.021679332479834557, "eval_runtime": 6.1964, "eval_samples_per_second": 8.069, "eval_steps_per_second": 2.098, "step": 270 }, { "epoch": 3.2848484848484847, "grad_norm": 0.030379703268408775, "learning_rate": 3.1111902066602724e-05, "loss": 0.018, "step": 271 }, { "epoch": 3.296969696969697, "grad_norm": 0.02761555276811123, "learning_rate": 3.071844377559323e-05, "loss": 0.016, "step": 272 }, { "epoch": 3.309090909090909, "grad_norm": 0.026775086298584938, "learning_rate": 3.0326383096107426e-05, "loss": 0.014, "step": 273 }, { "epoch": 3.3212121212121213, "grad_norm": 0.03328753635287285, "learning_rate": 2.9935748446418066e-05, "loss": 0.0169, "step": 274 }, { "epoch": 3.3333333333333335, "grad_norm": 0.03564237430691719, "learning_rate": 2.9546568141433006e-05, "loss": 0.0172, "step": 275 }, { "epoch": 3.3333333333333335, "eval_loss": 0.021324800327420235, "eval_runtime": 6.1942, "eval_samples_per_second": 8.072, "eval_steps_per_second": 2.099, "step": 275 }, { "epoch": 3.3454545454545457, "grad_norm": 0.028079047799110413, "learning_rate": 2.915887039064287e-05, "loss": 0.0141, "step": 276 }, { "epoch": 3.3575757575757574, "grad_norm": 0.02460673451423645, "learning_rate": 2.8772683296076196e-05, "loss": 0.0126, "step": 277 }, { "epoch": 3.3696969696969696, "grad_norm": 0.04345537722110748, "learning_rate": 2.8388034850262646e-05, "loss": 0.0376, "step": 278 }, { "epoch": 3.381818181818182, "grad_norm": 0.0306687094271183, "learning_rate": 2.8004952934203838e-05, "loss": 0.017, "step": 279 }, { "epoch": 3.393939393939394, "grad_norm": 0.033993735909461975, "learning_rate": 2.762346531535246e-05, "loss": 0.017, "step": 280 }, { "epoch": 3.393939393939394, "eval_loss": 0.02108747698366642, "eval_runtime": 6.2236, "eval_samples_per_second": 8.034, "eval_steps_per_second": 2.089, "step": 280 }, { "epoch": 3.4060606060606062, "grad_norm": 0.02444186620414257, "learning_rate": 2.7243599645599576e-05, "loss": 0.014, "step": 281 }, { "epoch": 3.418181818181818, "grad_norm": 0.028384167701005936, "learning_rate": 2.6865383459270265e-05, "loss": 0.0158, "step": 282 }, { "epoch": 3.43030303030303, "grad_norm": 0.03162846714258194, "learning_rate": 2.6488844171127903e-05, "loss": 0.017, "step": 283 }, { "epoch": 3.4424242424242424, "grad_norm": 0.027297567576169968, "learning_rate": 2.6114009074386846e-05, "loss": 0.0124, "step": 284 }, { "epoch": 3.4545454545454546, "grad_norm": 0.03545952960848808, "learning_rate": 2.574090533873431e-05, "loss": 0.0223, "step": 285 }, { "epoch": 3.4545454545454546, "eval_loss": 0.021236957982182503, "eval_runtime": 6.1899, "eval_samples_per_second": 8.078, "eval_steps_per_second": 2.1, "step": 285 }, { "epoch": 3.466666666666667, "grad_norm": 0.024587715044617653, "learning_rate": 2.5369560008360828e-05, "loss": 0.0132, "step": 286 }, { "epoch": 3.4787878787878785, "grad_norm": 0.025963526219129562, "learning_rate": 2.500000000000001e-05, "loss": 0.0132, "step": 287 }, { "epoch": 3.4909090909090907, "grad_norm": 0.03487967699766159, "learning_rate": 2.4632252100977566e-05, "loss": 0.0136, "step": 288 }, { "epoch": 3.503030303030303, "grad_norm": 0.030390363186597824, "learning_rate": 2.4266342967269552e-05, "loss": 0.0157, "step": 289 }, { "epoch": 3.515151515151515, "grad_norm": 0.030459538102149963, "learning_rate": 2.3902299121570333e-05, "loss": 0.0144, "step": 290 }, { "epoch": 3.515151515151515, "eval_loss": 0.02107882872223854, "eval_runtime": 6.2006, "eval_samples_per_second": 8.064, "eval_steps_per_second": 2.097, "step": 290 }, { "epoch": 3.5272727272727273, "grad_norm": 0.02951274998486042, "learning_rate": 2.354014695136997e-05, "loss": 0.0131, "step": 291 }, { "epoch": 3.5393939393939395, "grad_norm": 0.029193086549639702, "learning_rate": 2.317991270704167e-05, "loss": 0.0151, "step": 292 }, { "epoch": 3.5515151515151517, "grad_norm": 0.02726319245994091, "learning_rate": 2.282162249993895e-05, "loss": 0.0125, "step": 293 }, { "epoch": 3.5636363636363635, "grad_norm": 0.03212954103946686, "learning_rate": 2.246530230050301e-05, "loss": 0.0162, "step": 294 }, { "epoch": 3.5757575757575757, "grad_norm": 0.024175025522708893, "learning_rate": 2.211097793638029e-05, "loss": 0.0125, "step": 295 }, { "epoch": 3.5757575757575757, "eval_loss": 0.020796656608581543, "eval_runtime": 6.1933, "eval_samples_per_second": 8.073, "eval_steps_per_second": 2.099, "step": 295 }, { "epoch": 3.587878787878788, "grad_norm": 0.02679980918765068, "learning_rate": 2.175867509055033e-05, "loss": 0.0111, "step": 296 }, { "epoch": 3.6, "grad_norm": 0.031121132895350456, "learning_rate": 2.1408419299464245e-05, "loss": 0.0165, "step": 297 }, { "epoch": 3.6121212121212123, "grad_norm": 0.030764909461140633, "learning_rate": 2.106023595119358e-05, "loss": 0.0139, "step": 298 }, { "epoch": 3.624242424242424, "grad_norm": 0.0302122812718153, "learning_rate": 2.071415028359026e-05, "loss": 0.0148, "step": 299 }, { "epoch": 3.6363636363636362, "grad_norm": 0.029834948480129242, "learning_rate": 2.0370187382457068e-05, "loss": 0.0163, "step": 300 }, { "epoch": 3.6363636363636362, "eval_loss": 0.02069205790758133, "eval_runtime": 6.2054, "eval_samples_per_second": 8.057, "eval_steps_per_second": 2.095, "step": 300 }, { "epoch": 3.6484848484848484, "grad_norm": 0.03009135089814663, "learning_rate": 2.0028372179729403e-05, "loss": 0.0143, "step": 301 }, { "epoch": 3.6606060606060606, "grad_norm": 0.029639270156621933, "learning_rate": 1.9688729451668114e-05, "loss": 0.0168, "step": 302 }, { "epoch": 3.672727272727273, "grad_norm": 0.026824606582522392, "learning_rate": 1.935128381706355e-05, "loss": 0.0129, "step": 303 }, { "epoch": 3.6848484848484846, "grad_norm": 0.03427920117974281, "learning_rate": 1.901605973545116e-05, "loss": 0.0194, "step": 304 }, { "epoch": 3.6969696969696972, "grad_norm": 0.031160475686192513, "learning_rate": 1.868308150533847e-05, "loss": 0.015, "step": 305 }, { "epoch": 3.6969696969696972, "eval_loss": 0.020683376118540764, "eval_runtime": 6.1876, "eval_samples_per_second": 8.081, "eval_steps_per_second": 2.101, "step": 305 }, { "epoch": 3.709090909090909, "grad_norm": 0.033249229192733765, "learning_rate": 1.8352373262443916e-05, "loss": 0.0147, "step": 306 }, { "epoch": 3.721212121212121, "grad_norm": 0.029073260724544525, "learning_rate": 1.8023958977947304e-05, "loss": 0.0157, "step": 307 }, { "epoch": 3.7333333333333334, "grad_norm": 0.030520522966980934, "learning_rate": 1.7697862456752273e-05, "loss": 0.0152, "step": 308 }, { "epoch": 3.7454545454545456, "grad_norm": 0.029693983495235443, "learning_rate": 1.7374107335760936e-05, "loss": 0.0172, "step": 309 }, { "epoch": 3.757575757575758, "grad_norm": 0.03103681467473507, "learning_rate": 1.7052717082160346e-05, "loss": 0.0154, "step": 310 }, { "epoch": 3.757575757575758, "eval_loss": 0.02056981809437275, "eval_runtime": 6.1896, "eval_samples_per_second": 8.078, "eval_steps_per_second": 2.1, "step": 310 }, { "epoch": 3.7696969696969695, "grad_norm": 0.030047744512557983, "learning_rate": 1.673371499172174e-05, "loss": 0.015, "step": 311 }, { "epoch": 3.7818181818181817, "grad_norm": 0.03367823734879494, "learning_rate": 1.6417124187111775e-05, "loss": 0.017, "step": 312 }, { "epoch": 3.793939393939394, "grad_norm": 0.027037424966692924, "learning_rate": 1.610296761621662e-05, "loss": 0.0145, "step": 313 }, { "epoch": 3.806060606060606, "grad_norm": 0.030140092596411705, "learning_rate": 1.5791268050478486e-05, "loss": 0.0228, "step": 314 }, { "epoch": 3.8181818181818183, "grad_norm": 0.031016338616609573, "learning_rate": 1.5482048083245114e-05, "loss": 0.0186, "step": 315 }, { "epoch": 3.8181818181818183, "eval_loss": 0.02028246596455574, "eval_runtime": 6.1869, "eval_samples_per_second": 8.082, "eval_steps_per_second": 2.101, "step": 315 }, { "epoch": 3.83030303030303, "grad_norm": 0.02848219871520996, "learning_rate": 1.517533012813217e-05, "loss": 0.0159, "step": 316 }, { "epoch": 3.8424242424242423, "grad_norm": 0.024206412956118584, "learning_rate": 1.4871136417398406e-05, "loss": 0.0123, "step": 317 }, { "epoch": 3.8545454545454545, "grad_norm": 0.03030635416507721, "learning_rate": 1.4569489000334436e-05, "loss": 0.0137, "step": 318 }, { "epoch": 3.8666666666666667, "grad_norm": 0.03156241029500961, "learning_rate": 1.427040974166427e-05, "loss": 0.0159, "step": 319 }, { "epoch": 3.878787878787879, "grad_norm": 0.028942270204424858, "learning_rate": 1.3973920319960655e-05, "loss": 0.0135, "step": 320 }, { "epoch": 3.878787878787879, "eval_loss": 0.020162392407655716, "eval_runtime": 6.1914, "eval_samples_per_second": 8.076, "eval_steps_per_second": 2.1, "step": 320 }, { "epoch": 3.8909090909090907, "grad_norm": 0.03344618156552315, "learning_rate": 1.3680042226073552e-05, "loss": 0.0148, "step": 321 }, { "epoch": 3.9030303030303033, "grad_norm": 0.02961633913218975, "learning_rate": 1.3388796761572492e-05, "loss": 0.0141, "step": 322 }, { "epoch": 3.915151515151515, "grad_norm": 0.030708983540534973, "learning_rate": 1.310020503720254e-05, "loss": 0.0132, "step": 323 }, { "epoch": 3.9272727272727272, "grad_norm": 0.030072160065174103, "learning_rate": 1.2814287971354022e-05, "loss": 0.0161, "step": 324 }, { "epoch": 3.9393939393939394, "grad_norm": 0.03028644621372223, "learning_rate": 1.253106628854635e-05, "loss": 0.0159, "step": 325 }, { "epoch": 3.9393939393939394, "eval_loss": 0.020128030329942703, "eval_runtime": 6.1885, "eval_samples_per_second": 8.08, "eval_steps_per_second": 2.101, "step": 325 }, { "epoch": 3.9515151515151516, "grad_norm": 0.02972756326198578, "learning_rate": 1.2250560517925746e-05, "loss": 0.0142, "step": 326 }, { "epoch": 3.963636363636364, "grad_norm": 0.02873014286160469, "learning_rate": 1.1972790991777311e-05, "loss": 0.0155, "step": 327 }, { "epoch": 3.9757575757575756, "grad_norm": 0.028870223090052605, "learning_rate": 1.1697777844051105e-05, "loss": 0.0142, "step": 328 }, { "epoch": 3.987878787878788, "grad_norm": 0.02774449624121189, "learning_rate": 1.1425541008902851e-05, "loss": 0.0147, "step": 329 }, { "epoch": 4.0, "grad_norm": 0.04664117470383644, "learning_rate": 1.1156100219249022e-05, "loss": 0.0211, "step": 330 }, { "epoch": 4.0, "eval_loss": 0.01995450258255005, "eval_runtime": 6.191, "eval_samples_per_second": 8.076, "eval_steps_per_second": 2.1, "step": 330 }, { "epoch": 4.012121212121212, "grad_norm": 0.024339957162737846, "learning_rate": 1.0889475005336446e-05, "loss": 0.0133, "step": 331 }, { "epoch": 4.024242424242424, "grad_norm": 0.023921139538288116, "learning_rate": 1.0625684693326727e-05, "loss": 0.013, "step": 332 }, { "epoch": 4.036363636363636, "grad_norm": 0.02364080585539341, "learning_rate": 1.036474840389537e-05, "loss": 0.0132, "step": 333 }, { "epoch": 4.048484848484849, "grad_norm": 0.02356121875345707, "learning_rate": 1.0106685050845838e-05, "loss": 0.0121, "step": 334 }, { "epoch": 4.0606060606060606, "grad_norm": 0.025879928842186928, "learning_rate": 9.851513339738628e-06, "loss": 0.0134, "step": 335 }, { "epoch": 4.0606060606060606, "eval_loss": 0.0202109944075346, "eval_runtime": 6.1925, "eval_samples_per_second": 8.074, "eval_steps_per_second": 2.099, "step": 335 }, { "epoch": 4.072727272727272, "grad_norm": 0.02288251556456089, "learning_rate": 9.599251766535345e-06, "loss": 0.0121, "step": 336 }, { "epoch": 4.084848484848485, "grad_norm": 0.02703404612839222, "learning_rate": 9.349918616258114e-06, "loss": 0.0126, "step": 337 }, { "epoch": 4.096969696969697, "grad_norm": 0.025494417175650597, "learning_rate": 9.103531961664118e-06, "loss": 0.0122, "step": 338 }, { "epoch": 4.109090909090909, "grad_norm": 0.02807869389653206, "learning_rate": 8.860109661935674e-06, "loss": 0.0155, "step": 339 }, { "epoch": 4.121212121212121, "grad_norm": 0.027642810717225075, "learning_rate": 8.619669361385663e-06, "loss": 0.0113, "step": 340 }, { "epoch": 4.121212121212121, "eval_loss": 0.020561667159199715, "eval_runtime": 6.1965, "eval_samples_per_second": 8.069, "eval_steps_per_second": 2.098, "step": 340 }, { "epoch": 4.133333333333334, "grad_norm": 0.030346019193530083, "learning_rate": 8.38222848817864e-06, "loss": 0.0127, "step": 341 }, { "epoch": 4.1454545454545455, "grad_norm": 0.024746423587203026, "learning_rate": 8.14780425306758e-06, "loss": 0.0121, "step": 342 }, { "epoch": 4.157575757575757, "grad_norm": 0.026435259729623795, "learning_rate": 7.91641364814628e-06, "loss": 0.0109, "step": 343 }, { "epoch": 4.16969696969697, "grad_norm": 0.02962976135313511, "learning_rate": 7.688073445617799e-06, "loss": 0.0108, "step": 344 }, { "epoch": 4.181818181818182, "grad_norm": 0.02813326194882393, "learning_rate": 7.462800196578662e-06, "loss": 0.0117, "step": 345 }, { "epoch": 4.181818181818182, "eval_loss": 0.0208114180713892, "eval_runtime": 6.1943, "eval_samples_per_second": 8.072, "eval_steps_per_second": 2.099, "step": 345 }, { "epoch": 4.193939393939394, "grad_norm": 0.03237050771713257, "learning_rate": 7.240610229819195e-06, "loss": 0.013, "step": 346 }, { "epoch": 4.206060606060606, "grad_norm": 0.026286713778972626, "learning_rate": 7.0215196506399515e-06, "loss": 0.0108, "step": 347 }, { "epoch": 4.218181818181818, "grad_norm": 0.026608100160956383, "learning_rate": 6.8055443396842945e-06, "loss": 0.0103, "step": 348 }, { "epoch": 4.2303030303030305, "grad_norm": 0.03118029236793518, "learning_rate": 6.592699951787362e-06, "loss": 0.0138, "step": 349 }, { "epoch": 4.242424242424242, "grad_norm": 0.030633771792054176, "learning_rate": 6.3830019148412525e-06, "loss": 0.0108, "step": 350 }, { "epoch": 4.242424242424242, "eval_loss": 0.020906535908579826, "eval_runtime": 6.1873, "eval_samples_per_second": 8.081, "eval_steps_per_second": 2.101, "step": 350 }, { "epoch": 4.254545454545455, "grad_norm": 0.03377068042755127, "learning_rate": 6.17646542867682e-06, "loss": 0.0144, "step": 351 }, { "epoch": 4.266666666666667, "grad_norm": 0.027513034641742706, "learning_rate": 5.973105463961865e-06, "loss": 0.0107, "step": 352 }, { "epoch": 4.278787878787879, "grad_norm": 0.03077622503042221, "learning_rate": 5.772936761116027e-06, "loss": 0.0139, "step": 353 }, { "epoch": 4.290909090909091, "grad_norm": 0.026948513463139534, "learning_rate": 5.575973829242364e-06, "loss": 0.0126, "step": 354 }, { "epoch": 4.303030303030303, "grad_norm": 0.02911302261054516, "learning_rate": 5.382230945075556e-06, "loss": 0.012, "step": 355 }, { "epoch": 4.303030303030303, "eval_loss": 0.020746439695358276, "eval_runtime": 6.1873, "eval_samples_per_second": 8.081, "eval_steps_per_second": 2.101, "step": 355 }, { "epoch": 4.315151515151515, "grad_norm": 0.02364683710038662, "learning_rate": 5.191722151947226e-06, "loss": 0.0099, "step": 356 }, { "epoch": 4.327272727272727, "grad_norm": 0.031056983396410942, "learning_rate": 5.004461258767873e-06, "loss": 0.0143, "step": 357 }, { "epoch": 4.33939393939394, "grad_norm": 0.02985430881381035, "learning_rate": 4.820461839026047e-06, "loss": 0.0126, "step": 358 }, { "epoch": 4.351515151515152, "grad_norm": 0.02651560679078102, "learning_rate": 4.639737229804403e-06, "loss": 0.0115, "step": 359 }, { "epoch": 4.363636363636363, "grad_norm": 0.026479771360754967, "learning_rate": 4.462300530813024e-06, "loss": 0.0111, "step": 360 }, { "epoch": 4.363636363636363, "eval_loss": 0.020565090700984, "eval_runtime": 6.2012, "eval_samples_per_second": 8.063, "eval_steps_per_second": 2.096, "step": 360 }, { "epoch": 4.375757575757576, "grad_norm": 0.02071288600564003, "learning_rate": 4.2881646034398925e-06, "loss": 0.0114, "step": 361 }, { "epoch": 4.387878787878788, "grad_norm": 0.02533382549881935, "learning_rate": 4.117342069818603e-06, "loss": 0.0113, "step": 362 }, { "epoch": 4.4, "grad_norm": 0.021412553265690804, "learning_rate": 3.949845311913492e-06, "loss": 0.0115, "step": 363 }, { "epoch": 4.412121212121212, "grad_norm": 0.024555562064051628, "learning_rate": 3.7856864706221185e-06, "loss": 0.0098, "step": 364 }, { "epoch": 4.424242424242424, "grad_norm": 0.027632344514131546, "learning_rate": 3.6248774448952695e-06, "loss": 0.0118, "step": 365 }, { "epoch": 4.424242424242424, "eval_loss": 0.02050224132835865, "eval_runtime": 6.1859, "eval_samples_per_second": 8.083, "eval_steps_per_second": 2.102, "step": 365 }, { "epoch": 4.4363636363636365, "grad_norm": 0.028998758643865585, "learning_rate": 3.467429890874424e-06, "loss": 0.0111, "step": 366 }, { "epoch": 4.448484848484848, "grad_norm": 0.027949200943112373, "learning_rate": 3.3133552210468875e-06, "loss": 0.0122, "step": 367 }, { "epoch": 4.460606060606061, "grad_norm": 0.029877539724111557, "learning_rate": 3.162664603418608e-06, "loss": 0.0136, "step": 368 }, { "epoch": 4.472727272727273, "grad_norm": 0.02742207795381546, "learning_rate": 3.0153689607045845e-06, "loss": 0.0116, "step": 369 }, { "epoch": 4.484848484848484, "grad_norm": 0.02200300246477127, "learning_rate": 2.871478969537206e-06, "loss": 0.0099, "step": 370 }, { "epoch": 4.484848484848484, "eval_loss": 0.02058413252234459, "eval_runtime": 6.1924, "eval_samples_per_second": 8.074, "eval_steps_per_second": 2.099, "step": 370 }, { "epoch": 4.496969696969697, "grad_norm": 0.029120702296495438, "learning_rate": 2.731005059692332e-06, "loss": 0.0212, "step": 371 }, { "epoch": 4.509090909090909, "grad_norm": 0.030737141147255898, "learning_rate": 2.5939574133333312e-06, "loss": 0.0275, "step": 372 }, { "epoch": 4.5212121212121215, "grad_norm": 0.02619299292564392, "learning_rate": 2.4603459642729863e-06, "loss": 0.0104, "step": 373 }, { "epoch": 4.533333333333333, "grad_norm": 0.02171757072210312, "learning_rate": 2.330180397253473e-06, "loss": 0.0099, "step": 374 }, { "epoch": 4.545454545454545, "grad_norm": 0.022865932434797287, "learning_rate": 2.203470147244385e-06, "loss": 0.0118, "step": 375 }, { "epoch": 4.545454545454545, "eval_loss": 0.020584262907505035, "eval_runtime": 6.193, "eval_samples_per_second": 8.074, "eval_steps_per_second": 2.099, "step": 375 }, { "epoch": 4.557575757575758, "grad_norm": 0.02180948108434677, "learning_rate": 2.0802243987588066e-06, "loss": 0.0104, "step": 376 }, { "epoch": 4.569696969696969, "grad_norm": 0.027300819754600525, "learning_rate": 1.9604520851876198e-06, "loss": 0.0121, "step": 377 }, { "epoch": 4.581818181818182, "grad_norm": 0.027127033099532127, "learning_rate": 1.8441618881519184e-06, "loss": 0.0113, "step": 378 }, { "epoch": 4.593939393939394, "grad_norm": 0.026878971606492996, "learning_rate": 1.7313622368738014e-06, "loss": 0.0105, "step": 379 }, { "epoch": 4.606060606060606, "grad_norm": 0.02648119069635868, "learning_rate": 1.6220613075653202e-06, "loss": 0.0119, "step": 380 }, { "epoch": 4.606060606060606, "eval_loss": 0.02056843228638172, "eval_runtime": 6.1957, "eval_samples_per_second": 8.07, "eval_steps_per_second": 2.098, "step": 380 }, { "epoch": 4.618181818181818, "grad_norm": 0.026216818019747734, "learning_rate": 1.51626702283586e-06, "loss": 0.0098, "step": 381 }, { "epoch": 4.63030303030303, "grad_norm": 0.02351340465247631, "learning_rate": 1.4139870511178766e-06, "loss": 0.0116, "step": 382 }, { "epoch": 4.642424242424243, "grad_norm": 0.030724933370947838, "learning_rate": 1.3152288061110518e-06, "loss": 0.0101, "step": 383 }, { "epoch": 4.654545454545454, "grad_norm": 0.02688099816441536, "learning_rate": 1.2199994462448904e-06, "loss": 0.0113, "step": 384 }, { "epoch": 4.666666666666667, "grad_norm": 0.02722257934510708, "learning_rate": 1.128305874159896e-06, "loss": 0.0114, "step": 385 }, { "epoch": 4.666666666666667, "eval_loss": 0.02059413306415081, "eval_runtime": 6.2059, "eval_samples_per_second": 8.057, "eval_steps_per_second": 2.095, "step": 385 }, { "epoch": 4.678787878787879, "grad_norm": 0.02383565530180931, "learning_rate": 1.040154736207194e-06, "loss": 0.0106, "step": 386 }, { "epoch": 4.690909090909091, "grad_norm": 0.028037432581186295, "learning_rate": 9.555524219667989e-07, "loss": 0.0122, "step": 387 }, { "epoch": 4.703030303030303, "grad_norm": 0.021838784217834473, "learning_rate": 8.745050637844532e-07, "loss": 0.0097, "step": 388 }, { "epoch": 4.715151515151515, "grad_norm": 0.030912073329091072, "learning_rate": 7.970185363271431e-07, "loss": 0.0121, "step": 389 }, { "epoch": 4.7272727272727275, "grad_norm": 0.02470664493739605, "learning_rate": 7.230984561572729e-07, "loss": 0.0109, "step": 390 }, { "epoch": 4.7272727272727275, "eval_loss": 0.020591916516423225, "eval_runtime": 6.1901, "eval_samples_per_second": 8.077, "eval_steps_per_second": 2.1, "step": 390 }, { "epoch": 4.739393939393939, "grad_norm": 0.030810924246907234, "learning_rate": 6.527501813255344e-07, "loss": 0.0165, "step": 391 }, { "epoch": 4.751515151515152, "grad_norm": 0.025043383240699768, "learning_rate": 5.859788109825793e-07, "loss": 0.0096, "step": 392 }, { "epoch": 4.763636363636364, "grad_norm": 0.028706299141049385, "learning_rate": 5.227891850093314e-07, "loss": 0.0129, "step": 393 }, { "epoch": 4.775757575757575, "grad_norm": 0.0277020912617445, "learning_rate": 4.6318588366625616e-07, "loss": 0.0122, "step": 394 }, { "epoch": 4.787878787878788, "grad_norm": 0.02909735217690468, "learning_rate": 4.071732272613149e-07, "loss": 0.0124, "step": 395 }, { "epoch": 4.787878787878788, "eval_loss": 0.020531287416815758, "eval_runtime": 6.1957, "eval_samples_per_second": 8.07, "eval_steps_per_second": 2.098, "step": 395 }, { "epoch": 4.8, "grad_norm": 0.024658478796482086, "learning_rate": 3.5475527583681e-07, "loss": 0.0112, "step": 396 }, { "epoch": 4.8121212121212125, "grad_norm": 0.023081207647919655, "learning_rate": 3.059358288751202e-07, "loss": 0.0102, "step": 397 }, { "epoch": 4.824242424242424, "grad_norm": 0.03320831060409546, "learning_rate": 2.6071842502326527e-07, "loss": 0.0119, "step": 398 }, { "epoch": 4.836363636363636, "grad_norm": 0.025276964530348778, "learning_rate": 2.1910634183644474e-07, "loss": 0.0117, "step": 399 }, { "epoch": 4.848484848484849, "grad_norm": 0.02636777050793171, "learning_rate": 1.811025955404333e-07, "loss": 0.0111, "step": 400 }, { "epoch": 4.848484848484849, "eval_loss": 0.020571600645780563, "eval_runtime": 6.1951, "eval_samples_per_second": 8.071, "eval_steps_per_second": 2.098, "step": 400 }, { "epoch": 4.86060606060606, "grad_norm": 0.0283295139670372, "learning_rate": 1.4670994081297795e-07, "loss": 0.0141, "step": 401 }, { "epoch": 4.872727272727273, "grad_norm": 0.025880116969347, "learning_rate": 1.1593087058410779e-07, "loss": 0.0113, "step": 402 }, { "epoch": 4.884848484848485, "grad_norm": 0.02684679627418518, "learning_rate": 8.876761585545068e-08, "loss": 0.0104, "step": 403 }, { "epoch": 4.8969696969696965, "grad_norm": 0.027327047660946846, "learning_rate": 6.522214553850159e-08, "loss": 0.0122, "step": 404 }, { "epoch": 4.909090909090909, "grad_norm": 0.033394601196050644, "learning_rate": 4.529616631193112e-08, "loss": 0.012, "step": 405 }, { "epoch": 4.909090909090909, "eval_loss": 0.020557112991809845, "eval_runtime": 6.1866, "eval_samples_per_second": 8.082, "eval_steps_per_second": 2.101, "step": 405 }, { "epoch": 4.921212121212121, "grad_norm": 0.02917388454079628, "learning_rate": 2.899112249786229e-08, "loss": 0.0115, "step": 406 }, { "epoch": 4.933333333333334, "grad_norm": 0.032552849501371384, "learning_rate": 1.6308195957182027e-08, "loss": 0.0125, "step": 407 }, { "epoch": 4.945454545454545, "grad_norm": 0.026771927252411842, "learning_rate": 7.248306003865279e-09, "loss": 0.0123, "step": 408 }, { "epoch": 4.957575757575757, "grad_norm": 0.024449503049254417, "learning_rate": 1.8121093383671738e-09, "loss": 0.0117, "step": 409 }, { "epoch": 4.96969696969697, "grad_norm": 0.02603001333773136, "learning_rate": 0.0, "loss": 0.0104, "step": 410 }, { "epoch": 4.96969696969697, "eval_loss": 0.020535213872790337, "eval_runtime": 6.2062, "eval_samples_per_second": 8.056, "eval_steps_per_second": 2.095, "step": 410 } ], "logging_steps": 1, "max_steps": 410, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.27193584892674e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }