{ "best_metric": 0.44082728028297424, "best_model_checkpoint": "data/Llama-31-8B_task-3_120-samples_config-3/checkpoint-264", "epoch": 31.0, "eval_steps": 500, "global_step": 341, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09090909090909091, "grad_norm": 3.931244373321533, "learning_rate": 6.060606060606061e-08, "loss": 3.095, "step": 1 }, { "epoch": 0.18181818181818182, "grad_norm": 2.530280828475952, "learning_rate": 1.2121212121212122e-07, "loss": 2.5399, "step": 2 }, { "epoch": 0.36363636363636365, "grad_norm": 2.5952248573303223, "learning_rate": 2.4242424242424244e-07, "loss": 2.5861, "step": 4 }, { "epoch": 0.5454545454545454, "grad_norm": 2.0955796241760254, "learning_rate": 3.6363636363636366e-07, "loss": 2.4442, "step": 6 }, { "epoch": 0.7272727272727273, "grad_norm": 3.4685494899749756, "learning_rate": 4.848484848484849e-07, "loss": 2.4938, "step": 8 }, { "epoch": 0.9090909090909091, "grad_norm": 3.055572509765625, "learning_rate": 6.060606060606061e-07, "loss": 2.3729, "step": 10 }, { "epoch": 1.0, "eval_loss": 2.4972100257873535, "eval_runtime": 21.0088, "eval_samples_per_second": 1.142, "eval_steps_per_second": 1.142, "step": 11 }, { "epoch": 1.0909090909090908, "grad_norm": 2.5200321674346924, "learning_rate": 7.272727272727273e-07, "loss": 2.4004, "step": 12 }, { "epoch": 1.2727272727272727, "grad_norm": 2.5053160190582275, "learning_rate": 8.484848484848486e-07, "loss": 2.3639, "step": 14 }, { "epoch": 1.4545454545454546, "grad_norm": 2.5543558597564697, "learning_rate": 9.696969696969698e-07, "loss": 2.7736, "step": 16 }, { "epoch": 1.6363636363636362, "grad_norm": 2.6235134601593018, "learning_rate": 1.090909090909091e-06, "loss": 2.3351, "step": 18 }, { "epoch": 1.8181818181818183, "grad_norm": 2.2782557010650635, "learning_rate": 1.2121212121212122e-06, "loss": 2.4073, "step": 20 }, { "epoch": 2.0, "grad_norm": 3.364241123199463, "learning_rate": 1.3333333333333334e-06, "loss": 2.6938, "step": 22 }, { "epoch": 2.0, "eval_loss": 2.457120180130005, "eval_runtime": 20.9973, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 22 }, { "epoch": 2.1818181818181817, "grad_norm": 2.8939199447631836, "learning_rate": 1.4545454545454546e-06, "loss": 2.6378, "step": 24 }, { "epoch": 2.3636363636363638, "grad_norm": 1.724478006362915, "learning_rate": 1.5757575757575759e-06, "loss": 2.3463, "step": 26 }, { "epoch": 2.5454545454545454, "grad_norm": 3.102917194366455, "learning_rate": 1.6969696969696973e-06, "loss": 2.3407, "step": 28 }, { "epoch": 2.7272727272727275, "grad_norm": 2.207613945007324, "learning_rate": 1.8181818181818183e-06, "loss": 2.4054, "step": 30 }, { "epoch": 2.909090909090909, "grad_norm": 3.5173332691192627, "learning_rate": 1.9393939393939395e-06, "loss": 2.6474, "step": 32 }, { "epoch": 3.0, "eval_loss": 2.3881404399871826, "eval_runtime": 20.9969, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 33 }, { "epoch": 3.090909090909091, "grad_norm": 2.588070869445801, "learning_rate": 2.0606060606060607e-06, "loss": 2.2888, "step": 34 }, { "epoch": 3.2727272727272725, "grad_norm": 3.921119451522827, "learning_rate": 2.181818181818182e-06, "loss": 2.858, "step": 36 }, { "epoch": 3.4545454545454546, "grad_norm": 3.1361024379730225, "learning_rate": 2.303030303030303e-06, "loss": 2.292, "step": 38 }, { "epoch": 3.6363636363636362, "grad_norm": 2.6496403217315674, "learning_rate": 2.4242424242424244e-06, "loss": 2.2874, "step": 40 }, { "epoch": 3.8181818181818183, "grad_norm": 1.7044334411621094, "learning_rate": 2.5454545454545456e-06, "loss": 2.0497, "step": 42 }, { "epoch": 4.0, "grad_norm": 2.6162171363830566, "learning_rate": 2.666666666666667e-06, "loss": 2.2763, "step": 44 }, { "epoch": 4.0, "eval_loss": 2.264249801635742, "eval_runtime": 21.0004, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 44 }, { "epoch": 4.181818181818182, "grad_norm": 3.5348732471466064, "learning_rate": 2.7878787878787885e-06, "loss": 2.486, "step": 46 }, { "epoch": 4.363636363636363, "grad_norm": 1.8792097568511963, "learning_rate": 2.9090909090909093e-06, "loss": 2.0279, "step": 48 }, { "epoch": 4.545454545454545, "grad_norm": 3.1981241703033447, "learning_rate": 3.0303030303030305e-06, "loss": 2.279, "step": 50 }, { "epoch": 4.7272727272727275, "grad_norm": 2.5685250759124756, "learning_rate": 3.1515151515151517e-06, "loss": 2.1978, "step": 52 }, { "epoch": 4.909090909090909, "grad_norm": 2.161813497543335, "learning_rate": 3.272727272727273e-06, "loss": 2.0268, "step": 54 }, { "epoch": 5.0, "eval_loss": 2.069439649581909, "eval_runtime": 20.9965, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 55 }, { "epoch": 5.090909090909091, "grad_norm": 2.084135055541992, "learning_rate": 3.3939393939393946e-06, "loss": 1.9478, "step": 56 }, { "epoch": 5.2727272727272725, "grad_norm": 2.176788806915283, "learning_rate": 3.5151515151515154e-06, "loss": 1.9753, "step": 58 }, { "epoch": 5.454545454545454, "grad_norm": 4.001251220703125, "learning_rate": 3.6363636363636366e-06, "loss": 2.1318, "step": 60 }, { "epoch": 5.636363636363637, "grad_norm": 3.4586737155914307, "learning_rate": 3.757575757575758e-06, "loss": 1.9426, "step": 62 }, { "epoch": 5.818181818181818, "grad_norm": 3.11710786819458, "learning_rate": 3.878787878787879e-06, "loss": 1.9089, "step": 64 }, { "epoch": 6.0, "grad_norm": 2.0227549076080322, "learning_rate": 4.000000000000001e-06, "loss": 1.7309, "step": 66 }, { "epoch": 6.0, "eval_loss": 1.7871297597885132, "eval_runtime": 20.9996, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 66 }, { "epoch": 6.181818181818182, "grad_norm": 3.2912845611572266, "learning_rate": 4.1212121212121215e-06, "loss": 1.8532, "step": 68 }, { "epoch": 6.363636363636363, "grad_norm": 2.7779335975646973, "learning_rate": 4.242424242424243e-06, "loss": 1.7371, "step": 70 }, { "epoch": 6.545454545454545, "grad_norm": 1.3037850856781006, "learning_rate": 4.363636363636364e-06, "loss": 1.4541, "step": 72 }, { "epoch": 6.7272727272727275, "grad_norm": 2.257749557495117, "learning_rate": 4.4848484848484855e-06, "loss": 1.5641, "step": 74 }, { "epoch": 6.909090909090909, "grad_norm": 2.7067761421203613, "learning_rate": 4.606060606060606e-06, "loss": 1.4481, "step": 76 }, { "epoch": 7.0, "eval_loss": 1.4330077171325684, "eval_runtime": 21.0001, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 77 }, { "epoch": 7.090909090909091, "grad_norm": 1.7599331140518188, "learning_rate": 4.727272727272728e-06, "loss": 1.3604, "step": 78 }, { "epoch": 7.2727272727272725, "grad_norm": 1.4922908544540405, "learning_rate": 4.848484848484849e-06, "loss": 1.3767, "step": 80 }, { "epoch": 7.454545454545454, "grad_norm": 2.4537432193756104, "learning_rate": 4.9696969696969696e-06, "loss": 1.2013, "step": 82 }, { "epoch": 7.636363636363637, "grad_norm": 2.7686843872070312, "learning_rate": 5.090909090909091e-06, "loss": 1.1578, "step": 84 }, { "epoch": 7.818181818181818, "grad_norm": 2.3368959426879883, "learning_rate": 5.212121212121213e-06, "loss": 1.1423, "step": 86 }, { "epoch": 8.0, "grad_norm": 1.5859788656234741, "learning_rate": 5.333333333333334e-06, "loss": 1.0554, "step": 88 }, { "epoch": 8.0, "eval_loss": 1.0675058364868164, "eval_runtime": 21.0113, "eval_samples_per_second": 1.142, "eval_steps_per_second": 1.142, "step": 88 }, { "epoch": 8.181818181818182, "grad_norm": 1.8273411989212036, "learning_rate": 5.4545454545454545e-06, "loss": 0.9927, "step": 90 }, { "epoch": 8.363636363636363, "grad_norm": 1.7687430381774902, "learning_rate": 5.575757575757577e-06, "loss": 0.8886, "step": 92 }, { "epoch": 8.545454545454545, "grad_norm": 1.9781934022903442, "learning_rate": 5.696969696969698e-06, "loss": 0.7076, "step": 94 }, { "epoch": 8.727272727272727, "grad_norm": 1.8124470710754395, "learning_rate": 5.8181818181818185e-06, "loss": 0.8414, "step": 96 }, { "epoch": 8.909090909090908, "grad_norm": 1.2052216529846191, "learning_rate": 5.93939393939394e-06, "loss": 0.8392, "step": 98 }, { "epoch": 9.0, "eval_loss": 0.756293773651123, "eval_runtime": 21.002, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 99 }, { "epoch": 9.090909090909092, "grad_norm": 1.1055922508239746, "learning_rate": 6.060606060606061e-06, "loss": 0.7587, "step": 100 }, { "epoch": 9.272727272727273, "grad_norm": 1.4254761934280396, "learning_rate": 6.181818181818182e-06, "loss": 0.333, "step": 102 }, { "epoch": 9.454545454545455, "grad_norm": 1.636863350868225, "learning_rate": 6.303030303030303e-06, "loss": 1.0304, "step": 104 }, { "epoch": 9.636363636363637, "grad_norm": 1.3520318269729614, "learning_rate": 6.424242424242425e-06, "loss": 0.5626, "step": 106 }, { "epoch": 9.818181818181818, "grad_norm": 0.739101231098175, "learning_rate": 6.545454545454546e-06, "loss": 0.4562, "step": 108 }, { "epoch": 10.0, "grad_norm": 0.719274640083313, "learning_rate": 6.666666666666667e-06, "loss": 0.4685, "step": 110 }, { "epoch": 10.0, "eval_loss": 0.6436794400215149, "eval_runtime": 20.9986, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 110 }, { "epoch": 10.181818181818182, "grad_norm": 0.5967914462089539, "learning_rate": 6.787878787878789e-06, "loss": 0.4207, "step": 112 }, { "epoch": 10.363636363636363, "grad_norm": 0.7981317639350891, "learning_rate": 6.90909090909091e-06, "loss": 0.7027, "step": 114 }, { "epoch": 10.545454545454545, "grad_norm": 0.648346483707428, "learning_rate": 7.030303030303031e-06, "loss": 0.48, "step": 116 }, { "epoch": 10.727272727272727, "grad_norm": 1.0265989303588867, "learning_rate": 7.151515151515152e-06, "loss": 0.6033, "step": 118 }, { "epoch": 10.909090909090908, "grad_norm": 0.6349014639854431, "learning_rate": 7.272727272727273e-06, "loss": 0.3588, "step": 120 }, { "epoch": 11.0, "eval_loss": 0.5850974321365356, "eval_runtime": 21.004, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 121 }, { "epoch": 11.090909090909092, "grad_norm": 0.530517041683197, "learning_rate": 7.393939393939395e-06, "loss": 0.4937, "step": 122 }, { "epoch": 11.272727272727273, "grad_norm": 0.9633344411849976, "learning_rate": 7.515151515151516e-06, "loss": 0.542, "step": 124 }, { "epoch": 11.454545454545455, "grad_norm": 0.6681090593338013, "learning_rate": 7.636363636363638e-06, "loss": 0.4627, "step": 126 }, { "epoch": 11.636363636363637, "grad_norm": 0.4449699819087982, "learning_rate": 7.757575757575758e-06, "loss": 0.4319, "step": 128 }, { "epoch": 11.818181818181818, "grad_norm": 0.5469127297401428, "learning_rate": 7.87878787878788e-06, "loss": 0.3578, "step": 130 }, { "epoch": 12.0, "grad_norm": 0.9779024720191956, "learning_rate": 8.000000000000001e-06, "loss": 0.6319, "step": 132 }, { "epoch": 12.0, "eval_loss": 0.5406798720359802, "eval_runtime": 20.9992, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 132 }, { "epoch": 12.181818181818182, "grad_norm": 0.5859923958778381, "learning_rate": 8.121212121212121e-06, "loss": 0.3894, "step": 134 }, { "epoch": 12.363636363636363, "grad_norm": 0.1749761551618576, "learning_rate": 8.242424242424243e-06, "loss": 0.2537, "step": 136 }, { "epoch": 12.545454545454545, "grad_norm": 1.400254487991333, "learning_rate": 8.363636363636365e-06, "loss": 0.6108, "step": 138 }, { "epoch": 12.727272727272727, "grad_norm": 0.556860089302063, "learning_rate": 8.484848484848486e-06, "loss": 0.5365, "step": 140 }, { "epoch": 12.909090909090908, "grad_norm": 0.4626636207103729, "learning_rate": 8.606060606060606e-06, "loss": 0.4211, "step": 142 }, { "epoch": 13.0, "eval_loss": 0.5247967839241028, "eval_runtime": 21.0033, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 143 }, { "epoch": 13.090909090909092, "grad_norm": 0.42664870619773865, "learning_rate": 8.727272727272728e-06, "loss": 0.4939, "step": 144 }, { "epoch": 13.272727272727273, "grad_norm": 0.38396334648132324, "learning_rate": 8.84848484848485e-06, "loss": 0.3484, "step": 146 }, { "epoch": 13.454545454545455, "grad_norm": 0.43991151452064514, "learning_rate": 8.969696969696971e-06, "loss": 0.4156, "step": 148 }, { "epoch": 13.636363636363637, "grad_norm": 0.4237063229084015, "learning_rate": 9.090909090909091e-06, "loss": 0.3955, "step": 150 }, { "epoch": 13.818181818181818, "grad_norm": 0.34925854206085205, "learning_rate": 9.212121212121213e-06, "loss": 0.446, "step": 152 }, { "epoch": 14.0, "grad_norm": 0.8939254879951477, "learning_rate": 9.333333333333334e-06, "loss": 0.495, "step": 154 }, { "epoch": 14.0, "eval_loss": 0.5127188563346863, "eval_runtime": 20.9989, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 154 }, { "epoch": 14.181818181818182, "grad_norm": 0.4808824062347412, "learning_rate": 9.454545454545456e-06, "loss": 0.4217, "step": 156 }, { "epoch": 14.363636363636363, "grad_norm": 0.3635479807853699, "learning_rate": 9.575757575757576e-06, "loss": 0.4738, "step": 158 }, { "epoch": 14.545454545454545, "grad_norm": 0.2080135941505432, "learning_rate": 9.696969696969698e-06, "loss": 0.3693, "step": 160 }, { "epoch": 14.727272727272727, "grad_norm": 2.0220720767974854, "learning_rate": 9.81818181818182e-06, "loss": 0.3737, "step": 162 }, { "epoch": 14.909090909090908, "grad_norm": 0.3454424738883972, "learning_rate": 9.939393939393939e-06, "loss": 0.4232, "step": 164 }, { "epoch": 15.0, "eval_loss": 0.5019441246986389, "eval_runtime": 21.0036, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 165 }, { "epoch": 15.090909090909092, "grad_norm": 0.4212585389614105, "learning_rate": 9.999988811118232e-06, "loss": 0.4735, "step": 166 }, { "epoch": 15.272727272727273, "grad_norm": 0.22414664924144745, "learning_rate": 9.999899300364534e-06, "loss": 0.2503, "step": 168 }, { "epoch": 15.454545454545455, "grad_norm": 1.3229994773864746, "learning_rate": 9.999720280459576e-06, "loss": 0.4071, "step": 170 }, { "epoch": 15.636363636363637, "grad_norm": 0.29747679829597473, "learning_rate": 9.999451754608208e-06, "loss": 0.3427, "step": 172 }, { "epoch": 15.818181818181818, "grad_norm": 0.3459671437740326, "learning_rate": 9.99909372761763e-06, "loss": 0.4779, "step": 174 }, { "epoch": 16.0, "grad_norm": 0.31325361132621765, "learning_rate": 9.99864620589731e-06, "loss": 0.496, "step": 176 }, { "epoch": 16.0, "eval_loss": 0.5102820992469788, "eval_runtime": 21.0039, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 176 }, { "epoch": 16.181818181818183, "grad_norm": 0.35442739725112915, "learning_rate": 9.998109197458865e-06, "loss": 0.2946, "step": 178 }, { "epoch": 16.363636363636363, "grad_norm": 1.0050742626190186, "learning_rate": 9.997482711915926e-06, "loss": 0.5689, "step": 180 }, { "epoch": 16.545454545454547, "grad_norm": 0.3000654876232147, "learning_rate": 9.996766760483955e-06, "loss": 0.3062, "step": 182 }, { "epoch": 16.727272727272727, "grad_norm": 0.33660659193992615, "learning_rate": 9.995961355980052e-06, "loss": 0.3404, "step": 184 }, { "epoch": 16.90909090909091, "grad_norm": 0.32371142506599426, "learning_rate": 9.99506651282272e-06, "loss": 0.3903, "step": 186 }, { "epoch": 17.0, "eval_loss": 0.4814320504665375, "eval_runtime": 21.0016, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 187 }, { "epoch": 17.09090909090909, "grad_norm": 0.7857639193534851, "learning_rate": 9.994082247031613e-06, "loss": 0.5145, "step": 188 }, { "epoch": 17.272727272727273, "grad_norm": 0.007223070599138737, "learning_rate": 9.993008576227248e-06, "loss": 0.2366, "step": 190 }, { "epoch": 17.454545454545453, "grad_norm": 0.5182167887687683, "learning_rate": 9.991845519630679e-06, "loss": 0.4549, "step": 192 }, { "epoch": 17.636363636363637, "grad_norm": 0.2884611487388611, "learning_rate": 9.99059309806317e-06, "loss": 0.3861, "step": 194 }, { "epoch": 17.818181818181817, "grad_norm": 0.49307528138160706, "learning_rate": 9.989251333945813e-06, "loss": 0.4248, "step": 196 }, { "epoch": 18.0, "grad_norm": 0.48076504468917847, "learning_rate": 9.987820251299121e-06, "loss": 0.331, "step": 198 }, { "epoch": 18.0, "eval_loss": 0.49132904410362244, "eval_runtime": 21.0024, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 198 }, { "epoch": 18.181818181818183, "grad_norm": 0.5222908854484558, "learning_rate": 9.986299875742612e-06, "loss": 0.4817, "step": 200 }, { "epoch": 18.363636363636363, "grad_norm": 0.3346542418003082, "learning_rate": 9.984690234494338e-06, "loss": 0.2685, "step": 202 }, { "epoch": 18.545454545454547, "grad_norm": 0.48757171630859375, "learning_rate": 9.982991356370404e-06, "loss": 0.4127, "step": 204 }, { "epoch": 18.727272727272727, "grad_norm": 0.5337949991226196, "learning_rate": 9.98120327178445e-06, "loss": 0.4215, "step": 206 }, { "epoch": 18.90909090909091, "grad_norm": 0.26037395000457764, "learning_rate": 9.979326012747106e-06, "loss": 0.2403, "step": 208 }, { "epoch": 19.0, "eval_loss": 0.4869350492954254, "eval_runtime": 21.0006, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 209 }, { "epoch": 19.09090909090909, "grad_norm": 0.31072062253952026, "learning_rate": 9.977359612865424e-06, "loss": 0.2671, "step": 210 }, { "epoch": 19.272727272727273, "grad_norm": 0.5128066539764404, "learning_rate": 9.975304107342268e-06, "loss": 0.3695, "step": 212 }, { "epoch": 19.454545454545453, "grad_norm": 0.585717499256134, "learning_rate": 9.973159532975691e-06, "loss": 0.5722, "step": 214 }, { "epoch": 19.636363636363637, "grad_norm": 0.4827703833580017, "learning_rate": 9.970925928158275e-06, "loss": 0.3577, "step": 216 }, { "epoch": 19.818181818181817, "grad_norm": 0.3153725266456604, "learning_rate": 9.968603332876435e-06, "loss": 0.2262, "step": 218 }, { "epoch": 20.0, "grad_norm": 0.4635719954967499, "learning_rate": 9.966191788709716e-06, "loss": 0.3563, "step": 220 }, { "epoch": 20.0, "eval_loss": 0.471805602312088, "eval_runtime": 21.0011, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 220 }, { "epoch": 20.181818181818183, "grad_norm": 0.4292517304420471, "learning_rate": 9.963691338830045e-06, "loss": 0.2494, "step": 222 }, { "epoch": 20.363636363636363, "grad_norm": 0.2989480495452881, "learning_rate": 9.961102028000948e-06, "loss": 0.2761, "step": 224 }, { "epoch": 20.545454545454547, "grad_norm": 0.4395551383495331, "learning_rate": 9.958423902576764e-06, "loss": 0.5072, "step": 226 }, { "epoch": 20.727272727272727, "grad_norm": 0.47977691888809204, "learning_rate": 9.955657010501807e-06, "loss": 0.2494, "step": 228 }, { "epoch": 20.90909090909091, "grad_norm": 0.4911171793937683, "learning_rate": 9.952801401309504e-06, "loss": 0.4107, "step": 230 }, { "epoch": 21.0, "eval_loss": 0.4596048593521118, "eval_runtime": 20.9979, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 231 }, { "epoch": 21.09090909090909, "grad_norm": 0.5808133482933044, "learning_rate": 9.949857126121519e-06, "loss": 0.4717, "step": 232 }, { "epoch": 21.272727272727273, "grad_norm": 0.19762107729911804, "learning_rate": 9.946824237646823e-06, "loss": 0.165, "step": 234 }, { "epoch": 21.454545454545453, "grad_norm": 0.5478127598762512, "learning_rate": 9.94370279018077e-06, "loss": 0.383, "step": 236 }, { "epoch": 21.636363636363637, "grad_norm": 0.3956260085105896, "learning_rate": 9.940492839604103e-06, "loss": 0.3194, "step": 238 }, { "epoch": 21.818181818181817, "grad_norm": 0.608517050743103, "learning_rate": 9.937194443381972e-06, "loss": 0.3811, "step": 240 }, { "epoch": 22.0, "grad_norm": 0.35069647431373596, "learning_rate": 9.933807660562898e-06, "loss": 0.2631, "step": 242 }, { "epoch": 22.0, "eval_loss": 0.4477730691432953, "eval_runtime": 20.9988, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 242 }, { "epoch": 22.181818181818183, "grad_norm": 0.5358024835586548, "learning_rate": 9.930332551777709e-06, "loss": 0.24, "step": 244 }, { "epoch": 22.363636363636363, "grad_norm": 0.5837539434432983, "learning_rate": 9.926769179238467e-06, "loss": 0.312, "step": 246 }, { "epoch": 22.545454545454547, "grad_norm": 0.4202631711959839, "learning_rate": 9.923117606737347e-06, "loss": 0.1839, "step": 248 }, { "epoch": 22.727272727272727, "grad_norm": 0.7312754988670349, "learning_rate": 9.919377899645497e-06, "loss": 0.3982, "step": 250 }, { "epoch": 22.90909090909091, "grad_norm": 0.6649301648139954, "learning_rate": 9.915550124911866e-06, "loss": 0.4212, "step": 252 }, { "epoch": 23.0, "eval_loss": 0.4496145248413086, "eval_runtime": 21.002, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 253 }, { "epoch": 23.09090909090909, "grad_norm": 0.4475117027759552, "learning_rate": 9.91163435106201e-06, "loss": 0.2837, "step": 254 }, { "epoch": 23.272727272727273, "grad_norm": 0.7331680059432983, "learning_rate": 9.907630648196857e-06, "loss": 0.4256, "step": 256 }, { "epoch": 23.454545454545453, "grad_norm": 0.5951094627380371, "learning_rate": 9.903539087991462e-06, "loss": 0.2876, "step": 258 }, { "epoch": 23.636363636363637, "grad_norm": 0.5116230845451355, "learning_rate": 9.899359743693715e-06, "loss": 0.2226, "step": 260 }, { "epoch": 23.818181818181817, "grad_norm": 0.4971112310886383, "learning_rate": 9.895092690123036e-06, "loss": 0.2511, "step": 262 }, { "epoch": 24.0, "grad_norm": 0.0066973976790905, "learning_rate": 9.890738003669029e-06, "loss": 0.3304, "step": 264 }, { "epoch": 24.0, "eval_loss": 0.44082728028297424, "eval_runtime": 20.9935, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 264 }, { "epoch": 24.181818181818183, "grad_norm": 0.5074991583824158, "learning_rate": 9.886295762290125e-06, "loss": 0.1619, "step": 266 }, { "epoch": 24.363636363636363, "grad_norm": 0.803401529788971, "learning_rate": 9.881766045512176e-06, "loss": 0.3071, "step": 268 }, { "epoch": 24.545454545454547, "grad_norm": 0.7890399694442749, "learning_rate": 9.877148934427037e-06, "loss": 0.2939, "step": 270 }, { "epoch": 24.727272727272727, "grad_norm": 0.5034437775611877, "learning_rate": 9.872444511691108e-06, "loss": 0.2662, "step": 272 }, { "epoch": 24.90909090909091, "grad_norm": 0.7107832431793213, "learning_rate": 9.867652861523866e-06, "loss": 0.3296, "step": 274 }, { "epoch": 25.0, "eval_loss": 0.4437350928783417, "eval_runtime": 21.0123, "eval_samples_per_second": 1.142, "eval_steps_per_second": 1.142, "step": 275 }, { "epoch": 25.09090909090909, "grad_norm": 0.8815945386886597, "learning_rate": 9.862774069706346e-06, "loss": 0.3628, "step": 276 }, { "epoch": 25.272727272727273, "grad_norm": 0.6133561134338379, "learning_rate": 9.85780822357961e-06, "loss": 0.2528, "step": 278 }, { "epoch": 25.454545454545453, "grad_norm": 0.7392721772193909, "learning_rate": 9.85275541204318e-06, "loss": 0.166, "step": 280 }, { "epoch": 25.636363636363637, "grad_norm": 0.7326251268386841, "learning_rate": 9.847615725553457e-06, "loss": 0.1924, "step": 282 }, { "epoch": 25.818181818181817, "grad_norm": 0.9658777117729187, "learning_rate": 9.842389256122086e-06, "loss": 0.3393, "step": 284 }, { "epoch": 26.0, "grad_norm": 0.9744178056716919, "learning_rate": 9.83707609731432e-06, "loss": 0.3266, "step": 286 }, { "epoch": 26.0, "eval_loss": 0.44414618611335754, "eval_runtime": 20.9954, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 286 }, { "epoch": 26.181818181818183, "grad_norm": 1.009691834449768, "learning_rate": 9.831676344247343e-06, "loss": 0.2983, "step": 288 }, { "epoch": 26.363636363636363, "grad_norm": 0.9881241917610168, "learning_rate": 9.826190093588564e-06, "loss": 0.3114, "step": 290 }, { "epoch": 26.545454545454547, "grad_norm": 0.6508401036262512, "learning_rate": 9.820617443553889e-06, "loss": 0.1722, "step": 292 }, { "epoch": 26.727272727272727, "grad_norm": 1.0022698640823364, "learning_rate": 9.814958493905962e-06, "loss": 0.2775, "step": 294 }, { "epoch": 26.90909090909091, "grad_norm": 0.8220618963241577, "learning_rate": 9.80921334595238e-06, "loss": 0.1403, "step": 296 }, { "epoch": 27.0, "eval_loss": 0.4495992660522461, "eval_runtime": 21.0004, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 297 }, { "epoch": 27.09090909090909, "grad_norm": 0.6972556114196777, "learning_rate": 9.80338210254388e-06, "loss": 0.2361, "step": 298 }, { "epoch": 27.272727272727273, "grad_norm": 1.3660632371902466, "learning_rate": 9.797464868072489e-06, "loss": 0.1793, "step": 300 }, { "epoch": 27.454545454545453, "grad_norm": 0.9514533281326294, "learning_rate": 9.791461748469669e-06, "loss": 0.346, "step": 302 }, { "epoch": 27.636363636363637, "grad_norm": 1.0539543628692627, "learning_rate": 9.785372851204415e-06, "loss": 0.201, "step": 304 }, { "epoch": 27.818181818181817, "grad_norm": 1.111655592918396, "learning_rate": 9.779198285281326e-06, "loss": 0.2216, "step": 306 }, { "epoch": 28.0, "grad_norm": 0.7238084077835083, "learning_rate": 9.77293816123866e-06, "loss": 0.1732, "step": 308 }, { "epoch": 28.0, "eval_loss": 0.45740625262260437, "eval_runtime": 21.0057, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 308 }, { "epoch": 28.181818181818183, "grad_norm": 1.3565870523452759, "learning_rate": 9.766592591146353e-06, "loss": 0.3174, "step": 310 }, { "epoch": 28.363636363636363, "grad_norm": 1.3067699670791626, "learning_rate": 9.760161688604008e-06, "loss": 0.2076, "step": 312 }, { "epoch": 28.545454545454547, "grad_norm": 0.9948272705078125, "learning_rate": 9.753645568738872e-06, "loss": 0.1527, "step": 314 }, { "epoch": 28.727272727272727, "grad_norm": 1.3128433227539062, "learning_rate": 9.747044348203766e-06, "loss": 0.1789, "step": 316 }, { "epoch": 28.90909090909091, "grad_norm": 0.9066749215126038, "learning_rate": 9.740358145174999e-06, "loss": 0.1797, "step": 318 }, { "epoch": 29.0, "eval_loss": 0.4808818995952606, "eval_runtime": 20.9997, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 319 }, { "epoch": 29.09090909090909, "grad_norm": 1.0640830993652344, "learning_rate": 9.733587079350254e-06, "loss": 0.1481, "step": 320 }, { "epoch": 29.272727272727273, "grad_norm": 1.490294337272644, "learning_rate": 9.72673127194644e-06, "loss": 0.355, "step": 322 }, { "epoch": 29.454545454545453, "grad_norm": 0.6777256727218628, "learning_rate": 9.719790845697534e-06, "loss": 0.1048, "step": 324 }, { "epoch": 29.636363636363637, "grad_norm": 1.5173866748809814, "learning_rate": 9.71276592485237e-06, "loss": 0.1738, "step": 326 }, { "epoch": 29.818181818181817, "grad_norm": 1.211938500404358, "learning_rate": 9.705656635172418e-06, "loss": 0.0788, "step": 328 }, { "epoch": 30.0, "grad_norm": 1.4657930135726929, "learning_rate": 9.698463103929542e-06, "loss": 0.1355, "step": 330 }, { "epoch": 30.0, "eval_loss": 0.49904152750968933, "eval_runtime": 21.0013, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 330 }, { "epoch": 30.181818181818183, "grad_norm": 1.3539866209030151, "learning_rate": 9.69118545990371e-06, "loss": 0.1336, "step": 332 }, { "epoch": 30.363636363636363, "grad_norm": 1.1103911399841309, "learning_rate": 9.683823833380692e-06, "loss": 0.1738, "step": 334 }, { "epoch": 30.545454545454547, "grad_norm": 1.4360435009002686, "learning_rate": 9.676378356149733e-06, "loss": 0.0891, "step": 336 }, { "epoch": 30.727272727272727, "grad_norm": 1.7939261198043823, "learning_rate": 9.668849161501186e-06, "loss": 0.2107, "step": 338 }, { "epoch": 30.90909090909091, "grad_norm": 1.3665426969528198, "learning_rate": 9.66123638422413e-06, "loss": 0.1346, "step": 340 }, { "epoch": 31.0, "eval_loss": 0.5312966704368591, "eval_runtime": 21.0009, "eval_samples_per_second": 1.143, "eval_steps_per_second": 1.143, "step": 341 }, { "epoch": 31.0, "step": 341, "total_flos": 2.1292989192077312e+17, "train_loss": 0.8120344140050698, "train_runtime": 7851.2602, "train_samples_per_second": 1.681, "train_steps_per_second": 0.21 } ], "logging_steps": 2, "max_steps": 1650, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 7, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1292989192077312e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }