|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999459527627146, |
|
"eval_steps": 100, |
|
"global_step": 3469, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008647557965661989, |
|
"grad_norm": 10.80162525177002, |
|
"learning_rate": 1.7241379310344828e-07, |
|
"loss": 2.2563, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0017295115931323978, |
|
"grad_norm": 9.61453914642334, |
|
"learning_rate": 3.4482758620689656e-07, |
|
"loss": 2.0897, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0025942673896985967, |
|
"grad_norm": 9.470385551452637, |
|
"learning_rate": 5.172413793103449e-07, |
|
"loss": 1.9261, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0034590231862647956, |
|
"grad_norm": 9.873086929321289, |
|
"learning_rate": 6.896551724137931e-07, |
|
"loss": 2.0147, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.004323778982830994, |
|
"grad_norm": 10.191353797912598, |
|
"learning_rate": 8.620689655172415e-07, |
|
"loss": 1.858, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005188534779397193, |
|
"grad_norm": 10.490758895874023, |
|
"learning_rate": 1.0344827586206898e-06, |
|
"loss": 2.2373, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.006053290575963392, |
|
"grad_norm": 10.7379789352417, |
|
"learning_rate": 1.2068965517241381e-06, |
|
"loss": 2.2257, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.006918046372529591, |
|
"grad_norm": 9.31098747253418, |
|
"learning_rate": 1.3793103448275862e-06, |
|
"loss": 1.9876, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00778280216909579, |
|
"grad_norm": 9.79057502746582, |
|
"learning_rate": 1.5517241379310346e-06, |
|
"loss": 1.9015, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.008647557965661988, |
|
"grad_norm": 9.3207368850708, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 1.8092, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009512313762228188, |
|
"grad_norm": 8.36523151397705, |
|
"learning_rate": 1.896551724137931e-06, |
|
"loss": 1.7048, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.010377069558794387, |
|
"grad_norm": 7.840041160583496, |
|
"learning_rate": 2.0689655172413796e-06, |
|
"loss": 1.4938, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.011241825355360585, |
|
"grad_norm": 8.646334648132324, |
|
"learning_rate": 2.241379310344828e-06, |
|
"loss": 1.4769, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.012106581151926784, |
|
"grad_norm": 8.217284202575684, |
|
"learning_rate": 2.4137931034482762e-06, |
|
"loss": 1.4442, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.012971336948492982, |
|
"grad_norm": 8.16273307800293, |
|
"learning_rate": 2.5862068965517246e-06, |
|
"loss": 1.6091, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.013836092745059182, |
|
"grad_norm": 7.606343746185303, |
|
"learning_rate": 2.7586206896551725e-06, |
|
"loss": 1.565, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.01470084854162538, |
|
"grad_norm": 5.831145763397217, |
|
"learning_rate": 2.931034482758621e-06, |
|
"loss": 1.4604, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.01556560433819158, |
|
"grad_norm": 5.626949310302734, |
|
"learning_rate": 3.103448275862069e-06, |
|
"loss": 1.2422, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.01643036013475778, |
|
"grad_norm": 8.196585655212402, |
|
"learning_rate": 3.2758620689655175e-06, |
|
"loss": 1.178, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.017295115931323976, |
|
"grad_norm": 3.9342713356018066, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 1.1149, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.018159871727890176, |
|
"grad_norm": 5.660026550292969, |
|
"learning_rate": 3.620689655172414e-06, |
|
"loss": 1.2666, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.019024627524456376, |
|
"grad_norm": 3.853914737701416, |
|
"learning_rate": 3.793103448275862e-06, |
|
"loss": 0.9817, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.019889383321022573, |
|
"grad_norm": 4.146341323852539, |
|
"learning_rate": 3.96551724137931e-06, |
|
"loss": 1.0925, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.020754139117588773, |
|
"grad_norm": 3.1577234268188477, |
|
"learning_rate": 4.137931034482759e-06, |
|
"loss": 0.8252, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.02161889491415497, |
|
"grad_norm": 3.2642972469329834, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 0.7533, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02248365071072117, |
|
"grad_norm": 3.2303547859191895, |
|
"learning_rate": 4.482758620689656e-06, |
|
"loss": 0.7326, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.02334840650728737, |
|
"grad_norm": 2.59647536277771, |
|
"learning_rate": 4.655172413793104e-06, |
|
"loss": 0.7174, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.024213162303853567, |
|
"grad_norm": 2.8148062229156494, |
|
"learning_rate": 4.8275862068965525e-06, |
|
"loss": 0.7488, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.025077918100419767, |
|
"grad_norm": 1.758013129234314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7332, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.025942673896985964, |
|
"grad_norm": 2.0942482948303223, |
|
"learning_rate": 5.172413793103449e-06, |
|
"loss": 0.8884, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.026807429693552164, |
|
"grad_norm": 0.4857475757598877, |
|
"learning_rate": 5.344827586206896e-06, |
|
"loss": 0.497, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.027672185490118364, |
|
"grad_norm": 1.5475839376449585, |
|
"learning_rate": 5.517241379310345e-06, |
|
"loss": 0.669, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.02853694128668456, |
|
"grad_norm": 1.0966241359710693, |
|
"learning_rate": 5.689655172413794e-06, |
|
"loss": 0.5887, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.028825193218873297, |
|
"eval_loss": 0.6418702006340027, |
|
"eval_mse": 0.6418701782226562, |
|
"eval_runtime": 6.7842, |
|
"eval_samples_per_second": 147.401, |
|
"eval_steps_per_second": 18.425, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02940169708325076, |
|
"grad_norm": 0.6968008875846863, |
|
"learning_rate": 5.862068965517242e-06, |
|
"loss": 0.7585, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.03026645287981696, |
|
"grad_norm": 1.3219984769821167, |
|
"learning_rate": 6.03448275862069e-06, |
|
"loss": 0.7782, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.03113120867638316, |
|
"grad_norm": 1.0195666551589966, |
|
"learning_rate": 6.206896551724138e-06, |
|
"loss": 0.6689, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.031995964472949355, |
|
"grad_norm": 1.4114892482757568, |
|
"learning_rate": 6.379310344827587e-06, |
|
"loss": 0.6057, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.03286072026951556, |
|
"grad_norm": 1.173322319984436, |
|
"learning_rate": 6.551724137931035e-06, |
|
"loss": 0.7234, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.033725476066081755, |
|
"grad_norm": 1.520411729812622, |
|
"learning_rate": 6.724137931034484e-06, |
|
"loss": 0.7145, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.03459023186264795, |
|
"grad_norm": 2.086540937423706, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 0.6265, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.035454987659214156, |
|
"grad_norm": 1.582171082496643, |
|
"learning_rate": 7.0689655172413796e-06, |
|
"loss": 0.6202, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.03631974345578035, |
|
"grad_norm": 1.1396665573120117, |
|
"learning_rate": 7.241379310344828e-06, |
|
"loss": 0.5784, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.03718449925234655, |
|
"grad_norm": 1.427815318107605, |
|
"learning_rate": 7.413793103448277e-06, |
|
"loss": 0.4815, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.03804925504891275, |
|
"grad_norm": 4.718282699584961, |
|
"learning_rate": 7.586206896551724e-06, |
|
"loss": 0.6151, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.03891401084547895, |
|
"grad_norm": 1.1084308624267578, |
|
"learning_rate": 7.758620689655173e-06, |
|
"loss": 0.548, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.039778766642045146, |
|
"grad_norm": 2.411567211151123, |
|
"learning_rate": 7.93103448275862e-06, |
|
"loss": 0.4664, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.04064352243861134, |
|
"grad_norm": 1.9377732276916504, |
|
"learning_rate": 8.103448275862069e-06, |
|
"loss": 0.4761, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.04150827823517755, |
|
"grad_norm": 1.6360706090927124, |
|
"learning_rate": 8.275862068965518e-06, |
|
"loss": 0.5101, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.04237303403174374, |
|
"grad_norm": 1.6306743621826172, |
|
"learning_rate": 8.448275862068966e-06, |
|
"loss": 0.4356, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.04323778982830994, |
|
"grad_norm": 2.038235664367676, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 0.4647, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.044102545624876144, |
|
"grad_norm": 2.968766450881958, |
|
"learning_rate": 8.793103448275862e-06, |
|
"loss": 0.5035, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.04496730142144234, |
|
"grad_norm": 1.407654047012329, |
|
"learning_rate": 8.965517241379312e-06, |
|
"loss": 0.4639, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.04583205721800854, |
|
"grad_norm": 2.141657590866089, |
|
"learning_rate": 9.13793103448276e-06, |
|
"loss": 0.4255, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.04669681301457474, |
|
"grad_norm": 2.2702648639678955, |
|
"learning_rate": 9.310344827586207e-06, |
|
"loss": 0.49, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.04756156881114094, |
|
"grad_norm": 1.1990652084350586, |
|
"learning_rate": 9.482758620689655e-06, |
|
"loss": 0.4145, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.048426324607707134, |
|
"grad_norm": 2.7780394554138184, |
|
"learning_rate": 9.655172413793105e-06, |
|
"loss": 0.4001, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.04929108040427334, |
|
"grad_norm": 1.6007318496704102, |
|
"learning_rate": 9.827586206896553e-06, |
|
"loss": 0.3719, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.050155836200839535, |
|
"grad_norm": 1.820497751235962, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4183, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.05102059199740573, |
|
"grad_norm": 2.3770830631256104, |
|
"learning_rate": 9.990895295902884e-06, |
|
"loss": 0.4273, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.05188534779397193, |
|
"grad_norm": 1.9047311544418335, |
|
"learning_rate": 9.981790591805767e-06, |
|
"loss": 0.4622, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05275010359053813, |
|
"grad_norm": 1.876373052597046, |
|
"learning_rate": 9.972685887708651e-06, |
|
"loss": 0.3132, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.05361485938710433, |
|
"grad_norm": 1.6469354629516602, |
|
"learning_rate": 9.963581183611534e-06, |
|
"loss": 0.3647, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.054479615183670525, |
|
"grad_norm": 1.3573085069656372, |
|
"learning_rate": 9.954476479514417e-06, |
|
"loss": 0.3267, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.05534437098023673, |
|
"grad_norm": 6.931013107299805, |
|
"learning_rate": 9.9453717754173e-06, |
|
"loss": 0.3981, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.056209126776802926, |
|
"grad_norm": 3.3258678913116455, |
|
"learning_rate": 9.936267071320182e-06, |
|
"loss": 0.3855, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.05707388257336912, |
|
"grad_norm": 4.0184550285339355, |
|
"learning_rate": 9.927162367223067e-06, |
|
"loss": 0.371, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.057650386437746594, |
|
"eval_loss": 0.34387460350990295, |
|
"eval_mse": 0.3438746197223663, |
|
"eval_runtime": 6.7538, |
|
"eval_samples_per_second": 148.065, |
|
"eval_steps_per_second": 18.508, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.057938638369935326, |
|
"grad_norm": 1.7570185661315918, |
|
"learning_rate": 9.91805766312595e-06, |
|
"loss": 0.4019, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.05880339416650152, |
|
"grad_norm": 3.925257444381714, |
|
"learning_rate": 9.908952959028833e-06, |
|
"loss": 0.3352, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.05966814996306772, |
|
"grad_norm": 1.5225563049316406, |
|
"learning_rate": 9.899848254931715e-06, |
|
"loss": 0.3081, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.06053290575963392, |
|
"grad_norm": 1.902543067932129, |
|
"learning_rate": 9.890743550834598e-06, |
|
"loss": 0.3908, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06139766155620012, |
|
"grad_norm": 1.2444404363632202, |
|
"learning_rate": 9.881638846737481e-06, |
|
"loss": 0.31, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.06226241735276632, |
|
"grad_norm": 1.8852041959762573, |
|
"learning_rate": 9.872534142640366e-06, |
|
"loss": 0.297, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.06312717314933251, |
|
"grad_norm": 5.955360412597656, |
|
"learning_rate": 9.863429438543249e-06, |
|
"loss": 0.3639, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.06399192894589871, |
|
"grad_norm": 1.8612481355667114, |
|
"learning_rate": 9.854324734446131e-06, |
|
"loss": 0.3808, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.06485668474246492, |
|
"grad_norm": 1.4975637197494507, |
|
"learning_rate": 9.845220030349014e-06, |
|
"loss": 0.3694, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.06572144053903112, |
|
"grad_norm": 1.5896477699279785, |
|
"learning_rate": 9.836115326251897e-06, |
|
"loss": 0.3056, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.06658619633559731, |
|
"grad_norm": 3.6155972480773926, |
|
"learning_rate": 9.827010622154782e-06, |
|
"loss": 0.3084, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.06745095213216351, |
|
"grad_norm": 4.709763526916504, |
|
"learning_rate": 9.817905918057664e-06, |
|
"loss": 0.3949, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.06831570792872971, |
|
"grad_norm": 3.7941088676452637, |
|
"learning_rate": 9.808801213960547e-06, |
|
"loss": 0.3213, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.0691804637252959, |
|
"grad_norm": 2.548584461212158, |
|
"learning_rate": 9.79969650986343e-06, |
|
"loss": 0.3335, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0700452195218621, |
|
"grad_norm": 1.5992470979690552, |
|
"learning_rate": 9.790591805766313e-06, |
|
"loss": 0.2688, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.07090997531842831, |
|
"grad_norm": 2.99310564994812, |
|
"learning_rate": 9.781487101669198e-06, |
|
"loss": 0.3488, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.07177473111499451, |
|
"grad_norm": 2.987074613571167, |
|
"learning_rate": 9.77238239757208e-06, |
|
"loss": 0.2703, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.0726394869115607, |
|
"grad_norm": 5.408164978027344, |
|
"learning_rate": 9.763277693474963e-06, |
|
"loss": 0.2854, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.0735042427081269, |
|
"grad_norm": 1.6425719261169434, |
|
"learning_rate": 9.754172989377846e-06, |
|
"loss": 0.2824, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.0743689985046931, |
|
"grad_norm": 1.8134018182754517, |
|
"learning_rate": 9.745068285280729e-06, |
|
"loss": 0.3286, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.0752337543012593, |
|
"grad_norm": 5.619387626647949, |
|
"learning_rate": 9.735963581183613e-06, |
|
"loss": 0.3226, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.0760985100978255, |
|
"grad_norm": 2.6264755725860596, |
|
"learning_rate": 9.726858877086496e-06, |
|
"loss": 0.3442, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.0769632658943917, |
|
"grad_norm": 3.537142515182495, |
|
"learning_rate": 9.717754172989379e-06, |
|
"loss": 0.2687, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.0778280216909579, |
|
"grad_norm": 2.1801705360412598, |
|
"learning_rate": 9.708649468892262e-06, |
|
"loss": 0.3705, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0786927774875241, |
|
"grad_norm": 6.869683742523193, |
|
"learning_rate": 9.699544764795145e-06, |
|
"loss": 0.3777, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.07955753328409029, |
|
"grad_norm": 3.4050021171569824, |
|
"learning_rate": 9.690440060698028e-06, |
|
"loss": 0.3265, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.08042228908065649, |
|
"grad_norm": 2.8107333183288574, |
|
"learning_rate": 9.681335356600912e-06, |
|
"loss": 0.3286, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.08128704487722269, |
|
"grad_norm": 1.5242034196853638, |
|
"learning_rate": 9.672230652503795e-06, |
|
"loss": 0.2786, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.0821518006737889, |
|
"grad_norm": 22.8824462890625, |
|
"learning_rate": 9.663125948406678e-06, |
|
"loss": 0.3706, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.0830165564703551, |
|
"grad_norm": 4.904447555541992, |
|
"learning_rate": 9.65402124430956e-06, |
|
"loss": 0.298, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.08388131226692129, |
|
"grad_norm": 4.405270576477051, |
|
"learning_rate": 9.644916540212444e-06, |
|
"loss": 0.3625, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.08474606806348749, |
|
"grad_norm": 6.424873352050781, |
|
"learning_rate": 9.635811836115328e-06, |
|
"loss": 0.3246, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.08561082386005368, |
|
"grad_norm": 1.9719147682189941, |
|
"learning_rate": 9.626707132018211e-06, |
|
"loss": 0.3014, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.08647557965661988, |
|
"grad_norm": 5.9254021644592285, |
|
"learning_rate": 9.617602427921094e-06, |
|
"loss": 0.3607, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08647557965661988, |
|
"eval_loss": 0.28438636660575867, |
|
"eval_mse": 0.2843863361030817, |
|
"eval_runtime": 6.6145, |
|
"eval_samples_per_second": 151.183, |
|
"eval_steps_per_second": 18.898, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08734033545318609, |
|
"grad_norm": 3.4420008659362793, |
|
"learning_rate": 9.608497723823977e-06, |
|
"loss": 0.2631, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.08820509124975229, |
|
"grad_norm": 2.4832332134246826, |
|
"learning_rate": 9.59939301972686e-06, |
|
"loss": 0.2985, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.08906984704631848, |
|
"grad_norm": 3.3724935054779053, |
|
"learning_rate": 9.590288315629744e-06, |
|
"loss": 0.2703, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.08993460284288468, |
|
"grad_norm": 8.569412231445312, |
|
"learning_rate": 9.581183611532627e-06, |
|
"loss": 0.3068, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.09079935863945088, |
|
"grad_norm": 1.9817373752593994, |
|
"learning_rate": 9.57207890743551e-06, |
|
"loss": 0.3271, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.09166411443601707, |
|
"grad_norm": 2.6405210494995117, |
|
"learning_rate": 9.562974203338393e-06, |
|
"loss": 0.2454, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.09252887023258327, |
|
"grad_norm": 1.6005452871322632, |
|
"learning_rate": 9.553869499241275e-06, |
|
"loss": 0.2506, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.09339362602914948, |
|
"grad_norm": 1.67816162109375, |
|
"learning_rate": 9.54476479514416e-06, |
|
"loss": 0.2503, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.09425838182571568, |
|
"grad_norm": 1.4401655197143555, |
|
"learning_rate": 9.535660091047043e-06, |
|
"loss": 0.2322, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.09512313762228188, |
|
"grad_norm": 2.919785261154175, |
|
"learning_rate": 9.526555386949926e-06, |
|
"loss": 0.2746, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09598789341884807, |
|
"grad_norm": 4.454317569732666, |
|
"learning_rate": 9.517450682852808e-06, |
|
"loss": 0.2907, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.09685264921541427, |
|
"grad_norm": 4.585294246673584, |
|
"learning_rate": 9.508345978755691e-06, |
|
"loss": 0.2873, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.09771740501198047, |
|
"grad_norm": 2.247422218322754, |
|
"learning_rate": 9.499241274658574e-06, |
|
"loss": 0.2798, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.09858216080854668, |
|
"grad_norm": 3.6044836044311523, |
|
"learning_rate": 9.490136570561459e-06, |
|
"loss": 0.3284, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.09944691660511287, |
|
"grad_norm": 4.609151363372803, |
|
"learning_rate": 9.481031866464341e-06, |
|
"loss": 0.2274, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.10031167240167907, |
|
"grad_norm": 4.793229103088379, |
|
"learning_rate": 9.471927162367224e-06, |
|
"loss": 0.2734, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.10117642819824527, |
|
"grad_norm": 4.507264614105225, |
|
"learning_rate": 9.462822458270107e-06, |
|
"loss": 0.3082, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.10204118399481146, |
|
"grad_norm": 6.129451274871826, |
|
"learning_rate": 9.45371775417299e-06, |
|
"loss": 0.2617, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.10290593979137766, |
|
"grad_norm": 2.631593704223633, |
|
"learning_rate": 9.444613050075875e-06, |
|
"loss": 0.2565, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.10377069558794386, |
|
"grad_norm": 4.379823684692383, |
|
"learning_rate": 9.435508345978757e-06, |
|
"loss": 0.3235, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.10463545138451007, |
|
"grad_norm": 2.077354907989502, |
|
"learning_rate": 9.42640364188164e-06, |
|
"loss": 0.2938, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.10550020718107626, |
|
"grad_norm": 5.108116626739502, |
|
"learning_rate": 9.417298937784523e-06, |
|
"loss": 0.2698, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.10636496297764246, |
|
"grad_norm": 7.464448928833008, |
|
"learning_rate": 9.408194233687406e-06, |
|
"loss": 0.2703, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.10722971877420866, |
|
"grad_norm": 1.855411410331726, |
|
"learning_rate": 9.399089529590289e-06, |
|
"loss": 0.2958, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.10809447457077485, |
|
"grad_norm": 5.284719944000244, |
|
"learning_rate": 9.389984825493173e-06, |
|
"loss": 0.2703, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.10895923036734105, |
|
"grad_norm": 2.494473457336426, |
|
"learning_rate": 9.380880121396056e-06, |
|
"loss": 0.2698, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.10982398616390726, |
|
"grad_norm": 2.0765345096588135, |
|
"learning_rate": 9.371775417298939e-06, |
|
"loss": 0.2726, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.11068874196047346, |
|
"grad_norm": 1.893574595451355, |
|
"learning_rate": 9.362670713201822e-06, |
|
"loss": 0.2739, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.11155349775703965, |
|
"grad_norm": 2.0016255378723145, |
|
"learning_rate": 9.353566009104705e-06, |
|
"loss": 0.2624, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.11241825355360585, |
|
"grad_norm": 2.99924898147583, |
|
"learning_rate": 9.344461305007587e-06, |
|
"loss": 0.3098, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11328300935017205, |
|
"grad_norm": 1.666891098022461, |
|
"learning_rate": 9.335356600910472e-06, |
|
"loss": 0.2451, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.11414776514673824, |
|
"grad_norm": 2.8993024826049805, |
|
"learning_rate": 9.326251896813355e-06, |
|
"loss": 0.271, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.11501252094330444, |
|
"grad_norm": 5.040359973907471, |
|
"learning_rate": 9.317147192716238e-06, |
|
"loss": 0.2576, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.11530077287549319, |
|
"eval_loss": 0.2588765025138855, |
|
"eval_mse": 0.25887649209996744, |
|
"eval_runtime": 6.6586, |
|
"eval_samples_per_second": 150.181, |
|
"eval_steps_per_second": 18.773, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11587727673987065, |
|
"grad_norm": 3.234560251235962, |
|
"learning_rate": 9.30804248861912e-06, |
|
"loss": 0.2702, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.11674203253643685, |
|
"grad_norm": 2.8497729301452637, |
|
"learning_rate": 9.298937784522003e-06, |
|
"loss": 0.2371, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.11760678833300305, |
|
"grad_norm": 1.3214294910430908, |
|
"learning_rate": 9.289833080424886e-06, |
|
"loss": 0.2262, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.11847154412956924, |
|
"grad_norm": 3.5736958980560303, |
|
"learning_rate": 9.28072837632777e-06, |
|
"loss": 0.2619, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.11933629992613544, |
|
"grad_norm": 3.478178024291992, |
|
"learning_rate": 9.271623672230654e-06, |
|
"loss": 0.3061, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.12020105572270164, |
|
"grad_norm": 2.523387908935547, |
|
"learning_rate": 9.262518968133536e-06, |
|
"loss": 0.3232, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.12106581151926785, |
|
"grad_norm": 2.1046786308288574, |
|
"learning_rate": 9.25341426403642e-06, |
|
"loss": 0.2815, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12193056731583404, |
|
"grad_norm": 4.513411045074463, |
|
"learning_rate": 9.244309559939302e-06, |
|
"loss": 0.3053, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.12279532311240024, |
|
"grad_norm": 2.628030776977539, |
|
"learning_rate": 9.235204855842187e-06, |
|
"loss": 0.2714, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.12366007890896644, |
|
"grad_norm": 6.008927345275879, |
|
"learning_rate": 9.22610015174507e-06, |
|
"loss": 0.3243, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.12452483470553263, |
|
"grad_norm": 6.236274242401123, |
|
"learning_rate": 9.216995447647952e-06, |
|
"loss": 0.2916, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.12538959050209883, |
|
"grad_norm": 2.30412220954895, |
|
"learning_rate": 9.207890743550835e-06, |
|
"loss": 0.2731, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.12625434629866503, |
|
"grad_norm": 3.3161492347717285, |
|
"learning_rate": 9.198786039453718e-06, |
|
"loss": 0.2508, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.12711910209523122, |
|
"grad_norm": 4.023074626922607, |
|
"learning_rate": 9.189681335356601e-06, |
|
"loss": 0.2973, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.12798385789179742, |
|
"grad_norm": 2.486236333847046, |
|
"learning_rate": 9.180576631259485e-06, |
|
"loss": 0.235, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.12884861368836362, |
|
"grad_norm": 4.445496082305908, |
|
"learning_rate": 9.171471927162368e-06, |
|
"loss": 0.2519, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.12971336948492984, |
|
"grad_norm": 1.629809021949768, |
|
"learning_rate": 9.162367223065251e-06, |
|
"loss": 0.2664, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.13057812528149604, |
|
"grad_norm": 3.0351674556732178, |
|
"learning_rate": 9.153262518968134e-06, |
|
"loss": 0.2433, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.13144288107806223, |
|
"grad_norm": 1.9163283109664917, |
|
"learning_rate": 9.144157814871017e-06, |
|
"loss": 0.2997, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.13230763687462843, |
|
"grad_norm": 3.978429079055786, |
|
"learning_rate": 9.1350531107739e-06, |
|
"loss": 0.2969, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.13317239267119463, |
|
"grad_norm": 1.7428011894226074, |
|
"learning_rate": 9.125948406676784e-06, |
|
"loss": 0.2721, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.13403714846776082, |
|
"grad_norm": 1.462106704711914, |
|
"learning_rate": 9.116843702579667e-06, |
|
"loss": 0.3179, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.13490190426432702, |
|
"grad_norm": 4.226785659790039, |
|
"learning_rate": 9.10773899848255e-06, |
|
"loss": 0.262, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.13576666006089322, |
|
"grad_norm": 3.227842330932617, |
|
"learning_rate": 9.098634294385433e-06, |
|
"loss": 0.2602, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.13663141585745942, |
|
"grad_norm": 4.708644866943359, |
|
"learning_rate": 9.089529590288316e-06, |
|
"loss": 0.2502, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.1374961716540256, |
|
"grad_norm": 3.478773832321167, |
|
"learning_rate": 9.080424886191198e-06, |
|
"loss": 0.2799, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.1383609274505918, |
|
"grad_norm": 5.991547107696533, |
|
"learning_rate": 9.071320182094083e-06, |
|
"loss": 0.3103, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.139225683247158, |
|
"grad_norm": 5.446369647979736, |
|
"learning_rate": 9.062215477996966e-06, |
|
"loss": 0.2834, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.1400904390437242, |
|
"grad_norm": 4.723580360412598, |
|
"learning_rate": 9.053110773899849e-06, |
|
"loss": 0.244, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.14095519484029043, |
|
"grad_norm": 2.518148183822632, |
|
"learning_rate": 9.044006069802731e-06, |
|
"loss": 0.3102, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.14181995063685662, |
|
"grad_norm": 4.1146039962768555, |
|
"learning_rate": 9.034901365705614e-06, |
|
"loss": 0.2913, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.14268470643342282, |
|
"grad_norm": 2.6333212852478027, |
|
"learning_rate": 9.025796661608497e-06, |
|
"loss": 0.2195, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.14354946222998902, |
|
"grad_norm": 3.344228506088257, |
|
"learning_rate": 9.016691957511382e-06, |
|
"loss": 0.2822, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.14412596609436648, |
|
"eval_loss": 0.2707275450229645, |
|
"eval_mse": 0.27072753977278263, |
|
"eval_runtime": 6.638, |
|
"eval_samples_per_second": 150.647, |
|
"eval_steps_per_second": 18.831, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1444142180265552, |
|
"grad_norm": 2.839517593383789, |
|
"learning_rate": 9.007587253414265e-06, |
|
"loss": 0.2469, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.1452789738231214, |
|
"grad_norm": 3.71785569190979, |
|
"learning_rate": 8.998482549317147e-06, |
|
"loss": 0.2695, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.1461437296196876, |
|
"grad_norm": 2.9626924991607666, |
|
"learning_rate": 8.98937784522003e-06, |
|
"loss": 0.233, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.1470084854162538, |
|
"grad_norm": 2.0274481773376465, |
|
"learning_rate": 8.980273141122913e-06, |
|
"loss": 0.2854, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.14787324121282, |
|
"grad_norm": 2.5144805908203125, |
|
"learning_rate": 8.971168437025798e-06, |
|
"loss": 0.2545, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.1487379970093862, |
|
"grad_norm": 3.106039047241211, |
|
"learning_rate": 8.96206373292868e-06, |
|
"loss": 0.2305, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.1496027528059524, |
|
"grad_norm": 3.3986117839813232, |
|
"learning_rate": 8.952959028831563e-06, |
|
"loss": 0.2564, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.1504675086025186, |
|
"grad_norm": 2.877206325531006, |
|
"learning_rate": 8.943854324734446e-06, |
|
"loss": 0.2781, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.1513322643990848, |
|
"grad_norm": 2.561119556427002, |
|
"learning_rate": 8.934749620637329e-06, |
|
"loss": 0.2666, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.152197020195651, |
|
"grad_norm": 2.566633939743042, |
|
"learning_rate": 8.925644916540213e-06, |
|
"loss": 0.2791, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.1530617759922172, |
|
"grad_norm": 2.1075491905212402, |
|
"learning_rate": 8.916540212443096e-06, |
|
"loss": 0.2276, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.1539265317887834, |
|
"grad_norm": 3.239712715148926, |
|
"learning_rate": 8.90743550834598e-06, |
|
"loss": 0.2749, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.1547912875853496, |
|
"grad_norm": 6.005987167358398, |
|
"learning_rate": 8.898330804248862e-06, |
|
"loss": 0.2395, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.1556560433819158, |
|
"grad_norm": 1.6621935367584229, |
|
"learning_rate": 8.889226100151745e-06, |
|
"loss": 0.2616, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.156520799178482, |
|
"grad_norm": 3.1361093521118164, |
|
"learning_rate": 8.880121396054628e-06, |
|
"loss": 0.2712, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.1573855549750482, |
|
"grad_norm": 1.8823013305664062, |
|
"learning_rate": 8.871016691957512e-06, |
|
"loss": 0.2571, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.1582503107716144, |
|
"grad_norm": 3.6857988834381104, |
|
"learning_rate": 8.861911987860395e-06, |
|
"loss": 0.2552, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.15911506656818059, |
|
"grad_norm": 2.26597261428833, |
|
"learning_rate": 8.852807283763278e-06, |
|
"loss": 0.2239, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.15997982236474678, |
|
"grad_norm": 5.791572570800781, |
|
"learning_rate": 8.84370257966616e-06, |
|
"loss": 0.2859, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.16084457816131298, |
|
"grad_norm": 3.4344265460968018, |
|
"learning_rate": 8.834597875569044e-06, |
|
"loss": 0.3055, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.16170933395787918, |
|
"grad_norm": 3.98288631439209, |
|
"learning_rate": 8.825493171471928e-06, |
|
"loss": 0.2698, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.16257408975444537, |
|
"grad_norm": 7.429836273193359, |
|
"learning_rate": 8.816388467374811e-06, |
|
"loss": 0.3207, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.1634388455510116, |
|
"grad_norm": 4.021480560302734, |
|
"learning_rate": 8.807283763277694e-06, |
|
"loss": 0.2477, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.1643036013475778, |
|
"grad_norm": 2.619497537612915, |
|
"learning_rate": 8.798179059180577e-06, |
|
"loss": 0.2152, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.165168357144144, |
|
"grad_norm": 2.073925018310547, |
|
"learning_rate": 8.78907435508346e-06, |
|
"loss": 0.2557, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.1660331129407102, |
|
"grad_norm": 2.250293493270874, |
|
"learning_rate": 8.779969650986344e-06, |
|
"loss": 0.2294, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.16689786873727638, |
|
"grad_norm": 2.9747421741485596, |
|
"learning_rate": 8.770864946889227e-06, |
|
"loss": 0.2461, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.16776262453384258, |
|
"grad_norm": 1.8815991878509521, |
|
"learning_rate": 8.76176024279211e-06, |
|
"loss": 0.2698, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.16862738033040878, |
|
"grad_norm": 2.1905224323272705, |
|
"learning_rate": 8.752655538694993e-06, |
|
"loss": 0.2631, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.16949213612697497, |
|
"grad_norm": 3.6808903217315674, |
|
"learning_rate": 8.743550834597875e-06, |
|
"loss": 0.2341, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.17035689192354117, |
|
"grad_norm": 2.905963897705078, |
|
"learning_rate": 8.73444613050076e-06, |
|
"loss": 0.2295, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.17122164772010737, |
|
"grad_norm": 5.486540794372559, |
|
"learning_rate": 8.725341426403643e-06, |
|
"loss": 0.2816, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.17208640351667356, |
|
"grad_norm": 1.8525919914245605, |
|
"learning_rate": 8.716236722306526e-06, |
|
"loss": 0.2197, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.17295115931323976, |
|
"grad_norm": 5.908591270446777, |
|
"learning_rate": 8.707132018209408e-06, |
|
"loss": 0.2908, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17295115931323976, |
|
"eval_loss": 0.23815499246120453, |
|
"eval_mse": 0.23815500601008535, |
|
"eval_runtime": 6.6897, |
|
"eval_samples_per_second": 149.484, |
|
"eval_steps_per_second": 18.685, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17381591510980596, |
|
"grad_norm": 3.1334948539733887, |
|
"learning_rate": 8.698027314112291e-06, |
|
"loss": 0.253, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.17468067090637218, |
|
"grad_norm": 2.314412832260132, |
|
"learning_rate": 8.688922610015174e-06, |
|
"loss": 0.2631, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.17554542670293838, |
|
"grad_norm": 3.483959436416626, |
|
"learning_rate": 8.679817905918059e-06, |
|
"loss": 0.2452, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.17641018249950458, |
|
"grad_norm": 2.2747623920440674, |
|
"learning_rate": 8.670713201820942e-06, |
|
"loss": 0.2462, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.17727493829607077, |
|
"grad_norm": 5.517392635345459, |
|
"learning_rate": 8.661608497723824e-06, |
|
"loss": 0.2707, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.17813969409263697, |
|
"grad_norm": 2.7062125205993652, |
|
"learning_rate": 8.652503793626707e-06, |
|
"loss": 0.2324, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.17900444988920317, |
|
"grad_norm": 2.712933301925659, |
|
"learning_rate": 8.64339908952959e-06, |
|
"loss": 0.2914, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.17986920568576936, |
|
"grad_norm": 3.0349957942962646, |
|
"learning_rate": 8.634294385432475e-06, |
|
"loss": 0.245, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.18073396148233556, |
|
"grad_norm": 1.533530592918396, |
|
"learning_rate": 8.625189681335357e-06, |
|
"loss": 0.245, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.18159871727890176, |
|
"grad_norm": 1.5070098638534546, |
|
"learning_rate": 8.61608497723824e-06, |
|
"loss": 0.252, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.18246347307546795, |
|
"grad_norm": 1.803514003753662, |
|
"learning_rate": 8.606980273141123e-06, |
|
"loss": 0.2395, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.18332822887203415, |
|
"grad_norm": 1.8047112226486206, |
|
"learning_rate": 8.597875569044006e-06, |
|
"loss": 0.2731, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.18419298466860035, |
|
"grad_norm": 3.090371608734131, |
|
"learning_rate": 8.58877086494689e-06, |
|
"loss": 0.2314, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.18505774046516654, |
|
"grad_norm": 2.2020678520202637, |
|
"learning_rate": 8.579666160849773e-06, |
|
"loss": 0.2828, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.18592249626173277, |
|
"grad_norm": 1.8086636066436768, |
|
"learning_rate": 8.570561456752656e-06, |
|
"loss": 0.2736, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.18678725205829896, |
|
"grad_norm": 1.9154764413833618, |
|
"learning_rate": 8.561456752655539e-06, |
|
"loss": 0.2422, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.18765200785486516, |
|
"grad_norm": 2.6091620922088623, |
|
"learning_rate": 8.552352048558422e-06, |
|
"loss": 0.278, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.18851676365143136, |
|
"grad_norm": 4.057301998138428, |
|
"learning_rate": 8.543247344461306e-06, |
|
"loss": 0.2268, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.18938151944799755, |
|
"grad_norm": 1.6707180738449097, |
|
"learning_rate": 8.53414264036419e-06, |
|
"loss": 0.2657, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.19024627524456375, |
|
"grad_norm": 4.327409744262695, |
|
"learning_rate": 8.525037936267072e-06, |
|
"loss": 0.2235, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.19111103104112995, |
|
"grad_norm": 3.991241931915283, |
|
"learning_rate": 8.515933232169955e-06, |
|
"loss": 0.2429, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.19197578683769614, |
|
"grad_norm": 1.742564082145691, |
|
"learning_rate": 8.506828528072838e-06, |
|
"loss": 0.2245, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.19284054263426234, |
|
"grad_norm": 2.362626791000366, |
|
"learning_rate": 8.49772382397572e-06, |
|
"loss": 0.2689, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.19370529843082854, |
|
"grad_norm": 1.4896934032440186, |
|
"learning_rate": 8.488619119878605e-06, |
|
"loss": 0.2669, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.19457005422739473, |
|
"grad_norm": 1.8312315940856934, |
|
"learning_rate": 8.479514415781488e-06, |
|
"loss": 0.2255, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.19543481002396093, |
|
"grad_norm": 3.9799551963806152, |
|
"learning_rate": 8.470409711684371e-06, |
|
"loss": 0.2152, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.19629956582052713, |
|
"grad_norm": 1.9584782123565674, |
|
"learning_rate": 8.461305007587254e-06, |
|
"loss": 0.2721, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.19716432161709335, |
|
"grad_norm": 3.361952781677246, |
|
"learning_rate": 8.452200303490137e-06, |
|
"loss": 0.265, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.19802907741365955, |
|
"grad_norm": 2.125466823577881, |
|
"learning_rate": 8.443095599393021e-06, |
|
"loss": 0.2473, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.19889383321022575, |
|
"grad_norm": 4.599373817443848, |
|
"learning_rate": 8.433990895295904e-06, |
|
"loss": 0.2201, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.19975858900679194, |
|
"grad_norm": 2.139647960662842, |
|
"learning_rate": 8.424886191198787e-06, |
|
"loss": 0.2647, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.20062334480335814, |
|
"grad_norm": 1.7437653541564941, |
|
"learning_rate": 8.41578148710167e-06, |
|
"loss": 0.2242, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.20148810059992434, |
|
"grad_norm": 4.154083728790283, |
|
"learning_rate": 8.406676783004552e-06, |
|
"loss": 0.2258, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.20177635253211307, |
|
"eval_loss": 0.2404525727033615, |
|
"eval_mse": 0.24045259381830691, |
|
"eval_runtime": 6.5158, |
|
"eval_samples_per_second": 153.474, |
|
"eval_steps_per_second": 19.184, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20235285639649053, |
|
"grad_norm": 1.7175828218460083, |
|
"learning_rate": 8.397572078907437e-06, |
|
"loss": 0.2734, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.20321761219305673, |
|
"grad_norm": 1.8725277185440063, |
|
"learning_rate": 8.38846737481032e-06, |
|
"loss": 0.2309, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.20408236798962293, |
|
"grad_norm": 1.7434577941894531, |
|
"learning_rate": 8.379362670713203e-06, |
|
"loss": 0.2395, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.20494712378618912, |
|
"grad_norm": 2.8038480281829834, |
|
"learning_rate": 8.370257966616086e-06, |
|
"loss": 0.2787, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.20581187958275532, |
|
"grad_norm": 1.701920509338379, |
|
"learning_rate": 8.361153262518968e-06, |
|
"loss": 0.2267, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.20667663537932152, |
|
"grad_norm": 4.903564453125, |
|
"learning_rate": 8.352048558421853e-06, |
|
"loss": 0.2716, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.2075413911758877, |
|
"grad_norm": 3.723651647567749, |
|
"learning_rate": 8.342943854324736e-06, |
|
"loss": 0.2674, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.20840614697245394, |
|
"grad_norm": 1.7158082723617554, |
|
"learning_rate": 8.333839150227619e-06, |
|
"loss": 0.2296, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.20927090276902013, |
|
"grad_norm": 2.1699960231781006, |
|
"learning_rate": 8.324734446130501e-06, |
|
"loss": 0.2843, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.21013565856558633, |
|
"grad_norm": 4.244576454162598, |
|
"learning_rate": 8.315629742033384e-06, |
|
"loss": 0.2678, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.21100041436215253, |
|
"grad_norm": 1.4597645998001099, |
|
"learning_rate": 8.306525037936269e-06, |
|
"loss": 0.248, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.21186517015871872, |
|
"grad_norm": 2.9361813068389893, |
|
"learning_rate": 8.297420333839152e-06, |
|
"loss": 0.241, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.21272992595528492, |
|
"grad_norm": 4.552188396453857, |
|
"learning_rate": 8.288315629742034e-06, |
|
"loss": 0.2935, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.21359468175185112, |
|
"grad_norm": 4.201780796051025, |
|
"learning_rate": 8.279210925644917e-06, |
|
"loss": 0.2935, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.21445943754841731, |
|
"grad_norm": 7.236446380615234, |
|
"learning_rate": 8.2701062215478e-06, |
|
"loss": 0.2584, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.2153241933449835, |
|
"grad_norm": 1.7462209463119507, |
|
"learning_rate": 8.261001517450683e-06, |
|
"loss": 0.2457, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.2161889491415497, |
|
"grad_norm": 3.9612677097320557, |
|
"learning_rate": 8.251896813353568e-06, |
|
"loss": 0.2472, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2170537049381159, |
|
"grad_norm": 3.581313371658325, |
|
"learning_rate": 8.24279210925645e-06, |
|
"loss": 0.2833, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.2179184607346821, |
|
"grad_norm": 3.473001003265381, |
|
"learning_rate": 8.233687405159333e-06, |
|
"loss": 0.2474, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.2187832165312483, |
|
"grad_norm": 5.877528667449951, |
|
"learning_rate": 8.224582701062216e-06, |
|
"loss": 0.2469, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.21964797232781452, |
|
"grad_norm": 4.294084072113037, |
|
"learning_rate": 8.215477996965099e-06, |
|
"loss": 0.2569, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.22051272812438072, |
|
"grad_norm": 1.9812819957733154, |
|
"learning_rate": 8.206373292867983e-06, |
|
"loss": 0.2592, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.22137748392094692, |
|
"grad_norm": 1.6627389192581177, |
|
"learning_rate": 8.197268588770866e-06, |
|
"loss": 0.2597, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.2222422397175131, |
|
"grad_norm": 2.557081699371338, |
|
"learning_rate": 8.188163884673749e-06, |
|
"loss": 0.2624, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.2231069955140793, |
|
"grad_norm": 3.3301448822021484, |
|
"learning_rate": 8.179059180576632e-06, |
|
"loss": 0.2458, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.2239717513106455, |
|
"grad_norm": 3.717036247253418, |
|
"learning_rate": 8.169954476479515e-06, |
|
"loss": 0.2403, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.2248365071072117, |
|
"grad_norm": 1.9032166004180908, |
|
"learning_rate": 8.1608497723824e-06, |
|
"loss": 0.2194, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2257012629037779, |
|
"grad_norm": 11.293305397033691, |
|
"learning_rate": 8.151745068285282e-06, |
|
"loss": 0.276, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.2265660187003441, |
|
"grad_norm": 2.2903361320495605, |
|
"learning_rate": 8.142640364188165e-06, |
|
"loss": 0.2089, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.2274307744969103, |
|
"grad_norm": 1.6450647115707397, |
|
"learning_rate": 8.133535660091048e-06, |
|
"loss": 0.239, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.2282955302934765, |
|
"grad_norm": 2.290724277496338, |
|
"learning_rate": 8.12443095599393e-06, |
|
"loss": 0.2509, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.2291602860900427, |
|
"grad_norm": 2.6252450942993164, |
|
"learning_rate": 8.115326251896815e-06, |
|
"loss": 0.2578, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.23002504188660888, |
|
"grad_norm": 4.217925071716309, |
|
"learning_rate": 8.106221547799698e-06, |
|
"loss": 0.2604, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.23060154575098638, |
|
"eval_loss": 0.23178604245185852, |
|
"eval_mse": 0.2317860303344205, |
|
"eval_runtime": 6.6317, |
|
"eval_samples_per_second": 150.79, |
|
"eval_steps_per_second": 18.849, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2308897976831751, |
|
"grad_norm": 2.014478921890259, |
|
"learning_rate": 8.097116843702581e-06, |
|
"loss": 0.238, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.2317545534797413, |
|
"grad_norm": 3.1536102294921875, |
|
"learning_rate": 8.088012139605464e-06, |
|
"loss": 0.2236, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.2326193092763075, |
|
"grad_norm": 2.094320297241211, |
|
"learning_rate": 8.078907435508347e-06, |
|
"loss": 0.2536, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.2334840650728737, |
|
"grad_norm": 1.7041524648666382, |
|
"learning_rate": 8.06980273141123e-06, |
|
"loss": 0.2243, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2343488208694399, |
|
"grad_norm": 4.586849689483643, |
|
"learning_rate": 8.060698027314114e-06, |
|
"loss": 0.2276, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.2352135766660061, |
|
"grad_norm": 1.8360718488693237, |
|
"learning_rate": 8.051593323216997e-06, |
|
"loss": 0.2299, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.2360783324625723, |
|
"grad_norm": 2.283907651901245, |
|
"learning_rate": 8.04248861911988e-06, |
|
"loss": 0.2996, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.23694308825913848, |
|
"grad_norm": 2.762160301208496, |
|
"learning_rate": 8.033383915022763e-06, |
|
"loss": 0.2474, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.23780784405570468, |
|
"grad_norm": 2.776780366897583, |
|
"learning_rate": 8.024279210925645e-06, |
|
"loss": 0.2394, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.23867259985227088, |
|
"grad_norm": 1.8678719997406006, |
|
"learning_rate": 8.01517450682853e-06, |
|
"loss": 0.2732, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.23953735564883707, |
|
"grad_norm": 1.6664822101593018, |
|
"learning_rate": 8.006069802731413e-06, |
|
"loss": 0.245, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.24040211144540327, |
|
"grad_norm": 2.244666814804077, |
|
"learning_rate": 7.996965098634296e-06, |
|
"loss": 0.2587, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.24126686724196947, |
|
"grad_norm": 1.8958755731582642, |
|
"learning_rate": 7.987860394537178e-06, |
|
"loss": 0.2632, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.2421316230385357, |
|
"grad_norm": 1.7408092021942139, |
|
"learning_rate": 7.978755690440061e-06, |
|
"loss": 0.2475, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2429963788351019, |
|
"grad_norm": 4.280789375305176, |
|
"learning_rate": 7.969650986342944e-06, |
|
"loss": 0.2478, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.2438611346316681, |
|
"grad_norm": 3.1445536613464355, |
|
"learning_rate": 7.960546282245829e-06, |
|
"loss": 0.2412, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.24472589042823428, |
|
"grad_norm": 1.806154727935791, |
|
"learning_rate": 7.951441578148712e-06, |
|
"loss": 0.2772, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.24559064622480048, |
|
"grad_norm": 2.123932123184204, |
|
"learning_rate": 7.942336874051594e-06, |
|
"loss": 0.2535, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.24645540202136668, |
|
"grad_norm": 1.488558292388916, |
|
"learning_rate": 7.933232169954477e-06, |
|
"loss": 0.2342, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.24732015781793287, |
|
"grad_norm": 3.2414135932922363, |
|
"learning_rate": 7.92412746585736e-06, |
|
"loss": 0.2536, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.24818491361449907, |
|
"grad_norm": 1.952009916305542, |
|
"learning_rate": 7.915022761760245e-06, |
|
"loss": 0.2028, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.24904966941106527, |
|
"grad_norm": 1.7001383304595947, |
|
"learning_rate": 7.905918057663127e-06, |
|
"loss": 0.2791, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.24991442520763146, |
|
"grad_norm": 2.1715829372406006, |
|
"learning_rate": 7.89681335356601e-06, |
|
"loss": 0.2211, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.25077918100419766, |
|
"grad_norm": 2.4868202209472656, |
|
"learning_rate": 7.887708649468893e-06, |
|
"loss": 0.2599, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.25164393680076386, |
|
"grad_norm": 1.4740101099014282, |
|
"learning_rate": 7.878603945371776e-06, |
|
"loss": 0.2076, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.25250869259733005, |
|
"grad_norm": 2.241642951965332, |
|
"learning_rate": 7.869499241274659e-06, |
|
"loss": 0.2413, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.25337344839389625, |
|
"grad_norm": 4.3130784034729, |
|
"learning_rate": 7.860394537177543e-06, |
|
"loss": 0.2495, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.25423820419046245, |
|
"grad_norm": 6.674787998199463, |
|
"learning_rate": 7.851289833080426e-06, |
|
"loss": 0.2396, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.25510295998702864, |
|
"grad_norm": 2.463395118713379, |
|
"learning_rate": 7.842185128983309e-06, |
|
"loss": 0.2304, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.25596771578359484, |
|
"grad_norm": 4.519803047180176, |
|
"learning_rate": 7.833080424886192e-06, |
|
"loss": 0.2603, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.25683247158016104, |
|
"grad_norm": 1.3876014947891235, |
|
"learning_rate": 7.823975720789075e-06, |
|
"loss": 0.2282, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.25769722737672723, |
|
"grad_norm": 2.0575900077819824, |
|
"learning_rate": 7.814871016691958e-06, |
|
"loss": 0.2286, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.2585619831732935, |
|
"grad_norm": 4.538529872894287, |
|
"learning_rate": 7.805766312594842e-06, |
|
"loss": 0.2606, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.2594267389698597, |
|
"grad_norm": 3.036686658859253, |
|
"learning_rate": 7.796661608497725e-06, |
|
"loss": 0.2961, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2594267389698597, |
|
"eval_loss": 0.2185884416103363, |
|
"eval_mse": 0.21858844196051358, |
|
"eval_runtime": 6.596, |
|
"eval_samples_per_second": 151.607, |
|
"eval_steps_per_second": 18.951, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2602914947664259, |
|
"grad_norm": 2.319840669631958, |
|
"learning_rate": 7.787556904400608e-06, |
|
"loss": 0.2493, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.2611562505629921, |
|
"grad_norm": 4.723567485809326, |
|
"learning_rate": 7.77845220030349e-06, |
|
"loss": 0.2236, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.2620210063595583, |
|
"grad_norm": 2.231250286102295, |
|
"learning_rate": 7.769347496206373e-06, |
|
"loss": 0.2766, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.26288576215612447, |
|
"grad_norm": 2.3697762489318848, |
|
"learning_rate": 7.760242792109256e-06, |
|
"loss": 0.2319, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.26375051795269067, |
|
"grad_norm": 2.0771355628967285, |
|
"learning_rate": 7.75113808801214e-06, |
|
"loss": 0.246, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.26461527374925686, |
|
"grad_norm": 2.4788358211517334, |
|
"learning_rate": 7.742033383915024e-06, |
|
"loss": 0.2163, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.26548002954582306, |
|
"grad_norm": 1.5741719007492065, |
|
"learning_rate": 7.732928679817907e-06, |
|
"loss": 0.2308, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.26634478534238926, |
|
"grad_norm": 2.12677001953125, |
|
"learning_rate": 7.72382397572079e-06, |
|
"loss": 0.2304, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.26720954113895545, |
|
"grad_norm": 1.9022995233535767, |
|
"learning_rate": 7.714719271623672e-06, |
|
"loss": 0.2454, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.26807429693552165, |
|
"grad_norm": 3.3529253005981445, |
|
"learning_rate": 7.705614567526557e-06, |
|
"loss": 0.2137, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.26893905273208785, |
|
"grad_norm": 1.935281753540039, |
|
"learning_rate": 7.69650986342944e-06, |
|
"loss": 0.2581, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.26980380852865404, |
|
"grad_norm": 2.14315128326416, |
|
"learning_rate": 7.687405159332322e-06, |
|
"loss": 0.2317, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.27066856432522024, |
|
"grad_norm": 2.028090238571167, |
|
"learning_rate": 7.678300455235205e-06, |
|
"loss": 0.2379, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.27153332012178644, |
|
"grad_norm": 2.918959379196167, |
|
"learning_rate": 7.669195751138088e-06, |
|
"loss": 0.2708, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.27239807591835263, |
|
"grad_norm": 4.042644500732422, |
|
"learning_rate": 7.660091047040971e-06, |
|
"loss": 0.3112, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.27326283171491883, |
|
"grad_norm": 1.8345041275024414, |
|
"learning_rate": 7.650986342943855e-06, |
|
"loss": 0.267, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.274127587511485, |
|
"grad_norm": 5.901050567626953, |
|
"learning_rate": 7.641881638846738e-06, |
|
"loss": 0.272, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.2749923433080512, |
|
"grad_norm": 2.413675546646118, |
|
"learning_rate": 7.632776934749621e-06, |
|
"loss": 0.2906, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.2758570991046174, |
|
"grad_norm": 2.699126958847046, |
|
"learning_rate": 7.623672230652505e-06, |
|
"loss": 0.2201, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.2767218549011836, |
|
"grad_norm": 1.3186564445495605, |
|
"learning_rate": 7.614567526555388e-06, |
|
"loss": 0.227, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2775866106977498, |
|
"grad_norm": 7.576478958129883, |
|
"learning_rate": 7.6054628224582705e-06, |
|
"loss": 0.2569, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.278451366494316, |
|
"grad_norm": 1.853145718574524, |
|
"learning_rate": 7.596358118361153e-06, |
|
"loss": 0.2586, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.2793161222908822, |
|
"grad_norm": 3.2667322158813477, |
|
"learning_rate": 7.587253414264037e-06, |
|
"loss": 0.2483, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.2801808780874484, |
|
"grad_norm": 2.7756762504577637, |
|
"learning_rate": 7.578148710166921e-06, |
|
"loss": 0.2724, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.28104563388401466, |
|
"grad_norm": 2.138936758041382, |
|
"learning_rate": 7.569044006069804e-06, |
|
"loss": 0.2517, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.28191038968058085, |
|
"grad_norm": 1.9090849161148071, |
|
"learning_rate": 7.5599393019726864e-06, |
|
"loss": 0.2608, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.28277514547714705, |
|
"grad_norm": 2.6184728145599365, |
|
"learning_rate": 7.550834597875569e-06, |
|
"loss": 0.2728, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.28363990127371325, |
|
"grad_norm": 1.5910395383834839, |
|
"learning_rate": 7.541729893778453e-06, |
|
"loss": 0.2144, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.28450465707027944, |
|
"grad_norm": 2.3524558544158936, |
|
"learning_rate": 7.532625189681337e-06, |
|
"loss": 0.23, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.28536941286684564, |
|
"grad_norm": 3.056361675262451, |
|
"learning_rate": 7.5235204855842195e-06, |
|
"loss": 0.2397, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.28623416866341184, |
|
"grad_norm": 3.4158847332000732, |
|
"learning_rate": 7.514415781487102e-06, |
|
"loss": 0.2712, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.28709892445997803, |
|
"grad_norm": 3.3620333671569824, |
|
"learning_rate": 7.505311077389985e-06, |
|
"loss": 0.2107, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.28796368025654423, |
|
"grad_norm": 3.508890390396118, |
|
"learning_rate": 7.496206373292868e-06, |
|
"loss": 0.2453, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.28825193218873296, |
|
"eval_loss": 0.21684841811656952, |
|
"eval_mse": 0.21684841979760677, |
|
"eval_runtime": 6.6234, |
|
"eval_samples_per_second": 150.98, |
|
"eval_steps_per_second": 18.872, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2888284360531104, |
|
"grad_norm": 2.658740997314453, |
|
"learning_rate": 7.487101669195752e-06, |
|
"loss": 0.2319, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.2896931918496766, |
|
"grad_norm": 1.370540976524353, |
|
"learning_rate": 7.477996965098635e-06, |
|
"loss": 0.2443, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.2905579476462428, |
|
"grad_norm": 8.996585845947266, |
|
"learning_rate": 7.468892261001518e-06, |
|
"loss": 0.2859, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.291422703442809, |
|
"grad_norm": 3.880053758621216, |
|
"learning_rate": 7.459787556904401e-06, |
|
"loss": 0.2913, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.2922874592393752, |
|
"grad_norm": 4.735537052154541, |
|
"learning_rate": 7.450682852807284e-06, |
|
"loss": 0.258, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.2931522150359414, |
|
"grad_norm": 1.7119396924972534, |
|
"learning_rate": 7.441578148710168e-06, |
|
"loss": 0.2397, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.2940169708325076, |
|
"grad_norm": 3.4769861698150635, |
|
"learning_rate": 7.4324734446130505e-06, |
|
"loss": 0.209, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2948817266290738, |
|
"grad_norm": 2.3741278648376465, |
|
"learning_rate": 7.423368740515934e-06, |
|
"loss": 0.2627, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.29574648242564, |
|
"grad_norm": 1.5303798913955688, |
|
"learning_rate": 7.414264036418817e-06, |
|
"loss": 0.2352, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.2966112382222062, |
|
"grad_norm": 1.63661789894104, |
|
"learning_rate": 7.4051593323217e-06, |
|
"loss": 0.2448, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.2974759940187724, |
|
"grad_norm": 1.531538724899292, |
|
"learning_rate": 7.3960546282245835e-06, |
|
"loss": 0.23, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.2983407498153386, |
|
"grad_norm": 1.3936281204223633, |
|
"learning_rate": 7.386949924127466e-06, |
|
"loss": 0.2367, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.2992055056119048, |
|
"grad_norm": 4.795119762420654, |
|
"learning_rate": 7.377845220030349e-06, |
|
"loss": 0.2621, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.300070261408471, |
|
"grad_norm": 3.725170135498047, |
|
"learning_rate": 7.368740515933233e-06, |
|
"loss": 0.2708, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.3009350172050372, |
|
"grad_norm": 1.5772522687911987, |
|
"learning_rate": 7.359635811836116e-06, |
|
"loss": 0.1897, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.3017997730016034, |
|
"grad_norm": 4.716231822967529, |
|
"learning_rate": 7.3505311077389994e-06, |
|
"loss": 0.2297, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.3026645287981696, |
|
"grad_norm": 3.9254777431488037, |
|
"learning_rate": 7.341426403641882e-06, |
|
"loss": 0.2285, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3035292845947358, |
|
"grad_norm": 2.7238216400146484, |
|
"learning_rate": 7.332321699544765e-06, |
|
"loss": 0.2488, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.304394040391302, |
|
"grad_norm": 7.061728477478027, |
|
"learning_rate": 7.323216995447649e-06, |
|
"loss": 0.2329, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.3052587961878682, |
|
"grad_norm": 2.2323367595672607, |
|
"learning_rate": 7.314112291350532e-06, |
|
"loss": 0.2511, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.3061235519844344, |
|
"grad_norm": 3.432279586791992, |
|
"learning_rate": 7.305007587253415e-06, |
|
"loss": 0.251, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.3069883077810006, |
|
"grad_norm": 4.372462272644043, |
|
"learning_rate": 7.295902883156298e-06, |
|
"loss": 0.2306, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.3078530635775668, |
|
"grad_norm": 3.428677558898926, |
|
"learning_rate": 7.286798179059181e-06, |
|
"loss": 0.2389, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.308717819374133, |
|
"grad_norm": 1.7803095579147339, |
|
"learning_rate": 7.277693474962064e-06, |
|
"loss": 0.2374, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.3095825751706992, |
|
"grad_norm": 1.7453327178955078, |
|
"learning_rate": 7.2685887708649476e-06, |
|
"loss": 0.2175, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.3104473309672654, |
|
"grad_norm": 3.4764058589935303, |
|
"learning_rate": 7.25948406676783e-06, |
|
"loss": 0.2649, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.3113120867638316, |
|
"grad_norm": 2.5065643787384033, |
|
"learning_rate": 7.250379362670714e-06, |
|
"loss": 0.2436, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3121768425603978, |
|
"grad_norm": 2.4735498428344727, |
|
"learning_rate": 7.241274658573597e-06, |
|
"loss": 0.2565, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.313041598356964, |
|
"grad_norm": 7.107683181762695, |
|
"learning_rate": 7.23216995447648e-06, |
|
"loss": 0.2691, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.3139063541535302, |
|
"grad_norm": 1.9517550468444824, |
|
"learning_rate": 7.223065250379363e-06, |
|
"loss": 0.2869, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.3147711099500964, |
|
"grad_norm": 3.1179702281951904, |
|
"learning_rate": 7.213960546282246e-06, |
|
"loss": 0.2434, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.3156358657466626, |
|
"grad_norm": 4.619999885559082, |
|
"learning_rate": 7.20485584218513e-06, |
|
"loss": 0.2469, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.3165006215432288, |
|
"grad_norm": 6.7724528312683105, |
|
"learning_rate": 7.195751138088013e-06, |
|
"loss": 0.278, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.31707712540760624, |
|
"eval_loss": 0.2247343808412552, |
|
"eval_mse": 0.22473438137583435, |
|
"eval_runtime": 6.5256, |
|
"eval_samples_per_second": 153.242, |
|
"eval_steps_per_second": 19.155, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.317365377339795, |
|
"grad_norm": 1.6179205179214478, |
|
"learning_rate": 7.186646433990896e-06, |
|
"loss": 0.252, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 0.31823013313636117, |
|
"grad_norm": 1.8803197145462036, |
|
"learning_rate": 7.1775417298937785e-06, |
|
"loss": 0.2227, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.31909488893292737, |
|
"grad_norm": 2.2427573204040527, |
|
"learning_rate": 7.168437025796661e-06, |
|
"loss": 0.2399, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 0.31995964472949356, |
|
"grad_norm": 1.907244086265564, |
|
"learning_rate": 7.159332321699546e-06, |
|
"loss": 0.2162, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.32082440052605976, |
|
"grad_norm": 3.7878000736236572, |
|
"learning_rate": 7.150227617602429e-06, |
|
"loss": 0.246, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.32168915632262596, |
|
"grad_norm": 1.9053196907043457, |
|
"learning_rate": 7.141122913505312e-06, |
|
"loss": 0.2684, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.32255391211919215, |
|
"grad_norm": 5.108983039855957, |
|
"learning_rate": 7.1320182094081944e-06, |
|
"loss": 0.2237, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 0.32341866791575835, |
|
"grad_norm": 2.2469422817230225, |
|
"learning_rate": 7.122913505311077e-06, |
|
"loss": 0.2462, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.32428342371232455, |
|
"grad_norm": 5.127351760864258, |
|
"learning_rate": 7.113808801213962e-06, |
|
"loss": 0.2543, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.32514817950889074, |
|
"grad_norm": 4.980170249938965, |
|
"learning_rate": 7.104704097116845e-06, |
|
"loss": 0.2775, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.326012935305457, |
|
"grad_norm": 3.701903820037842, |
|
"learning_rate": 7.0955993930197275e-06, |
|
"loss": 0.2548, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 0.3268776911020232, |
|
"grad_norm": 3.780144214630127, |
|
"learning_rate": 7.08649468892261e-06, |
|
"loss": 0.2687, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.3277424468985894, |
|
"grad_norm": 2.2161099910736084, |
|
"learning_rate": 7.077389984825493e-06, |
|
"loss": 0.2351, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 0.3286072026951556, |
|
"grad_norm": 4.7017998695373535, |
|
"learning_rate": 7.068285280728376e-06, |
|
"loss": 0.2431, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3294719584917218, |
|
"grad_norm": 2.053750991821289, |
|
"learning_rate": 7.0591805766312606e-06, |
|
"loss": 0.254, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 0.330336714288288, |
|
"grad_norm": 2.8078341484069824, |
|
"learning_rate": 7.050075872534143e-06, |
|
"loss": 0.2646, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.3312014700848542, |
|
"grad_norm": 2.585087776184082, |
|
"learning_rate": 7.040971168437026e-06, |
|
"loss": 0.2536, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 0.3320662258814204, |
|
"grad_norm": 4.2963104248046875, |
|
"learning_rate": 7.031866464339909e-06, |
|
"loss": 0.2276, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.33293098167798657, |
|
"grad_norm": 4.205751419067383, |
|
"learning_rate": 7.022761760242792e-06, |
|
"loss": 0.2404, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.33379573747455277, |
|
"grad_norm": 3.460796356201172, |
|
"learning_rate": 7.0136570561456765e-06, |
|
"loss": 0.2473, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.33466049327111896, |
|
"grad_norm": 3.529181957244873, |
|
"learning_rate": 7.004552352048559e-06, |
|
"loss": 0.24, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 0.33552524906768516, |
|
"grad_norm": 3.203437328338623, |
|
"learning_rate": 6.995447647951442e-06, |
|
"loss": 0.2547, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.33639000486425136, |
|
"grad_norm": 4.1535491943359375, |
|
"learning_rate": 6.986342943854325e-06, |
|
"loss": 0.2532, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 0.33725476066081755, |
|
"grad_norm": 3.7933478355407715, |
|
"learning_rate": 6.977238239757208e-06, |
|
"loss": 0.2435, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.33811951645738375, |
|
"grad_norm": 2.6703147888183594, |
|
"learning_rate": 6.968133535660092e-06, |
|
"loss": 0.2293, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 0.33898427225394995, |
|
"grad_norm": 2.8900182247161865, |
|
"learning_rate": 6.959028831562975e-06, |
|
"loss": 0.2163, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.33984902805051614, |
|
"grad_norm": 3.9563350677490234, |
|
"learning_rate": 6.949924127465858e-06, |
|
"loss": 0.2682, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 0.34071378384708234, |
|
"grad_norm": 1.8461941480636597, |
|
"learning_rate": 6.940819423368741e-06, |
|
"loss": 0.2293, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 0.34157853964364854, |
|
"grad_norm": 3.313368082046509, |
|
"learning_rate": 6.931714719271624e-06, |
|
"loss": 0.2436, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.34244329544021473, |
|
"grad_norm": 1.7820873260498047, |
|
"learning_rate": 6.922610015174508e-06, |
|
"loss": 0.212, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 0.34330805123678093, |
|
"grad_norm": 1.995291829109192, |
|
"learning_rate": 6.913505311077391e-06, |
|
"loss": 0.2243, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 0.34417280703334713, |
|
"grad_norm": 2.928727626800537, |
|
"learning_rate": 6.904400606980274e-06, |
|
"loss": 0.2296, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 0.3450375628299133, |
|
"grad_norm": 2.5598068237304688, |
|
"learning_rate": 6.895295902883157e-06, |
|
"loss": 0.2135, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 0.3459023186264795, |
|
"grad_norm": 2.4700326919555664, |
|
"learning_rate": 6.88619119878604e-06, |
|
"loss": 0.2319, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3459023186264795, |
|
"eval_loss": 0.21415139734745026, |
|
"eval_mse": 0.2141513990436215, |
|
"eval_runtime": 6.5018, |
|
"eval_samples_per_second": 153.803, |
|
"eval_steps_per_second": 19.225, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3467670744230457, |
|
"grad_norm": 3.6825876235961914, |
|
"learning_rate": 6.8770864946889225e-06, |
|
"loss": 0.2409, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 0.3476318302196119, |
|
"grad_norm": 2.8780245780944824, |
|
"learning_rate": 6.867981790591807e-06, |
|
"loss": 0.2219, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 0.34849658601617817, |
|
"grad_norm": 2.641505479812622, |
|
"learning_rate": 6.85887708649469e-06, |
|
"loss": 0.2315, |
|
"step": 1209 |
|
}, |
|
{ |
|
"epoch": 0.34936134181274436, |
|
"grad_norm": 1.660909652709961, |
|
"learning_rate": 6.849772382397573e-06, |
|
"loss": 0.224, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 0.35022609760931056, |
|
"grad_norm": 5.104085445404053, |
|
"learning_rate": 6.8406676783004556e-06, |
|
"loss": 0.2184, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.35109085340587676, |
|
"grad_norm": 1.870938777923584, |
|
"learning_rate": 6.831562974203338e-06, |
|
"loss": 0.234, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.35195560920244295, |
|
"grad_norm": 4.542322635650635, |
|
"learning_rate": 6.822458270106223e-06, |
|
"loss": 0.2247, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 0.35282036499900915, |
|
"grad_norm": 1.4236701726913452, |
|
"learning_rate": 6.813353566009106e-06, |
|
"loss": 0.2168, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 0.35368512079557535, |
|
"grad_norm": 2.0418410301208496, |
|
"learning_rate": 6.804248861911989e-06, |
|
"loss": 0.2425, |
|
"step": 1227 |
|
}, |
|
{ |
|
"epoch": 0.35454987659214154, |
|
"grad_norm": 2.868399143218994, |
|
"learning_rate": 6.7951441578148715e-06, |
|
"loss": 0.2354, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.35541463238870774, |
|
"grad_norm": 2.453361749649048, |
|
"learning_rate": 6.786039453717754e-06, |
|
"loss": 0.2266, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 0.35627938818527394, |
|
"grad_norm": 2.0826542377471924, |
|
"learning_rate": 6.776934749620638e-06, |
|
"loss": 0.2194, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 0.35714414398184013, |
|
"grad_norm": 1.511440634727478, |
|
"learning_rate": 6.767830045523522e-06, |
|
"loss": 0.2264, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 0.35800889977840633, |
|
"grad_norm": 2.456897020339966, |
|
"learning_rate": 6.7587253414264045e-06, |
|
"loss": 0.2864, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 0.3588736555749725, |
|
"grad_norm": 2.873429298400879, |
|
"learning_rate": 6.749620637329287e-06, |
|
"loss": 0.2055, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.3597384113715387, |
|
"grad_norm": 1.7501113414764404, |
|
"learning_rate": 6.74051593323217e-06, |
|
"loss": 0.2205, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.3606031671681049, |
|
"grad_norm": 1.615105390548706, |
|
"learning_rate": 6.731411229135054e-06, |
|
"loss": 0.2337, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 0.3614679229646711, |
|
"grad_norm": 1.6872748136520386, |
|
"learning_rate": 6.722306525037937e-06, |
|
"loss": 0.2164, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 0.3623326787612373, |
|
"grad_norm": 3.1001312732696533, |
|
"learning_rate": 6.7132018209408204e-06, |
|
"loss": 0.2154, |
|
"step": 1257 |
|
}, |
|
{ |
|
"epoch": 0.3631974345578035, |
|
"grad_norm": 2.1165292263031006, |
|
"learning_rate": 6.704097116843703e-06, |
|
"loss": 0.2512, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3640621903543697, |
|
"grad_norm": 4.326318740844727, |
|
"learning_rate": 6.694992412746586e-06, |
|
"loss": 0.2471, |
|
"step": 1263 |
|
}, |
|
{ |
|
"epoch": 0.3649269461509359, |
|
"grad_norm": 5.773290634155273, |
|
"learning_rate": 6.685887708649469e-06, |
|
"loss": 0.2407, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 0.3657917019475021, |
|
"grad_norm": 3.2723119258880615, |
|
"learning_rate": 6.676783004552353e-06, |
|
"loss": 0.3129, |
|
"step": 1269 |
|
}, |
|
{ |
|
"epoch": 0.3666564577440683, |
|
"grad_norm": 2.927086114883423, |
|
"learning_rate": 6.6676783004552355e-06, |
|
"loss": 0.231, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 0.3675212135406345, |
|
"grad_norm": 1.7322252988815308, |
|
"learning_rate": 6.658573596358119e-06, |
|
"loss": 0.2504, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.3683859693372007, |
|
"grad_norm": 2.5904715061187744, |
|
"learning_rate": 6.649468892261002e-06, |
|
"loss": 0.2079, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 0.3692507251337669, |
|
"grad_norm": 2.6561062335968018, |
|
"learning_rate": 6.640364188163885e-06, |
|
"loss": 0.2423, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 0.3701154809303331, |
|
"grad_norm": 3.3299241065979004, |
|
"learning_rate": 6.6312594840667686e-06, |
|
"loss": 0.2386, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 0.37098023672689934, |
|
"grad_norm": 1.731477975845337, |
|
"learning_rate": 6.622154779969651e-06, |
|
"loss": 0.2017, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 0.37184499252346553, |
|
"grad_norm": 2.5077965259552, |
|
"learning_rate": 6.613050075872534e-06, |
|
"loss": 0.2446, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.37270974832003173, |
|
"grad_norm": 2.36556077003479, |
|
"learning_rate": 6.603945371775418e-06, |
|
"loss": 0.2544, |
|
"step": 1293 |
|
}, |
|
{ |
|
"epoch": 0.3735745041165979, |
|
"grad_norm": 1.9789232015609741, |
|
"learning_rate": 6.594840667678301e-06, |
|
"loss": 0.2247, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.3744392599131641, |
|
"grad_norm": 1.7310489416122437, |
|
"learning_rate": 6.5857359635811845e-06, |
|
"loss": 0.1983, |
|
"step": 1299 |
|
}, |
|
{ |
|
"epoch": 0.37472751184535286, |
|
"eval_loss": 0.2175411880016327, |
|
"eval_mse": 0.21754117820138344, |
|
"eval_runtime": 7.1272, |
|
"eval_samples_per_second": 140.308, |
|
"eval_steps_per_second": 17.538, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3753040157097303, |
|
"grad_norm": 4.152148723602295, |
|
"learning_rate": 6.576631259484067e-06, |
|
"loss": 0.2899, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.3761687715062965, |
|
"grad_norm": 2.1581296920776367, |
|
"learning_rate": 6.56752655538695e-06, |
|
"loss": 0.2448, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.3770335273028627, |
|
"grad_norm": 2.8600330352783203, |
|
"learning_rate": 6.558421851289834e-06, |
|
"loss": 0.2084, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 0.3778982830994289, |
|
"grad_norm": 1.5723686218261719, |
|
"learning_rate": 6.549317147192717e-06, |
|
"loss": 0.2441, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 0.3787630388959951, |
|
"grad_norm": 3.4149153232574463, |
|
"learning_rate": 6.5402124430956e-06, |
|
"loss": 0.247, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 0.3796277946925613, |
|
"grad_norm": 16.35414695739746, |
|
"learning_rate": 6.531107738998483e-06, |
|
"loss": 0.2478, |
|
"step": 1317 |
|
}, |
|
{ |
|
"epoch": 0.3804925504891275, |
|
"grad_norm": 6.227761268615723, |
|
"learning_rate": 6.522003034901366e-06, |
|
"loss": 0.2316, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3813573062856937, |
|
"grad_norm": 2.2669637203216553, |
|
"learning_rate": 6.512898330804249e-06, |
|
"loss": 0.2836, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 0.3822220620822599, |
|
"grad_norm": 1.4385027885437012, |
|
"learning_rate": 6.503793626707133e-06, |
|
"loss": 0.2122, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 0.3830868178788261, |
|
"grad_norm": 2.3909130096435547, |
|
"learning_rate": 6.4946889226100154e-06, |
|
"loss": 0.2357, |
|
"step": 1329 |
|
}, |
|
{ |
|
"epoch": 0.3839515736753923, |
|
"grad_norm": 1.7610464096069336, |
|
"learning_rate": 6.485584218512899e-06, |
|
"loss": 0.2746, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 0.3848163294719585, |
|
"grad_norm": 2.8983278274536133, |
|
"learning_rate": 6.476479514415782e-06, |
|
"loss": 0.2176, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.3856810852685247, |
|
"grad_norm": 1.7231597900390625, |
|
"learning_rate": 6.467374810318665e-06, |
|
"loss": 0.2065, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 0.3865458410650909, |
|
"grad_norm": 1.6913188695907593, |
|
"learning_rate": 6.458270106221548e-06, |
|
"loss": 0.2541, |
|
"step": 1341 |
|
}, |
|
{ |
|
"epoch": 0.3874105968616571, |
|
"grad_norm": 2.1574645042419434, |
|
"learning_rate": 6.449165402124431e-06, |
|
"loss": 0.2393, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.38827535265822327, |
|
"grad_norm": 3.8315231800079346, |
|
"learning_rate": 6.440060698027315e-06, |
|
"loss": 0.2353, |
|
"step": 1347 |
|
}, |
|
{ |
|
"epoch": 0.38914010845478947, |
|
"grad_norm": 2.655545711517334, |
|
"learning_rate": 6.430955993930198e-06, |
|
"loss": 0.2222, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.39000486425135567, |
|
"grad_norm": 2.0346477031707764, |
|
"learning_rate": 6.421851289833081e-06, |
|
"loss": 0.2289, |
|
"step": 1353 |
|
}, |
|
{ |
|
"epoch": 0.39086962004792186, |
|
"grad_norm": 4.726449489593506, |
|
"learning_rate": 6.4127465857359636e-06, |
|
"loss": 0.2748, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 0.39173437584448806, |
|
"grad_norm": 3.0121731758117676, |
|
"learning_rate": 6.403641881638846e-06, |
|
"loss": 0.2244, |
|
"step": 1359 |
|
}, |
|
{ |
|
"epoch": 0.39259913164105426, |
|
"grad_norm": 6.562178611755371, |
|
"learning_rate": 6.394537177541731e-06, |
|
"loss": 0.264, |
|
"step": 1362 |
|
}, |
|
{ |
|
"epoch": 0.3934638874376205, |
|
"grad_norm": 3.1092755794525146, |
|
"learning_rate": 6.385432473444614e-06, |
|
"loss": 0.228, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.3943286432341867, |
|
"grad_norm": 4.431830406188965, |
|
"learning_rate": 6.376327769347497e-06, |
|
"loss": 0.2544, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 0.3951933990307529, |
|
"grad_norm": 2.545694351196289, |
|
"learning_rate": 6.3672230652503795e-06, |
|
"loss": 0.2618, |
|
"step": 1371 |
|
}, |
|
{ |
|
"epoch": 0.3960581548273191, |
|
"grad_norm": 1.5483859777450562, |
|
"learning_rate": 6.358118361153262e-06, |
|
"loss": 0.282, |
|
"step": 1374 |
|
}, |
|
{ |
|
"epoch": 0.3969229106238853, |
|
"grad_norm": 1.750784993171692, |
|
"learning_rate": 6.349013657056147e-06, |
|
"loss": 0.2447, |
|
"step": 1377 |
|
}, |
|
{ |
|
"epoch": 0.3977876664204515, |
|
"grad_norm": 2.440020799636841, |
|
"learning_rate": 6.33990895295903e-06, |
|
"loss": 0.2472, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3986524222170177, |
|
"grad_norm": 1.4584132432937622, |
|
"learning_rate": 6.3308042488619125e-06, |
|
"loss": 0.2469, |
|
"step": 1383 |
|
}, |
|
{ |
|
"epoch": 0.3995171780135839, |
|
"grad_norm": 1.3845562934875488, |
|
"learning_rate": 6.321699544764795e-06, |
|
"loss": 0.2111, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 0.4003819338101501, |
|
"grad_norm": 1.736345648765564, |
|
"learning_rate": 6.312594840667678e-06, |
|
"loss": 0.2203, |
|
"step": 1389 |
|
}, |
|
{ |
|
"epoch": 0.4012466896067163, |
|
"grad_norm": 2.5879809856414795, |
|
"learning_rate": 6.303490136570563e-06, |
|
"loss": 0.2776, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 0.4021114454032825, |
|
"grad_norm": 1.3224149942398071, |
|
"learning_rate": 6.294385432473446e-06, |
|
"loss": 0.2016, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.40297620119984867, |
|
"grad_norm": 2.7825825214385986, |
|
"learning_rate": 6.2852807283763284e-06, |
|
"loss": 0.2264, |
|
"step": 1398 |
|
}, |
|
{ |
|
"epoch": 0.40355270506422614, |
|
"eval_loss": 0.23064111173152924, |
|
"eval_mse": 0.2306411211611703, |
|
"eval_runtime": 6.653, |
|
"eval_samples_per_second": 150.308, |
|
"eval_steps_per_second": 18.789, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.40384095699641487, |
|
"grad_norm": 3.059859275817871, |
|
"learning_rate": 6.276176024279211e-06, |
|
"loss": 0.1952, |
|
"step": 1401 |
|
}, |
|
{ |
|
"epoch": 0.40470571279298106, |
|
"grad_norm": 2.1431009769439697, |
|
"learning_rate": 6.267071320182094e-06, |
|
"loss": 0.2681, |
|
"step": 1404 |
|
}, |
|
{ |
|
"epoch": 0.40557046858954726, |
|
"grad_norm": 1.6716617345809937, |
|
"learning_rate": 6.257966616084977e-06, |
|
"loss": 0.221, |
|
"step": 1407 |
|
}, |
|
{ |
|
"epoch": 0.40643522438611346, |
|
"grad_norm": 1.9646525382995605, |
|
"learning_rate": 6.2488619119878615e-06, |
|
"loss": 0.2218, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.40729998018267966, |
|
"grad_norm": 1.2912189960479736, |
|
"learning_rate": 6.239757207890744e-06, |
|
"loss": 0.2077, |
|
"step": 1413 |
|
}, |
|
{ |
|
"epoch": 0.40816473597924585, |
|
"grad_norm": 2.4723434448242188, |
|
"learning_rate": 6.230652503793627e-06, |
|
"loss": 0.2629, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 0.40902949177581205, |
|
"grad_norm": 2.1053199768066406, |
|
"learning_rate": 6.22154779969651e-06, |
|
"loss": 0.2524, |
|
"step": 1419 |
|
}, |
|
{ |
|
"epoch": 0.40989424757237825, |
|
"grad_norm": 2.039580821990967, |
|
"learning_rate": 6.212443095599393e-06, |
|
"loss": 0.2035, |
|
"step": 1422 |
|
}, |
|
{ |
|
"epoch": 0.41075900336894444, |
|
"grad_norm": 1.499022364616394, |
|
"learning_rate": 6.203338391502277e-06, |
|
"loss": 0.2263, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.41162375916551064, |
|
"grad_norm": 2.090580701828003, |
|
"learning_rate": 6.19423368740516e-06, |
|
"loss": 0.2834, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 0.41248851496207684, |
|
"grad_norm": 2.0547232627868652, |
|
"learning_rate": 6.185128983308043e-06, |
|
"loss": 0.2725, |
|
"step": 1431 |
|
}, |
|
{ |
|
"epoch": 0.41335327075864303, |
|
"grad_norm": 1.8254441022872925, |
|
"learning_rate": 6.176024279210926e-06, |
|
"loss": 0.2234, |
|
"step": 1434 |
|
}, |
|
{ |
|
"epoch": 0.41421802655520923, |
|
"grad_norm": 1.860533595085144, |
|
"learning_rate": 6.166919575113809e-06, |
|
"loss": 0.2554, |
|
"step": 1437 |
|
}, |
|
{ |
|
"epoch": 0.4150827823517754, |
|
"grad_norm": 3.225929021835327, |
|
"learning_rate": 6.157814871016693e-06, |
|
"loss": 0.2784, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4159475381483417, |
|
"grad_norm": 2.2769436836242676, |
|
"learning_rate": 6.148710166919576e-06, |
|
"loss": 0.2261, |
|
"step": 1443 |
|
}, |
|
{ |
|
"epoch": 0.4168122939449079, |
|
"grad_norm": 1.6467565298080444, |
|
"learning_rate": 6.139605462822459e-06, |
|
"loss": 0.2702, |
|
"step": 1446 |
|
}, |
|
{ |
|
"epoch": 0.41767704974147407, |
|
"grad_norm": 3.0362329483032227, |
|
"learning_rate": 6.130500758725342e-06, |
|
"loss": 0.2348, |
|
"step": 1449 |
|
}, |
|
{ |
|
"epoch": 0.41854180553804027, |
|
"grad_norm": 1.8852200508117676, |
|
"learning_rate": 6.121396054628225e-06, |
|
"loss": 0.2222, |
|
"step": 1452 |
|
}, |
|
{ |
|
"epoch": 0.41940656133460646, |
|
"grad_norm": 2.119568109512329, |
|
"learning_rate": 6.112291350531108e-06, |
|
"loss": 0.226, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.42027131713117266, |
|
"grad_norm": 2.534950017929077, |
|
"learning_rate": 6.103186646433992e-06, |
|
"loss": 0.2353, |
|
"step": 1458 |
|
}, |
|
{ |
|
"epoch": 0.42113607292773886, |
|
"grad_norm": 3.6363894939422607, |
|
"learning_rate": 6.094081942336875e-06, |
|
"loss": 0.2724, |
|
"step": 1461 |
|
}, |
|
{ |
|
"epoch": 0.42200082872430505, |
|
"grad_norm": 1.8480486869812012, |
|
"learning_rate": 6.084977238239758e-06, |
|
"loss": 0.2417, |
|
"step": 1464 |
|
}, |
|
{ |
|
"epoch": 0.42286558452087125, |
|
"grad_norm": 4.110941410064697, |
|
"learning_rate": 6.075872534142641e-06, |
|
"loss": 0.2061, |
|
"step": 1467 |
|
}, |
|
{ |
|
"epoch": 0.42373034031743745, |
|
"grad_norm": 2.7998435497283936, |
|
"learning_rate": 6.0667678300455234e-06, |
|
"loss": 0.231, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.42459509611400365, |
|
"grad_norm": 1.7628705501556396, |
|
"learning_rate": 6.057663125948408e-06, |
|
"loss": 0.2193, |
|
"step": 1473 |
|
}, |
|
{ |
|
"epoch": 0.42545985191056984, |
|
"grad_norm": 2.7976937294006348, |
|
"learning_rate": 6.048558421851291e-06, |
|
"loss": 0.2361, |
|
"step": 1476 |
|
}, |
|
{ |
|
"epoch": 0.42632460770713604, |
|
"grad_norm": 2.4593019485473633, |
|
"learning_rate": 6.039453717754174e-06, |
|
"loss": 0.2494, |
|
"step": 1479 |
|
}, |
|
{ |
|
"epoch": 0.42718936350370224, |
|
"grad_norm": 2.5946741104125977, |
|
"learning_rate": 6.0303490136570565e-06, |
|
"loss": 0.2279, |
|
"step": 1482 |
|
}, |
|
{ |
|
"epoch": 0.42805411930026843, |
|
"grad_norm": 1.6827466487884521, |
|
"learning_rate": 6.021244309559939e-06, |
|
"loss": 0.2458, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.42891887509683463, |
|
"grad_norm": 4.625283241271973, |
|
"learning_rate": 6.012139605462823e-06, |
|
"loss": 0.2707, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 0.4297836308934008, |
|
"grad_norm": 2.733687400817871, |
|
"learning_rate": 6.003034901365707e-06, |
|
"loss": 0.2294, |
|
"step": 1491 |
|
}, |
|
{ |
|
"epoch": 0.430648386689967, |
|
"grad_norm": 1.38575279712677, |
|
"learning_rate": 5.9939301972685896e-06, |
|
"loss": 0.1992, |
|
"step": 1494 |
|
}, |
|
{ |
|
"epoch": 0.4315131424865332, |
|
"grad_norm": 1.9684631824493408, |
|
"learning_rate": 5.984825493171472e-06, |
|
"loss": 0.2118, |
|
"step": 1497 |
|
}, |
|
{ |
|
"epoch": 0.4323778982830994, |
|
"grad_norm": 3.40984845161438, |
|
"learning_rate": 5.975720789074355e-06, |
|
"loss": 0.2175, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4323778982830994, |
|
"eval_loss": 0.2375136762857437, |
|
"eval_mse": 0.23751368772797288, |
|
"eval_runtime": 6.7445, |
|
"eval_samples_per_second": 148.27, |
|
"eval_steps_per_second": 18.534, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4332426540796656, |
|
"grad_norm": 2.3788678646087646, |
|
"learning_rate": 5.966616084977239e-06, |
|
"loss": 0.2311, |
|
"step": 1503 |
|
}, |
|
{ |
|
"epoch": 0.4341074098762318, |
|
"grad_norm": 4.027227401733398, |
|
"learning_rate": 5.957511380880122e-06, |
|
"loss": 0.2456, |
|
"step": 1506 |
|
}, |
|
{ |
|
"epoch": 0.434972165672798, |
|
"grad_norm": 5.0818586349487305, |
|
"learning_rate": 5.9484066767830055e-06, |
|
"loss": 0.2631, |
|
"step": 1509 |
|
}, |
|
{ |
|
"epoch": 0.4358369214693642, |
|
"grad_norm": 4.373122215270996, |
|
"learning_rate": 5.939301972685888e-06, |
|
"loss": 0.243, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 0.4367016772659304, |
|
"grad_norm": 3.31792950630188, |
|
"learning_rate": 5.930197268588771e-06, |
|
"loss": 0.2278, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.4375664330624966, |
|
"grad_norm": 1.9427984952926636, |
|
"learning_rate": 5.921092564491655e-06, |
|
"loss": 0.2578, |
|
"step": 1518 |
|
}, |
|
{ |
|
"epoch": 0.43843118885906285, |
|
"grad_norm": 2.5355935096740723, |
|
"learning_rate": 5.911987860394538e-06, |
|
"loss": 0.2125, |
|
"step": 1521 |
|
}, |
|
{ |
|
"epoch": 0.43929594465562904, |
|
"grad_norm": 5.661628723144531, |
|
"learning_rate": 5.9028831562974205e-06, |
|
"loss": 0.2591, |
|
"step": 1524 |
|
}, |
|
{ |
|
"epoch": 0.44016070045219524, |
|
"grad_norm": 2.133945941925049, |
|
"learning_rate": 5.893778452200304e-06, |
|
"loss": 0.2577, |
|
"step": 1527 |
|
}, |
|
{ |
|
"epoch": 0.44102545624876144, |
|
"grad_norm": 2.8841874599456787, |
|
"learning_rate": 5.884673748103187e-06, |
|
"loss": 0.2344, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.44189021204532763, |
|
"grad_norm": 1.6562261581420898, |
|
"learning_rate": 5.87556904400607e-06, |
|
"loss": 0.2133, |
|
"step": 1533 |
|
}, |
|
{ |
|
"epoch": 0.44275496784189383, |
|
"grad_norm": 3.133864164352417, |
|
"learning_rate": 5.866464339908954e-06, |
|
"loss": 0.2154, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 0.44361972363846003, |
|
"grad_norm": 1.9966986179351807, |
|
"learning_rate": 5.8573596358118364e-06, |
|
"loss": 0.1964, |
|
"step": 1539 |
|
}, |
|
{ |
|
"epoch": 0.4444844794350262, |
|
"grad_norm": 1.9703294038772583, |
|
"learning_rate": 5.848254931714719e-06, |
|
"loss": 0.2238, |
|
"step": 1542 |
|
}, |
|
{ |
|
"epoch": 0.4453492352315924, |
|
"grad_norm": 3.2984211444854736, |
|
"learning_rate": 5.839150227617603e-06, |
|
"loss": 0.2628, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.4462139910281586, |
|
"grad_norm": 3.6368560791015625, |
|
"learning_rate": 5.830045523520486e-06, |
|
"loss": 0.2567, |
|
"step": 1548 |
|
}, |
|
{ |
|
"epoch": 0.4470787468247248, |
|
"grad_norm": 2.366480827331543, |
|
"learning_rate": 5.8209408194233695e-06, |
|
"loss": 0.2493, |
|
"step": 1551 |
|
}, |
|
{ |
|
"epoch": 0.447943502621291, |
|
"grad_norm": 3.3239293098449707, |
|
"learning_rate": 5.811836115326252e-06, |
|
"loss": 0.2268, |
|
"step": 1554 |
|
}, |
|
{ |
|
"epoch": 0.4488082584178572, |
|
"grad_norm": 4.618416786193848, |
|
"learning_rate": 5.802731411229135e-06, |
|
"loss": 0.2456, |
|
"step": 1557 |
|
}, |
|
{ |
|
"epoch": 0.4496730142144234, |
|
"grad_norm": 2.826070785522461, |
|
"learning_rate": 5.793626707132019e-06, |
|
"loss": 0.2206, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.4505377700109896, |
|
"grad_norm": 4.012238025665283, |
|
"learning_rate": 5.784522003034902e-06, |
|
"loss": 0.2157, |
|
"step": 1563 |
|
}, |
|
{ |
|
"epoch": 0.4514025258075558, |
|
"grad_norm": 2.0067975521087646, |
|
"learning_rate": 5.775417298937785e-06, |
|
"loss": 0.2489, |
|
"step": 1566 |
|
}, |
|
{ |
|
"epoch": 0.452267281604122, |
|
"grad_norm": 1.500907301902771, |
|
"learning_rate": 5.766312594840668e-06, |
|
"loss": 0.2348, |
|
"step": 1569 |
|
}, |
|
{ |
|
"epoch": 0.4531320374006882, |
|
"grad_norm": 1.8496533632278442, |
|
"learning_rate": 5.757207890743551e-06, |
|
"loss": 0.2019, |
|
"step": 1572 |
|
}, |
|
{ |
|
"epoch": 0.4539967931972544, |
|
"grad_norm": 3.222740650177002, |
|
"learning_rate": 5.748103186646434e-06, |
|
"loss": 0.2587, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.4548615489938206, |
|
"grad_norm": 1.572925329208374, |
|
"learning_rate": 5.738998482549318e-06, |
|
"loss": 0.2265, |
|
"step": 1578 |
|
}, |
|
{ |
|
"epoch": 0.4557263047903868, |
|
"grad_norm": 1.603624701499939, |
|
"learning_rate": 5.729893778452201e-06, |
|
"loss": 0.2433, |
|
"step": 1581 |
|
}, |
|
{ |
|
"epoch": 0.456591060586953, |
|
"grad_norm": 3.0979552268981934, |
|
"learning_rate": 5.720789074355084e-06, |
|
"loss": 0.221, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 0.4574558163835192, |
|
"grad_norm": 1.8691914081573486, |
|
"learning_rate": 5.711684370257967e-06, |
|
"loss": 0.2083, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 0.4583205721800854, |
|
"grad_norm": 1.6643935441970825, |
|
"learning_rate": 5.70257966616085e-06, |
|
"loss": 0.2173, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.45918532797665157, |
|
"grad_norm": 3.626629590988159, |
|
"learning_rate": 5.693474962063733e-06, |
|
"loss": 0.2256, |
|
"step": 1593 |
|
}, |
|
{ |
|
"epoch": 0.46005008377321777, |
|
"grad_norm": 1.6160587072372437, |
|
"learning_rate": 5.684370257966616e-06, |
|
"loss": 0.2197, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 0.460914839569784, |
|
"grad_norm": 3.206094264984131, |
|
"learning_rate": 5.6752655538695e-06, |
|
"loss": 0.2461, |
|
"step": 1599 |
|
}, |
|
{ |
|
"epoch": 0.46120309150197275, |
|
"eval_loss": 0.24926815927028656, |
|
"eval_mse": 0.24926817585621028, |
|
"eval_runtime": 6.5749, |
|
"eval_samples_per_second": 152.094, |
|
"eval_steps_per_second": 19.012, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4617795953663502, |
|
"grad_norm": 4.698122024536133, |
|
"learning_rate": 5.666160849772383e-06, |
|
"loss": 0.2546, |
|
"step": 1602 |
|
}, |
|
{ |
|
"epoch": 0.4626443511629164, |
|
"grad_norm": 1.5181471109390259, |
|
"learning_rate": 5.657056145675266e-06, |
|
"loss": 0.2167, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.4635091069594826, |
|
"grad_norm": 7.337409496307373, |
|
"learning_rate": 5.647951441578149e-06, |
|
"loss": 0.2435, |
|
"step": 1608 |
|
}, |
|
{ |
|
"epoch": 0.4643738627560488, |
|
"grad_norm": 1.6400500535964966, |
|
"learning_rate": 5.6388467374810314e-06, |
|
"loss": 0.2336, |
|
"step": 1611 |
|
}, |
|
{ |
|
"epoch": 0.465238618552615, |
|
"grad_norm": 1.7364429235458374, |
|
"learning_rate": 5.629742033383916e-06, |
|
"loss": 0.2397, |
|
"step": 1614 |
|
}, |
|
{ |
|
"epoch": 0.4661033743491812, |
|
"grad_norm": 2.7070679664611816, |
|
"learning_rate": 5.620637329286799e-06, |
|
"loss": 0.2389, |
|
"step": 1617 |
|
}, |
|
{ |
|
"epoch": 0.4669681301457474, |
|
"grad_norm": 1.28640878200531, |
|
"learning_rate": 5.611532625189682e-06, |
|
"loss": 0.202, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4678328859423136, |
|
"grad_norm": 2.6867973804473877, |
|
"learning_rate": 5.6024279210925645e-06, |
|
"loss": 0.2501, |
|
"step": 1623 |
|
}, |
|
{ |
|
"epoch": 0.4686976417388798, |
|
"grad_norm": 2.1441292762756348, |
|
"learning_rate": 5.593323216995447e-06, |
|
"loss": 0.2364, |
|
"step": 1626 |
|
}, |
|
{ |
|
"epoch": 0.469562397535446, |
|
"grad_norm": 1.6109544038772583, |
|
"learning_rate": 5.584218512898332e-06, |
|
"loss": 0.244, |
|
"step": 1629 |
|
}, |
|
{ |
|
"epoch": 0.4704271533320122, |
|
"grad_norm": 2.0842268466949463, |
|
"learning_rate": 5.575113808801215e-06, |
|
"loss": 0.2434, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 0.4712919091285784, |
|
"grad_norm": 1.4527335166931152, |
|
"learning_rate": 5.5660091047040976e-06, |
|
"loss": 0.2113, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.4721566649251446, |
|
"grad_norm": 2.0434927940368652, |
|
"learning_rate": 5.55690440060698e-06, |
|
"loss": 0.2404, |
|
"step": 1638 |
|
}, |
|
{ |
|
"epoch": 0.4730214207217108, |
|
"grad_norm": 3.0256736278533936, |
|
"learning_rate": 5.547799696509863e-06, |
|
"loss": 0.2652, |
|
"step": 1641 |
|
}, |
|
{ |
|
"epoch": 0.47388617651827697, |
|
"grad_norm": 2.3856120109558105, |
|
"learning_rate": 5.538694992412748e-06, |
|
"loss": 0.2156, |
|
"step": 1644 |
|
}, |
|
{ |
|
"epoch": 0.47475093231484317, |
|
"grad_norm": 2.0948779582977295, |
|
"learning_rate": 5.529590288315631e-06, |
|
"loss": 0.2245, |
|
"step": 1647 |
|
}, |
|
{ |
|
"epoch": 0.47561568811140936, |
|
"grad_norm": 1.7975860834121704, |
|
"learning_rate": 5.5204855842185135e-06, |
|
"loss": 0.2542, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.47648044390797556, |
|
"grad_norm": 3.510812997817993, |
|
"learning_rate": 5.511380880121396e-06, |
|
"loss": 0.2288, |
|
"step": 1653 |
|
}, |
|
{ |
|
"epoch": 0.47734519970454176, |
|
"grad_norm": 3.4858286380767822, |
|
"learning_rate": 5.502276176024279e-06, |
|
"loss": 0.2497, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 0.47820995550110795, |
|
"grad_norm": 2.605661630630493, |
|
"learning_rate": 5.493171471927162e-06, |
|
"loss": 0.2062, |
|
"step": 1659 |
|
}, |
|
{ |
|
"epoch": 0.47907471129767415, |
|
"grad_norm": 2.162203788757324, |
|
"learning_rate": 5.4840667678300465e-06, |
|
"loss": 0.2167, |
|
"step": 1662 |
|
}, |
|
{ |
|
"epoch": 0.47993946709424035, |
|
"grad_norm": 4.7574262619018555, |
|
"learning_rate": 5.474962063732929e-06, |
|
"loss": 0.2088, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.48080422289080654, |
|
"grad_norm": 2.1833505630493164, |
|
"learning_rate": 5.465857359635812e-06, |
|
"loss": 0.2622, |
|
"step": 1668 |
|
}, |
|
{ |
|
"epoch": 0.48166897868737274, |
|
"grad_norm": 1.645045280456543, |
|
"learning_rate": 5.456752655538695e-06, |
|
"loss": 0.2164, |
|
"step": 1671 |
|
}, |
|
{ |
|
"epoch": 0.48253373448393894, |
|
"grad_norm": 2.270944356918335, |
|
"learning_rate": 5.447647951441578e-06, |
|
"loss": 0.2441, |
|
"step": 1674 |
|
}, |
|
{ |
|
"epoch": 0.4833984902805052, |
|
"grad_norm": 2.0133309364318848, |
|
"learning_rate": 5.4385432473444624e-06, |
|
"loss": 0.2032, |
|
"step": 1677 |
|
}, |
|
{ |
|
"epoch": 0.4842632460770714, |
|
"grad_norm": 2.5585310459136963, |
|
"learning_rate": 5.429438543247345e-06, |
|
"loss": 0.2385, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.4851280018736376, |
|
"grad_norm": 1.7377713918685913, |
|
"learning_rate": 5.420333839150228e-06, |
|
"loss": 0.216, |
|
"step": 1683 |
|
}, |
|
{ |
|
"epoch": 0.4859927576702038, |
|
"grad_norm": 1.8322491645812988, |
|
"learning_rate": 5.411229135053111e-06, |
|
"loss": 0.2102, |
|
"step": 1686 |
|
}, |
|
{ |
|
"epoch": 0.48685751346677, |
|
"grad_norm": 1.8956815004348755, |
|
"learning_rate": 5.402124430955994e-06, |
|
"loss": 0.2561, |
|
"step": 1689 |
|
}, |
|
{ |
|
"epoch": 0.4877222692633362, |
|
"grad_norm": 1.5535213947296143, |
|
"learning_rate": 5.393019726858878e-06, |
|
"loss": 0.2099, |
|
"step": 1692 |
|
}, |
|
{ |
|
"epoch": 0.48858702505990237, |
|
"grad_norm": 2.924278974533081, |
|
"learning_rate": 5.383915022761761e-06, |
|
"loss": 0.2662, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.48945178085646857, |
|
"grad_norm": 2.1653637886047363, |
|
"learning_rate": 5.374810318664644e-06, |
|
"loss": 0.2419, |
|
"step": 1698 |
|
}, |
|
{ |
|
"epoch": 0.49002828472084603, |
|
"eval_loss": 0.22344937920570374, |
|
"eval_mse": 0.22344938813522458, |
|
"eval_runtime": 6.5992, |
|
"eval_samples_per_second": 151.533, |
|
"eval_steps_per_second": 18.942, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.49031653665303476, |
|
"grad_norm": 2.360328197479248, |
|
"learning_rate": 5.365705614567527e-06, |
|
"loss": 0.2661, |
|
"step": 1701 |
|
}, |
|
{ |
|
"epoch": 0.49118129244960096, |
|
"grad_norm": 2.328495502471924, |
|
"learning_rate": 5.35660091047041e-06, |
|
"loss": 0.2347, |
|
"step": 1704 |
|
}, |
|
{ |
|
"epoch": 0.49204604824616716, |
|
"grad_norm": 1.6670514345169067, |
|
"learning_rate": 5.347496206373293e-06, |
|
"loss": 0.2269, |
|
"step": 1707 |
|
}, |
|
{ |
|
"epoch": 0.49291080404273335, |
|
"grad_norm": 2.426805257797241, |
|
"learning_rate": 5.338391502276177e-06, |
|
"loss": 0.1946, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.49377555983929955, |
|
"grad_norm": 1.7583879232406616, |
|
"learning_rate": 5.32928679817906e-06, |
|
"loss": 0.2235, |
|
"step": 1713 |
|
}, |
|
{ |
|
"epoch": 0.49464031563586575, |
|
"grad_norm": 1.7235326766967773, |
|
"learning_rate": 5.320182094081943e-06, |
|
"loss": 0.2395, |
|
"step": 1716 |
|
}, |
|
{ |
|
"epoch": 0.49550507143243194, |
|
"grad_norm": 1.4216803312301636, |
|
"learning_rate": 5.311077389984826e-06, |
|
"loss": 0.2634, |
|
"step": 1719 |
|
}, |
|
{ |
|
"epoch": 0.49636982722899814, |
|
"grad_norm": 1.2892686128616333, |
|
"learning_rate": 5.301972685887709e-06, |
|
"loss": 0.2083, |
|
"step": 1722 |
|
}, |
|
{ |
|
"epoch": 0.49723458302556434, |
|
"grad_norm": 2.5210540294647217, |
|
"learning_rate": 5.292867981790593e-06, |
|
"loss": 0.2017, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.49809933882213053, |
|
"grad_norm": 5.790046691894531, |
|
"learning_rate": 5.283763277693476e-06, |
|
"loss": 0.2306, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 0.49896409461869673, |
|
"grad_norm": 1.4158023595809937, |
|
"learning_rate": 5.274658573596359e-06, |
|
"loss": 0.266, |
|
"step": 1731 |
|
}, |
|
{ |
|
"epoch": 0.4998288504152629, |
|
"grad_norm": 3.0490024089813232, |
|
"learning_rate": 5.2655538694992415e-06, |
|
"loss": 0.2506, |
|
"step": 1734 |
|
}, |
|
{ |
|
"epoch": 0.5006936062118291, |
|
"grad_norm": 2.33208966255188, |
|
"learning_rate": 5.256449165402124e-06, |
|
"loss": 0.2088, |
|
"step": 1737 |
|
}, |
|
{ |
|
"epoch": 0.5015583620083953, |
|
"grad_norm": 1.8087997436523438, |
|
"learning_rate": 5.247344461305008e-06, |
|
"loss": 0.2095, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5024231178049615, |
|
"grad_norm": 1.5517979860305786, |
|
"learning_rate": 5.238239757207892e-06, |
|
"loss": 0.2342, |
|
"step": 1743 |
|
}, |
|
{ |
|
"epoch": 0.5032878736015277, |
|
"grad_norm": 1.841036319732666, |
|
"learning_rate": 5.229135053110775e-06, |
|
"loss": 0.2385, |
|
"step": 1746 |
|
}, |
|
{ |
|
"epoch": 0.5041526293980939, |
|
"grad_norm": 1.8034095764160156, |
|
"learning_rate": 5.2200303490136574e-06, |
|
"loss": 0.2537, |
|
"step": 1749 |
|
}, |
|
{ |
|
"epoch": 0.5050173851946601, |
|
"grad_norm": 3.617159366607666, |
|
"learning_rate": 5.21092564491654e-06, |
|
"loss": 0.2378, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 0.5058821409912263, |
|
"grad_norm": 2.903215169906616, |
|
"learning_rate": 5.201820940819424e-06, |
|
"loss": 0.2356, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.5067468967877925, |
|
"grad_norm": 2.084693193435669, |
|
"learning_rate": 5.192716236722307e-06, |
|
"loss": 0.2631, |
|
"step": 1758 |
|
}, |
|
{ |
|
"epoch": 0.5076116525843587, |
|
"grad_norm": 1.8994488716125488, |
|
"learning_rate": 5.1836115326251905e-06, |
|
"loss": 0.2742, |
|
"step": 1761 |
|
}, |
|
{ |
|
"epoch": 0.5084764083809249, |
|
"grad_norm": 2.651257276535034, |
|
"learning_rate": 5.174506828528073e-06, |
|
"loss": 0.2332, |
|
"step": 1764 |
|
}, |
|
{ |
|
"epoch": 0.5093411641774911, |
|
"grad_norm": 4.182311534881592, |
|
"learning_rate": 5.165402124430956e-06, |
|
"loss": 0.2467, |
|
"step": 1767 |
|
}, |
|
{ |
|
"epoch": 0.5102059199740573, |
|
"grad_norm": 27.990720748901367, |
|
"learning_rate": 5.15629742033384e-06, |
|
"loss": 0.2135, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5110706757706235, |
|
"grad_norm": 1.942474126815796, |
|
"learning_rate": 5.147192716236723e-06, |
|
"loss": 0.2443, |
|
"step": 1773 |
|
}, |
|
{ |
|
"epoch": 0.5119354315671897, |
|
"grad_norm": 2.768105983734131, |
|
"learning_rate": 5.1380880121396055e-06, |
|
"loss": 0.2566, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 0.5128001873637559, |
|
"grad_norm": 2.423797607421875, |
|
"learning_rate": 5.128983308042489e-06, |
|
"loss": 0.2667, |
|
"step": 1779 |
|
}, |
|
{ |
|
"epoch": 0.5136649431603221, |
|
"grad_norm": 2.395047426223755, |
|
"learning_rate": 5.119878603945372e-06, |
|
"loss": 0.2495, |
|
"step": 1782 |
|
}, |
|
{ |
|
"epoch": 0.5145296989568883, |
|
"grad_norm": 2.577787160873413, |
|
"learning_rate": 5.110773899848256e-06, |
|
"loss": 0.2454, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.5153944547534545, |
|
"grad_norm": 3.106776714324951, |
|
"learning_rate": 5.101669195751139e-06, |
|
"loss": 0.2162, |
|
"step": 1788 |
|
}, |
|
{ |
|
"epoch": 0.5162592105500207, |
|
"grad_norm": 1.5912446975708008, |
|
"learning_rate": 5.0925644916540215e-06, |
|
"loss": 0.2201, |
|
"step": 1791 |
|
}, |
|
{ |
|
"epoch": 0.517123966346587, |
|
"grad_norm": 2.427795171737671, |
|
"learning_rate": 5.083459787556905e-06, |
|
"loss": 0.236, |
|
"step": 1794 |
|
}, |
|
{ |
|
"epoch": 0.5179887221431532, |
|
"grad_norm": 2.5363399982452393, |
|
"learning_rate": 5.074355083459788e-06, |
|
"loss": 0.2551, |
|
"step": 1797 |
|
}, |
|
{ |
|
"epoch": 0.5188534779397194, |
|
"grad_norm": 1.8077950477600098, |
|
"learning_rate": 5.065250379362671e-06, |
|
"loss": 0.2411, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5188534779397194, |
|
"eval_loss": 0.21374772489070892, |
|
"eval_mse": 0.21374773593991994, |
|
"eval_runtime": 6.6116, |
|
"eval_samples_per_second": 151.249, |
|
"eval_steps_per_second": 18.906, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5197182337362856, |
|
"grad_norm": 5.243107795715332, |
|
"learning_rate": 5.0561456752655545e-06, |
|
"loss": 0.243, |
|
"step": 1803 |
|
}, |
|
{ |
|
"epoch": 0.5205829895328518, |
|
"grad_norm": 2.7711331844329834, |
|
"learning_rate": 5.047040971168437e-06, |
|
"loss": 0.2037, |
|
"step": 1806 |
|
}, |
|
{ |
|
"epoch": 0.521447745329418, |
|
"grad_norm": 2.8418238162994385, |
|
"learning_rate": 5.03793626707132e-06, |
|
"loss": 0.2331, |
|
"step": 1809 |
|
}, |
|
{ |
|
"epoch": 0.5223125011259842, |
|
"grad_norm": 3.0842299461364746, |
|
"learning_rate": 5.028831562974204e-06, |
|
"loss": 0.2422, |
|
"step": 1812 |
|
}, |
|
{ |
|
"epoch": 0.5231772569225503, |
|
"grad_norm": 2.5956835746765137, |
|
"learning_rate": 5.019726858877087e-06, |
|
"loss": 0.2057, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.5240420127191165, |
|
"grad_norm": 1.3121715784072876, |
|
"learning_rate": 5.0106221547799704e-06, |
|
"loss": 0.2387, |
|
"step": 1818 |
|
}, |
|
{ |
|
"epoch": 0.5249067685156827, |
|
"grad_norm": 2.2341432571411133, |
|
"learning_rate": 5.001517450682853e-06, |
|
"loss": 0.2611, |
|
"step": 1821 |
|
}, |
|
{ |
|
"epoch": 0.5257715243122489, |
|
"grad_norm": 2.0494587421417236, |
|
"learning_rate": 4.992412746585736e-06, |
|
"loss": 0.2238, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 0.5266362801088151, |
|
"grad_norm": 2.2597897052764893, |
|
"learning_rate": 4.983308042488619e-06, |
|
"loss": 0.2322, |
|
"step": 1827 |
|
}, |
|
{ |
|
"epoch": 0.5275010359053813, |
|
"grad_norm": 3.5993051528930664, |
|
"learning_rate": 4.974203338391503e-06, |
|
"loss": 0.234, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5283657917019475, |
|
"grad_norm": 3.769505262374878, |
|
"learning_rate": 4.9650986342943855e-06, |
|
"loss": 0.2454, |
|
"step": 1833 |
|
}, |
|
{ |
|
"epoch": 0.5292305474985137, |
|
"grad_norm": 1.4726693630218506, |
|
"learning_rate": 4.955993930197269e-06, |
|
"loss": 0.2261, |
|
"step": 1836 |
|
}, |
|
{ |
|
"epoch": 0.5300953032950799, |
|
"grad_norm": 2.2910659313201904, |
|
"learning_rate": 4.946889226100152e-06, |
|
"loss": 0.2551, |
|
"step": 1839 |
|
}, |
|
{ |
|
"epoch": 0.5309600590916461, |
|
"grad_norm": 1.8401825428009033, |
|
"learning_rate": 4.937784522003035e-06, |
|
"loss": 0.1891, |
|
"step": 1842 |
|
}, |
|
{ |
|
"epoch": 0.5318248148882123, |
|
"grad_norm": 2.0589776039123535, |
|
"learning_rate": 4.9286798179059185e-06, |
|
"loss": 0.2475, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.5326895706847785, |
|
"grad_norm": 1.8461434841156006, |
|
"learning_rate": 4.919575113808801e-06, |
|
"loss": 0.2516, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 0.5335543264813447, |
|
"grad_norm": 1.8950605392456055, |
|
"learning_rate": 4.910470409711684e-06, |
|
"loss": 0.246, |
|
"step": 1851 |
|
}, |
|
{ |
|
"epoch": 0.5344190822779109, |
|
"grad_norm": 1.7544567584991455, |
|
"learning_rate": 4.901365705614568e-06, |
|
"loss": 0.2159, |
|
"step": 1854 |
|
}, |
|
{ |
|
"epoch": 0.5352838380744771, |
|
"grad_norm": 2.04953932762146, |
|
"learning_rate": 4.892261001517451e-06, |
|
"loss": 0.2745, |
|
"step": 1857 |
|
}, |
|
{ |
|
"epoch": 0.5361485938710433, |
|
"grad_norm": 2.173112154006958, |
|
"learning_rate": 4.8831562974203345e-06, |
|
"loss": 0.2222, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5370133496676095, |
|
"grad_norm": 1.4713711738586426, |
|
"learning_rate": 4.874051593323217e-06, |
|
"loss": 0.2182, |
|
"step": 1863 |
|
}, |
|
{ |
|
"epoch": 0.5378781054641757, |
|
"grad_norm": 2.421405792236328, |
|
"learning_rate": 4.8649468892261e-06, |
|
"loss": 0.2472, |
|
"step": 1866 |
|
}, |
|
{ |
|
"epoch": 0.5387428612607419, |
|
"grad_norm": 2.6306304931640625, |
|
"learning_rate": 4.855842185128984e-06, |
|
"loss": 0.2148, |
|
"step": 1869 |
|
}, |
|
{ |
|
"epoch": 0.5396076170573081, |
|
"grad_norm": 3.4957375526428223, |
|
"learning_rate": 4.846737481031867e-06, |
|
"loss": 0.2321, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 0.5404723728538743, |
|
"grad_norm": 4.008154392242432, |
|
"learning_rate": 4.8376327769347495e-06, |
|
"loss": 0.216, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.5413371286504405, |
|
"grad_norm": 2.417433977127075, |
|
"learning_rate": 4.828528072837633e-06, |
|
"loss": 0.2261, |
|
"step": 1878 |
|
}, |
|
{ |
|
"epoch": 0.5422018844470067, |
|
"grad_norm": 1.7908028364181519, |
|
"learning_rate": 4.819423368740516e-06, |
|
"loss": 0.2445, |
|
"step": 1881 |
|
}, |
|
{ |
|
"epoch": 0.5430666402435729, |
|
"grad_norm": 1.3756656646728516, |
|
"learning_rate": 4.8103186646434e-06, |
|
"loss": 0.201, |
|
"step": 1884 |
|
}, |
|
{ |
|
"epoch": 0.5439313960401391, |
|
"grad_norm": 1.686787724494934, |
|
"learning_rate": 4.801213960546283e-06, |
|
"loss": 0.217, |
|
"step": 1887 |
|
}, |
|
{ |
|
"epoch": 0.5447961518367053, |
|
"grad_norm": 3.6207942962646484, |
|
"learning_rate": 4.792109256449165e-06, |
|
"loss": 0.2746, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5456609076332715, |
|
"grad_norm": 3.838956117630005, |
|
"learning_rate": 4.783004552352049e-06, |
|
"loss": 0.2313, |
|
"step": 1893 |
|
}, |
|
{ |
|
"epoch": 0.5465256634298377, |
|
"grad_norm": 2.059926748275757, |
|
"learning_rate": 4.773899848254932e-06, |
|
"loss": 0.2302, |
|
"step": 1896 |
|
}, |
|
{ |
|
"epoch": 0.5473904192264039, |
|
"grad_norm": 1.9524738788604736, |
|
"learning_rate": 4.764795144157816e-06, |
|
"loss": 0.2473, |
|
"step": 1899 |
|
}, |
|
{ |
|
"epoch": 0.5476786711585926, |
|
"eval_loss": 0.21403329074382782, |
|
"eval_mse": 0.2140333094932139, |
|
"eval_runtime": 6.5394, |
|
"eval_samples_per_second": 152.918, |
|
"eval_steps_per_second": 19.115, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.54825517502297, |
|
"grad_norm": 1.8144432306289673, |
|
"learning_rate": 4.7556904400606985e-06, |
|
"loss": 0.2082, |
|
"step": 1902 |
|
}, |
|
{ |
|
"epoch": 0.5491199308195363, |
|
"grad_norm": 2.4071717262268066, |
|
"learning_rate": 4.746585735963581e-06, |
|
"loss": 0.2364, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.5499846866161024, |
|
"grad_norm": 1.7162179946899414, |
|
"learning_rate": 4.737481031866465e-06, |
|
"loss": 0.2119, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 0.5508494424126686, |
|
"grad_norm": 2.368528366088867, |
|
"learning_rate": 4.728376327769348e-06, |
|
"loss": 0.2314, |
|
"step": 1911 |
|
}, |
|
{ |
|
"epoch": 0.5517141982092348, |
|
"grad_norm": 3.422670602798462, |
|
"learning_rate": 4.719271623672231e-06, |
|
"loss": 0.2355, |
|
"step": 1914 |
|
}, |
|
{ |
|
"epoch": 0.552578954005801, |
|
"grad_norm": 2.324976682662964, |
|
"learning_rate": 4.710166919575114e-06, |
|
"loss": 0.2483, |
|
"step": 1917 |
|
}, |
|
{ |
|
"epoch": 0.5534437098023672, |
|
"grad_norm": 2.3686418533325195, |
|
"learning_rate": 4.701062215477997e-06, |
|
"loss": 0.204, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5543084655989334, |
|
"grad_norm": 2.1361286640167236, |
|
"learning_rate": 4.691957511380881e-06, |
|
"loss": 0.195, |
|
"step": 1923 |
|
}, |
|
{ |
|
"epoch": 0.5551732213954996, |
|
"grad_norm": 1.5527316331863403, |
|
"learning_rate": 4.682852807283764e-06, |
|
"loss": 0.2232, |
|
"step": 1926 |
|
}, |
|
{ |
|
"epoch": 0.5560379771920658, |
|
"grad_norm": 3.792592763900757, |
|
"learning_rate": 4.673748103186647e-06, |
|
"loss": 0.2157, |
|
"step": 1929 |
|
}, |
|
{ |
|
"epoch": 0.556902732988632, |
|
"grad_norm": 1.8878562450408936, |
|
"learning_rate": 4.66464339908953e-06, |
|
"loss": 0.21, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 0.5577674887851982, |
|
"grad_norm": 1.686164140701294, |
|
"learning_rate": 4.655538694992413e-06, |
|
"loss": 0.2177, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.5586322445817644, |
|
"grad_norm": 2.310054302215576, |
|
"learning_rate": 4.646433990895296e-06, |
|
"loss": 0.2039, |
|
"step": 1938 |
|
}, |
|
{ |
|
"epoch": 0.5594970003783306, |
|
"grad_norm": 1.8163293600082397, |
|
"learning_rate": 4.63732928679818e-06, |
|
"loss": 0.2117, |
|
"step": 1941 |
|
}, |
|
{ |
|
"epoch": 0.5603617561748968, |
|
"grad_norm": 1.608209490776062, |
|
"learning_rate": 4.6282245827010625e-06, |
|
"loss": 0.2361, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 0.561226511971463, |
|
"grad_norm": 1.4159106016159058, |
|
"learning_rate": 4.619119878603946e-06, |
|
"loss": 0.2042, |
|
"step": 1947 |
|
}, |
|
{ |
|
"epoch": 0.5620912677680293, |
|
"grad_norm": 2.920888900756836, |
|
"learning_rate": 4.610015174506829e-06, |
|
"loss": 0.2715, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5629560235645955, |
|
"grad_norm": 1.1805585622787476, |
|
"learning_rate": 4.600910470409712e-06, |
|
"loss": 0.2329, |
|
"step": 1953 |
|
}, |
|
{ |
|
"epoch": 0.5638207793611617, |
|
"grad_norm": 3.1957271099090576, |
|
"learning_rate": 4.591805766312596e-06, |
|
"loss": 0.253, |
|
"step": 1956 |
|
}, |
|
{ |
|
"epoch": 0.5646855351577279, |
|
"grad_norm": 2.3281495571136475, |
|
"learning_rate": 4.582701062215478e-06, |
|
"loss": 0.2508, |
|
"step": 1959 |
|
}, |
|
{ |
|
"epoch": 0.5655502909542941, |
|
"grad_norm": 1.9826480150222778, |
|
"learning_rate": 4.573596358118362e-06, |
|
"loss": 0.2107, |
|
"step": 1962 |
|
}, |
|
{ |
|
"epoch": 0.5664150467508603, |
|
"grad_norm": 5.98641300201416, |
|
"learning_rate": 4.564491654021245e-06, |
|
"loss": 0.2343, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.5672798025474265, |
|
"grad_norm": 4.082788944244385, |
|
"learning_rate": 4.555386949924128e-06, |
|
"loss": 0.2493, |
|
"step": 1968 |
|
}, |
|
{ |
|
"epoch": 0.5681445583439927, |
|
"grad_norm": 1.6683293581008911, |
|
"learning_rate": 4.5462822458270115e-06, |
|
"loss": 0.1892, |
|
"step": 1971 |
|
}, |
|
{ |
|
"epoch": 0.5690093141405589, |
|
"grad_norm": 4.348227024078369, |
|
"learning_rate": 4.537177541729894e-06, |
|
"loss": 0.2277, |
|
"step": 1974 |
|
}, |
|
{ |
|
"epoch": 0.5698740699371251, |
|
"grad_norm": 1.2774113416671753, |
|
"learning_rate": 4.528072837632777e-06, |
|
"loss": 0.2077, |
|
"step": 1977 |
|
}, |
|
{ |
|
"epoch": 0.5707388257336913, |
|
"grad_norm": 1.5915591716766357, |
|
"learning_rate": 4.518968133535661e-06, |
|
"loss": 0.2284, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5716035815302575, |
|
"grad_norm": 1.3817569017410278, |
|
"learning_rate": 4.509863429438544e-06, |
|
"loss": 0.2159, |
|
"step": 1983 |
|
}, |
|
{ |
|
"epoch": 0.5724683373268237, |
|
"grad_norm": 1.3606081008911133, |
|
"learning_rate": 4.500758725341427e-06, |
|
"loss": 0.2055, |
|
"step": 1986 |
|
}, |
|
{ |
|
"epoch": 0.5733330931233899, |
|
"grad_norm": 2.6602344512939453, |
|
"learning_rate": 4.49165402124431e-06, |
|
"loss": 0.2611, |
|
"step": 1989 |
|
}, |
|
{ |
|
"epoch": 0.5741978489199561, |
|
"grad_norm": 1.796570897102356, |
|
"learning_rate": 4.482549317147193e-06, |
|
"loss": 0.1925, |
|
"step": 1992 |
|
}, |
|
{ |
|
"epoch": 0.5750626047165223, |
|
"grad_norm": 2.4218592643737793, |
|
"learning_rate": 4.473444613050077e-06, |
|
"loss": 0.1992, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5759273605130885, |
|
"grad_norm": 1.9690345525741577, |
|
"learning_rate": 4.46433990895296e-06, |
|
"loss": 0.237, |
|
"step": 1998 |
|
}, |
|
{ |
|
"epoch": 0.5765038643774659, |
|
"eval_loss": 0.2176571637392044, |
|
"eval_mse": 0.2176571699755732, |
|
"eval_runtime": 6.5589, |
|
"eval_samples_per_second": 152.465, |
|
"eval_steps_per_second": 19.058, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5767921163096547, |
|
"grad_norm": 2.168555498123169, |
|
"learning_rate": 4.4552352048558425e-06, |
|
"loss": 0.2009, |
|
"step": 2001 |
|
}, |
|
{ |
|
"epoch": 0.5776568721062209, |
|
"grad_norm": 3.1508069038391113, |
|
"learning_rate": 4.446130500758726e-06, |
|
"loss": 0.2316, |
|
"step": 2004 |
|
}, |
|
{ |
|
"epoch": 0.578521627902787, |
|
"grad_norm": 1.686283826828003, |
|
"learning_rate": 4.437025796661609e-06, |
|
"loss": 0.2313, |
|
"step": 2007 |
|
}, |
|
{ |
|
"epoch": 0.5793863836993532, |
|
"grad_norm": 1.3212209939956665, |
|
"learning_rate": 4.427921092564492e-06, |
|
"loss": 0.2076, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5802511394959194, |
|
"grad_norm": 3.2884185314178467, |
|
"learning_rate": 4.4188163884673755e-06, |
|
"loss": 0.2345, |
|
"step": 2013 |
|
}, |
|
{ |
|
"epoch": 0.5811158952924856, |
|
"grad_norm": 1.9892568588256836, |
|
"learning_rate": 4.409711684370258e-06, |
|
"loss": 0.2428, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 0.5819806510890518, |
|
"grad_norm": 1.5294564962387085, |
|
"learning_rate": 4.400606980273141e-06, |
|
"loss": 0.2252, |
|
"step": 2019 |
|
}, |
|
{ |
|
"epoch": 0.582845406885618, |
|
"grad_norm": 2.2944376468658447, |
|
"learning_rate": 4.391502276176025e-06, |
|
"loss": 0.2429, |
|
"step": 2022 |
|
}, |
|
{ |
|
"epoch": 0.5837101626821842, |
|
"grad_norm": 1.584145426750183, |
|
"learning_rate": 4.382397572078908e-06, |
|
"loss": 0.2548, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5845749184787504, |
|
"grad_norm": 1.604771375656128, |
|
"learning_rate": 4.3732928679817906e-06, |
|
"loss": 0.2211, |
|
"step": 2028 |
|
}, |
|
{ |
|
"epoch": 0.5854396742753166, |
|
"grad_norm": 1.7536587715148926, |
|
"learning_rate": 4.364188163884674e-06, |
|
"loss": 0.2165, |
|
"step": 2031 |
|
}, |
|
{ |
|
"epoch": 0.5863044300718828, |
|
"grad_norm": 1.6281161308288574, |
|
"learning_rate": 4.355083459787557e-06, |
|
"loss": 0.2609, |
|
"step": 2034 |
|
}, |
|
{ |
|
"epoch": 0.587169185868449, |
|
"grad_norm": 1.77180016040802, |
|
"learning_rate": 4.34597875569044e-06, |
|
"loss": 0.2333, |
|
"step": 2037 |
|
}, |
|
{ |
|
"epoch": 0.5880339416650152, |
|
"grad_norm": 3.2408559322357178, |
|
"learning_rate": 4.336874051593324e-06, |
|
"loss": 0.2454, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5888986974615814, |
|
"grad_norm": 3.4444427490234375, |
|
"learning_rate": 4.3277693474962065e-06, |
|
"loss": 0.2549, |
|
"step": 2043 |
|
}, |
|
{ |
|
"epoch": 0.5897634532581476, |
|
"grad_norm": 4.399412155151367, |
|
"learning_rate": 4.31866464339909e-06, |
|
"loss": 0.2258, |
|
"step": 2046 |
|
}, |
|
{ |
|
"epoch": 0.5906282090547138, |
|
"grad_norm": 1.646681785583496, |
|
"learning_rate": 4.309559939301973e-06, |
|
"loss": 0.1858, |
|
"step": 2049 |
|
}, |
|
{ |
|
"epoch": 0.59149296485128, |
|
"grad_norm": 2.786576986312866, |
|
"learning_rate": 4.300455235204856e-06, |
|
"loss": 0.2085, |
|
"step": 2052 |
|
}, |
|
{ |
|
"epoch": 0.5923577206478462, |
|
"grad_norm": 3.0838379859924316, |
|
"learning_rate": 4.2913505311077395e-06, |
|
"loss": 0.2428, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.5932224764444124, |
|
"grad_norm": 1.518548846244812, |
|
"learning_rate": 4.282245827010622e-06, |
|
"loss": 0.2159, |
|
"step": 2058 |
|
}, |
|
{ |
|
"epoch": 0.5940872322409786, |
|
"grad_norm": 2.2088887691497803, |
|
"learning_rate": 4.273141122913505e-06, |
|
"loss": 0.2107, |
|
"step": 2061 |
|
}, |
|
{ |
|
"epoch": 0.5949519880375448, |
|
"grad_norm": 4.359911918640137, |
|
"learning_rate": 4.264036418816389e-06, |
|
"loss": 0.2461, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 0.595816743834111, |
|
"grad_norm": 1.796669840812683, |
|
"learning_rate": 4.254931714719272e-06, |
|
"loss": 0.1836, |
|
"step": 2067 |
|
}, |
|
{ |
|
"epoch": 0.5966814996306772, |
|
"grad_norm": 2.0835959911346436, |
|
"learning_rate": 4.245827010622155e-06, |
|
"loss": 0.2192, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5975462554272434, |
|
"grad_norm": 2.772815227508545, |
|
"learning_rate": 4.236722306525038e-06, |
|
"loss": 0.1974, |
|
"step": 2073 |
|
}, |
|
{ |
|
"epoch": 0.5984110112238096, |
|
"grad_norm": 1.8529541492462158, |
|
"learning_rate": 4.227617602427921e-06, |
|
"loss": 0.1946, |
|
"step": 2076 |
|
}, |
|
{ |
|
"epoch": 0.5992757670203758, |
|
"grad_norm": 2.0415849685668945, |
|
"learning_rate": 4.218512898330804e-06, |
|
"loss": 0.2182, |
|
"step": 2079 |
|
}, |
|
{ |
|
"epoch": 0.600140522816942, |
|
"grad_norm": 2.6295053958892822, |
|
"learning_rate": 4.209408194233688e-06, |
|
"loss": 0.2438, |
|
"step": 2082 |
|
}, |
|
{ |
|
"epoch": 0.6010052786135082, |
|
"grad_norm": 1.9082621335983276, |
|
"learning_rate": 4.2003034901365705e-06, |
|
"loss": 0.1834, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.6018700344100744, |
|
"grad_norm": 1.692436933517456, |
|
"learning_rate": 4.191198786039454e-06, |
|
"loss": 0.2104, |
|
"step": 2088 |
|
}, |
|
{ |
|
"epoch": 0.6027347902066406, |
|
"grad_norm": 2.0836997032165527, |
|
"learning_rate": 4.182094081942337e-06, |
|
"loss": 0.2323, |
|
"step": 2091 |
|
}, |
|
{ |
|
"epoch": 0.6035995460032068, |
|
"grad_norm": 1.354612946510315, |
|
"learning_rate": 4.17298937784522e-06, |
|
"loss": 0.1975, |
|
"step": 2094 |
|
}, |
|
{ |
|
"epoch": 0.604464301799773, |
|
"grad_norm": 3.683278799057007, |
|
"learning_rate": 4.1638846737481036e-06, |
|
"loss": 0.2256, |
|
"step": 2097 |
|
}, |
|
{ |
|
"epoch": 0.6053290575963391, |
|
"grad_norm": 2.232513189315796, |
|
"learning_rate": 4.154779969650986e-06, |
|
"loss": 0.1972, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6053290575963391, |
|
"eval_loss": 0.21862919628620148, |
|
"eval_mse": 0.218629194105044, |
|
"eval_runtime": 6.7266, |
|
"eval_samples_per_second": 148.664, |
|
"eval_steps_per_second": 18.583, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6061938133929053, |
|
"grad_norm": 1.5396499633789062, |
|
"learning_rate": 4.145675265553869e-06, |
|
"loss": 0.2151, |
|
"step": 2103 |
|
}, |
|
{ |
|
"epoch": 0.6070585691894717, |
|
"grad_norm": 1.482253909111023, |
|
"learning_rate": 4.136570561456753e-06, |
|
"loss": 0.2222, |
|
"step": 2106 |
|
}, |
|
{ |
|
"epoch": 0.6079233249860378, |
|
"grad_norm": 4.880987644195557, |
|
"learning_rate": 4.127465857359636e-06, |
|
"loss": 0.2535, |
|
"step": 2109 |
|
}, |
|
{ |
|
"epoch": 0.608788080782604, |
|
"grad_norm": 2.1557230949401855, |
|
"learning_rate": 4.1183611532625195e-06, |
|
"loss": 0.2213, |
|
"step": 2112 |
|
}, |
|
{ |
|
"epoch": 0.6096528365791702, |
|
"grad_norm": 2.417856454849243, |
|
"learning_rate": 4.109256449165402e-06, |
|
"loss": 0.2481, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.6105175923757364, |
|
"grad_norm": 2.211514949798584, |
|
"learning_rate": 4.100151745068285e-06, |
|
"loss": 0.2369, |
|
"step": 2118 |
|
}, |
|
{ |
|
"epoch": 0.6113823481723026, |
|
"grad_norm": 2.2844600677490234, |
|
"learning_rate": 4.091047040971169e-06, |
|
"loss": 0.2174, |
|
"step": 2121 |
|
}, |
|
{ |
|
"epoch": 0.6122471039688688, |
|
"grad_norm": 2.7534289360046387, |
|
"learning_rate": 4.081942336874052e-06, |
|
"loss": 0.22, |
|
"step": 2124 |
|
}, |
|
{ |
|
"epoch": 0.613111859765435, |
|
"grad_norm": 1.5547044277191162, |
|
"learning_rate": 4.072837632776935e-06, |
|
"loss": 0.227, |
|
"step": 2127 |
|
}, |
|
{ |
|
"epoch": 0.6139766155620012, |
|
"grad_norm": 1.6965092420578003, |
|
"learning_rate": 4.063732928679818e-06, |
|
"loss": 0.219, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6148413713585674, |
|
"grad_norm": 1.8000000715255737, |
|
"learning_rate": 4.054628224582701e-06, |
|
"loss": 0.2303, |
|
"step": 2133 |
|
}, |
|
{ |
|
"epoch": 0.6157061271551336, |
|
"grad_norm": 1.5764316320419312, |
|
"learning_rate": 4.045523520485585e-06, |
|
"loss": 0.191, |
|
"step": 2136 |
|
}, |
|
{ |
|
"epoch": 0.6165708829516998, |
|
"grad_norm": 2.0041658878326416, |
|
"learning_rate": 4.036418816388468e-06, |
|
"loss": 0.2025, |
|
"step": 2139 |
|
}, |
|
{ |
|
"epoch": 0.617435638748266, |
|
"grad_norm": 1.9013463258743286, |
|
"learning_rate": 4.0273141122913504e-06, |
|
"loss": 0.225, |
|
"step": 2142 |
|
}, |
|
{ |
|
"epoch": 0.6183003945448322, |
|
"grad_norm": 2.1815786361694336, |
|
"learning_rate": 4.018209408194234e-06, |
|
"loss": 0.2409, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.6191651503413984, |
|
"grad_norm": 1.6740418672561646, |
|
"learning_rate": 4.009104704097117e-06, |
|
"loss": 0.2143, |
|
"step": 2148 |
|
}, |
|
{ |
|
"epoch": 0.6200299061379646, |
|
"grad_norm": 2.1056814193725586, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.2365, |
|
"step": 2151 |
|
}, |
|
{ |
|
"epoch": 0.6208946619345308, |
|
"grad_norm": 2.629563808441162, |
|
"learning_rate": 3.9908952959028835e-06, |
|
"loss": 0.1932, |
|
"step": 2154 |
|
}, |
|
{ |
|
"epoch": 0.621759417731097, |
|
"grad_norm": 1.7547650337219238, |
|
"learning_rate": 3.981790591805766e-06, |
|
"loss": 0.246, |
|
"step": 2157 |
|
}, |
|
{ |
|
"epoch": 0.6226241735276632, |
|
"grad_norm": 2.2451794147491455, |
|
"learning_rate": 3.97268588770865e-06, |
|
"loss": 0.2405, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6234889293242294, |
|
"grad_norm": 2.8820624351501465, |
|
"learning_rate": 3.963581183611533e-06, |
|
"loss": 0.253, |
|
"step": 2163 |
|
}, |
|
{ |
|
"epoch": 0.6243536851207956, |
|
"grad_norm": 2.9832215309143066, |
|
"learning_rate": 3.9544764795144166e-06, |
|
"loss": 0.223, |
|
"step": 2166 |
|
}, |
|
{ |
|
"epoch": 0.6252184409173618, |
|
"grad_norm": 2.911879539489746, |
|
"learning_rate": 3.945371775417299e-06, |
|
"loss": 0.2177, |
|
"step": 2169 |
|
}, |
|
{ |
|
"epoch": 0.626083196713928, |
|
"grad_norm": 2.266767740249634, |
|
"learning_rate": 3.936267071320182e-06, |
|
"loss": 0.2288, |
|
"step": 2172 |
|
}, |
|
{ |
|
"epoch": 0.6269479525104942, |
|
"grad_norm": 1.401633858680725, |
|
"learning_rate": 3.927162367223066e-06, |
|
"loss": 0.2556, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.6278127083070604, |
|
"grad_norm": 2.7354822158813477, |
|
"learning_rate": 3.918057663125949e-06, |
|
"loss": 0.2439, |
|
"step": 2178 |
|
}, |
|
{ |
|
"epoch": 0.6286774641036266, |
|
"grad_norm": 1.6652506589889526, |
|
"learning_rate": 3.908952959028832e-06, |
|
"loss": 0.2041, |
|
"step": 2181 |
|
}, |
|
{ |
|
"epoch": 0.6295422199001928, |
|
"grad_norm": 3.3072733879089355, |
|
"learning_rate": 3.899848254931715e-06, |
|
"loss": 0.2207, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 0.630406975696759, |
|
"grad_norm": 2.254608392715454, |
|
"learning_rate": 3.890743550834598e-06, |
|
"loss": 0.2352, |
|
"step": 2187 |
|
}, |
|
{ |
|
"epoch": 0.6312717314933252, |
|
"grad_norm": 1.765981674194336, |
|
"learning_rate": 3.881638846737482e-06, |
|
"loss": 0.2923, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6321364872898914, |
|
"grad_norm": 1.7792342901229858, |
|
"learning_rate": 3.872534142640365e-06, |
|
"loss": 0.2428, |
|
"step": 2193 |
|
}, |
|
{ |
|
"epoch": 0.6330012430864576, |
|
"grad_norm": 5.084781646728516, |
|
"learning_rate": 3.8634294385432475e-06, |
|
"loss": 0.2616, |
|
"step": 2196 |
|
}, |
|
{ |
|
"epoch": 0.6338659988830238, |
|
"grad_norm": 2.0845305919647217, |
|
"learning_rate": 3.854324734446131e-06, |
|
"loss": 0.2556, |
|
"step": 2199 |
|
}, |
|
{ |
|
"epoch": 0.6341542508152125, |
|
"eval_loss": 0.2416454404592514, |
|
"eval_mse": 0.2416454482518602, |
|
"eval_runtime": 6.5442, |
|
"eval_samples_per_second": 152.808, |
|
"eval_steps_per_second": 19.101, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.63473075467959, |
|
"grad_norm": 3.8279173374176025, |
|
"learning_rate": 3.845220030349014e-06, |
|
"loss": 0.2245, |
|
"step": 2202 |
|
}, |
|
{ |
|
"epoch": 0.6355955104761561, |
|
"grad_norm": 3.564417839050293, |
|
"learning_rate": 3.836115326251897e-06, |
|
"loss": 0.2233, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.6364602662727223, |
|
"grad_norm": 2.4322614669799805, |
|
"learning_rate": 3.827010622154781e-06, |
|
"loss": 0.2227, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 0.6373250220692885, |
|
"grad_norm": 1.6439640522003174, |
|
"learning_rate": 3.8179059180576634e-06, |
|
"loss": 0.2135, |
|
"step": 2211 |
|
}, |
|
{ |
|
"epoch": 0.6381897778658547, |
|
"grad_norm": 2.605598211288452, |
|
"learning_rate": 3.8088012139605467e-06, |
|
"loss": 0.2203, |
|
"step": 2214 |
|
}, |
|
{ |
|
"epoch": 0.6390545336624209, |
|
"grad_norm": 1.939488410949707, |
|
"learning_rate": 3.7996965098634296e-06, |
|
"loss": 0.2377, |
|
"step": 2217 |
|
}, |
|
{ |
|
"epoch": 0.6399192894589871, |
|
"grad_norm": 1.851778507232666, |
|
"learning_rate": 3.790591805766313e-06, |
|
"loss": 0.2199, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6407840452555533, |
|
"grad_norm": 2.077923059463501, |
|
"learning_rate": 3.781487101669196e-06, |
|
"loss": 0.223, |
|
"step": 2223 |
|
}, |
|
{ |
|
"epoch": 0.6416488010521195, |
|
"grad_norm": 5.192010402679443, |
|
"learning_rate": 3.772382397572079e-06, |
|
"loss": 0.2412, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 0.6425135568486857, |
|
"grad_norm": 1.6057194471359253, |
|
"learning_rate": 3.7632776934749626e-06, |
|
"loss": 0.2354, |
|
"step": 2229 |
|
}, |
|
{ |
|
"epoch": 0.6433783126452519, |
|
"grad_norm": 3.0130653381347656, |
|
"learning_rate": 3.7541729893778455e-06, |
|
"loss": 0.2307, |
|
"step": 2232 |
|
}, |
|
{ |
|
"epoch": 0.6442430684418181, |
|
"grad_norm": 2.835080623626709, |
|
"learning_rate": 3.7450682852807287e-06, |
|
"loss": 0.1929, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.6451078242383843, |
|
"grad_norm": 1.800140619277954, |
|
"learning_rate": 3.735963581183612e-06, |
|
"loss": 0.2158, |
|
"step": 2238 |
|
}, |
|
{ |
|
"epoch": 0.6459725800349505, |
|
"grad_norm": 1.8859021663665771, |
|
"learning_rate": 3.726858877086495e-06, |
|
"loss": 0.2469, |
|
"step": 2241 |
|
}, |
|
{ |
|
"epoch": 0.6468373358315167, |
|
"grad_norm": 1.8524531126022339, |
|
"learning_rate": 3.717754172989378e-06, |
|
"loss": 0.2257, |
|
"step": 2244 |
|
}, |
|
{ |
|
"epoch": 0.6477020916280829, |
|
"grad_norm": 2.759021520614624, |
|
"learning_rate": 3.7086494688922614e-06, |
|
"loss": 0.2187, |
|
"step": 2247 |
|
}, |
|
{ |
|
"epoch": 0.6485668474246491, |
|
"grad_norm": 1.885272741317749, |
|
"learning_rate": 3.699544764795144e-06, |
|
"loss": 0.2416, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6494316032212153, |
|
"grad_norm": 2.244595527648926, |
|
"learning_rate": 3.690440060698028e-06, |
|
"loss": 0.2002, |
|
"step": 2253 |
|
}, |
|
{ |
|
"epoch": 0.6502963590177815, |
|
"grad_norm": 2.533815622329712, |
|
"learning_rate": 3.6813353566009107e-06, |
|
"loss": 0.2065, |
|
"step": 2256 |
|
}, |
|
{ |
|
"epoch": 0.6511611148143477, |
|
"grad_norm": 1.883478045463562, |
|
"learning_rate": 3.6722306525037936e-06, |
|
"loss": 0.2014, |
|
"step": 2259 |
|
}, |
|
{ |
|
"epoch": 0.652025870610914, |
|
"grad_norm": 4.37358283996582, |
|
"learning_rate": 3.6631259484066773e-06, |
|
"loss": 0.2347, |
|
"step": 2262 |
|
}, |
|
{ |
|
"epoch": 0.6528906264074802, |
|
"grad_norm": 1.9453434944152832, |
|
"learning_rate": 3.65402124430956e-06, |
|
"loss": 0.2414, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.6537553822040464, |
|
"grad_norm": 1.5430552959442139, |
|
"learning_rate": 3.644916540212443e-06, |
|
"loss": 0.2012, |
|
"step": 2268 |
|
}, |
|
{ |
|
"epoch": 0.6546201380006126, |
|
"grad_norm": 2.023857593536377, |
|
"learning_rate": 3.6358118361153266e-06, |
|
"loss": 0.216, |
|
"step": 2271 |
|
}, |
|
{ |
|
"epoch": 0.6554848937971788, |
|
"grad_norm": 2.0380475521087646, |
|
"learning_rate": 3.6267071320182095e-06, |
|
"loss": 0.1851, |
|
"step": 2274 |
|
}, |
|
{ |
|
"epoch": 0.656349649593745, |
|
"grad_norm": 1.9703435897827148, |
|
"learning_rate": 3.617602427921093e-06, |
|
"loss": 0.2152, |
|
"step": 2277 |
|
}, |
|
{ |
|
"epoch": 0.6572144053903112, |
|
"grad_norm": 2.0153567790985107, |
|
"learning_rate": 3.608497723823976e-06, |
|
"loss": 0.2248, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6580791611868774, |
|
"grad_norm": 1.6611104011535645, |
|
"learning_rate": 3.599393019726859e-06, |
|
"loss": 0.2416, |
|
"step": 2283 |
|
}, |
|
{ |
|
"epoch": 0.6589439169834436, |
|
"grad_norm": 1.8866307735443115, |
|
"learning_rate": 3.5902883156297426e-06, |
|
"loss": 0.2319, |
|
"step": 2286 |
|
}, |
|
{ |
|
"epoch": 0.6598086727800098, |
|
"grad_norm": 2.9478352069854736, |
|
"learning_rate": 3.5811836115326254e-06, |
|
"loss": 0.2614, |
|
"step": 2289 |
|
}, |
|
{ |
|
"epoch": 0.660673428576576, |
|
"grad_norm": 1.578539252281189, |
|
"learning_rate": 3.572078907435509e-06, |
|
"loss": 0.2097, |
|
"step": 2292 |
|
}, |
|
{ |
|
"epoch": 0.6615381843731422, |
|
"grad_norm": 3.096663236618042, |
|
"learning_rate": 3.562974203338392e-06, |
|
"loss": 0.1977, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.6624029401697084, |
|
"grad_norm": 1.81285560131073, |
|
"learning_rate": 3.5538694992412748e-06, |
|
"loss": 0.2273, |
|
"step": 2298 |
|
}, |
|
{ |
|
"epoch": 0.6629794440340858, |
|
"eval_loss": 0.2196592092514038, |
|
"eval_mse": 0.2196592075770641, |
|
"eval_runtime": 6.5661, |
|
"eval_samples_per_second": 152.298, |
|
"eval_steps_per_second": 19.037, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6632676959662746, |
|
"grad_norm": 1.6824992895126343, |
|
"learning_rate": 3.5447647951441585e-06, |
|
"loss": 0.1899, |
|
"step": 2301 |
|
}, |
|
{ |
|
"epoch": 0.6641324517628407, |
|
"grad_norm": 2.2244341373443604, |
|
"learning_rate": 3.5356600910470413e-06, |
|
"loss": 0.2113, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 0.6649972075594069, |
|
"grad_norm": 2.0596795082092285, |
|
"learning_rate": 3.526555386949924e-06, |
|
"loss": 0.2264, |
|
"step": 2307 |
|
}, |
|
{ |
|
"epoch": 0.6658619633559731, |
|
"grad_norm": 2.2340962886810303, |
|
"learning_rate": 3.517450682852808e-06, |
|
"loss": 0.2168, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6667267191525393, |
|
"grad_norm": 1.9558560848236084, |
|
"learning_rate": 3.5083459787556907e-06, |
|
"loss": 0.2088, |
|
"step": 2313 |
|
}, |
|
{ |
|
"epoch": 0.6675914749491055, |
|
"grad_norm": 1.9379972219467163, |
|
"learning_rate": 3.499241274658574e-06, |
|
"loss": 0.2284, |
|
"step": 2316 |
|
}, |
|
{ |
|
"epoch": 0.6684562307456717, |
|
"grad_norm": 2.3833818435668945, |
|
"learning_rate": 3.490136570561457e-06, |
|
"loss": 0.2526, |
|
"step": 2319 |
|
}, |
|
{ |
|
"epoch": 0.6693209865422379, |
|
"grad_norm": 2.1912760734558105, |
|
"learning_rate": 3.48103186646434e-06, |
|
"loss": 0.2579, |
|
"step": 2322 |
|
}, |
|
{ |
|
"epoch": 0.6701857423388041, |
|
"grad_norm": 1.4502041339874268, |
|
"learning_rate": 3.4719271623672233e-06, |
|
"loss": 0.2118, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.6710504981353703, |
|
"grad_norm": 1.6936084032058716, |
|
"learning_rate": 3.4628224582701066e-06, |
|
"loss": 0.2417, |
|
"step": 2328 |
|
}, |
|
{ |
|
"epoch": 0.6719152539319365, |
|
"grad_norm": 2.5630977153778076, |
|
"learning_rate": 3.45371775417299e-06, |
|
"loss": 0.255, |
|
"step": 2331 |
|
}, |
|
{ |
|
"epoch": 0.6727800097285027, |
|
"grad_norm": 1.7571030855178833, |
|
"learning_rate": 3.4446130500758727e-06, |
|
"loss": 0.2122, |
|
"step": 2334 |
|
}, |
|
{ |
|
"epoch": 0.6736447655250689, |
|
"grad_norm": 2.4220032691955566, |
|
"learning_rate": 3.435508345978756e-06, |
|
"loss": 0.2383, |
|
"step": 2337 |
|
}, |
|
{ |
|
"epoch": 0.6745095213216351, |
|
"grad_norm": 2.3608193397521973, |
|
"learning_rate": 3.4264036418816392e-06, |
|
"loss": 0.241, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6753742771182013, |
|
"grad_norm": 2.0964229106903076, |
|
"learning_rate": 3.417298937784522e-06, |
|
"loss": 0.2178, |
|
"step": 2343 |
|
}, |
|
{ |
|
"epoch": 0.6762390329147675, |
|
"grad_norm": 3.038722038269043, |
|
"learning_rate": 3.4081942336874053e-06, |
|
"loss": 0.2276, |
|
"step": 2346 |
|
}, |
|
{ |
|
"epoch": 0.6771037887113337, |
|
"grad_norm": 3.0551295280456543, |
|
"learning_rate": 3.3990895295902886e-06, |
|
"loss": 0.1959, |
|
"step": 2349 |
|
}, |
|
{ |
|
"epoch": 0.6779685445078999, |
|
"grad_norm": 1.4905811548233032, |
|
"learning_rate": 3.3899848254931714e-06, |
|
"loss": 0.1863, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 0.6788333003044661, |
|
"grad_norm": 3.0637025833129883, |
|
"learning_rate": 3.380880121396055e-06, |
|
"loss": 0.2341, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.6796980561010323, |
|
"grad_norm": 2.9851162433624268, |
|
"learning_rate": 3.371775417298938e-06, |
|
"loss": 0.2586, |
|
"step": 2358 |
|
}, |
|
{ |
|
"epoch": 0.6805628118975985, |
|
"grad_norm": 2.0650391578674316, |
|
"learning_rate": 3.3626707132018212e-06, |
|
"loss": 0.2181, |
|
"step": 2361 |
|
}, |
|
{ |
|
"epoch": 0.6814275676941647, |
|
"grad_norm": 3.1792430877685547, |
|
"learning_rate": 3.3535660091047045e-06, |
|
"loss": 0.234, |
|
"step": 2364 |
|
}, |
|
{ |
|
"epoch": 0.6822923234907309, |
|
"grad_norm": 2.171764612197876, |
|
"learning_rate": 3.3444613050075873e-06, |
|
"loss": 0.2232, |
|
"step": 2367 |
|
}, |
|
{ |
|
"epoch": 0.6831570792872971, |
|
"grad_norm": 1.8832968473434448, |
|
"learning_rate": 3.3353566009104706e-06, |
|
"loss": 0.2386, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6840218350838633, |
|
"grad_norm": 2.217407703399658, |
|
"learning_rate": 3.326251896813354e-06, |
|
"loss": 0.2575, |
|
"step": 2373 |
|
}, |
|
{ |
|
"epoch": 0.6848865908804295, |
|
"grad_norm": 1.3866760730743408, |
|
"learning_rate": 3.3171471927162367e-06, |
|
"loss": 0.2321, |
|
"step": 2376 |
|
}, |
|
{ |
|
"epoch": 0.6857513466769957, |
|
"grad_norm": 2.836749315261841, |
|
"learning_rate": 3.3080424886191204e-06, |
|
"loss": 0.2082, |
|
"step": 2379 |
|
}, |
|
{ |
|
"epoch": 0.6866161024735619, |
|
"grad_norm": 4.798961162567139, |
|
"learning_rate": 3.2989377845220033e-06, |
|
"loss": 0.2397, |
|
"step": 2382 |
|
}, |
|
{ |
|
"epoch": 0.6874808582701281, |
|
"grad_norm": 1.8099883794784546, |
|
"learning_rate": 3.289833080424886e-06, |
|
"loss": 0.2468, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.6883456140666943, |
|
"grad_norm": 3.0821380615234375, |
|
"learning_rate": 3.2807283763277698e-06, |
|
"loss": 0.2013, |
|
"step": 2388 |
|
}, |
|
{ |
|
"epoch": 0.6892103698632605, |
|
"grad_norm": 1.6952015161514282, |
|
"learning_rate": 3.2716236722306526e-06, |
|
"loss": 0.2156, |
|
"step": 2391 |
|
}, |
|
{ |
|
"epoch": 0.6900751256598266, |
|
"grad_norm": 2.4413681030273438, |
|
"learning_rate": 3.2625189681335363e-06, |
|
"loss": 0.2307, |
|
"step": 2394 |
|
}, |
|
{ |
|
"epoch": 0.6909398814563928, |
|
"grad_norm": 1.9589879512786865, |
|
"learning_rate": 3.253414264036419e-06, |
|
"loss": 0.2948, |
|
"step": 2397 |
|
}, |
|
{ |
|
"epoch": 0.691804637252959, |
|
"grad_norm": 2.4465548992156982, |
|
"learning_rate": 3.244309559939302e-06, |
|
"loss": 0.223, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.691804637252959, |
|
"eval_loss": 0.22531487047672272, |
|
"eval_mse": 0.2253148703626357, |
|
"eval_runtime": 6.6334, |
|
"eval_samples_per_second": 150.751, |
|
"eval_steps_per_second": 18.844, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6926693930495252, |
|
"grad_norm": 2.3037302494049072, |
|
"learning_rate": 3.2352048558421857e-06, |
|
"loss": 0.2199, |
|
"step": 2403 |
|
}, |
|
{ |
|
"epoch": 0.6935341488460914, |
|
"grad_norm": 1.6721099615097046, |
|
"learning_rate": 3.2261001517450685e-06, |
|
"loss": 0.2286, |
|
"step": 2406 |
|
}, |
|
{ |
|
"epoch": 0.6943989046426576, |
|
"grad_norm": 3.3806381225585938, |
|
"learning_rate": 3.2169954476479514e-06, |
|
"loss": 0.1988, |
|
"step": 2409 |
|
}, |
|
{ |
|
"epoch": 0.6952636604392238, |
|
"grad_norm": 2.1515021324157715, |
|
"learning_rate": 3.207890743550835e-06, |
|
"loss": 0.2412, |
|
"step": 2412 |
|
}, |
|
{ |
|
"epoch": 0.69612841623579, |
|
"grad_norm": 3.980482816696167, |
|
"learning_rate": 3.198786039453718e-06, |
|
"loss": 0.2088, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.6969931720323563, |
|
"grad_norm": 2.2418131828308105, |
|
"learning_rate": 3.1896813353566016e-06, |
|
"loss": 0.2107, |
|
"step": 2418 |
|
}, |
|
{ |
|
"epoch": 0.6978579278289225, |
|
"grad_norm": 1.819807767868042, |
|
"learning_rate": 3.1805766312594844e-06, |
|
"loss": 0.2435, |
|
"step": 2421 |
|
}, |
|
{ |
|
"epoch": 0.6987226836254887, |
|
"grad_norm": 3.8227691650390625, |
|
"learning_rate": 3.1714719271623673e-06, |
|
"loss": 0.2278, |
|
"step": 2424 |
|
}, |
|
{ |
|
"epoch": 0.6995874394220549, |
|
"grad_norm": 2.207240104675293, |
|
"learning_rate": 3.162367223065251e-06, |
|
"loss": 0.2362, |
|
"step": 2427 |
|
}, |
|
{ |
|
"epoch": 0.7004521952186211, |
|
"grad_norm": 1.796724796295166, |
|
"learning_rate": 3.153262518968134e-06, |
|
"loss": 0.2383, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7013169510151873, |
|
"grad_norm": 2.7628397941589355, |
|
"learning_rate": 3.1441578148710167e-06, |
|
"loss": 0.2326, |
|
"step": 2433 |
|
}, |
|
{ |
|
"epoch": 0.7021817068117535, |
|
"grad_norm": 1.3642479181289673, |
|
"learning_rate": 3.1350531107739003e-06, |
|
"loss": 0.2059, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 0.7030464626083197, |
|
"grad_norm": 1.5554901361465454, |
|
"learning_rate": 3.125948406676783e-06, |
|
"loss": 0.2162, |
|
"step": 2439 |
|
}, |
|
{ |
|
"epoch": 0.7039112184048859, |
|
"grad_norm": 2.5311179161071777, |
|
"learning_rate": 3.1168437025796665e-06, |
|
"loss": 0.237, |
|
"step": 2442 |
|
}, |
|
{ |
|
"epoch": 0.7047759742014521, |
|
"grad_norm": 1.805991768836975, |
|
"learning_rate": 3.1077389984825497e-06, |
|
"loss": 0.2426, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.7056407299980183, |
|
"grad_norm": 2.029891014099121, |
|
"learning_rate": 3.0986342943854326e-06, |
|
"loss": 0.2139, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 0.7065054857945845, |
|
"grad_norm": 2.5495572090148926, |
|
"learning_rate": 3.089529590288316e-06, |
|
"loss": 0.2279, |
|
"step": 2451 |
|
}, |
|
{ |
|
"epoch": 0.7073702415911507, |
|
"grad_norm": 1.5610768795013428, |
|
"learning_rate": 3.080424886191199e-06, |
|
"loss": 0.2175, |
|
"step": 2454 |
|
}, |
|
{ |
|
"epoch": 0.7082349973877169, |
|
"grad_norm": 3.5896153450012207, |
|
"learning_rate": 3.0713201820940824e-06, |
|
"loss": 0.2459, |
|
"step": 2457 |
|
}, |
|
{ |
|
"epoch": 0.7090997531842831, |
|
"grad_norm": 2.1120688915252686, |
|
"learning_rate": 3.062215477996965e-06, |
|
"loss": 0.2565, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7099645089808493, |
|
"grad_norm": 2.7099833488464355, |
|
"learning_rate": 3.0531107738998485e-06, |
|
"loss": 0.2088, |
|
"step": 2463 |
|
}, |
|
{ |
|
"epoch": 0.7108292647774155, |
|
"grad_norm": 1.7153905630111694, |
|
"learning_rate": 3.0440060698027317e-06, |
|
"loss": 0.2114, |
|
"step": 2466 |
|
}, |
|
{ |
|
"epoch": 0.7116940205739817, |
|
"grad_norm": 1.9465105533599854, |
|
"learning_rate": 3.0349013657056146e-06, |
|
"loss": 0.1932, |
|
"step": 2469 |
|
}, |
|
{ |
|
"epoch": 0.7125587763705479, |
|
"grad_norm": 1.6483453512191772, |
|
"learning_rate": 3.025796661608498e-06, |
|
"loss": 0.232, |
|
"step": 2472 |
|
}, |
|
{ |
|
"epoch": 0.7134235321671141, |
|
"grad_norm": 2.3854711055755615, |
|
"learning_rate": 3.016691957511381e-06, |
|
"loss": 0.2397, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.7142882879636803, |
|
"grad_norm": 1.565765380859375, |
|
"learning_rate": 3.0075872534142644e-06, |
|
"loss": 0.2055, |
|
"step": 2478 |
|
}, |
|
{ |
|
"epoch": 0.7151530437602465, |
|
"grad_norm": 1.5985909700393677, |
|
"learning_rate": 2.9984825493171476e-06, |
|
"loss": 0.2077, |
|
"step": 2481 |
|
}, |
|
{ |
|
"epoch": 0.7160177995568127, |
|
"grad_norm": 2.9907102584838867, |
|
"learning_rate": 2.9893778452200305e-06, |
|
"loss": 0.2125, |
|
"step": 2484 |
|
}, |
|
{ |
|
"epoch": 0.7168825553533789, |
|
"grad_norm": 3.0764994621276855, |
|
"learning_rate": 2.9802731411229137e-06, |
|
"loss": 0.2342, |
|
"step": 2487 |
|
}, |
|
{ |
|
"epoch": 0.717747311149945, |
|
"grad_norm": 2.954237461090088, |
|
"learning_rate": 2.971168437025797e-06, |
|
"loss": 0.232, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7186120669465113, |
|
"grad_norm": 1.5421547889709473, |
|
"learning_rate": 2.96206373292868e-06, |
|
"loss": 0.2019, |
|
"step": 2493 |
|
}, |
|
{ |
|
"epoch": 0.7194768227430774, |
|
"grad_norm": 5.054042816162109, |
|
"learning_rate": 2.9529590288315635e-06, |
|
"loss": 0.2359, |
|
"step": 2496 |
|
}, |
|
{ |
|
"epoch": 0.7203415785396436, |
|
"grad_norm": 1.5477067232131958, |
|
"learning_rate": 2.9438543247344464e-06, |
|
"loss": 0.2028, |
|
"step": 2499 |
|
}, |
|
{ |
|
"epoch": 0.7206298304718324, |
|
"eval_loss": 0.22387926280498505, |
|
"eval_mse": 0.22387926151184365, |
|
"eval_runtime": 6.5037, |
|
"eval_samples_per_second": 153.759, |
|
"eval_steps_per_second": 19.22, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7212063343362098, |
|
"grad_norm": 1.912636160850525, |
|
"learning_rate": 2.9347496206373292e-06, |
|
"loss": 0.2167, |
|
"step": 2502 |
|
}, |
|
{ |
|
"epoch": 0.722071090132776, |
|
"grad_norm": 1.644394040107727, |
|
"learning_rate": 2.925644916540213e-06, |
|
"loss": 0.2068, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.7229358459293422, |
|
"grad_norm": 3.4328315258026123, |
|
"learning_rate": 2.9165402124430958e-06, |
|
"loss": 0.2417, |
|
"step": 2508 |
|
}, |
|
{ |
|
"epoch": 0.7238006017259084, |
|
"grad_norm": 1.918043613433838, |
|
"learning_rate": 2.9074355083459786e-06, |
|
"loss": 0.2308, |
|
"step": 2511 |
|
}, |
|
{ |
|
"epoch": 0.7246653575224746, |
|
"grad_norm": 1.94221031665802, |
|
"learning_rate": 2.8983308042488623e-06, |
|
"loss": 0.2957, |
|
"step": 2514 |
|
}, |
|
{ |
|
"epoch": 0.7255301133190408, |
|
"grad_norm": 2.877037525177002, |
|
"learning_rate": 2.889226100151745e-06, |
|
"loss": 0.2133, |
|
"step": 2517 |
|
}, |
|
{ |
|
"epoch": 0.726394869115607, |
|
"grad_norm": 2.2768120765686035, |
|
"learning_rate": 2.880121396054629e-06, |
|
"loss": 0.2251, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7272596249121732, |
|
"grad_norm": 2.9239742755889893, |
|
"learning_rate": 2.8710166919575117e-06, |
|
"loss": 0.215, |
|
"step": 2523 |
|
}, |
|
{ |
|
"epoch": 0.7281243807087394, |
|
"grad_norm": 1.5520339012145996, |
|
"learning_rate": 2.8619119878603945e-06, |
|
"loss": 0.2113, |
|
"step": 2526 |
|
}, |
|
{ |
|
"epoch": 0.7289891365053056, |
|
"grad_norm": 3.458822011947632, |
|
"learning_rate": 2.852807283763278e-06, |
|
"loss": 0.2465, |
|
"step": 2529 |
|
}, |
|
{ |
|
"epoch": 0.7298538923018718, |
|
"grad_norm": 1.606724500656128, |
|
"learning_rate": 2.843702579666161e-06, |
|
"loss": 0.188, |
|
"step": 2532 |
|
}, |
|
{ |
|
"epoch": 0.730718648098438, |
|
"grad_norm": 3.552236318588257, |
|
"learning_rate": 2.834597875569044e-06, |
|
"loss": 0.2178, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.7315834038950042, |
|
"grad_norm": 2.940363883972168, |
|
"learning_rate": 2.8254931714719276e-06, |
|
"loss": 0.2149, |
|
"step": 2538 |
|
}, |
|
{ |
|
"epoch": 0.7324481596915704, |
|
"grad_norm": 1.3787034749984741, |
|
"learning_rate": 2.8163884673748104e-06, |
|
"loss": 0.2155, |
|
"step": 2541 |
|
}, |
|
{ |
|
"epoch": 0.7333129154881366, |
|
"grad_norm": 1.4637516736984253, |
|
"learning_rate": 2.807283763277694e-06, |
|
"loss": 0.1944, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 0.7341776712847028, |
|
"grad_norm": 1.5903189182281494, |
|
"learning_rate": 2.798179059180577e-06, |
|
"loss": 0.214, |
|
"step": 2547 |
|
}, |
|
{ |
|
"epoch": 0.735042427081269, |
|
"grad_norm": 1.999664306640625, |
|
"learning_rate": 2.78907435508346e-06, |
|
"loss": 0.2277, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7359071828778352, |
|
"grad_norm": 2.250450849533081, |
|
"learning_rate": 2.7799696509863435e-06, |
|
"loss": 0.1958, |
|
"step": 2553 |
|
}, |
|
{ |
|
"epoch": 0.7367719386744014, |
|
"grad_norm": 1.8212629556655884, |
|
"learning_rate": 2.7708649468892263e-06, |
|
"loss": 0.2155, |
|
"step": 2556 |
|
}, |
|
{ |
|
"epoch": 0.7376366944709676, |
|
"grad_norm": 1.4670761823654175, |
|
"learning_rate": 2.7617602427921096e-06, |
|
"loss": 0.2251, |
|
"step": 2559 |
|
}, |
|
{ |
|
"epoch": 0.7385014502675338, |
|
"grad_norm": 2.9052860736846924, |
|
"learning_rate": 2.752655538694993e-06, |
|
"loss": 0.2279, |
|
"step": 2562 |
|
}, |
|
{ |
|
"epoch": 0.7393662060641, |
|
"grad_norm": 1.647455096244812, |
|
"learning_rate": 2.7435508345978757e-06, |
|
"loss": 0.2011, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.7402309618606662, |
|
"grad_norm": 2.5667457580566406, |
|
"learning_rate": 2.734446130500759e-06, |
|
"loss": 0.2113, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 0.7410957176572324, |
|
"grad_norm": 2.021571159362793, |
|
"learning_rate": 2.7253414264036422e-06, |
|
"loss": 0.2085, |
|
"step": 2571 |
|
}, |
|
{ |
|
"epoch": 0.7419604734537987, |
|
"grad_norm": 3.436924457550049, |
|
"learning_rate": 2.716236722306525e-06, |
|
"loss": 0.2658, |
|
"step": 2574 |
|
}, |
|
{ |
|
"epoch": 0.7428252292503649, |
|
"grad_norm": 1.9480434656143188, |
|
"learning_rate": 2.7071320182094083e-06, |
|
"loss": 0.2595, |
|
"step": 2577 |
|
}, |
|
{ |
|
"epoch": 0.7436899850469311, |
|
"grad_norm": 1.8556321859359741, |
|
"learning_rate": 2.6980273141122916e-06, |
|
"loss": 0.2308, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7445547408434973, |
|
"grad_norm": 3.174111843109131, |
|
"learning_rate": 2.688922610015175e-06, |
|
"loss": 0.2172, |
|
"step": 2583 |
|
}, |
|
{ |
|
"epoch": 0.7454194966400635, |
|
"grad_norm": 1.5629518032073975, |
|
"learning_rate": 2.6798179059180577e-06, |
|
"loss": 0.2417, |
|
"step": 2586 |
|
}, |
|
{ |
|
"epoch": 0.7462842524366297, |
|
"grad_norm": 1.8133536577224731, |
|
"learning_rate": 2.670713201820941e-06, |
|
"loss": 0.2188, |
|
"step": 2589 |
|
}, |
|
{ |
|
"epoch": 0.7471490082331959, |
|
"grad_norm": 1.5448634624481201, |
|
"learning_rate": 2.6616084977238242e-06, |
|
"loss": 0.2201, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 0.748013764029762, |
|
"grad_norm": 2.376194953918457, |
|
"learning_rate": 2.652503793626707e-06, |
|
"loss": 0.2146, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.7488785198263282, |
|
"grad_norm": 1.8988285064697266, |
|
"learning_rate": 2.6433990895295904e-06, |
|
"loss": 0.2322, |
|
"step": 2598 |
|
}, |
|
{ |
|
"epoch": 0.7494550236907057, |
|
"eval_loss": 0.21797478199005127, |
|
"eval_mse": 0.21797478066571058, |
|
"eval_runtime": 6.533, |
|
"eval_samples_per_second": 153.07, |
|
"eval_steps_per_second": 19.134, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7497432756228944, |
|
"grad_norm": 3.16768217086792, |
|
"learning_rate": 2.6342943854324736e-06, |
|
"loss": 0.2533, |
|
"step": 2601 |
|
}, |
|
{ |
|
"epoch": 0.7506080314194606, |
|
"grad_norm": 1.7228988409042358, |
|
"learning_rate": 2.625189681335357e-06, |
|
"loss": 0.2406, |
|
"step": 2604 |
|
}, |
|
{ |
|
"epoch": 0.7514727872160268, |
|
"grad_norm": 2.9629013538360596, |
|
"learning_rate": 2.61608497723824e-06, |
|
"loss": 0.2131, |
|
"step": 2607 |
|
}, |
|
{ |
|
"epoch": 0.752337543012593, |
|
"grad_norm": 3.4181559085845947, |
|
"learning_rate": 2.606980273141123e-06, |
|
"loss": 0.2126, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7532022988091592, |
|
"grad_norm": 2.142685890197754, |
|
"learning_rate": 2.5978755690440063e-06, |
|
"loss": 0.2345, |
|
"step": 2613 |
|
}, |
|
{ |
|
"epoch": 0.7540670546057254, |
|
"grad_norm": 3.622145175933838, |
|
"learning_rate": 2.5887708649468895e-06, |
|
"loss": 0.2187, |
|
"step": 2616 |
|
}, |
|
{ |
|
"epoch": 0.7549318104022916, |
|
"grad_norm": 1.5243175029754639, |
|
"learning_rate": 2.5796661608497724e-06, |
|
"loss": 0.1996, |
|
"step": 2619 |
|
}, |
|
{ |
|
"epoch": 0.7557965661988578, |
|
"grad_norm": 2.7075355052948, |
|
"learning_rate": 2.570561456752656e-06, |
|
"loss": 0.2755, |
|
"step": 2622 |
|
}, |
|
{ |
|
"epoch": 0.756661321995424, |
|
"grad_norm": 2.6778082847595215, |
|
"learning_rate": 2.561456752655539e-06, |
|
"loss": 0.1959, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.7575260777919902, |
|
"grad_norm": 3.8043668270111084, |
|
"learning_rate": 2.5523520485584217e-06, |
|
"loss": 0.2054, |
|
"step": 2628 |
|
}, |
|
{ |
|
"epoch": 0.7583908335885564, |
|
"grad_norm": 3.628281354904175, |
|
"learning_rate": 2.5432473444613054e-06, |
|
"loss": 0.2095, |
|
"step": 2631 |
|
}, |
|
{ |
|
"epoch": 0.7592555893851226, |
|
"grad_norm": 3.2685089111328125, |
|
"learning_rate": 2.5341426403641883e-06, |
|
"loss": 0.2167, |
|
"step": 2634 |
|
}, |
|
{ |
|
"epoch": 0.7601203451816888, |
|
"grad_norm": 1.6048041582107544, |
|
"learning_rate": 2.525037936267071e-06, |
|
"loss": 0.2416, |
|
"step": 2637 |
|
}, |
|
{ |
|
"epoch": 0.760985100978255, |
|
"grad_norm": 1.5175867080688477, |
|
"learning_rate": 2.515933232169955e-06, |
|
"loss": 0.2326, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7618498567748212, |
|
"grad_norm": 1.9787158966064453, |
|
"learning_rate": 2.5068285280728376e-06, |
|
"loss": 0.2266, |
|
"step": 2643 |
|
}, |
|
{ |
|
"epoch": 0.7627146125713874, |
|
"grad_norm": 4.297834873199463, |
|
"learning_rate": 2.497723823975721e-06, |
|
"loss": 0.2414, |
|
"step": 2646 |
|
}, |
|
{ |
|
"epoch": 0.7635793683679536, |
|
"grad_norm": 1.822587251663208, |
|
"learning_rate": 2.488619119878604e-06, |
|
"loss": 0.2386, |
|
"step": 2649 |
|
}, |
|
{ |
|
"epoch": 0.7644441241645198, |
|
"grad_norm": 4.966648101806641, |
|
"learning_rate": 2.4795144157814874e-06, |
|
"loss": 0.2268, |
|
"step": 2652 |
|
}, |
|
{ |
|
"epoch": 0.765308879961086, |
|
"grad_norm": 2.8137335777282715, |
|
"learning_rate": 2.4704097116843703e-06, |
|
"loss": 0.2349, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.7661736357576522, |
|
"grad_norm": 2.5401577949523926, |
|
"learning_rate": 2.4613050075872536e-06, |
|
"loss": 0.1941, |
|
"step": 2658 |
|
}, |
|
{ |
|
"epoch": 0.7670383915542184, |
|
"grad_norm": 1.550758957862854, |
|
"learning_rate": 2.452200303490137e-06, |
|
"loss": 0.2151, |
|
"step": 2661 |
|
}, |
|
{ |
|
"epoch": 0.7679031473507846, |
|
"grad_norm": 6.889467239379883, |
|
"learning_rate": 2.44309559939302e-06, |
|
"loss": 0.2287, |
|
"step": 2664 |
|
}, |
|
{ |
|
"epoch": 0.7687679031473508, |
|
"grad_norm": 1.4833314418792725, |
|
"learning_rate": 2.4339908952959034e-06, |
|
"loss": 0.2017, |
|
"step": 2667 |
|
}, |
|
{ |
|
"epoch": 0.769632658943917, |
|
"grad_norm": 1.607751727104187, |
|
"learning_rate": 2.424886191198786e-06, |
|
"loss": 0.2309, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.7704974147404832, |
|
"grad_norm": 1.7369478940963745, |
|
"learning_rate": 2.4157814871016695e-06, |
|
"loss": 0.217, |
|
"step": 2673 |
|
}, |
|
{ |
|
"epoch": 0.7713621705370494, |
|
"grad_norm": 1.7309290170669556, |
|
"learning_rate": 2.4066767830045527e-06, |
|
"loss": 0.2151, |
|
"step": 2676 |
|
}, |
|
{ |
|
"epoch": 0.7722269263336156, |
|
"grad_norm": 2.047727108001709, |
|
"learning_rate": 2.397572078907436e-06, |
|
"loss": 0.2078, |
|
"step": 2679 |
|
}, |
|
{ |
|
"epoch": 0.7730916821301818, |
|
"grad_norm": 2.2800142765045166, |
|
"learning_rate": 2.388467374810319e-06, |
|
"loss": 0.2372, |
|
"step": 2682 |
|
}, |
|
{ |
|
"epoch": 0.773956437926748, |
|
"grad_norm": 3.920849323272705, |
|
"learning_rate": 2.379362670713202e-06, |
|
"loss": 0.2304, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.7748211937233141, |
|
"grad_norm": 3.4216678142547607, |
|
"learning_rate": 2.3702579666160854e-06, |
|
"loss": 0.2382, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 0.7756859495198803, |
|
"grad_norm": 1.5471861362457275, |
|
"learning_rate": 2.3611532625189686e-06, |
|
"loss": 0.2243, |
|
"step": 2691 |
|
}, |
|
{ |
|
"epoch": 0.7765507053164465, |
|
"grad_norm": 1.7489866018295288, |
|
"learning_rate": 2.3520485584218515e-06, |
|
"loss": 0.2239, |
|
"step": 2694 |
|
}, |
|
{ |
|
"epoch": 0.7774154611130127, |
|
"grad_norm": 4.3836822509765625, |
|
"learning_rate": 2.3429438543247347e-06, |
|
"loss": 0.205, |
|
"step": 2697 |
|
}, |
|
{ |
|
"epoch": 0.7782802169095789, |
|
"grad_norm": 1.8707342147827148, |
|
"learning_rate": 2.333839150227618e-06, |
|
"loss": 0.1933, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7782802169095789, |
|
"eval_loss": 0.2158193737268448, |
|
"eval_mse": 0.21581936262454837, |
|
"eval_runtime": 6.5614, |
|
"eval_samples_per_second": 152.407, |
|
"eval_steps_per_second": 19.051, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7791449727061451, |
|
"grad_norm": 1.6554896831512451, |
|
"learning_rate": 2.324734446130501e-06, |
|
"loss": 0.2178, |
|
"step": 2703 |
|
}, |
|
{ |
|
"epoch": 0.7800097285027113, |
|
"grad_norm": 3.131352424621582, |
|
"learning_rate": 2.315629742033384e-06, |
|
"loss": 0.2122, |
|
"step": 2706 |
|
}, |
|
{ |
|
"epoch": 0.7808744842992775, |
|
"grad_norm": 2.62422776222229, |
|
"learning_rate": 2.3065250379362674e-06, |
|
"loss": 0.2085, |
|
"step": 2709 |
|
}, |
|
{ |
|
"epoch": 0.7817392400958437, |
|
"grad_norm": 2.1258456707000732, |
|
"learning_rate": 2.2974203338391502e-06, |
|
"loss": 0.2203, |
|
"step": 2712 |
|
}, |
|
{ |
|
"epoch": 0.7826039958924099, |
|
"grad_norm": 3.144688606262207, |
|
"learning_rate": 2.2883156297420335e-06, |
|
"loss": 0.2428, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.7834687516889761, |
|
"grad_norm": 4.5740180015563965, |
|
"learning_rate": 2.2792109256449168e-06, |
|
"loss": 0.2196, |
|
"step": 2718 |
|
}, |
|
{ |
|
"epoch": 0.7843335074855423, |
|
"grad_norm": 1.7256083488464355, |
|
"learning_rate": 2.2701062215477996e-06, |
|
"loss": 0.2316, |
|
"step": 2721 |
|
}, |
|
{ |
|
"epoch": 0.7851982632821085, |
|
"grad_norm": 2.0723230838775635, |
|
"learning_rate": 2.261001517450683e-06, |
|
"loss": 0.1958, |
|
"step": 2724 |
|
}, |
|
{ |
|
"epoch": 0.7860630190786747, |
|
"grad_norm": 1.6268962621688843, |
|
"learning_rate": 2.251896813353566e-06, |
|
"loss": 0.2329, |
|
"step": 2727 |
|
}, |
|
{ |
|
"epoch": 0.786927774875241, |
|
"grad_norm": 4.054417610168457, |
|
"learning_rate": 2.2427921092564494e-06, |
|
"loss": 0.2091, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.7877925306718072, |
|
"grad_norm": 1.7409260272979736, |
|
"learning_rate": 2.2336874051593322e-06, |
|
"loss": 0.2144, |
|
"step": 2733 |
|
}, |
|
{ |
|
"epoch": 0.7886572864683734, |
|
"grad_norm": 2.77607798576355, |
|
"learning_rate": 2.2245827010622155e-06, |
|
"loss": 0.2199, |
|
"step": 2736 |
|
}, |
|
{ |
|
"epoch": 0.7895220422649396, |
|
"grad_norm": 2.215284585952759, |
|
"learning_rate": 2.2154779969650988e-06, |
|
"loss": 0.2388, |
|
"step": 2739 |
|
}, |
|
{ |
|
"epoch": 0.7903867980615058, |
|
"grad_norm": 1.7318382263183594, |
|
"learning_rate": 2.206373292867982e-06, |
|
"loss": 0.2115, |
|
"step": 2742 |
|
}, |
|
{ |
|
"epoch": 0.791251553858072, |
|
"grad_norm": 1.5627691745758057, |
|
"learning_rate": 2.197268588770865e-06, |
|
"loss": 0.2054, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.7921163096546382, |
|
"grad_norm": 1.810509443283081, |
|
"learning_rate": 2.188163884673748e-06, |
|
"loss": 0.2086, |
|
"step": 2748 |
|
}, |
|
{ |
|
"epoch": 0.7929810654512044, |
|
"grad_norm": 2.1531972885131836, |
|
"learning_rate": 2.1790591805766314e-06, |
|
"loss": 0.2252, |
|
"step": 2751 |
|
}, |
|
{ |
|
"epoch": 0.7938458212477706, |
|
"grad_norm": 2.0212440490722656, |
|
"learning_rate": 2.1699544764795147e-06, |
|
"loss": 0.2355, |
|
"step": 2754 |
|
}, |
|
{ |
|
"epoch": 0.7947105770443368, |
|
"grad_norm": 5.030855178833008, |
|
"learning_rate": 2.1608497723823975e-06, |
|
"loss": 0.2341, |
|
"step": 2757 |
|
}, |
|
{ |
|
"epoch": 0.795575332840903, |
|
"grad_norm": 2.213249921798706, |
|
"learning_rate": 2.1517450682852808e-06, |
|
"loss": 0.2133, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.7964400886374692, |
|
"grad_norm": 1.8025689125061035, |
|
"learning_rate": 2.142640364188164e-06, |
|
"loss": 0.2251, |
|
"step": 2763 |
|
}, |
|
{ |
|
"epoch": 0.7973048444340354, |
|
"grad_norm": 4.14149284362793, |
|
"learning_rate": 2.1335356600910473e-06, |
|
"loss": 0.253, |
|
"step": 2766 |
|
}, |
|
{ |
|
"epoch": 0.7981696002306016, |
|
"grad_norm": 2.2051069736480713, |
|
"learning_rate": 2.12443095599393e-06, |
|
"loss": 0.2238, |
|
"step": 2769 |
|
}, |
|
{ |
|
"epoch": 0.7990343560271678, |
|
"grad_norm": 2.249032497406006, |
|
"learning_rate": 2.1153262518968134e-06, |
|
"loss": 0.2282, |
|
"step": 2772 |
|
}, |
|
{ |
|
"epoch": 0.799899111823734, |
|
"grad_norm": 1.5087867975234985, |
|
"learning_rate": 2.1062215477996967e-06, |
|
"loss": 0.1948, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.8007638676203002, |
|
"grad_norm": 1.9934585094451904, |
|
"learning_rate": 2.09711684370258e-06, |
|
"loss": 0.2061, |
|
"step": 2778 |
|
}, |
|
{ |
|
"epoch": 0.8016286234168664, |
|
"grad_norm": 2.521526336669922, |
|
"learning_rate": 2.0880121396054632e-06, |
|
"loss": 0.2331, |
|
"step": 2781 |
|
}, |
|
{ |
|
"epoch": 0.8024933792134326, |
|
"grad_norm": 4.441010475158691, |
|
"learning_rate": 2.078907435508346e-06, |
|
"loss": 0.2337, |
|
"step": 2784 |
|
}, |
|
{ |
|
"epoch": 0.8033581350099988, |
|
"grad_norm": 1.9386543035507202, |
|
"learning_rate": 2.0698027314112293e-06, |
|
"loss": 0.2434, |
|
"step": 2787 |
|
}, |
|
{ |
|
"epoch": 0.804222890806565, |
|
"grad_norm": 1.6140722036361694, |
|
"learning_rate": 2.0606980273141126e-06, |
|
"loss": 0.223, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8050876466031311, |
|
"grad_norm": 3.248769998550415, |
|
"learning_rate": 2.051593323216996e-06, |
|
"loss": 0.2352, |
|
"step": 2793 |
|
}, |
|
{ |
|
"epoch": 0.8059524023996973, |
|
"grad_norm": 2.259561061859131, |
|
"learning_rate": 2.0424886191198787e-06, |
|
"loss": 0.2428, |
|
"step": 2796 |
|
}, |
|
{ |
|
"epoch": 0.8068171581962635, |
|
"grad_norm": 2.289113998413086, |
|
"learning_rate": 2.033383915022762e-06, |
|
"loss": 0.2085, |
|
"step": 2799 |
|
}, |
|
{ |
|
"epoch": 0.8071054101284523, |
|
"eval_loss": 0.22976131737232208, |
|
"eval_mse": 0.22976131996285404, |
|
"eval_runtime": 6.5104, |
|
"eval_samples_per_second": 153.6, |
|
"eval_steps_per_second": 19.2, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8076819139928297, |
|
"grad_norm": 1.4588847160339355, |
|
"learning_rate": 2.0242792109256452e-06, |
|
"loss": 0.2004, |
|
"step": 2802 |
|
}, |
|
{ |
|
"epoch": 0.8085466697893959, |
|
"grad_norm": 3.1680517196655273, |
|
"learning_rate": 2.0151745068285285e-06, |
|
"loss": 0.2038, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.8094114255859621, |
|
"grad_norm": 2.708411693572998, |
|
"learning_rate": 2.0060698027314113e-06, |
|
"loss": 0.2289, |
|
"step": 2808 |
|
}, |
|
{ |
|
"epoch": 0.8102761813825283, |
|
"grad_norm": 2.2479403018951416, |
|
"learning_rate": 1.9969650986342946e-06, |
|
"loss": 0.2465, |
|
"step": 2811 |
|
}, |
|
{ |
|
"epoch": 0.8111409371790945, |
|
"grad_norm": 3.2582664489746094, |
|
"learning_rate": 1.987860394537178e-06, |
|
"loss": 0.2187, |
|
"step": 2814 |
|
}, |
|
{ |
|
"epoch": 0.8120056929756607, |
|
"grad_norm": 2.5267367362976074, |
|
"learning_rate": 1.978755690440061e-06, |
|
"loss": 0.1955, |
|
"step": 2817 |
|
}, |
|
{ |
|
"epoch": 0.8128704487722269, |
|
"grad_norm": 2.42645525932312, |
|
"learning_rate": 1.969650986342944e-06, |
|
"loss": 0.233, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8137352045687931, |
|
"grad_norm": 1.6183414459228516, |
|
"learning_rate": 1.9605462822458273e-06, |
|
"loss": 0.2187, |
|
"step": 2823 |
|
}, |
|
{ |
|
"epoch": 0.8145999603653593, |
|
"grad_norm": 2.7215240001678467, |
|
"learning_rate": 1.9514415781487105e-06, |
|
"loss": 0.2109, |
|
"step": 2826 |
|
}, |
|
{ |
|
"epoch": 0.8154647161619255, |
|
"grad_norm": 2.152639389038086, |
|
"learning_rate": 1.9423368740515934e-06, |
|
"loss": 0.2056, |
|
"step": 2829 |
|
}, |
|
{ |
|
"epoch": 0.8163294719584917, |
|
"grad_norm": 2.530045509338379, |
|
"learning_rate": 1.9332321699544766e-06, |
|
"loss": 0.2185, |
|
"step": 2832 |
|
}, |
|
{ |
|
"epoch": 0.8171942277550579, |
|
"grad_norm": 1.8189818859100342, |
|
"learning_rate": 1.92412746585736e-06, |
|
"loss": 0.2502, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.8180589835516241, |
|
"grad_norm": 1.5473167896270752, |
|
"learning_rate": 1.9150227617602427e-06, |
|
"loss": 0.1923, |
|
"step": 2838 |
|
}, |
|
{ |
|
"epoch": 0.8189237393481903, |
|
"grad_norm": 2.99245285987854, |
|
"learning_rate": 1.9059180576631262e-06, |
|
"loss": 0.209, |
|
"step": 2841 |
|
}, |
|
{ |
|
"epoch": 0.8197884951447565, |
|
"grad_norm": 1.509892225265503, |
|
"learning_rate": 1.8968133535660093e-06, |
|
"loss": 0.2293, |
|
"step": 2844 |
|
}, |
|
{ |
|
"epoch": 0.8206532509413227, |
|
"grad_norm": 3.5667645931243896, |
|
"learning_rate": 1.8877086494688923e-06, |
|
"loss": 0.1935, |
|
"step": 2847 |
|
}, |
|
{ |
|
"epoch": 0.8215180067378889, |
|
"grad_norm": 3.1929867267608643, |
|
"learning_rate": 1.8786039453717756e-06, |
|
"loss": 0.201, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8223827625344551, |
|
"grad_norm": 1.5706427097320557, |
|
"learning_rate": 1.8694992412746589e-06, |
|
"loss": 0.2315, |
|
"step": 2853 |
|
}, |
|
{ |
|
"epoch": 0.8232475183310213, |
|
"grad_norm": 3.1730329990386963, |
|
"learning_rate": 1.860394537177542e-06, |
|
"loss": 0.2475, |
|
"step": 2856 |
|
}, |
|
{ |
|
"epoch": 0.8241122741275875, |
|
"grad_norm": 1.3369548320770264, |
|
"learning_rate": 1.851289833080425e-06, |
|
"loss": 0.2186, |
|
"step": 2859 |
|
}, |
|
{ |
|
"epoch": 0.8249770299241537, |
|
"grad_norm": 2.522751569747925, |
|
"learning_rate": 1.8421851289833082e-06, |
|
"loss": 0.2477, |
|
"step": 2862 |
|
}, |
|
{ |
|
"epoch": 0.8258417857207199, |
|
"grad_norm": 1.991076111793518, |
|
"learning_rate": 1.8330804248861913e-06, |
|
"loss": 0.2009, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.8267065415172861, |
|
"grad_norm": 2.707282781600952, |
|
"learning_rate": 1.8239757207890745e-06, |
|
"loss": 0.2302, |
|
"step": 2868 |
|
}, |
|
{ |
|
"epoch": 0.8275712973138523, |
|
"grad_norm": 1.578192949295044, |
|
"learning_rate": 1.8148710166919576e-06, |
|
"loss": 0.2021, |
|
"step": 2871 |
|
}, |
|
{ |
|
"epoch": 0.8284360531104185, |
|
"grad_norm": 3.6003148555755615, |
|
"learning_rate": 1.8057663125948407e-06, |
|
"loss": 0.2409, |
|
"step": 2874 |
|
}, |
|
{ |
|
"epoch": 0.8293008089069847, |
|
"grad_norm": 3.1442506313323975, |
|
"learning_rate": 1.796661608497724e-06, |
|
"loss": 0.2129, |
|
"step": 2877 |
|
}, |
|
{ |
|
"epoch": 0.8301655647035509, |
|
"grad_norm": 1.509333610534668, |
|
"learning_rate": 1.7875569044006072e-06, |
|
"loss": 0.2202, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.831030320500117, |
|
"grad_norm": 1.9379024505615234, |
|
"learning_rate": 1.7784522003034905e-06, |
|
"loss": 0.2238, |
|
"step": 2883 |
|
}, |
|
{ |
|
"epoch": 0.8318950762966834, |
|
"grad_norm": 1.3617918491363525, |
|
"learning_rate": 1.7693474962063733e-06, |
|
"loss": 0.1972, |
|
"step": 2886 |
|
}, |
|
{ |
|
"epoch": 0.8327598320932496, |
|
"grad_norm": 1.4775515794754028, |
|
"learning_rate": 1.7602427921092566e-06, |
|
"loss": 0.1985, |
|
"step": 2889 |
|
}, |
|
{ |
|
"epoch": 0.8336245878898157, |
|
"grad_norm": 1.4302202463150024, |
|
"learning_rate": 1.7511380880121398e-06, |
|
"loss": 0.2403, |
|
"step": 2892 |
|
}, |
|
{ |
|
"epoch": 0.834489343686382, |
|
"grad_norm": 2.128401041030884, |
|
"learning_rate": 1.742033383915023e-06, |
|
"loss": 0.2204, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.8353540994829481, |
|
"grad_norm": 1.9766216278076172, |
|
"learning_rate": 1.732928679817906e-06, |
|
"loss": 0.2038, |
|
"step": 2898 |
|
}, |
|
{ |
|
"epoch": 0.8359306033473256, |
|
"eval_loss": 0.21660968661308289, |
|
"eval_mse": 0.21660968138270925, |
|
"eval_runtime": 6.5731, |
|
"eval_samples_per_second": 152.135, |
|
"eval_steps_per_second": 19.017, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8362188552795143, |
|
"grad_norm": 2.260625123977661, |
|
"learning_rate": 1.7238239757207892e-06, |
|
"loss": 0.2206, |
|
"step": 2901 |
|
}, |
|
{ |
|
"epoch": 0.8370836110760805, |
|
"grad_norm": 2.605224609375, |
|
"learning_rate": 1.7147192716236725e-06, |
|
"loss": 0.2186, |
|
"step": 2904 |
|
}, |
|
{ |
|
"epoch": 0.8379483668726467, |
|
"grad_norm": 2.7619729042053223, |
|
"learning_rate": 1.7056145675265557e-06, |
|
"loss": 0.2195, |
|
"step": 2907 |
|
}, |
|
{ |
|
"epoch": 0.8388131226692129, |
|
"grad_norm": 1.9395873546600342, |
|
"learning_rate": 1.6965098634294386e-06, |
|
"loss": 0.2119, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8396778784657791, |
|
"grad_norm": 1.727263331413269, |
|
"learning_rate": 1.6874051593323218e-06, |
|
"loss": 0.2125, |
|
"step": 2913 |
|
}, |
|
{ |
|
"epoch": 0.8405426342623453, |
|
"grad_norm": 2.149775981903076, |
|
"learning_rate": 1.6783004552352051e-06, |
|
"loss": 0.2644, |
|
"step": 2916 |
|
}, |
|
{ |
|
"epoch": 0.8414073900589115, |
|
"grad_norm": 2.6743104457855225, |
|
"learning_rate": 1.6691957511380882e-06, |
|
"loss": 0.2217, |
|
"step": 2919 |
|
}, |
|
{ |
|
"epoch": 0.8422721458554777, |
|
"grad_norm": 2.795736074447632, |
|
"learning_rate": 1.6600910470409712e-06, |
|
"loss": 0.2115, |
|
"step": 2922 |
|
}, |
|
{ |
|
"epoch": 0.8431369016520439, |
|
"grad_norm": 1.8719727993011475, |
|
"learning_rate": 1.6509863429438545e-06, |
|
"loss": 0.1768, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.8440016574486101, |
|
"grad_norm": 3.4366025924682617, |
|
"learning_rate": 1.6418816388467375e-06, |
|
"loss": 0.2357, |
|
"step": 2928 |
|
}, |
|
{ |
|
"epoch": 0.8448664132451763, |
|
"grad_norm": 2.2458267211914062, |
|
"learning_rate": 1.6327769347496208e-06, |
|
"loss": 0.2147, |
|
"step": 2931 |
|
}, |
|
{ |
|
"epoch": 0.8457311690417425, |
|
"grad_norm": 1.958115577697754, |
|
"learning_rate": 1.6236722306525039e-06, |
|
"loss": 0.2339, |
|
"step": 2934 |
|
}, |
|
{ |
|
"epoch": 0.8465959248383087, |
|
"grad_norm": 1.6470586061477661, |
|
"learning_rate": 1.614567526555387e-06, |
|
"loss": 0.2259, |
|
"step": 2937 |
|
}, |
|
{ |
|
"epoch": 0.8474606806348749, |
|
"grad_norm": 1.2936792373657227, |
|
"learning_rate": 1.6054628224582702e-06, |
|
"loss": 0.1981, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8483254364314411, |
|
"grad_norm": 4.565530300140381, |
|
"learning_rate": 1.5963581183611534e-06, |
|
"loss": 0.1993, |
|
"step": 2943 |
|
}, |
|
{ |
|
"epoch": 0.8491901922280073, |
|
"grad_norm": 2.8682401180267334, |
|
"learning_rate": 1.5872534142640367e-06, |
|
"loss": 0.2093, |
|
"step": 2946 |
|
}, |
|
{ |
|
"epoch": 0.8500549480245735, |
|
"grad_norm": 1.7801469564437866, |
|
"learning_rate": 1.5781487101669196e-06, |
|
"loss": 0.2136, |
|
"step": 2949 |
|
}, |
|
{ |
|
"epoch": 0.8509197038211397, |
|
"grad_norm": 2.372549057006836, |
|
"learning_rate": 1.5690440060698028e-06, |
|
"loss": 0.1845, |
|
"step": 2952 |
|
}, |
|
{ |
|
"epoch": 0.8517844596177059, |
|
"grad_norm": 2.190469741821289, |
|
"learning_rate": 1.559939301972686e-06, |
|
"loss": 0.2137, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.8526492154142721, |
|
"grad_norm": 1.6399952173233032, |
|
"learning_rate": 1.5508345978755694e-06, |
|
"loss": 0.2206, |
|
"step": 2958 |
|
}, |
|
{ |
|
"epoch": 0.8535139712108383, |
|
"grad_norm": 1.6555943489074707, |
|
"learning_rate": 1.5417298937784522e-06, |
|
"loss": 0.2403, |
|
"step": 2961 |
|
}, |
|
{ |
|
"epoch": 0.8543787270074045, |
|
"grad_norm": 2.6609280109405518, |
|
"learning_rate": 1.5326251896813355e-06, |
|
"loss": 0.2176, |
|
"step": 2964 |
|
}, |
|
{ |
|
"epoch": 0.8552434828039707, |
|
"grad_norm": 2.3398261070251465, |
|
"learning_rate": 1.5235204855842187e-06, |
|
"loss": 0.219, |
|
"step": 2967 |
|
}, |
|
{ |
|
"epoch": 0.8561082386005369, |
|
"grad_norm": 1.9740712642669678, |
|
"learning_rate": 1.514415781487102e-06, |
|
"loss": 0.2232, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.8569729943971031, |
|
"grad_norm": 1.6300252676010132, |
|
"learning_rate": 1.5053110773899848e-06, |
|
"loss": 0.2094, |
|
"step": 2973 |
|
}, |
|
{ |
|
"epoch": 0.8578377501936693, |
|
"grad_norm": 2.8211612701416016, |
|
"learning_rate": 1.496206373292868e-06, |
|
"loss": 0.1696, |
|
"step": 2976 |
|
}, |
|
{ |
|
"epoch": 0.8587025059902355, |
|
"grad_norm": 2.621321439743042, |
|
"learning_rate": 1.4871016691957514e-06, |
|
"loss": 0.2357, |
|
"step": 2979 |
|
}, |
|
{ |
|
"epoch": 0.8595672617868017, |
|
"grad_norm": 1.5020322799682617, |
|
"learning_rate": 1.4779969650986344e-06, |
|
"loss": 0.2456, |
|
"step": 2982 |
|
}, |
|
{ |
|
"epoch": 0.8604320175833678, |
|
"grad_norm": 1.474507212638855, |
|
"learning_rate": 1.4688922610015175e-06, |
|
"loss": 0.2074, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.861296773379934, |
|
"grad_norm": 2.9856317043304443, |
|
"learning_rate": 1.4597875569044007e-06, |
|
"loss": 0.2241, |
|
"step": 2988 |
|
}, |
|
{ |
|
"epoch": 0.8621615291765002, |
|
"grad_norm": 2.0011954307556152, |
|
"learning_rate": 1.4506828528072838e-06, |
|
"loss": 0.2026, |
|
"step": 2991 |
|
}, |
|
{ |
|
"epoch": 0.8630262849730664, |
|
"grad_norm": 1.6045671701431274, |
|
"learning_rate": 1.441578148710167e-06, |
|
"loss": 0.186, |
|
"step": 2994 |
|
}, |
|
{ |
|
"epoch": 0.8638910407696326, |
|
"grad_norm": 1.5708575248718262, |
|
"learning_rate": 1.4324734446130503e-06, |
|
"loss": 0.2048, |
|
"step": 2997 |
|
}, |
|
{ |
|
"epoch": 0.8647557965661988, |
|
"grad_norm": 2.8704543113708496, |
|
"learning_rate": 1.4233687405159332e-06, |
|
"loss": 0.2158, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8647557965661988, |
|
"eval_loss": 0.2083793729543686, |
|
"eval_mse": 0.20837937731285638, |
|
"eval_runtime": 6.6796, |
|
"eval_samples_per_second": 149.709, |
|
"eval_steps_per_second": 18.714, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.865620552362765, |
|
"grad_norm": 3.2445404529571533, |
|
"learning_rate": 1.4142640364188164e-06, |
|
"loss": 0.2144, |
|
"step": 3003 |
|
}, |
|
{ |
|
"epoch": 0.8664853081593312, |
|
"grad_norm": 1.96418297290802, |
|
"learning_rate": 1.4051593323216997e-06, |
|
"loss": 0.2491, |
|
"step": 3006 |
|
}, |
|
{ |
|
"epoch": 0.8673500639558974, |
|
"grad_norm": 1.8195468187332153, |
|
"learning_rate": 1.396054628224583e-06, |
|
"loss": 0.2065, |
|
"step": 3009 |
|
}, |
|
{ |
|
"epoch": 0.8682148197524636, |
|
"grad_norm": 1.3888121843338013, |
|
"learning_rate": 1.3869499241274658e-06, |
|
"loss": 0.2049, |
|
"step": 3012 |
|
}, |
|
{ |
|
"epoch": 0.8690795755490298, |
|
"grad_norm": 3.738133668899536, |
|
"learning_rate": 1.377845220030349e-06, |
|
"loss": 0.2522, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.869944331345596, |
|
"grad_norm": 6.097411632537842, |
|
"learning_rate": 1.3687405159332323e-06, |
|
"loss": 0.2532, |
|
"step": 3018 |
|
}, |
|
{ |
|
"epoch": 0.8708090871421622, |
|
"grad_norm": 2.2873427867889404, |
|
"learning_rate": 1.3596358118361156e-06, |
|
"loss": 0.2382, |
|
"step": 3021 |
|
}, |
|
{ |
|
"epoch": 0.8716738429387284, |
|
"grad_norm": 1.540143370628357, |
|
"learning_rate": 1.3505311077389985e-06, |
|
"loss": 0.1979, |
|
"step": 3024 |
|
}, |
|
{ |
|
"epoch": 0.8725385987352946, |
|
"grad_norm": 1.6231845617294312, |
|
"learning_rate": 1.3414264036418817e-06, |
|
"loss": 0.195, |
|
"step": 3027 |
|
}, |
|
{ |
|
"epoch": 0.8734033545318608, |
|
"grad_norm": 2.2970290184020996, |
|
"learning_rate": 1.332321699544765e-06, |
|
"loss": 0.2154, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.874268110328427, |
|
"grad_norm": 2.0112340450286865, |
|
"learning_rate": 1.3232169954476482e-06, |
|
"loss": 0.2355, |
|
"step": 3033 |
|
}, |
|
{ |
|
"epoch": 0.8751328661249932, |
|
"grad_norm": 1.3887783288955688, |
|
"learning_rate": 1.314112291350531e-06, |
|
"loss": 0.201, |
|
"step": 3036 |
|
}, |
|
{ |
|
"epoch": 0.8759976219215594, |
|
"grad_norm": 2.187082529067993, |
|
"learning_rate": 1.3050075872534144e-06, |
|
"loss": 0.1965, |
|
"step": 3039 |
|
}, |
|
{ |
|
"epoch": 0.8768623777181257, |
|
"grad_norm": 1.300243616104126, |
|
"learning_rate": 1.2959028831562976e-06, |
|
"loss": 0.2097, |
|
"step": 3042 |
|
}, |
|
{ |
|
"epoch": 0.8777271335146919, |
|
"grad_norm": 2.1217234134674072, |
|
"learning_rate": 1.2867981790591807e-06, |
|
"loss": 0.214, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.8785918893112581, |
|
"grad_norm": 1.8281973600387573, |
|
"learning_rate": 1.277693474962064e-06, |
|
"loss": 0.2082, |
|
"step": 3048 |
|
}, |
|
{ |
|
"epoch": 0.8794566451078243, |
|
"grad_norm": 2.3602306842803955, |
|
"learning_rate": 1.268588770864947e-06, |
|
"loss": 0.2172, |
|
"step": 3051 |
|
}, |
|
{ |
|
"epoch": 0.8803214009043905, |
|
"grad_norm": 1.903954267501831, |
|
"learning_rate": 1.25948406676783e-06, |
|
"loss": 0.2323, |
|
"step": 3054 |
|
}, |
|
{ |
|
"epoch": 0.8811861567009567, |
|
"grad_norm": 3.514057159423828, |
|
"learning_rate": 1.2503793626707133e-06, |
|
"loss": 0.2323, |
|
"step": 3057 |
|
}, |
|
{ |
|
"epoch": 0.8820509124975229, |
|
"grad_norm": 3.3089487552642822, |
|
"learning_rate": 1.2412746585735964e-06, |
|
"loss": 0.2506, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.8829156682940891, |
|
"grad_norm": 2.317981004714966, |
|
"learning_rate": 1.2321699544764796e-06, |
|
"loss": 0.2274, |
|
"step": 3063 |
|
}, |
|
{ |
|
"epoch": 0.8837804240906553, |
|
"grad_norm": 2.7326478958129883, |
|
"learning_rate": 1.2230652503793627e-06, |
|
"loss": 0.2057, |
|
"step": 3066 |
|
}, |
|
{ |
|
"epoch": 0.8846451798872215, |
|
"grad_norm": 4.13656759262085, |
|
"learning_rate": 1.213960546282246e-06, |
|
"loss": 0.2209, |
|
"step": 3069 |
|
}, |
|
{ |
|
"epoch": 0.8855099356837877, |
|
"grad_norm": 2.0688633918762207, |
|
"learning_rate": 1.204855842185129e-06, |
|
"loss": 0.2608, |
|
"step": 3072 |
|
}, |
|
{ |
|
"epoch": 0.8863746914803539, |
|
"grad_norm": 1.9340734481811523, |
|
"learning_rate": 1.1957511380880123e-06, |
|
"loss": 0.2187, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.8872394472769201, |
|
"grad_norm": 2.4431509971618652, |
|
"learning_rate": 1.1866464339908953e-06, |
|
"loss": 0.2578, |
|
"step": 3078 |
|
}, |
|
{ |
|
"epoch": 0.8881042030734863, |
|
"grad_norm": 2.9879822731018066, |
|
"learning_rate": 1.1775417298937786e-06, |
|
"loss": 0.2614, |
|
"step": 3081 |
|
}, |
|
{ |
|
"epoch": 0.8889689588700525, |
|
"grad_norm": 1.412150502204895, |
|
"learning_rate": 1.1684370257966617e-06, |
|
"loss": 0.1879, |
|
"step": 3084 |
|
}, |
|
{ |
|
"epoch": 0.8898337146666186, |
|
"grad_norm": 2.1106693744659424, |
|
"learning_rate": 1.159332321699545e-06, |
|
"loss": 0.2436, |
|
"step": 3087 |
|
}, |
|
{ |
|
"epoch": 0.8906984704631848, |
|
"grad_norm": 1.6913747787475586, |
|
"learning_rate": 1.150227617602428e-06, |
|
"loss": 0.2234, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.891563226259751, |
|
"grad_norm": 1.6746766567230225, |
|
"learning_rate": 1.1411229135053112e-06, |
|
"loss": 0.2381, |
|
"step": 3093 |
|
}, |
|
{ |
|
"epoch": 0.8924279820563172, |
|
"grad_norm": 3.068824291229248, |
|
"learning_rate": 1.1320182094081943e-06, |
|
"loss": 0.223, |
|
"step": 3096 |
|
}, |
|
{ |
|
"epoch": 0.8932927378528834, |
|
"grad_norm": 2.9033825397491455, |
|
"learning_rate": 1.1229135053110776e-06, |
|
"loss": 0.2197, |
|
"step": 3099 |
|
}, |
|
{ |
|
"epoch": 0.8935809897850722, |
|
"eval_loss": 0.21448279917240143, |
|
"eval_mse": 0.21448280355427415, |
|
"eval_runtime": 6.6727, |
|
"eval_samples_per_second": 149.865, |
|
"eval_steps_per_second": 18.733, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8941574936494496, |
|
"grad_norm": 1.8722037076950073, |
|
"learning_rate": 1.1138088012139606e-06, |
|
"loss": 0.1974, |
|
"step": 3102 |
|
}, |
|
{ |
|
"epoch": 0.8950222494460158, |
|
"grad_norm": 1.319684624671936, |
|
"learning_rate": 1.1047040971168439e-06, |
|
"loss": 0.2027, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.895887005242582, |
|
"grad_norm": 3.9506242275238037, |
|
"learning_rate": 1.095599393019727e-06, |
|
"loss": 0.2124, |
|
"step": 3108 |
|
}, |
|
{ |
|
"epoch": 0.8967517610391482, |
|
"grad_norm": 1.7725896835327148, |
|
"learning_rate": 1.08649468892261e-06, |
|
"loss": 0.2146, |
|
"step": 3111 |
|
}, |
|
{ |
|
"epoch": 0.8976165168357144, |
|
"grad_norm": 1.9070608615875244, |
|
"learning_rate": 1.0773899848254933e-06, |
|
"loss": 0.2311, |
|
"step": 3114 |
|
}, |
|
{ |
|
"epoch": 0.8984812726322806, |
|
"grad_norm": 2.3098270893096924, |
|
"learning_rate": 1.0682852807283763e-06, |
|
"loss": 0.1847, |
|
"step": 3117 |
|
}, |
|
{ |
|
"epoch": 0.8993460284288468, |
|
"grad_norm": 2.392598867416382, |
|
"learning_rate": 1.0591805766312596e-06, |
|
"loss": 0.252, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.900210784225413, |
|
"grad_norm": 1.592748761177063, |
|
"learning_rate": 1.0500758725341426e-06, |
|
"loss": 0.2226, |
|
"step": 3123 |
|
}, |
|
{ |
|
"epoch": 0.9010755400219792, |
|
"grad_norm": 2.1816020011901855, |
|
"learning_rate": 1.0409711684370259e-06, |
|
"loss": 0.2551, |
|
"step": 3126 |
|
}, |
|
{ |
|
"epoch": 0.9019402958185454, |
|
"grad_norm": 2.04571270942688, |
|
"learning_rate": 1.031866464339909e-06, |
|
"loss": 0.2506, |
|
"step": 3129 |
|
}, |
|
{ |
|
"epoch": 0.9028050516151116, |
|
"grad_norm": 3.148040771484375, |
|
"learning_rate": 1.0227617602427922e-06, |
|
"loss": 0.2056, |
|
"step": 3132 |
|
}, |
|
{ |
|
"epoch": 0.9036698074116778, |
|
"grad_norm": 4.721248626708984, |
|
"learning_rate": 1.0136570561456753e-06, |
|
"loss": 0.2243, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.904534563208244, |
|
"grad_norm": 1.5059829950332642, |
|
"learning_rate": 1.0045523520485585e-06, |
|
"loss": 0.1884, |
|
"step": 3138 |
|
}, |
|
{ |
|
"epoch": 0.9053993190048102, |
|
"grad_norm": 1.9442839622497559, |
|
"learning_rate": 9.954476479514416e-07, |
|
"loss": 0.1965, |
|
"step": 3141 |
|
}, |
|
{ |
|
"epoch": 0.9062640748013764, |
|
"grad_norm": 2.1955509185791016, |
|
"learning_rate": 9.863429438543249e-07, |
|
"loss": 0.204, |
|
"step": 3144 |
|
}, |
|
{ |
|
"epoch": 0.9071288305979426, |
|
"grad_norm": 1.5504348278045654, |
|
"learning_rate": 9.77238239757208e-07, |
|
"loss": 0.2248, |
|
"step": 3147 |
|
}, |
|
{ |
|
"epoch": 0.9079935863945088, |
|
"grad_norm": 1.6235114336013794, |
|
"learning_rate": 9.681335356600912e-07, |
|
"loss": 0.2276, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.908858342191075, |
|
"grad_norm": 2.8215818405151367, |
|
"learning_rate": 9.590288315629742e-07, |
|
"loss": 0.2292, |
|
"step": 3153 |
|
}, |
|
{ |
|
"epoch": 0.9097230979876412, |
|
"grad_norm": 1.7022403478622437, |
|
"learning_rate": 9.499241274658574e-07, |
|
"loss": 0.2383, |
|
"step": 3156 |
|
}, |
|
{ |
|
"epoch": 0.9105878537842074, |
|
"grad_norm": 2.5963640213012695, |
|
"learning_rate": 9.408194233687407e-07, |
|
"loss": 0.2311, |
|
"step": 3159 |
|
}, |
|
{ |
|
"epoch": 0.9114526095807736, |
|
"grad_norm": 1.830937147140503, |
|
"learning_rate": 9.317147192716237e-07, |
|
"loss": 0.2073, |
|
"step": 3162 |
|
}, |
|
{ |
|
"epoch": 0.9123173653773398, |
|
"grad_norm": 1.8354508876800537, |
|
"learning_rate": 9.22610015174507e-07, |
|
"loss": 0.2143, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.913182121173906, |
|
"grad_norm": 1.5569084882736206, |
|
"learning_rate": 9.1350531107739e-07, |
|
"loss": 0.2102, |
|
"step": 3168 |
|
}, |
|
{ |
|
"epoch": 0.9140468769704722, |
|
"grad_norm": 1.5762006044387817, |
|
"learning_rate": 9.044006069802733e-07, |
|
"loss": 0.2335, |
|
"step": 3171 |
|
}, |
|
{ |
|
"epoch": 0.9149116327670384, |
|
"grad_norm": 1.4155080318450928, |
|
"learning_rate": 8.952959028831563e-07, |
|
"loss": 0.1993, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 0.9157763885636045, |
|
"grad_norm": 3.324310779571533, |
|
"learning_rate": 8.861911987860396e-07, |
|
"loss": 0.2209, |
|
"step": 3177 |
|
}, |
|
{ |
|
"epoch": 0.9166411443601707, |
|
"grad_norm": 2.4857962131500244, |
|
"learning_rate": 8.770864946889227e-07, |
|
"loss": 0.2384, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.9175059001567369, |
|
"grad_norm": 1.998383641242981, |
|
"learning_rate": 8.679817905918058e-07, |
|
"loss": 0.2216, |
|
"step": 3183 |
|
}, |
|
{ |
|
"epoch": 0.9183706559533031, |
|
"grad_norm": 2.2427866458892822, |
|
"learning_rate": 8.58877086494689e-07, |
|
"loss": 0.2328, |
|
"step": 3186 |
|
}, |
|
{ |
|
"epoch": 0.9192354117498693, |
|
"grad_norm": 2.5861330032348633, |
|
"learning_rate": 8.497723823975721e-07, |
|
"loss": 0.2228, |
|
"step": 3189 |
|
}, |
|
{ |
|
"epoch": 0.9201001675464355, |
|
"grad_norm": 1.680492639541626, |
|
"learning_rate": 8.406676783004553e-07, |
|
"loss": 0.2241, |
|
"step": 3192 |
|
}, |
|
{ |
|
"epoch": 0.9209649233430017, |
|
"grad_norm": 2.616267204284668, |
|
"learning_rate": 8.315629742033385e-07, |
|
"loss": 0.2524, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.921829679139568, |
|
"grad_norm": 1.922303318977356, |
|
"learning_rate": 8.224582701062215e-07, |
|
"loss": 0.2397, |
|
"step": 3198 |
|
}, |
|
{ |
|
"epoch": 0.9224061830039455, |
|
"eval_loss": 0.21633096039295197, |
|
"eval_mse": 0.2163309759118274, |
|
"eval_runtime": 6.5496, |
|
"eval_samples_per_second": 152.68, |
|
"eval_steps_per_second": 19.085, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9226944349361342, |
|
"grad_norm": 4.39849853515625, |
|
"learning_rate": 8.133535660091048e-07, |
|
"loss": 0.2216, |
|
"step": 3201 |
|
}, |
|
{ |
|
"epoch": 0.9235591907327004, |
|
"grad_norm": 2.2769124507904053, |
|
"learning_rate": 8.042488619119878e-07, |
|
"loss": 0.2182, |
|
"step": 3204 |
|
}, |
|
{ |
|
"epoch": 0.9244239465292666, |
|
"grad_norm": 2.8028557300567627, |
|
"learning_rate": 7.951441578148711e-07, |
|
"loss": 0.229, |
|
"step": 3207 |
|
}, |
|
{ |
|
"epoch": 0.9252887023258328, |
|
"grad_norm": 2.1702733039855957, |
|
"learning_rate": 7.860394537177542e-07, |
|
"loss": 0.2261, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.926153458122399, |
|
"grad_norm": 2.6439995765686035, |
|
"learning_rate": 7.769347496206374e-07, |
|
"loss": 0.2246, |
|
"step": 3213 |
|
}, |
|
{ |
|
"epoch": 0.9270182139189652, |
|
"grad_norm": 1.576919436454773, |
|
"learning_rate": 7.678300455235206e-07, |
|
"loss": 0.2354, |
|
"step": 3216 |
|
}, |
|
{ |
|
"epoch": 0.9278829697155314, |
|
"grad_norm": 1.6755398511886597, |
|
"learning_rate": 7.587253414264036e-07, |
|
"loss": 0.2158, |
|
"step": 3219 |
|
}, |
|
{ |
|
"epoch": 0.9287477255120976, |
|
"grad_norm": 2.1890718936920166, |
|
"learning_rate": 7.496206373292869e-07, |
|
"loss": 0.2233, |
|
"step": 3222 |
|
}, |
|
{ |
|
"epoch": 0.9296124813086638, |
|
"grad_norm": 1.7316986322402954, |
|
"learning_rate": 7.4051593323217e-07, |
|
"loss": 0.2296, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.93047723710523, |
|
"grad_norm": 1.7639137506484985, |
|
"learning_rate": 7.314112291350532e-07, |
|
"loss": 0.2274, |
|
"step": 3228 |
|
}, |
|
{ |
|
"epoch": 0.9313419929017962, |
|
"grad_norm": 1.7912460565567017, |
|
"learning_rate": 7.223065250379363e-07, |
|
"loss": 0.1942, |
|
"step": 3231 |
|
}, |
|
{ |
|
"epoch": 0.9322067486983624, |
|
"grad_norm": 1.9391751289367676, |
|
"learning_rate": 7.132018209408196e-07, |
|
"loss": 0.2326, |
|
"step": 3234 |
|
}, |
|
{ |
|
"epoch": 0.9330715044949286, |
|
"grad_norm": 2.4329934120178223, |
|
"learning_rate": 7.040971168437026e-07, |
|
"loss": 0.2205, |
|
"step": 3237 |
|
}, |
|
{ |
|
"epoch": 0.9339362602914948, |
|
"grad_norm": 1.5994445085525513, |
|
"learning_rate": 6.949924127465859e-07, |
|
"loss": 0.2149, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.934801016088061, |
|
"grad_norm": 2.992966651916504, |
|
"learning_rate": 6.858877086494689e-07, |
|
"loss": 0.2194, |
|
"step": 3243 |
|
}, |
|
{ |
|
"epoch": 0.9356657718846272, |
|
"grad_norm": 1.8795028924942017, |
|
"learning_rate": 6.767830045523521e-07, |
|
"loss": 0.2158, |
|
"step": 3246 |
|
}, |
|
{ |
|
"epoch": 0.9365305276811934, |
|
"grad_norm": 3.5229902267456055, |
|
"learning_rate": 6.676783004552352e-07, |
|
"loss": 0.2396, |
|
"step": 3249 |
|
}, |
|
{ |
|
"epoch": 0.9373952834777596, |
|
"grad_norm": 1.6539433002471924, |
|
"learning_rate": 6.585735963581184e-07, |
|
"loss": 0.2077, |
|
"step": 3252 |
|
}, |
|
{ |
|
"epoch": 0.9382600392743258, |
|
"grad_norm": 2.448824405670166, |
|
"learning_rate": 6.494688922610016e-07, |
|
"loss": 0.2271, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.939124795070892, |
|
"grad_norm": 1.3006185293197632, |
|
"learning_rate": 6.403641881638847e-07, |
|
"loss": 0.2432, |
|
"step": 3258 |
|
}, |
|
{ |
|
"epoch": 0.9399895508674582, |
|
"grad_norm": 2.689985752105713, |
|
"learning_rate": 6.312594840667678e-07, |
|
"loss": 0.2152, |
|
"step": 3261 |
|
}, |
|
{ |
|
"epoch": 0.9408543066640244, |
|
"grad_norm": 1.5370323657989502, |
|
"learning_rate": 6.22154779969651e-07, |
|
"loss": 0.197, |
|
"step": 3264 |
|
}, |
|
{ |
|
"epoch": 0.9417190624605906, |
|
"grad_norm": 1.6089318990707397, |
|
"learning_rate": 6.130500758725342e-07, |
|
"loss": 0.2145, |
|
"step": 3267 |
|
}, |
|
{ |
|
"epoch": 0.9425838182571568, |
|
"grad_norm": 2.1321656703948975, |
|
"learning_rate": 6.039453717754174e-07, |
|
"loss": 0.1989, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.943448574053723, |
|
"grad_norm": 1.990070104598999, |
|
"learning_rate": 5.948406676783005e-07, |
|
"loss": 0.1982, |
|
"step": 3273 |
|
}, |
|
{ |
|
"epoch": 0.9443133298502892, |
|
"grad_norm": 3.087510585784912, |
|
"learning_rate": 5.857359635811837e-07, |
|
"loss": 0.206, |
|
"step": 3276 |
|
}, |
|
{ |
|
"epoch": 0.9451780856468553, |
|
"grad_norm": 2.3679909706115723, |
|
"learning_rate": 5.766312594840668e-07, |
|
"loss": 0.2292, |
|
"step": 3279 |
|
}, |
|
{ |
|
"epoch": 0.9460428414434215, |
|
"grad_norm": 1.546036958694458, |
|
"learning_rate": 5.675265553869499e-07, |
|
"loss": 0.2127, |
|
"step": 3282 |
|
}, |
|
{ |
|
"epoch": 0.9469075972399877, |
|
"grad_norm": 1.9500880241394043, |
|
"learning_rate": 5.584218512898331e-07, |
|
"loss": 0.2157, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.9477723530365539, |
|
"grad_norm": 1.9931670427322388, |
|
"learning_rate": 5.493171471927162e-07, |
|
"loss": 0.2151, |
|
"step": 3288 |
|
}, |
|
{ |
|
"epoch": 0.9486371088331201, |
|
"grad_norm": 1.7962517738342285, |
|
"learning_rate": 5.402124430955994e-07, |
|
"loss": 0.2035, |
|
"step": 3291 |
|
}, |
|
{ |
|
"epoch": 0.9495018646296863, |
|
"grad_norm": 1.3835334777832031, |
|
"learning_rate": 5.311077389984825e-07, |
|
"loss": 0.2059, |
|
"step": 3294 |
|
}, |
|
{ |
|
"epoch": 0.9503666204262525, |
|
"grad_norm": 3.157975912094116, |
|
"learning_rate": 5.220030349013658e-07, |
|
"loss": 0.2002, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 0.9512313762228187, |
|
"grad_norm": 2.0881123542785645, |
|
"learning_rate": 5.12898330804249e-07, |
|
"loss": 0.2307, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9512313762228187, |
|
"eval_loss": 0.21600358188152313, |
|
"eval_mse": 0.21600358275626785, |
|
"eval_runtime": 6.4833, |
|
"eval_samples_per_second": 154.243, |
|
"eval_steps_per_second": 19.28, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9520961320193849, |
|
"grad_norm": 2.1528842449188232, |
|
"learning_rate": 5.037936267071321e-07, |
|
"loss": 0.2657, |
|
"step": 3303 |
|
}, |
|
{ |
|
"epoch": 0.9529608878159511, |
|
"grad_norm": 2.4048590660095215, |
|
"learning_rate": 4.946889226100153e-07, |
|
"loss": 0.2355, |
|
"step": 3306 |
|
}, |
|
{ |
|
"epoch": 0.9538256436125173, |
|
"grad_norm": 1.866373896598816, |
|
"learning_rate": 4.855842185128983e-07, |
|
"loss": 0.1932, |
|
"step": 3309 |
|
}, |
|
{ |
|
"epoch": 0.9546903994090835, |
|
"grad_norm": 1.543273687362671, |
|
"learning_rate": 4.7647951441578155e-07, |
|
"loss": 0.2088, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 0.9555551552056497, |
|
"grad_norm": 1.585208535194397, |
|
"learning_rate": 4.673748103186647e-07, |
|
"loss": 0.1945, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.9564199110022159, |
|
"grad_norm": 1.9510430097579956, |
|
"learning_rate": 4.582701062215478e-07, |
|
"loss": 0.1719, |
|
"step": 3318 |
|
}, |
|
{ |
|
"epoch": 0.9572846667987821, |
|
"grad_norm": 1.36229407787323, |
|
"learning_rate": 4.49165402124431e-07, |
|
"loss": 0.1997, |
|
"step": 3321 |
|
}, |
|
{ |
|
"epoch": 0.9581494225953483, |
|
"grad_norm": 2.563950777053833, |
|
"learning_rate": 4.4006069802731414e-07, |
|
"loss": 0.2385, |
|
"step": 3324 |
|
}, |
|
{ |
|
"epoch": 0.9590141783919145, |
|
"grad_norm": 2.159186363220215, |
|
"learning_rate": 4.309559939301973e-07, |
|
"loss": 0.2319, |
|
"step": 3327 |
|
}, |
|
{ |
|
"epoch": 0.9598789341884807, |
|
"grad_norm": 2.4233345985412598, |
|
"learning_rate": 4.2185128983308046e-07, |
|
"loss": 0.2117, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.9607436899850469, |
|
"grad_norm": 1.915822982788086, |
|
"learning_rate": 4.127465857359636e-07, |
|
"loss": 0.2244, |
|
"step": 3333 |
|
}, |
|
{ |
|
"epoch": 0.9616084457816131, |
|
"grad_norm": 3.731882333755493, |
|
"learning_rate": 4.0364188163884673e-07, |
|
"loss": 0.2222, |
|
"step": 3336 |
|
}, |
|
{ |
|
"epoch": 0.9624732015781793, |
|
"grad_norm": 1.4122893810272217, |
|
"learning_rate": 3.945371775417299e-07, |
|
"loss": 0.2052, |
|
"step": 3339 |
|
}, |
|
{ |
|
"epoch": 0.9633379573747455, |
|
"grad_norm": 3.098508596420288, |
|
"learning_rate": 3.8543247344461305e-07, |
|
"loss": 0.212, |
|
"step": 3342 |
|
}, |
|
{ |
|
"epoch": 0.9642027131713117, |
|
"grad_norm": 1.4726747274398804, |
|
"learning_rate": 3.763277693474962e-07, |
|
"loss": 0.2374, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.9650674689678779, |
|
"grad_norm": 1.8609498739242554, |
|
"learning_rate": 3.6722306525037937e-07, |
|
"loss": 0.2173, |
|
"step": 3348 |
|
}, |
|
{ |
|
"epoch": 0.9659322247644441, |
|
"grad_norm": 1.8475359678268433, |
|
"learning_rate": 3.581183611532626e-07, |
|
"loss": 0.2032, |
|
"step": 3351 |
|
}, |
|
{ |
|
"epoch": 0.9667969805610104, |
|
"grad_norm": 1.7029978036880493, |
|
"learning_rate": 3.4901365705614574e-07, |
|
"loss": 0.1713, |
|
"step": 3354 |
|
}, |
|
{ |
|
"epoch": 0.9676617363575766, |
|
"grad_norm": 2.657390832901001, |
|
"learning_rate": 3.399089529590289e-07, |
|
"loss": 0.2216, |
|
"step": 3357 |
|
}, |
|
{ |
|
"epoch": 0.9685264921541428, |
|
"grad_norm": 3.23854398727417, |
|
"learning_rate": 3.3080424886191206e-07, |
|
"loss": 0.1776, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.969391247950709, |
|
"grad_norm": 2.09384822845459, |
|
"learning_rate": 3.2169954476479517e-07, |
|
"loss": 0.1866, |
|
"step": 3363 |
|
}, |
|
{ |
|
"epoch": 0.9702560037472752, |
|
"grad_norm": 1.8957816362380981, |
|
"learning_rate": 3.1259484066767833e-07, |
|
"loss": 0.2317, |
|
"step": 3366 |
|
}, |
|
{ |
|
"epoch": 0.9711207595438414, |
|
"grad_norm": 1.5353327989578247, |
|
"learning_rate": 3.034901365705615e-07, |
|
"loss": 0.209, |
|
"step": 3369 |
|
}, |
|
{ |
|
"epoch": 0.9719855153404076, |
|
"grad_norm": 1.776352882385254, |
|
"learning_rate": 2.9438543247344465e-07, |
|
"loss": 0.219, |
|
"step": 3372 |
|
}, |
|
{ |
|
"epoch": 0.9728502711369738, |
|
"grad_norm": 3.282552480697632, |
|
"learning_rate": 2.852807283763278e-07, |
|
"loss": 0.224, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.97371502693354, |
|
"grad_norm": 1.8530248403549194, |
|
"learning_rate": 2.7617602427921097e-07, |
|
"loss": 0.2147, |
|
"step": 3378 |
|
}, |
|
{ |
|
"epoch": 0.9745797827301061, |
|
"grad_norm": 2.0258686542510986, |
|
"learning_rate": 2.670713201820941e-07, |
|
"loss": 0.2081, |
|
"step": 3381 |
|
}, |
|
{ |
|
"epoch": 0.9754445385266723, |
|
"grad_norm": 1.830100655555725, |
|
"learning_rate": 2.5796661608497724e-07, |
|
"loss": 0.2248, |
|
"step": 3384 |
|
}, |
|
{ |
|
"epoch": 0.9763092943232385, |
|
"grad_norm": 3.846280813217163, |
|
"learning_rate": 2.488619119878604e-07, |
|
"loss": 0.2269, |
|
"step": 3387 |
|
}, |
|
{ |
|
"epoch": 0.9771740501198047, |
|
"grad_norm": 2.3556368350982666, |
|
"learning_rate": 2.3975720789074356e-07, |
|
"loss": 0.2195, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.9780388059163709, |
|
"grad_norm": 1.8791780471801758, |
|
"learning_rate": 2.3065250379362674e-07, |
|
"loss": 0.2297, |
|
"step": 3393 |
|
}, |
|
{ |
|
"epoch": 0.9789035617129371, |
|
"grad_norm": 1.6351841688156128, |
|
"learning_rate": 2.215477996965099e-07, |
|
"loss": 0.1925, |
|
"step": 3396 |
|
}, |
|
{ |
|
"epoch": 0.9797683175095033, |
|
"grad_norm": 2.4028263092041016, |
|
"learning_rate": 2.1244309559939304e-07, |
|
"loss": 0.2099, |
|
"step": 3399 |
|
}, |
|
{ |
|
"epoch": 0.9800565694416921, |
|
"eval_loss": 0.2100730687379837, |
|
"eval_mse": 0.21007308381050824, |
|
"eval_runtime": 6.746, |
|
"eval_samples_per_second": 148.235, |
|
"eval_steps_per_second": 18.529, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9806330733060695, |
|
"grad_norm": 3.0006494522094727, |
|
"learning_rate": 2.033383915022762e-07, |
|
"loss": 0.2095, |
|
"step": 3402 |
|
}, |
|
{ |
|
"epoch": 0.9814978291026357, |
|
"grad_norm": 2.7023770809173584, |
|
"learning_rate": 1.9423368740515936e-07, |
|
"loss": 0.2064, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.9823625848992019, |
|
"grad_norm": 3.8468315601348877, |
|
"learning_rate": 1.851289833080425e-07, |
|
"loss": 0.2412, |
|
"step": 3408 |
|
}, |
|
{ |
|
"epoch": 0.9832273406957681, |
|
"grad_norm": 2.5841078758239746, |
|
"learning_rate": 1.7602427921092565e-07, |
|
"loss": 0.2056, |
|
"step": 3411 |
|
}, |
|
{ |
|
"epoch": 0.9840920964923343, |
|
"grad_norm": 2.557180881500244, |
|
"learning_rate": 1.669195751138088e-07, |
|
"loss": 0.2449, |
|
"step": 3414 |
|
}, |
|
{ |
|
"epoch": 0.9849568522889005, |
|
"grad_norm": 2.1071228981018066, |
|
"learning_rate": 1.5781487101669194e-07, |
|
"loss": 0.2664, |
|
"step": 3417 |
|
}, |
|
{ |
|
"epoch": 0.9858216080854667, |
|
"grad_norm": 2.5804591178894043, |
|
"learning_rate": 1.4871016691957513e-07, |
|
"loss": 0.2252, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.9866863638820329, |
|
"grad_norm": 1.557554006576538, |
|
"learning_rate": 1.3960546282245826e-07, |
|
"loss": 0.1979, |
|
"step": 3423 |
|
}, |
|
{ |
|
"epoch": 0.9875511196785991, |
|
"grad_norm": 2.0957911014556885, |
|
"learning_rate": 1.3050075872534145e-07, |
|
"loss": 0.21, |
|
"step": 3426 |
|
}, |
|
{ |
|
"epoch": 0.9884158754751653, |
|
"grad_norm": 2.239748954772949, |
|
"learning_rate": 1.2139605462822459e-07, |
|
"loss": 0.1995, |
|
"step": 3429 |
|
}, |
|
{ |
|
"epoch": 0.9892806312717315, |
|
"grad_norm": 2.46882963180542, |
|
"learning_rate": 1.1229135053110775e-07, |
|
"loss": 0.2319, |
|
"step": 3432 |
|
}, |
|
{ |
|
"epoch": 0.9901453870682977, |
|
"grad_norm": 2.6145310401916504, |
|
"learning_rate": 1.031866464339909e-07, |
|
"loss": 0.2245, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.9910101428648639, |
|
"grad_norm": 2.32053279876709, |
|
"learning_rate": 9.408194233687405e-08, |
|
"loss": 0.2338, |
|
"step": 3438 |
|
}, |
|
{ |
|
"epoch": 0.9918748986614301, |
|
"grad_norm": 1.4136054515838623, |
|
"learning_rate": 8.497723823975723e-08, |
|
"loss": 0.2173, |
|
"step": 3441 |
|
}, |
|
{ |
|
"epoch": 0.9927396544579963, |
|
"grad_norm": 1.3618991374969482, |
|
"learning_rate": 7.587253414264037e-08, |
|
"loss": 0.1794, |
|
"step": 3444 |
|
}, |
|
{ |
|
"epoch": 0.9936044102545625, |
|
"grad_norm": 2.8345561027526855, |
|
"learning_rate": 6.676783004552352e-08, |
|
"loss": 0.2182, |
|
"step": 3447 |
|
}, |
|
{ |
|
"epoch": 0.9944691660511287, |
|
"grad_norm": 1.9578803777694702, |
|
"learning_rate": 5.7663125948406686e-08, |
|
"loss": 0.2333, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.9953339218476949, |
|
"grad_norm": 1.6162538528442383, |
|
"learning_rate": 4.855842185128984e-08, |
|
"loss": 0.2005, |
|
"step": 3453 |
|
}, |
|
{ |
|
"epoch": 0.9961986776442611, |
|
"grad_norm": 3.075516939163208, |
|
"learning_rate": 3.9453717754172986e-08, |
|
"loss": 0.2249, |
|
"step": 3456 |
|
}, |
|
{ |
|
"epoch": 0.9970634334408273, |
|
"grad_norm": 2.544214963912964, |
|
"learning_rate": 3.0349013657056146e-08, |
|
"loss": 0.2147, |
|
"step": 3459 |
|
}, |
|
{ |
|
"epoch": 0.9979281892373935, |
|
"grad_norm": 1.9591963291168213, |
|
"learning_rate": 2.1244309559939306e-08, |
|
"loss": 0.2164, |
|
"step": 3462 |
|
}, |
|
{ |
|
"epoch": 0.9987929450339597, |
|
"grad_norm": 1.6480742692947388, |
|
"learning_rate": 1.213960546282246e-08, |
|
"loss": 0.1889, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.9996577008305259, |
|
"grad_norm": 1.6390196084976196, |
|
"learning_rate": 3.034901365705615e-09, |
|
"loss": 0.2205, |
|
"step": 3468 |
|
}, |
|
{ |
|
"epoch": 0.9999459527627146, |
|
"step": 3469, |
|
"total_flos": 1.1682867916662374e+17, |
|
"train_loss": 0.2771636627187094, |
|
"train_runtime": 4603.777, |
|
"train_samples_per_second": 96.454, |
|
"train_steps_per_second": 0.754 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 3469, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1682867916662374e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|