{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999655896218299, "eval_steps": 200, "global_step": 7265, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013764151268022434, "grad_norm": 1.0744437265855968, "learning_rate": 2.7510316368638242e-08, "loss": 0.8891, "step": 1 }, { "epoch": 0.0006882075634011218, "grad_norm": 1.0925226837125284, "learning_rate": 1.375515818431912e-07, "loss": 1.0453, "step": 5 }, { "epoch": 0.0013764151268022436, "grad_norm": 1.0429095053384205, "learning_rate": 2.751031636863824e-07, "loss": 1.1081, "step": 10 }, { "epoch": 0.0020646226902033653, "grad_norm": 1.0367184905756903, "learning_rate": 4.126547455295736e-07, "loss": 1.0806, "step": 15 }, { "epoch": 0.002752830253604487, "grad_norm": 0.8535228048657485, "learning_rate": 5.502063273727648e-07, "loss": 0.9934, "step": 20 }, { "epoch": 0.003441037817005609, "grad_norm": 0.7530736536407544, "learning_rate": 6.87757909215956e-07, "loss": 0.9748, "step": 25 }, { "epoch": 0.0041292453804067306, "grad_norm": 0.5363973383914493, "learning_rate": 8.253094910591472e-07, "loss": 0.9683, "step": 30 }, { "epoch": 0.004817452943807852, "grad_norm": 0.5147944085503978, "learning_rate": 9.628610729023384e-07, "loss": 0.9497, "step": 35 }, { "epoch": 0.005505660507208974, "grad_norm": 0.38494488930090076, "learning_rate": 1.1004126547455296e-06, "loss": 0.9571, "step": 40 }, { "epoch": 0.006193868070610096, "grad_norm": 0.34507001908846907, "learning_rate": 1.2379642365887208e-06, "loss": 0.9492, "step": 45 }, { "epoch": 0.006882075634011218, "grad_norm": 0.3226626197838648, "learning_rate": 1.375515818431912e-06, "loss": 0.9444, "step": 50 }, { "epoch": 0.00757028319741234, "grad_norm": 0.31747748325778746, "learning_rate": 1.5130674002751033e-06, "loss": 0.9361, "step": 55 }, { "epoch": 0.008258490760813461, "grad_norm": 0.34448468841946145, "learning_rate": 1.6506189821182945e-06, "loss": 0.8677, "step": 60 }, { "epoch": 0.008946698324214583, "grad_norm": 0.3092923585405677, "learning_rate": 1.7881705639614857e-06, "loss": 0.9459, "step": 65 }, { "epoch": 0.009634905887615704, "grad_norm": 0.2954071228548251, "learning_rate": 1.925722145804677e-06, "loss": 0.8243, "step": 70 }, { "epoch": 0.010323113451016826, "grad_norm": 0.2988292440034158, "learning_rate": 2.0632737276478683e-06, "loss": 0.9334, "step": 75 }, { "epoch": 0.011011321014417949, "grad_norm": 0.30646969514111183, "learning_rate": 2.2008253094910593e-06, "loss": 0.9376, "step": 80 }, { "epoch": 0.011699528577819071, "grad_norm": 0.2989939758101042, "learning_rate": 2.3383768913342507e-06, "loss": 0.8835, "step": 85 }, { "epoch": 0.012387736141220192, "grad_norm": 0.3353612857371804, "learning_rate": 2.4759284731774417e-06, "loss": 0.9515, "step": 90 }, { "epoch": 0.013075943704621314, "grad_norm": 0.2726869728919805, "learning_rate": 2.613480055020633e-06, "loss": 0.8335, "step": 95 }, { "epoch": 0.013764151268022436, "grad_norm": 0.3025237345927011, "learning_rate": 2.751031636863824e-06, "loss": 0.8728, "step": 100 }, { "epoch": 0.014452358831423557, "grad_norm": 0.2518432743455901, "learning_rate": 2.8885832187070155e-06, "loss": 0.8225, "step": 105 }, { "epoch": 0.01514056639482468, "grad_norm": 0.29872608191518396, "learning_rate": 3.0261348005502065e-06, "loss": 0.8233, "step": 110 }, { "epoch": 0.0158287739582258, "grad_norm": 0.2892819513373577, "learning_rate": 3.163686382393398e-06, "loss": 0.8676, "step": 115 }, { "epoch": 0.016516981521626922, "grad_norm": 0.3034282399338811, "learning_rate": 3.301237964236589e-06, "loss": 0.8163, "step": 120 }, { "epoch": 0.017205189085028043, "grad_norm": 0.2399748168471235, "learning_rate": 3.4387895460797803e-06, "loss": 0.8901, "step": 125 }, { "epoch": 0.017893396648429167, "grad_norm": 0.26820749676536576, "learning_rate": 3.5763411279229713e-06, "loss": 0.8608, "step": 130 }, { "epoch": 0.018581604211830288, "grad_norm": 0.2892396344590634, "learning_rate": 3.7138927097661627e-06, "loss": 0.7892, "step": 135 }, { "epoch": 0.019269811775231408, "grad_norm": 0.2803097465375326, "learning_rate": 3.851444291609354e-06, "loss": 0.8333, "step": 140 }, { "epoch": 0.019958019338632532, "grad_norm": 0.28184786877331375, "learning_rate": 3.988995873452545e-06, "loss": 0.8782, "step": 145 }, { "epoch": 0.020646226902033653, "grad_norm": 0.2656845205402233, "learning_rate": 4.1265474552957366e-06, "loss": 0.8329, "step": 150 }, { "epoch": 0.021334434465434773, "grad_norm": 0.25148251076174055, "learning_rate": 4.264099037138927e-06, "loss": 0.8831, "step": 155 }, { "epoch": 0.022022642028835897, "grad_norm": 0.25354261455581, "learning_rate": 4.4016506189821186e-06, "loss": 0.9237, "step": 160 }, { "epoch": 0.022710849592237018, "grad_norm": 0.26900972462282635, "learning_rate": 4.53920220082531e-06, "loss": 0.8101, "step": 165 }, { "epoch": 0.023399057155638142, "grad_norm": 0.2795048737406048, "learning_rate": 4.676753782668501e-06, "loss": 0.8541, "step": 170 }, { "epoch": 0.024087264719039263, "grad_norm": 0.2678194291016658, "learning_rate": 4.814305364511692e-06, "loss": 0.8175, "step": 175 }, { "epoch": 0.024775472282440383, "grad_norm": 0.2719134965425649, "learning_rate": 4.951856946354883e-06, "loss": 0.8499, "step": 180 }, { "epoch": 0.025463679845841507, "grad_norm": 0.25928274735305, "learning_rate": 5.089408528198074e-06, "loss": 0.8581, "step": 185 }, { "epoch": 0.026151887409242628, "grad_norm": 0.28582414278705426, "learning_rate": 5.226960110041266e-06, "loss": 0.8456, "step": 190 }, { "epoch": 0.02684009497264375, "grad_norm": 0.32816594865179044, "learning_rate": 5.364511691884458e-06, "loss": 0.7651, "step": 195 }, { "epoch": 0.027528302536044873, "grad_norm": 0.253109225413623, "learning_rate": 5.502063273727648e-06, "loss": 0.7972, "step": 200 }, { "epoch": 0.027528302536044873, "eval_loss": 0.8336466550827026, "eval_runtime": 53.0253, "eval_samples_per_second": 94.295, "eval_steps_per_second": 2.961, "step": 200 }, { "epoch": 0.028216510099445993, "grad_norm": 0.2631006965207915, "learning_rate": 5.63961485557084e-06, "loss": 0.7853, "step": 205 }, { "epoch": 0.028904717662847114, "grad_norm": 0.26141482784645287, "learning_rate": 5.777166437414031e-06, "loss": 0.8267, "step": 210 }, { "epoch": 0.029592925226248238, "grad_norm": 0.24150431707927228, "learning_rate": 5.914718019257222e-06, "loss": 0.8036, "step": 215 }, { "epoch": 0.03028113278964936, "grad_norm": 0.2944904995150858, "learning_rate": 6.052269601100413e-06, "loss": 0.7806, "step": 220 }, { "epoch": 0.03096934035305048, "grad_norm": 0.238858674637656, "learning_rate": 6.189821182943604e-06, "loss": 0.7703, "step": 225 }, { "epoch": 0.0316575479164516, "grad_norm": 0.25528256066626154, "learning_rate": 6.327372764786796e-06, "loss": 0.8215, "step": 230 }, { "epoch": 0.032345755479852724, "grad_norm": 0.25150468358801925, "learning_rate": 6.464924346629987e-06, "loss": 0.805, "step": 235 }, { "epoch": 0.033033963043253844, "grad_norm": 0.2567985154946394, "learning_rate": 6.602475928473178e-06, "loss": 0.8449, "step": 240 }, { "epoch": 0.033722170606654965, "grad_norm": 0.30128417458156487, "learning_rate": 6.740027510316369e-06, "loss": 0.8987, "step": 245 }, { "epoch": 0.034410378170056086, "grad_norm": 0.2468450749918583, "learning_rate": 6.877579092159561e-06, "loss": 0.7915, "step": 250 }, { "epoch": 0.03509858573345721, "grad_norm": 0.2652255969063236, "learning_rate": 7.015130674002751e-06, "loss": 0.819, "step": 255 }, { "epoch": 0.035786793296858334, "grad_norm": 0.27901024453469003, "learning_rate": 7.152682255845943e-06, "loss": 0.768, "step": 260 }, { "epoch": 0.036475000860259454, "grad_norm": 0.23650694837152136, "learning_rate": 7.290233837689133e-06, "loss": 0.805, "step": 265 }, { "epoch": 0.037163208423660575, "grad_norm": 0.28138836488849467, "learning_rate": 7.4277854195323255e-06, "loss": 0.8195, "step": 270 }, { "epoch": 0.037851415987061696, "grad_norm": 0.2776842086844666, "learning_rate": 7.565337001375517e-06, "loss": 0.8302, "step": 275 }, { "epoch": 0.038539623550462816, "grad_norm": 0.25249008491041186, "learning_rate": 7.702888583218707e-06, "loss": 0.7981, "step": 280 }, { "epoch": 0.039227831113863944, "grad_norm": 0.25031543045904664, "learning_rate": 7.840440165061898e-06, "loss": 0.8194, "step": 285 }, { "epoch": 0.039916038677265064, "grad_norm": 0.2615014227358955, "learning_rate": 7.97799174690509e-06, "loss": 0.82, "step": 290 }, { "epoch": 0.040604246240666185, "grad_norm": 0.25422856914873837, "learning_rate": 8.115543328748281e-06, "loss": 0.8166, "step": 295 }, { "epoch": 0.041292453804067306, "grad_norm": 0.26129491709057906, "learning_rate": 8.253094910591473e-06, "loss": 0.8163, "step": 300 }, { "epoch": 0.041980661367468426, "grad_norm": 0.2480673233015307, "learning_rate": 8.390646492434664e-06, "loss": 0.8041, "step": 305 }, { "epoch": 0.04266886893086955, "grad_norm": 0.27201228663416094, "learning_rate": 8.528198074277854e-06, "loss": 0.7592, "step": 310 }, { "epoch": 0.043357076494270674, "grad_norm": 0.27032789834190407, "learning_rate": 8.665749656121047e-06, "loss": 0.8466, "step": 315 }, { "epoch": 0.044045284057671795, "grad_norm": 0.27698487225021634, "learning_rate": 8.803301237964237e-06, "loss": 0.7959, "step": 320 }, { "epoch": 0.044733491621072916, "grad_norm": 0.26349258636640605, "learning_rate": 8.940852819807428e-06, "loss": 0.8271, "step": 325 }, { "epoch": 0.045421699184474036, "grad_norm": 0.2833776020234445, "learning_rate": 9.07840440165062e-06, "loss": 0.8423, "step": 330 }, { "epoch": 0.04610990674787516, "grad_norm": 0.26249090566762584, "learning_rate": 9.21595598349381e-06, "loss": 0.797, "step": 335 }, { "epoch": 0.046798114311276284, "grad_norm": 0.2703428149077038, "learning_rate": 9.353507565337003e-06, "loss": 0.8188, "step": 340 }, { "epoch": 0.047486321874677405, "grad_norm": 0.28483545085596673, "learning_rate": 9.491059147180193e-06, "loss": 0.8081, "step": 345 }, { "epoch": 0.048174529438078526, "grad_norm": 0.2711955261563685, "learning_rate": 9.628610729023384e-06, "loss": 0.778, "step": 350 }, { "epoch": 0.048862737001479646, "grad_norm": 0.2362883519248254, "learning_rate": 9.766162310866576e-06, "loss": 0.8029, "step": 355 }, { "epoch": 0.04955094456488077, "grad_norm": 0.24984233779690806, "learning_rate": 9.903713892709767e-06, "loss": 0.8352, "step": 360 }, { "epoch": 0.05023915212828189, "grad_norm": 0.2533895609475862, "learning_rate": 1.0041265474552959e-05, "loss": 0.812, "step": 365 }, { "epoch": 0.050927359691683015, "grad_norm": 0.27316397996549163, "learning_rate": 1.0178817056396148e-05, "loss": 0.7949, "step": 370 }, { "epoch": 0.051615567255084135, "grad_norm": 0.24997898374865665, "learning_rate": 1.031636863823934e-05, "loss": 0.8129, "step": 375 }, { "epoch": 0.052303774818485256, "grad_norm": 0.2509291295008787, "learning_rate": 1.0453920220082532e-05, "loss": 0.7678, "step": 380 }, { "epoch": 0.05299198238188638, "grad_norm": 0.24795148119296936, "learning_rate": 1.0591471801925723e-05, "loss": 0.7509, "step": 385 }, { "epoch": 0.0536801899452875, "grad_norm": 0.26138861430422916, "learning_rate": 1.0729023383768915e-05, "loss": 0.7794, "step": 390 }, { "epoch": 0.05436839750868862, "grad_norm": 0.26469072453130804, "learning_rate": 1.0866574965612104e-05, "loss": 0.872, "step": 395 }, { "epoch": 0.055056605072089745, "grad_norm": 0.27206028756311873, "learning_rate": 1.1004126547455296e-05, "loss": 0.7579, "step": 400 }, { "epoch": 0.055056605072089745, "eval_loss": 0.7995939254760742, "eval_runtime": 52.7788, "eval_samples_per_second": 94.735, "eval_steps_per_second": 2.975, "step": 400 }, { "epoch": 0.055744812635490866, "grad_norm": 0.25604118980945256, "learning_rate": 1.1141678129298487e-05, "loss": 0.8066, "step": 405 }, { "epoch": 0.05643302019889199, "grad_norm": 0.21589813345959452, "learning_rate": 1.127922971114168e-05, "loss": 0.7885, "step": 410 }, { "epoch": 0.05712122776229311, "grad_norm": 0.23173334125215145, "learning_rate": 1.1416781292984871e-05, "loss": 0.7553, "step": 415 }, { "epoch": 0.05780943532569423, "grad_norm": 0.2910531424043913, "learning_rate": 1.1554332874828062e-05, "loss": 0.8598, "step": 420 }, { "epoch": 0.05849764288909535, "grad_norm": 0.2541400073189138, "learning_rate": 1.1691884456671253e-05, "loss": 0.8246, "step": 425 }, { "epoch": 0.059185850452496476, "grad_norm": 0.2392029243140325, "learning_rate": 1.1829436038514443e-05, "loss": 0.7586, "step": 430 }, { "epoch": 0.0598740580158976, "grad_norm": 0.2553186944197809, "learning_rate": 1.1966987620357635e-05, "loss": 0.7822, "step": 435 }, { "epoch": 0.06056226557929872, "grad_norm": 0.25098006347202895, "learning_rate": 1.2104539202200826e-05, "loss": 0.7779, "step": 440 }, { "epoch": 0.06125047314269984, "grad_norm": 0.2686862571299319, "learning_rate": 1.2242090784044018e-05, "loss": 0.7329, "step": 445 }, { "epoch": 0.06193868070610096, "grad_norm": 0.27240298273640234, "learning_rate": 1.2379642365887207e-05, "loss": 0.8181, "step": 450 }, { "epoch": 0.06262688826950208, "grad_norm": 0.23993890948636432, "learning_rate": 1.25171939477304e-05, "loss": 0.7773, "step": 455 }, { "epoch": 0.0633150958329032, "grad_norm": 0.28356348954244076, "learning_rate": 1.2654745529573592e-05, "loss": 0.7789, "step": 460 }, { "epoch": 0.06400330339630432, "grad_norm": 0.2582698532531319, "learning_rate": 1.2792297111416782e-05, "loss": 0.7569, "step": 465 }, { "epoch": 0.06469151095970545, "grad_norm": 0.24946075763865486, "learning_rate": 1.2929848693259975e-05, "loss": 0.7918, "step": 470 }, { "epoch": 0.06537971852310658, "grad_norm": 0.2885880118724355, "learning_rate": 1.3067400275103163e-05, "loss": 0.8256, "step": 475 }, { "epoch": 0.06606792608650769, "grad_norm": 0.26556655622324205, "learning_rate": 1.3204951856946356e-05, "loss": 0.7618, "step": 480 }, { "epoch": 0.06675613364990882, "grad_norm": 0.24485781392219974, "learning_rate": 1.3342503438789546e-05, "loss": 0.8012, "step": 485 }, { "epoch": 0.06744434121330993, "grad_norm": 0.25655612782794157, "learning_rate": 1.3480055020632738e-05, "loss": 0.8461, "step": 490 }, { "epoch": 0.06813254877671106, "grad_norm": 0.24215321618147564, "learning_rate": 1.361760660247593e-05, "loss": 0.7952, "step": 495 }, { "epoch": 0.06882075634011217, "grad_norm": 0.2550254345291816, "learning_rate": 1.3755158184319121e-05, "loss": 0.7977, "step": 500 }, { "epoch": 0.0695089639035133, "grad_norm": 0.2351950781118825, "learning_rate": 1.3892709766162312e-05, "loss": 0.7397, "step": 505 }, { "epoch": 0.07019717146691443, "grad_norm": 0.25841976485403306, "learning_rate": 1.4030261348005502e-05, "loss": 0.8336, "step": 510 }, { "epoch": 0.07088537903031554, "grad_norm": 0.23271524382832345, "learning_rate": 1.4167812929848695e-05, "loss": 0.8959, "step": 515 }, { "epoch": 0.07157358659371667, "grad_norm": 0.24020488275466886, "learning_rate": 1.4305364511691885e-05, "loss": 0.7505, "step": 520 }, { "epoch": 0.07226179415711778, "grad_norm": 0.21773384020940342, "learning_rate": 1.4442916093535078e-05, "loss": 0.7261, "step": 525 }, { "epoch": 0.07295000172051891, "grad_norm": 0.24151374742276033, "learning_rate": 1.4580467675378266e-05, "loss": 0.7708, "step": 530 }, { "epoch": 0.07363820928392004, "grad_norm": 0.24926440943094774, "learning_rate": 1.4718019257221459e-05, "loss": 0.8332, "step": 535 }, { "epoch": 0.07432641684732115, "grad_norm": 0.23778999228251665, "learning_rate": 1.4855570839064651e-05, "loss": 0.737, "step": 540 }, { "epoch": 0.07501462441072228, "grad_norm": 0.24795131488863945, "learning_rate": 1.4993122420907842e-05, "loss": 0.7593, "step": 545 }, { "epoch": 0.07570283197412339, "grad_norm": 0.26408080272699436, "learning_rate": 1.5130674002751034e-05, "loss": 0.7548, "step": 550 }, { "epoch": 0.07639103953752452, "grad_norm": 0.2159724531175162, "learning_rate": 1.5268225584594224e-05, "loss": 0.6977, "step": 555 }, { "epoch": 0.07707924710092563, "grad_norm": 0.2385802666016439, "learning_rate": 1.5405777166437415e-05, "loss": 0.8057, "step": 560 }, { "epoch": 0.07776745466432676, "grad_norm": 0.2273885454076171, "learning_rate": 1.5543328748280606e-05, "loss": 0.7538, "step": 565 }, { "epoch": 0.07845566222772789, "grad_norm": 0.26795677211884505, "learning_rate": 1.5680880330123796e-05, "loss": 0.8225, "step": 570 }, { "epoch": 0.079143869791129, "grad_norm": 0.22376118744302545, "learning_rate": 1.581843191196699e-05, "loss": 0.7449, "step": 575 }, { "epoch": 0.07983207735453013, "grad_norm": 0.22413207876216854, "learning_rate": 1.595598349381018e-05, "loss": 0.6953, "step": 580 }, { "epoch": 0.08052028491793124, "grad_norm": 0.22871575826370327, "learning_rate": 1.609353507565337e-05, "loss": 0.7889, "step": 585 }, { "epoch": 0.08120849248133237, "grad_norm": 0.24284180799999136, "learning_rate": 1.6231086657496562e-05, "loss": 0.7883, "step": 590 }, { "epoch": 0.0818967000447335, "grad_norm": 0.23801824168710917, "learning_rate": 1.6368638239339752e-05, "loss": 0.8241, "step": 595 }, { "epoch": 0.08258490760813461, "grad_norm": 0.23752003411990083, "learning_rate": 1.6506189821182946e-05, "loss": 0.8037, "step": 600 }, { "epoch": 0.08258490760813461, "eval_loss": 0.7917500734329224, "eval_runtime": 52.7347, "eval_samples_per_second": 94.814, "eval_steps_per_second": 2.977, "step": 600 }, { "epoch": 0.08327311517153574, "grad_norm": 0.21498088395349046, "learning_rate": 1.6643741403026137e-05, "loss": 0.8051, "step": 605 }, { "epoch": 0.08396132273493685, "grad_norm": 0.2723853607428669, "learning_rate": 1.6781292984869327e-05, "loss": 0.8494, "step": 610 }, { "epoch": 0.08464953029833798, "grad_norm": 0.25087909139029063, "learning_rate": 1.6918844566712518e-05, "loss": 0.7954, "step": 615 }, { "epoch": 0.0853377378617391, "grad_norm": 0.26264853297170815, "learning_rate": 1.705639614855571e-05, "loss": 0.8179, "step": 620 }, { "epoch": 0.08602594542514022, "grad_norm": 0.25442767943563843, "learning_rate": 1.7193947730398902e-05, "loss": 0.7913, "step": 625 }, { "epoch": 0.08671415298854135, "grad_norm": 0.2485772674865671, "learning_rate": 1.7331499312242093e-05, "loss": 0.8462, "step": 630 }, { "epoch": 0.08740236055194246, "grad_norm": 0.2297827668923588, "learning_rate": 1.7469050894085284e-05, "loss": 0.8126, "step": 635 }, { "epoch": 0.08809056811534359, "grad_norm": 0.2580226174369326, "learning_rate": 1.7606602475928474e-05, "loss": 0.7875, "step": 640 }, { "epoch": 0.0887787756787447, "grad_norm": 0.25545084792637834, "learning_rate": 1.7744154057771665e-05, "loss": 0.7928, "step": 645 }, { "epoch": 0.08946698324214583, "grad_norm": 0.21744022535105786, "learning_rate": 1.7881705639614855e-05, "loss": 0.7524, "step": 650 }, { "epoch": 0.09015519080554696, "grad_norm": 0.22631874220082612, "learning_rate": 1.801925722145805e-05, "loss": 0.7999, "step": 655 }, { "epoch": 0.09084339836894807, "grad_norm": 0.2583501703584106, "learning_rate": 1.815680880330124e-05, "loss": 0.7105, "step": 660 }, { "epoch": 0.0915316059323492, "grad_norm": 0.21256186757978743, "learning_rate": 1.829436038514443e-05, "loss": 0.8142, "step": 665 }, { "epoch": 0.09221981349575031, "grad_norm": 0.2597351046945739, "learning_rate": 1.843191196698762e-05, "loss": 0.7873, "step": 670 }, { "epoch": 0.09290802105915144, "grad_norm": 0.27995140675392, "learning_rate": 1.856946354883081e-05, "loss": 0.8617, "step": 675 }, { "epoch": 0.09359622862255257, "grad_norm": 0.23207627757429503, "learning_rate": 1.8707015130674006e-05, "loss": 0.7981, "step": 680 }, { "epoch": 0.09428443618595368, "grad_norm": 0.2291794347272008, "learning_rate": 1.8844566712517196e-05, "loss": 0.8177, "step": 685 }, { "epoch": 0.09497264374935481, "grad_norm": 0.20702167244452424, "learning_rate": 1.8982118294360387e-05, "loss": 0.7888, "step": 690 }, { "epoch": 0.09566085131275592, "grad_norm": 0.24860651589382624, "learning_rate": 1.9119669876203577e-05, "loss": 0.8428, "step": 695 }, { "epoch": 0.09634905887615705, "grad_norm": 0.30628908920240605, "learning_rate": 1.9257221458046768e-05, "loss": 0.795, "step": 700 }, { "epoch": 0.09703726643955816, "grad_norm": 0.2296616012373046, "learning_rate": 1.9394773039889962e-05, "loss": 0.8296, "step": 705 }, { "epoch": 0.09772547400295929, "grad_norm": 0.24277081727446007, "learning_rate": 1.9532324621733152e-05, "loss": 0.7398, "step": 710 }, { "epoch": 0.09841368156636042, "grad_norm": 0.2413020719356136, "learning_rate": 1.9669876203576343e-05, "loss": 0.6853, "step": 715 }, { "epoch": 0.09910188912976153, "grad_norm": 0.24497594993981997, "learning_rate": 1.9807427785419533e-05, "loss": 0.7504, "step": 720 }, { "epoch": 0.09979009669316266, "grad_norm": 0.2391970342069721, "learning_rate": 1.9944979367262724e-05, "loss": 0.8073, "step": 725 }, { "epoch": 0.10047830425656377, "grad_norm": 0.24766426912726863, "learning_rate": 1.9999989609837225e-05, "loss": 0.8292, "step": 730 }, { "epoch": 0.1011665118199649, "grad_norm": 0.21306231491873381, "learning_rate": 1.999992611447622e-05, "loss": 0.7833, "step": 735 }, { "epoch": 0.10185471938336603, "grad_norm": 0.21425758797073008, "learning_rate": 1.999980489643294e-05, "loss": 0.7784, "step": 740 }, { "epoch": 0.10254292694676714, "grad_norm": 0.21481502838756258, "learning_rate": 1.9999625956407085e-05, "loss": 0.8067, "step": 745 }, { "epoch": 0.10323113451016827, "grad_norm": 0.27368148873647646, "learning_rate": 1.999938929543156e-05, "loss": 0.8193, "step": 750 }, { "epoch": 0.10391934207356938, "grad_norm": 0.24566975325911952, "learning_rate": 1.9999094914872443e-05, "loss": 0.7939, "step": 755 }, { "epoch": 0.10460754963697051, "grad_norm": 0.22177602497942456, "learning_rate": 1.999874281642899e-05, "loss": 0.7854, "step": 760 }, { "epoch": 0.10529575720037163, "grad_norm": 0.22326516638079882, "learning_rate": 1.999833300213362e-05, "loss": 0.764, "step": 765 }, { "epoch": 0.10598396476377275, "grad_norm": 0.2295427991285434, "learning_rate": 1.9997865474351913e-05, "loss": 0.8191, "step": 770 }, { "epoch": 0.10667217232717388, "grad_norm": 0.22259751550665133, "learning_rate": 1.9997340235782583e-05, "loss": 0.7646, "step": 775 }, { "epoch": 0.107360379890575, "grad_norm": 0.22058200750981372, "learning_rate": 1.999675728945747e-05, "loss": 0.7609, "step": 780 }, { "epoch": 0.10804858745397612, "grad_norm": 0.2457704583432352, "learning_rate": 1.999611663874152e-05, "loss": 0.838, "step": 785 }, { "epoch": 0.10873679501737724, "grad_norm": 0.21177882992130428, "learning_rate": 1.9995418287332767e-05, "loss": 0.7759, "step": 790 }, { "epoch": 0.10942500258077836, "grad_norm": 0.2503939565217841, "learning_rate": 1.9994662239262318e-05, "loss": 0.8032, "step": 795 }, { "epoch": 0.11011321014417949, "grad_norm": 0.21229753344233715, "learning_rate": 1.9993848498894315e-05, "loss": 0.7333, "step": 800 }, { "epoch": 0.11011321014417949, "eval_loss": 0.7879002690315247, "eval_runtime": 52.731, "eval_samples_per_second": 94.821, "eval_steps_per_second": 2.977, "step": 800 }, { "epoch": 0.1108014177075806, "grad_norm": 0.19448064261242895, "learning_rate": 1.999297707092592e-05, "loss": 0.7098, "step": 805 }, { "epoch": 0.11148962527098173, "grad_norm": 0.21869992031917437, "learning_rate": 1.9992047960387287e-05, "loss": 0.8123, "step": 810 }, { "epoch": 0.11217783283438285, "grad_norm": 0.22918629087160183, "learning_rate": 1.9991061172641526e-05, "loss": 0.7833, "step": 815 }, { "epoch": 0.11286604039778397, "grad_norm": 0.2277325576318785, "learning_rate": 1.9990016713384688e-05, "loss": 0.7605, "step": 820 }, { "epoch": 0.11355424796118509, "grad_norm": 0.20795171239526045, "learning_rate": 1.9988914588645715e-05, "loss": 0.7449, "step": 825 }, { "epoch": 0.11424245552458621, "grad_norm": 0.24557720615091322, "learning_rate": 1.9987754804786416e-05, "loss": 0.7298, "step": 830 }, { "epoch": 0.11493066308798734, "grad_norm": 0.2236305229478849, "learning_rate": 1.9986537368501416e-05, "loss": 0.7905, "step": 835 }, { "epoch": 0.11561887065138846, "grad_norm": 0.2426482676850689, "learning_rate": 1.9985262286818145e-05, "loss": 0.8111, "step": 840 }, { "epoch": 0.11630707821478958, "grad_norm": 0.2167140179416489, "learning_rate": 1.998392956709677e-05, "loss": 0.7324, "step": 845 }, { "epoch": 0.1169952857781907, "grad_norm": 0.2228312073867196, "learning_rate": 1.9982539217030157e-05, "loss": 0.7403, "step": 850 }, { "epoch": 0.11768349334159182, "grad_norm": 0.23718955370577552, "learning_rate": 1.9981091244643843e-05, "loss": 0.8587, "step": 855 }, { "epoch": 0.11837170090499295, "grad_norm": 0.2530234401055118, "learning_rate": 1.9979585658295975e-05, "loss": 0.7738, "step": 860 }, { "epoch": 0.11905990846839407, "grad_norm": 0.255916321887633, "learning_rate": 1.9978022466677265e-05, "loss": 0.7886, "step": 865 }, { "epoch": 0.1197481160317952, "grad_norm": 0.22333665389141752, "learning_rate": 1.9976401678810937e-05, "loss": 0.8244, "step": 870 }, { "epoch": 0.1204363235951963, "grad_norm": 0.2032086513478143, "learning_rate": 1.997472330405269e-05, "loss": 0.7632, "step": 875 }, { "epoch": 0.12112453115859743, "grad_norm": 0.23997697179830377, "learning_rate": 1.997298735209062e-05, "loss": 0.7947, "step": 880 }, { "epoch": 0.12181273872199855, "grad_norm": 0.21585387669572864, "learning_rate": 1.9971193832945184e-05, "loss": 0.7812, "step": 885 }, { "epoch": 0.12250094628539968, "grad_norm": 0.24514008600445167, "learning_rate": 1.996934275696913e-05, "loss": 0.8776, "step": 890 }, { "epoch": 0.1231891538488008, "grad_norm": 0.20512787821937278, "learning_rate": 1.9967434134847443e-05, "loss": 0.7161, "step": 895 }, { "epoch": 0.12387736141220192, "grad_norm": 0.2672063201074863, "learning_rate": 1.9965467977597286e-05, "loss": 0.7676, "step": 900 }, { "epoch": 0.12456556897560304, "grad_norm": 0.24178641785482682, "learning_rate": 1.9963444296567925e-05, "loss": 0.7796, "step": 905 }, { "epoch": 0.12525377653900416, "grad_norm": 0.24242812662458535, "learning_rate": 1.9961363103440684e-05, "loss": 0.7857, "step": 910 }, { "epoch": 0.12594198410240529, "grad_norm": 0.2202469722982878, "learning_rate": 1.995922441022885e-05, "loss": 0.8037, "step": 915 }, { "epoch": 0.1266301916658064, "grad_norm": 0.21976689253115256, "learning_rate": 1.9957028229277628e-05, "loss": 0.7248, "step": 920 }, { "epoch": 0.12731839922920754, "grad_norm": 0.2573512850429467, "learning_rate": 1.9954774573264058e-05, "loss": 0.804, "step": 925 }, { "epoch": 0.12800660679260864, "grad_norm": 0.21223484429852515, "learning_rate": 1.995246345519694e-05, "loss": 0.7094, "step": 930 }, { "epoch": 0.12869481435600977, "grad_norm": 0.25121838449886086, "learning_rate": 1.9950094888416766e-05, "loss": 0.8065, "step": 935 }, { "epoch": 0.1293830219194109, "grad_norm": 0.2201804036448773, "learning_rate": 1.9947668886595645e-05, "loss": 0.8205, "step": 940 }, { "epoch": 0.13007122948281202, "grad_norm": 0.2144702205181123, "learning_rate": 1.994518546373721e-05, "loss": 0.7267, "step": 945 }, { "epoch": 0.13075943704621315, "grad_norm": 0.2460840358935499, "learning_rate": 1.9942644634176547e-05, "loss": 0.8564, "step": 950 }, { "epoch": 0.13144764460961425, "grad_norm": 0.20957697318723228, "learning_rate": 1.994004641258012e-05, "loss": 0.7629, "step": 955 }, { "epoch": 0.13213585217301538, "grad_norm": 0.22355617581740012, "learning_rate": 1.993739081394567e-05, "loss": 0.7132, "step": 960 }, { "epoch": 0.1328240597364165, "grad_norm": 0.21348146904128287, "learning_rate": 1.9934677853602133e-05, "loss": 0.7974, "step": 965 }, { "epoch": 0.13351226729981763, "grad_norm": 0.25871554552299675, "learning_rate": 1.9931907547209563e-05, "loss": 0.7538, "step": 970 }, { "epoch": 0.13420047486321876, "grad_norm": 0.23412375521437684, "learning_rate": 1.9929079910759032e-05, "loss": 0.8231, "step": 975 }, { "epoch": 0.13488868242661986, "grad_norm": 0.199866328338374, "learning_rate": 1.9926194960572536e-05, "loss": 0.7381, "step": 980 }, { "epoch": 0.135576889990021, "grad_norm": 0.23197542329712204, "learning_rate": 1.99232527133029e-05, "loss": 0.7484, "step": 985 }, { "epoch": 0.13626509755342212, "grad_norm": 0.23854565742633904, "learning_rate": 1.9920253185933694e-05, "loss": 0.79, "step": 990 }, { "epoch": 0.13695330511682324, "grad_norm": 0.206979108968536, "learning_rate": 1.991719639577912e-05, "loss": 0.7912, "step": 995 }, { "epoch": 0.13764151268022434, "grad_norm": 0.2295553365165125, "learning_rate": 1.9914082360483924e-05, "loss": 0.7871, "step": 1000 }, { "epoch": 0.13764151268022434, "eval_loss": 0.7817676663398743, "eval_runtime": 52.7373, "eval_samples_per_second": 94.81, "eval_steps_per_second": 2.977, "step": 1000 }, { "epoch": 0.13832972024362547, "grad_norm": 0.22118391234341447, "learning_rate": 1.9910911098023287e-05, "loss": 0.7779, "step": 1005 }, { "epoch": 0.1390179278070266, "grad_norm": 0.20891029053638388, "learning_rate": 1.9907682626702717e-05, "loss": 0.7349, "step": 1010 }, { "epoch": 0.13970613537042773, "grad_norm": 0.19924067604798007, "learning_rate": 1.9904396965157954e-05, "loss": 0.7543, "step": 1015 }, { "epoch": 0.14039434293382885, "grad_norm": 0.19690073095679883, "learning_rate": 1.9901054132354857e-05, "loss": 0.7878, "step": 1020 }, { "epoch": 0.14108255049722995, "grad_norm": 0.22047664004944045, "learning_rate": 1.9897654147589306e-05, "loss": 0.7555, "step": 1025 }, { "epoch": 0.14177075806063108, "grad_norm": 0.22344040991323258, "learning_rate": 1.989419703048706e-05, "loss": 0.7122, "step": 1030 }, { "epoch": 0.1424589656240322, "grad_norm": 0.22272239318200754, "learning_rate": 1.9890682801003676e-05, "loss": 0.7732, "step": 1035 }, { "epoch": 0.14314717318743334, "grad_norm": 0.20794940962333522, "learning_rate": 1.988711147942438e-05, "loss": 0.7102, "step": 1040 }, { "epoch": 0.14383538075083446, "grad_norm": 0.2481794034240613, "learning_rate": 1.988348308636395e-05, "loss": 0.8535, "step": 1045 }, { "epoch": 0.14452358831423556, "grad_norm": 0.20237868727153344, "learning_rate": 1.98797976427666e-05, "loss": 0.8011, "step": 1050 }, { "epoch": 0.1452117958776367, "grad_norm": 0.256089774696112, "learning_rate": 1.9876055169905856e-05, "loss": 0.8312, "step": 1055 }, { "epoch": 0.14590000344103782, "grad_norm": 0.24060504283304612, "learning_rate": 1.9872255689384435e-05, "loss": 0.8131, "step": 1060 }, { "epoch": 0.14658821100443895, "grad_norm": 0.22835282616962924, "learning_rate": 1.9868399223134115e-05, "loss": 0.769, "step": 1065 }, { "epoch": 0.14727641856784007, "grad_norm": 0.22955935611469466, "learning_rate": 1.9864485793415624e-05, "loss": 0.7279, "step": 1070 }, { "epoch": 0.14796462613124117, "grad_norm": 0.2120555458338487, "learning_rate": 1.9860515422818493e-05, "loss": 0.8063, "step": 1075 }, { "epoch": 0.1486528336946423, "grad_norm": 0.23961672177859272, "learning_rate": 1.9856488134260935e-05, "loss": 0.8222, "step": 1080 }, { "epoch": 0.14934104125804343, "grad_norm": 0.23561821601200034, "learning_rate": 1.9852403950989712e-05, "loss": 0.7976, "step": 1085 }, { "epoch": 0.15002924882144456, "grad_norm": 0.22276962032658051, "learning_rate": 1.9848262896579994e-05, "loss": 0.7795, "step": 1090 }, { "epoch": 0.15071745638484568, "grad_norm": 0.19008185426951335, "learning_rate": 1.9844064994935247e-05, "loss": 0.7291, "step": 1095 }, { "epoch": 0.15140566394824678, "grad_norm": 0.2160509023778668, "learning_rate": 1.983981027028705e-05, "loss": 0.7341, "step": 1100 }, { "epoch": 0.1520938715116479, "grad_norm": 0.19876171963058964, "learning_rate": 1.983549874719501e-05, "loss": 0.7588, "step": 1105 }, { "epoch": 0.15278207907504904, "grad_norm": 0.22687067034664965, "learning_rate": 1.9831130450546568e-05, "loss": 0.775, "step": 1110 }, { "epoch": 0.15347028663845017, "grad_norm": 0.24658348762307966, "learning_rate": 1.9826705405556893e-05, "loss": 0.8034, "step": 1115 }, { "epoch": 0.15415849420185126, "grad_norm": 0.21936631969605264, "learning_rate": 1.9822223637768725e-05, "loss": 0.724, "step": 1120 }, { "epoch": 0.1548467017652524, "grad_norm": 0.20382451684839054, "learning_rate": 1.9817685173052217e-05, "loss": 0.7216, "step": 1125 }, { "epoch": 0.15553490932865352, "grad_norm": 0.22090433505564788, "learning_rate": 1.9813090037604798e-05, "loss": 0.7245, "step": 1130 }, { "epoch": 0.15622311689205465, "grad_norm": 0.21667819940597813, "learning_rate": 1.9808438257951018e-05, "loss": 0.79, "step": 1135 }, { "epoch": 0.15691132445545578, "grad_norm": 0.1931085332068848, "learning_rate": 1.9803729860942398e-05, "loss": 0.7189, "step": 1140 }, { "epoch": 0.15759953201885687, "grad_norm": 0.1856902787937216, "learning_rate": 1.9798964873757263e-05, "loss": 0.8128, "step": 1145 }, { "epoch": 0.158287739582258, "grad_norm": 0.23313448921968868, "learning_rate": 1.9794143323900613e-05, "loss": 0.7675, "step": 1150 }, { "epoch": 0.15897594714565913, "grad_norm": 0.20841804338231196, "learning_rate": 1.9789265239203915e-05, "loss": 0.7803, "step": 1155 }, { "epoch": 0.15966415470906026, "grad_norm": 0.21822110172204612, "learning_rate": 1.9784330647825006e-05, "loss": 0.7708, "step": 1160 }, { "epoch": 0.16035236227246139, "grad_norm": 0.18609736215239164, "learning_rate": 1.9779339578247877e-05, "loss": 0.6967, "step": 1165 }, { "epoch": 0.16104056983586248, "grad_norm": 0.21448922090977698, "learning_rate": 1.9774292059282528e-05, "loss": 0.8026, "step": 1170 }, { "epoch": 0.1617287773992636, "grad_norm": 0.21340359417402296, "learning_rate": 1.9769188120064814e-05, "loss": 0.8028, "step": 1175 }, { "epoch": 0.16241698496266474, "grad_norm": 0.21892787797905072, "learning_rate": 1.9764027790056253e-05, "loss": 0.7277, "step": 1180 }, { "epoch": 0.16310519252606587, "grad_norm": 0.19574842716638455, "learning_rate": 1.975881109904388e-05, "loss": 0.7643, "step": 1185 }, { "epoch": 0.163793400089467, "grad_norm": 0.220622646698398, "learning_rate": 1.975353807714005e-05, "loss": 0.7834, "step": 1190 }, { "epoch": 0.1644816076528681, "grad_norm": 0.24720515316109795, "learning_rate": 1.9748208754782295e-05, "loss": 0.8455, "step": 1195 }, { "epoch": 0.16516981521626922, "grad_norm": 0.2145580271830877, "learning_rate": 1.9742823162733108e-05, "loss": 0.8135, "step": 1200 }, { "epoch": 0.16516981521626922, "eval_loss": 0.7735677361488342, "eval_runtime": 52.7442, "eval_samples_per_second": 94.797, "eval_steps_per_second": 2.977, "step": 1200 }, { "epoch": 0.16585802277967035, "grad_norm": 0.20796867692490487, "learning_rate": 1.9737381332079812e-05, "loss": 0.7631, "step": 1205 }, { "epoch": 0.16654623034307148, "grad_norm": 0.22484091452942392, "learning_rate": 1.973188329423434e-05, "loss": 0.8729, "step": 1210 }, { "epoch": 0.1672344379064726, "grad_norm": 0.2255562834083872, "learning_rate": 1.9726329080933077e-05, "loss": 0.7624, "step": 1215 }, { "epoch": 0.1679226454698737, "grad_norm": 0.20602861639015677, "learning_rate": 1.9720718724236665e-05, "loss": 0.7435, "step": 1220 }, { "epoch": 0.16861085303327483, "grad_norm": 0.23647241585259732, "learning_rate": 1.9715052256529833e-05, "loss": 0.8136, "step": 1225 }, { "epoch": 0.16929906059667596, "grad_norm": 0.2068758757004929, "learning_rate": 1.9709329710521188e-05, "loss": 0.7536, "step": 1230 }, { "epoch": 0.1699872681600771, "grad_norm": 0.26534304335940007, "learning_rate": 1.9703551119243047e-05, "loss": 0.7175, "step": 1235 }, { "epoch": 0.1706754757234782, "grad_norm": 0.21442498125191325, "learning_rate": 1.969771651605124e-05, "loss": 0.7549, "step": 1240 }, { "epoch": 0.17136368328687931, "grad_norm": 0.2523883225805991, "learning_rate": 1.96918259346249e-05, "loss": 0.7445, "step": 1245 }, { "epoch": 0.17205189085028044, "grad_norm": 0.21804341495621404, "learning_rate": 1.9685879408966308e-05, "loss": 0.7741, "step": 1250 }, { "epoch": 0.17274009841368157, "grad_norm": 0.24040974170660406, "learning_rate": 1.9679876973400646e-05, "loss": 0.8408, "step": 1255 }, { "epoch": 0.1734283059770827, "grad_norm": 0.19169617204335768, "learning_rate": 1.9673818662575847e-05, "loss": 0.7132, "step": 1260 }, { "epoch": 0.1741165135404838, "grad_norm": 0.22042750095017302, "learning_rate": 1.966770451146236e-05, "loss": 0.7046, "step": 1265 }, { "epoch": 0.17480472110388492, "grad_norm": 0.21912132776328097, "learning_rate": 1.9661534555352965e-05, "loss": 0.8528, "step": 1270 }, { "epoch": 0.17549292866728605, "grad_norm": 0.2269499111175067, "learning_rate": 1.965530882986257e-05, "loss": 0.8536, "step": 1275 }, { "epoch": 0.17618113623068718, "grad_norm": 0.20895630955481365, "learning_rate": 1.9649027370927998e-05, "loss": 0.7473, "step": 1280 }, { "epoch": 0.1768693437940883, "grad_norm": 0.22974631530988204, "learning_rate": 1.964269021480778e-05, "loss": 0.7308, "step": 1285 }, { "epoch": 0.1775575513574894, "grad_norm": 0.20806371463201376, "learning_rate": 1.963629739808195e-05, "loss": 0.789, "step": 1290 }, { "epoch": 0.17824575892089053, "grad_norm": 0.21252030305308997, "learning_rate": 1.962984895765184e-05, "loss": 0.7034, "step": 1295 }, { "epoch": 0.17893396648429166, "grad_norm": 0.21934628364128902, "learning_rate": 1.9623344930739846e-05, "loss": 0.7825, "step": 1300 }, { "epoch": 0.1796221740476928, "grad_norm": 0.21049018105467296, "learning_rate": 1.9616785354889238e-05, "loss": 0.821, "step": 1305 }, { "epoch": 0.18031038161109392, "grad_norm": 0.22497439240985678, "learning_rate": 1.9610170267963923e-05, "loss": 0.7625, "step": 1310 }, { "epoch": 0.18099858917449502, "grad_norm": 0.20378166252946806, "learning_rate": 1.9603499708148245e-05, "loss": 0.7358, "step": 1315 }, { "epoch": 0.18168679673789614, "grad_norm": 0.2361812643406838, "learning_rate": 1.9596773713946746e-05, "loss": 0.8185, "step": 1320 }, { "epoch": 0.18237500430129727, "grad_norm": 0.20967484897646724, "learning_rate": 1.958999232418395e-05, "loss": 0.772, "step": 1325 }, { "epoch": 0.1830632118646984, "grad_norm": 0.22822746701758084, "learning_rate": 1.9583155578004156e-05, "loss": 0.7036, "step": 1330 }, { "epoch": 0.18375141942809953, "grad_norm": 0.20435160671647146, "learning_rate": 1.9576263514871182e-05, "loss": 0.756, "step": 1335 }, { "epoch": 0.18443962699150063, "grad_norm": 0.22168857814851095, "learning_rate": 1.956931617456816e-05, "loss": 0.6834, "step": 1340 }, { "epoch": 0.18512783455490175, "grad_norm": 0.1996375486008469, "learning_rate": 1.9562313597197303e-05, "loss": 0.7243, "step": 1345 }, { "epoch": 0.18581604211830288, "grad_norm": 0.20317075113535665, "learning_rate": 1.955525582317966e-05, "loss": 0.7199, "step": 1350 }, { "epoch": 0.186504249681704, "grad_norm": 0.2205678344920317, "learning_rate": 1.9548142893254893e-05, "loss": 0.7547, "step": 1355 }, { "epoch": 0.18719245724510514, "grad_norm": 0.23400801831851384, "learning_rate": 1.9540974848481052e-05, "loss": 0.8129, "step": 1360 }, { "epoch": 0.18788066480850624, "grad_norm": 0.23972146153324217, "learning_rate": 1.9533751730234316e-05, "loss": 0.7491, "step": 1365 }, { "epoch": 0.18856887237190736, "grad_norm": 0.2308188080570231, "learning_rate": 1.9526473580208773e-05, "loss": 0.8308, "step": 1370 }, { "epoch": 0.1892570799353085, "grad_norm": 0.22408184769962297, "learning_rate": 1.9519140440416164e-05, "loss": 0.7947, "step": 1375 }, { "epoch": 0.18994528749870962, "grad_norm": 0.2498795399022543, "learning_rate": 1.9511752353185648e-05, "loss": 0.7845, "step": 1380 }, { "epoch": 0.19063349506211072, "grad_norm": 0.20390795512572885, "learning_rate": 1.9504309361163566e-05, "loss": 0.6947, "step": 1385 }, { "epoch": 0.19132170262551185, "grad_norm": 0.21458945525719253, "learning_rate": 1.9496811507313185e-05, "loss": 0.8095, "step": 1390 }, { "epoch": 0.19200991018891297, "grad_norm": 0.19559709720468285, "learning_rate": 1.9489258834914442e-05, "loss": 0.7406, "step": 1395 }, { "epoch": 0.1926981177523141, "grad_norm": 0.2393501705613637, "learning_rate": 1.9481651387563712e-05, "loss": 0.7612, "step": 1400 }, { "epoch": 0.1926981177523141, "eval_loss": 0.7699134945869446, "eval_runtime": 52.747, "eval_samples_per_second": 94.792, "eval_steps_per_second": 2.976, "step": 1400 }, { "epoch": 0.19338632531571523, "grad_norm": 0.2256056687784738, "learning_rate": 1.947398920917355e-05, "loss": 0.7495, "step": 1405 }, { "epoch": 0.19407453287911633, "grad_norm": 0.21427883100026512, "learning_rate": 1.9466272343972426e-05, "loss": 0.768, "step": 1410 }, { "epoch": 0.19476274044251746, "grad_norm": 0.21251285833672356, "learning_rate": 1.9458500836504487e-05, "loss": 0.7565, "step": 1415 }, { "epoch": 0.19545094800591858, "grad_norm": 0.2396180707636783, "learning_rate": 1.945067473162929e-05, "loss": 0.7587, "step": 1420 }, { "epoch": 0.1961391555693197, "grad_norm": 0.21221585850601965, "learning_rate": 1.944279407452155e-05, "loss": 0.748, "step": 1425 }, { "epoch": 0.19682736313272084, "grad_norm": 0.19880613969057262, "learning_rate": 1.943485891067086e-05, "loss": 0.7741, "step": 1430 }, { "epoch": 0.19751557069612194, "grad_norm": 0.20673478227833617, "learning_rate": 1.9426869285881457e-05, "loss": 0.7693, "step": 1435 }, { "epoch": 0.19820377825952307, "grad_norm": 0.21189575142626188, "learning_rate": 1.9418825246271946e-05, "loss": 0.7569, "step": 1440 }, { "epoch": 0.1988919858229242, "grad_norm": 0.2325812669998682, "learning_rate": 1.9410726838275014e-05, "loss": 0.7315, "step": 1445 }, { "epoch": 0.19958019338632532, "grad_norm": 0.2323129771577128, "learning_rate": 1.9402574108637195e-05, "loss": 0.7796, "step": 1450 }, { "epoch": 0.20026840094972645, "grad_norm": 0.20234350301664983, "learning_rate": 1.9394367104418578e-05, "loss": 0.7276, "step": 1455 }, { "epoch": 0.20095660851312755, "grad_norm": 0.2240852640601608, "learning_rate": 1.9386105872992543e-05, "loss": 0.807, "step": 1460 }, { "epoch": 0.20164481607652868, "grad_norm": 0.225871645691355, "learning_rate": 1.9377790462045484e-05, "loss": 0.6777, "step": 1465 }, { "epoch": 0.2023330236399298, "grad_norm": 0.22644812795787322, "learning_rate": 1.936942091957654e-05, "loss": 0.8382, "step": 1470 }, { "epoch": 0.20302123120333093, "grad_norm": 0.23995851324600623, "learning_rate": 1.9360997293897314e-05, "loss": 0.8003, "step": 1475 }, { "epoch": 0.20370943876673206, "grad_norm": 0.20405053007479873, "learning_rate": 1.935251963363159e-05, "loss": 0.7151, "step": 1480 }, { "epoch": 0.20439764633013316, "grad_norm": 0.23146908211676637, "learning_rate": 1.9343987987715058e-05, "loss": 0.7644, "step": 1485 }, { "epoch": 0.2050858538935343, "grad_norm": 0.21989079217709911, "learning_rate": 1.933540240539503e-05, "loss": 0.777, "step": 1490 }, { "epoch": 0.20577406145693541, "grad_norm": 0.2282113670434263, "learning_rate": 1.9326762936230157e-05, "loss": 0.7499, "step": 1495 }, { "epoch": 0.20646226902033654, "grad_norm": 0.25452547820513716, "learning_rate": 1.931806963009014e-05, "loss": 0.7829, "step": 1500 }, { "epoch": 0.20715047658373764, "grad_norm": 0.20787137849467968, "learning_rate": 1.9309322537155446e-05, "loss": 0.7403, "step": 1505 }, { "epoch": 0.20783868414713877, "grad_norm": 0.1952438871371424, "learning_rate": 1.930052170791702e-05, "loss": 0.7467, "step": 1510 }, { "epoch": 0.2085268917105399, "grad_norm": 0.2193463414429094, "learning_rate": 1.929166719317597e-05, "loss": 0.7772, "step": 1515 }, { "epoch": 0.20921509927394102, "grad_norm": 0.20224589414069227, "learning_rate": 1.928275904404332e-05, "loss": 0.7833, "step": 1520 }, { "epoch": 0.20990330683734215, "grad_norm": 0.240523456771526, "learning_rate": 1.9273797311939673e-05, "loss": 0.7858, "step": 1525 }, { "epoch": 0.21059151440074325, "grad_norm": 0.2133446001107707, "learning_rate": 1.926478204859493e-05, "loss": 0.7064, "step": 1530 }, { "epoch": 0.21127972196414438, "grad_norm": 0.20972839518313188, "learning_rate": 1.9255713306047998e-05, "loss": 0.7522, "step": 1535 }, { "epoch": 0.2119679295275455, "grad_norm": 0.18609366967322333, "learning_rate": 1.9246591136646477e-05, "loss": 0.8138, "step": 1540 }, { "epoch": 0.21265613709094663, "grad_norm": 0.21775585284385154, "learning_rate": 1.923741559304636e-05, "loss": 0.7091, "step": 1545 }, { "epoch": 0.21334434465434776, "grad_norm": 0.3073629657548617, "learning_rate": 1.9228186728211743e-05, "loss": 0.7995, "step": 1550 }, { "epoch": 0.21403255221774886, "grad_norm": 0.20286717113961447, "learning_rate": 1.9218904595414503e-05, "loss": 0.6525, "step": 1555 }, { "epoch": 0.21472075978115, "grad_norm": 0.23807891065614986, "learning_rate": 1.920956924823399e-05, "loss": 0.7936, "step": 1560 }, { "epoch": 0.21540896734455112, "grad_norm": 0.20325290838283722, "learning_rate": 1.920018074055674e-05, "loss": 0.7063, "step": 1565 }, { "epoch": 0.21609717490795224, "grad_norm": 0.22425129526382953, "learning_rate": 1.9190739126576127e-05, "loss": 0.7573, "step": 1570 }, { "epoch": 0.21678538247135337, "grad_norm": 0.21970346833051707, "learning_rate": 1.918124446079209e-05, "loss": 0.7239, "step": 1575 }, { "epoch": 0.21747359003475447, "grad_norm": 0.20666685632054585, "learning_rate": 1.9171696798010784e-05, "loss": 0.7499, "step": 1580 }, { "epoch": 0.2181617975981556, "grad_norm": 0.19598667615166201, "learning_rate": 1.9162096193344293e-05, "loss": 0.7594, "step": 1585 }, { "epoch": 0.21885000516155673, "grad_norm": 0.24514631235882053, "learning_rate": 1.915244270221029e-05, "loss": 0.8164, "step": 1590 }, { "epoch": 0.21953821272495785, "grad_norm": 0.20520397803640633, "learning_rate": 1.914273638033173e-05, "loss": 0.6679, "step": 1595 }, { "epoch": 0.22022642028835898, "grad_norm": 0.21134981991985036, "learning_rate": 1.9132977283736513e-05, "loss": 0.7421, "step": 1600 }, { "epoch": 0.22022642028835898, "eval_loss": 0.7642711997032166, "eval_runtime": 52.7331, "eval_samples_per_second": 94.817, "eval_steps_per_second": 2.977, "step": 1600 }, { "epoch": 0.22091462785176008, "grad_norm": 0.2324092088277875, "learning_rate": 1.9123165468757182e-05, "loss": 0.7333, "step": 1605 }, { "epoch": 0.2216028354151612, "grad_norm": 0.2092630039644559, "learning_rate": 1.9113300992030587e-05, "loss": 0.6874, "step": 1610 }, { "epoch": 0.22229104297856234, "grad_norm": 0.2321108969617509, "learning_rate": 1.9103383910497552e-05, "loss": 0.7693, "step": 1615 }, { "epoch": 0.22297925054196346, "grad_norm": 0.20152482682123385, "learning_rate": 1.9093414281402554e-05, "loss": 0.7386, "step": 1620 }, { "epoch": 0.2236674581053646, "grad_norm": 0.21190783315389736, "learning_rate": 1.908339216229339e-05, "loss": 0.7518, "step": 1625 }, { "epoch": 0.2243556656687657, "grad_norm": 0.1973775323636198, "learning_rate": 1.9073317611020846e-05, "loss": 0.7653, "step": 1630 }, { "epoch": 0.22504387323216682, "grad_norm": 0.199006692349762, "learning_rate": 1.9063190685738372e-05, "loss": 0.7644, "step": 1635 }, { "epoch": 0.22573208079556795, "grad_norm": 0.19825621790068793, "learning_rate": 1.905301144490172e-05, "loss": 0.755, "step": 1640 }, { "epoch": 0.22642028835896907, "grad_norm": 0.2206849859936177, "learning_rate": 1.9042779947268633e-05, "loss": 0.7352, "step": 1645 }, { "epoch": 0.22710849592237017, "grad_norm": 0.23009681448516275, "learning_rate": 1.9032496251898505e-05, "loss": 0.8035, "step": 1650 }, { "epoch": 0.2277967034857713, "grad_norm": 0.22816633756295968, "learning_rate": 1.9022160418152014e-05, "loss": 0.7851, "step": 1655 }, { "epoch": 0.22848491104917243, "grad_norm": 0.2079311255227717, "learning_rate": 1.901177250569081e-05, "loss": 0.7265, "step": 1660 }, { "epoch": 0.22917311861257356, "grad_norm": 0.22556389039932231, "learning_rate": 1.9001332574477147e-05, "loss": 0.8056, "step": 1665 }, { "epoch": 0.22986132617597468, "grad_norm": 0.22163488173242515, "learning_rate": 1.899084068477356e-05, "loss": 0.7436, "step": 1670 }, { "epoch": 0.23054953373937578, "grad_norm": 0.22453101908829454, "learning_rate": 1.8980296897142496e-05, "loss": 0.8059, "step": 1675 }, { "epoch": 0.2312377413027769, "grad_norm": 0.20650459275871832, "learning_rate": 1.8969701272445977e-05, "loss": 0.8076, "step": 1680 }, { "epoch": 0.23192594886617804, "grad_norm": 0.23702284476249913, "learning_rate": 1.8959053871845237e-05, "loss": 0.7255, "step": 1685 }, { "epoch": 0.23261415642957917, "grad_norm": 0.19998413677936977, "learning_rate": 1.8948354756800388e-05, "loss": 0.766, "step": 1690 }, { "epoch": 0.2333023639929803, "grad_norm": 0.21042537308463502, "learning_rate": 1.8937603989070047e-05, "loss": 0.7728, "step": 1695 }, { "epoch": 0.2339905715563814, "grad_norm": 0.202766411050887, "learning_rate": 1.8926801630710984e-05, "loss": 0.7396, "step": 1700 }, { "epoch": 0.23467877911978252, "grad_norm": 0.21296629062232758, "learning_rate": 1.891594774407777e-05, "loss": 0.745, "step": 1705 }, { "epoch": 0.23536698668318365, "grad_norm": 0.198595670542233, "learning_rate": 1.8905042391822423e-05, "loss": 0.7146, "step": 1710 }, { "epoch": 0.23605519424658478, "grad_norm": 0.24235546391339205, "learning_rate": 1.8894085636894012e-05, "loss": 0.775, "step": 1715 }, { "epoch": 0.2367434018099859, "grad_norm": 0.21132236254570202, "learning_rate": 1.888307754253834e-05, "loss": 0.7584, "step": 1720 }, { "epoch": 0.237431609373387, "grad_norm": 0.22962489985954593, "learning_rate": 1.887201817229755e-05, "loss": 0.7378, "step": 1725 }, { "epoch": 0.23811981693678813, "grad_norm": 0.2100928476984632, "learning_rate": 1.8860907590009765e-05, "loss": 0.8592, "step": 1730 }, { "epoch": 0.23880802450018926, "grad_norm": 0.2040657893602979, "learning_rate": 1.8849745859808718e-05, "loss": 0.6999, "step": 1735 }, { "epoch": 0.2394962320635904, "grad_norm": 0.20748447965852776, "learning_rate": 1.883853304612339e-05, "loss": 0.6932, "step": 1740 }, { "epoch": 0.2401844396269915, "grad_norm": 0.19123478190881893, "learning_rate": 1.8827269213677617e-05, "loss": 0.7887, "step": 1745 }, { "epoch": 0.2408726471903926, "grad_norm": 0.21825323522552822, "learning_rate": 1.8815954427489754e-05, "loss": 0.7484, "step": 1750 }, { "epoch": 0.24156085475379374, "grad_norm": 0.2107945793953133, "learning_rate": 1.8804588752872255e-05, "loss": 0.7233, "step": 1755 }, { "epoch": 0.24224906231719487, "grad_norm": 0.2040974494704742, "learning_rate": 1.8793172255431335e-05, "loss": 0.7939, "step": 1760 }, { "epoch": 0.242937269880596, "grad_norm": 0.19109738371703827, "learning_rate": 1.878170500106656e-05, "loss": 0.7209, "step": 1765 }, { "epoch": 0.2436254774439971, "grad_norm": 0.19305640841556135, "learning_rate": 1.877018705597049e-05, "loss": 0.6902, "step": 1770 }, { "epoch": 0.24431368500739822, "grad_norm": 0.22120122018406746, "learning_rate": 1.8758618486628285e-05, "loss": 0.7849, "step": 1775 }, { "epoch": 0.24500189257079935, "grad_norm": 0.2029413994770908, "learning_rate": 1.8746999359817325e-05, "loss": 0.7454, "step": 1780 }, { "epoch": 0.24569010013420048, "grad_norm": 0.20493942634691786, "learning_rate": 1.8735329742606822e-05, "loss": 0.7544, "step": 1785 }, { "epoch": 0.2463783076976016, "grad_norm": 0.18890584964008378, "learning_rate": 1.8723609702357425e-05, "loss": 0.7249, "step": 1790 }, { "epoch": 0.2470665152610027, "grad_norm": 0.19717089069515184, "learning_rate": 1.871183930672086e-05, "loss": 0.7636, "step": 1795 }, { "epoch": 0.24775472282440383, "grad_norm": 0.19792229593503516, "learning_rate": 1.870001862363951e-05, "loss": 0.7451, "step": 1800 }, { "epoch": 0.24775472282440383, "eval_loss": 0.7594541907310486, "eval_runtime": 52.7317, "eval_samples_per_second": 94.82, "eval_steps_per_second": 2.977, "step": 1800 }, { "epoch": 0.24844293038780496, "grad_norm": 0.21414636682257654, "learning_rate": 1.868814772134603e-05, "loss": 0.7412, "step": 1805 }, { "epoch": 0.2491311379512061, "grad_norm": 0.20402115548652056, "learning_rate": 1.8676226668362955e-05, "loss": 0.8114, "step": 1810 }, { "epoch": 0.24981934551460722, "grad_norm": 0.19203606064827533, "learning_rate": 1.866425553350231e-05, "loss": 0.6816, "step": 1815 }, { "epoch": 0.2505075530780083, "grad_norm": 0.20781756629068618, "learning_rate": 1.8652234385865212e-05, "loss": 0.7266, "step": 1820 }, { "epoch": 0.25119576064140947, "grad_norm": 0.1994233276368824, "learning_rate": 1.8640163294841463e-05, "loss": 0.7208, "step": 1825 }, { "epoch": 0.25188396820481057, "grad_norm": 0.22396668125757926, "learning_rate": 1.8628042330109143e-05, "loss": 0.7377, "step": 1830 }, { "epoch": 0.25257217576821167, "grad_norm": 0.2105319138744887, "learning_rate": 1.861587156163424e-05, "loss": 0.7924, "step": 1835 }, { "epoch": 0.2532603833316128, "grad_norm": 0.22314059124862642, "learning_rate": 1.86036510596702e-05, "loss": 0.7885, "step": 1840 }, { "epoch": 0.2539485908950139, "grad_norm": 0.22471933606953126, "learning_rate": 1.8591380894757573e-05, "loss": 0.8471, "step": 1845 }, { "epoch": 0.2546367984584151, "grad_norm": 0.22685874053246408, "learning_rate": 1.857906113772356e-05, "loss": 0.7217, "step": 1850 }, { "epoch": 0.2553250060218162, "grad_norm": 0.2098478806656804, "learning_rate": 1.8566691859681624e-05, "loss": 0.8006, "step": 1855 }, { "epoch": 0.2560132135852173, "grad_norm": 0.18998733234593676, "learning_rate": 1.855427313203109e-05, "loss": 0.7409, "step": 1860 }, { "epoch": 0.25670142114861844, "grad_norm": 0.21712173854250844, "learning_rate": 1.854180502645671e-05, "loss": 0.7453, "step": 1865 }, { "epoch": 0.25738962871201954, "grad_norm": 0.20952866566439698, "learning_rate": 1.852928761492827e-05, "loss": 0.711, "step": 1870 }, { "epoch": 0.2580778362754207, "grad_norm": 0.20416883813212156, "learning_rate": 1.851672096970016e-05, "loss": 0.7011, "step": 1875 }, { "epoch": 0.2587660438388218, "grad_norm": 0.1800851461502608, "learning_rate": 1.8504105163310956e-05, "loss": 0.7006, "step": 1880 }, { "epoch": 0.2594542514022229, "grad_norm": 0.19404826951953036, "learning_rate": 1.8491440268583024e-05, "loss": 0.7747, "step": 1885 }, { "epoch": 0.26014245896562405, "grad_norm": 0.1958390480167634, "learning_rate": 1.847872635862207e-05, "loss": 0.7485, "step": 1890 }, { "epoch": 0.26083066652902515, "grad_norm": 0.18983271569816954, "learning_rate": 1.8465963506816727e-05, "loss": 0.7415, "step": 1895 }, { "epoch": 0.2615188740924263, "grad_norm": 0.18969728108759565, "learning_rate": 1.845315178683816e-05, "loss": 0.6805, "step": 1900 }, { "epoch": 0.2622070816558274, "grad_norm": 0.22101311744325242, "learning_rate": 1.8440291272639583e-05, "loss": 0.7912, "step": 1905 }, { "epoch": 0.2628952892192285, "grad_norm": 0.24692526060886472, "learning_rate": 1.842738203845589e-05, "loss": 0.7452, "step": 1910 }, { "epoch": 0.26358349678262966, "grad_norm": 0.1860762508510459, "learning_rate": 1.841442415880319e-05, "loss": 0.7252, "step": 1915 }, { "epoch": 0.26427170434603076, "grad_norm": 0.21042867195651055, "learning_rate": 1.8401417708478397e-05, "loss": 0.758, "step": 1920 }, { "epoch": 0.2649599119094319, "grad_norm": 0.22157930431445166, "learning_rate": 1.8388362762558786e-05, "loss": 0.7195, "step": 1925 }, { "epoch": 0.265648119472833, "grad_norm": 0.20302795024000786, "learning_rate": 1.837525939640156e-05, "loss": 0.7991, "step": 1930 }, { "epoch": 0.2663363270362341, "grad_norm": 0.24306040112374275, "learning_rate": 1.8362107685643424e-05, "loss": 0.7841, "step": 1935 }, { "epoch": 0.26702453459963527, "grad_norm": 0.19854192503907037, "learning_rate": 1.8348907706200142e-05, "loss": 0.8032, "step": 1940 }, { "epoch": 0.26771274216303637, "grad_norm": 0.19675981434016507, "learning_rate": 1.8335659534266095e-05, "loss": 0.7349, "step": 1945 }, { "epoch": 0.2684009497264375, "grad_norm": 0.20344315183340758, "learning_rate": 1.8322363246313852e-05, "loss": 0.6778, "step": 1950 }, { "epoch": 0.2690891572898386, "grad_norm": 0.21162243716721607, "learning_rate": 1.8309018919093723e-05, "loss": 0.7395, "step": 1955 }, { "epoch": 0.2697773648532397, "grad_norm": 0.1908079867227385, "learning_rate": 1.8295626629633308e-05, "loss": 0.7495, "step": 1960 }, { "epoch": 0.2704655724166409, "grad_norm": 0.20911307090491074, "learning_rate": 1.8282186455237072e-05, "loss": 0.6922, "step": 1965 }, { "epoch": 0.271153779980042, "grad_norm": 0.18515864053780087, "learning_rate": 1.8268698473485872e-05, "loss": 0.7154, "step": 1970 }, { "epoch": 0.2718419875434431, "grad_norm": 0.22541339746854136, "learning_rate": 1.8255162762236537e-05, "loss": 0.7103, "step": 1975 }, { "epoch": 0.27253019510684423, "grad_norm": 0.17809100125272448, "learning_rate": 1.82415793996214e-05, "loss": 0.6899, "step": 1980 }, { "epoch": 0.27321840267024533, "grad_norm": 0.22305819494976623, "learning_rate": 1.8227948464047854e-05, "loss": 0.7403, "step": 1985 }, { "epoch": 0.2739066102336465, "grad_norm": 0.1862885745625827, "learning_rate": 1.821427003419789e-05, "loss": 0.7533, "step": 1990 }, { "epoch": 0.2745948177970476, "grad_norm": 0.22155834638460667, "learning_rate": 1.8200544189027664e-05, "loss": 0.7322, "step": 1995 }, { "epoch": 0.2752830253604487, "grad_norm": 0.1969497556488121, "learning_rate": 1.818677100776702e-05, "loss": 0.7388, "step": 2000 }, { "epoch": 0.2752830253604487, "eval_loss": 0.7555726766586304, "eval_runtime": 52.7316, "eval_samples_per_second": 94.82, "eval_steps_per_second": 2.977, "step": 2000 }, { "epoch": 0.27597123292384984, "grad_norm": 0.2095180318324119, "learning_rate": 1.8172950569919038e-05, "loss": 0.8585, "step": 2005 }, { "epoch": 0.27665944048725094, "grad_norm": 0.20169627617694177, "learning_rate": 1.8159082955259588e-05, "loss": 0.6917, "step": 2010 }, { "epoch": 0.2773476480506521, "grad_norm": 0.19659661222703856, "learning_rate": 1.814516824383685e-05, "loss": 0.7913, "step": 2015 }, { "epoch": 0.2780358556140532, "grad_norm": 0.24683537415747775, "learning_rate": 1.8131206515970862e-05, "loss": 0.7757, "step": 2020 }, { "epoch": 0.2787240631774543, "grad_norm": 0.21600348610601483, "learning_rate": 1.811719785225306e-05, "loss": 0.8324, "step": 2025 }, { "epoch": 0.27941227074085545, "grad_norm": 0.21564558272420548, "learning_rate": 1.8103142333545807e-05, "loss": 0.7637, "step": 2030 }, { "epoch": 0.28010047830425655, "grad_norm": 0.21497359923092296, "learning_rate": 1.8089040040981926e-05, "loss": 0.7121, "step": 2035 }, { "epoch": 0.2807886858676577, "grad_norm": 0.25266058363702826, "learning_rate": 1.8074891055964233e-05, "loss": 0.7352, "step": 2040 }, { "epoch": 0.2814768934310588, "grad_norm": 0.25355406745358255, "learning_rate": 1.8060695460165063e-05, "loss": 0.7156, "step": 2045 }, { "epoch": 0.2821651009944599, "grad_norm": 0.21469665713291605, "learning_rate": 1.8046453335525816e-05, "loss": 0.7543, "step": 2050 }, { "epoch": 0.28285330855786106, "grad_norm": 0.20988710870642413, "learning_rate": 1.803216476425646e-05, "loss": 0.7439, "step": 2055 }, { "epoch": 0.28354151612126216, "grad_norm": 0.20113759650967541, "learning_rate": 1.801782982883507e-05, "loss": 0.7272, "step": 2060 }, { "epoch": 0.2842297236846633, "grad_norm": 0.2068343180012698, "learning_rate": 1.800344861200735e-05, "loss": 0.7831, "step": 2065 }, { "epoch": 0.2849179312480644, "grad_norm": 0.20333278073378902, "learning_rate": 1.7989021196786154e-05, "loss": 0.7798, "step": 2070 }, { "epoch": 0.2856061388114655, "grad_norm": 0.2228723414042137, "learning_rate": 1.797454766645101e-05, "loss": 0.7554, "step": 2075 }, { "epoch": 0.28629434637486667, "grad_norm": 0.19841019645089839, "learning_rate": 1.796002810454763e-05, "loss": 0.7796, "step": 2080 }, { "epoch": 0.28698255393826777, "grad_norm": 0.22065123290437863, "learning_rate": 1.7945462594887443e-05, "loss": 0.7391, "step": 2085 }, { "epoch": 0.2876707615016689, "grad_norm": 0.2100385154347628, "learning_rate": 1.7930851221547096e-05, "loss": 0.7644, "step": 2090 }, { "epoch": 0.28835896906507, "grad_norm": 0.20688682554236, "learning_rate": 1.7916194068867983e-05, "loss": 0.7215, "step": 2095 }, { "epoch": 0.2890471766284711, "grad_norm": 0.2084690604571315, "learning_rate": 1.790149122145574e-05, "loss": 0.776, "step": 2100 }, { "epoch": 0.2897353841918723, "grad_norm": 0.2197866074034849, "learning_rate": 1.788674276417978e-05, "loss": 0.7658, "step": 2105 }, { "epoch": 0.2904235917552734, "grad_norm": 0.20430199758381729, "learning_rate": 1.7871948782172774e-05, "loss": 0.6842, "step": 2110 }, { "epoch": 0.29111179931867454, "grad_norm": 0.21880861451419725, "learning_rate": 1.785710936083019e-05, "loss": 0.8003, "step": 2115 }, { "epoch": 0.29180000688207564, "grad_norm": 0.21420248675674025, "learning_rate": 1.7842224585809784e-05, "loss": 0.7783, "step": 2120 }, { "epoch": 0.29248821444547674, "grad_norm": 0.2198701056180671, "learning_rate": 1.7827294543031103e-05, "loss": 0.7421, "step": 2125 }, { "epoch": 0.2931764220088779, "grad_norm": 0.2252960919484524, "learning_rate": 1.7812319318674992e-05, "loss": 0.7517, "step": 2130 }, { "epoch": 0.293864629572279, "grad_norm": 0.25986304651334435, "learning_rate": 1.779729899918311e-05, "loss": 0.8107, "step": 2135 }, { "epoch": 0.29455283713568015, "grad_norm": 0.26430107201528963, "learning_rate": 1.77822336712574e-05, "loss": 0.705, "step": 2140 }, { "epoch": 0.29524104469908125, "grad_norm": 0.2258452731215674, "learning_rate": 1.7767123421859624e-05, "loss": 0.7335, "step": 2145 }, { "epoch": 0.29592925226248235, "grad_norm": 0.2355369329433477, "learning_rate": 1.7751968338210835e-05, "loss": 0.7599, "step": 2150 }, { "epoch": 0.2966174598258835, "grad_norm": 0.22697325824977216, "learning_rate": 1.773676850779089e-05, "loss": 0.7728, "step": 2155 }, { "epoch": 0.2973056673892846, "grad_norm": 0.18956339155990345, "learning_rate": 1.772152401833794e-05, "loss": 0.7348, "step": 2160 }, { "epoch": 0.29799387495268576, "grad_norm": 0.21276094941209778, "learning_rate": 1.770623495784791e-05, "loss": 0.7198, "step": 2165 }, { "epoch": 0.29868208251608686, "grad_norm": 0.19789785522372536, "learning_rate": 1.7690901414574014e-05, "loss": 0.7787, "step": 2170 }, { "epoch": 0.29937029007948796, "grad_norm": 0.1779340242442606, "learning_rate": 1.7675523477026225e-05, "loss": 0.7561, "step": 2175 }, { "epoch": 0.3000584976428891, "grad_norm": 0.20599637384444558, "learning_rate": 1.766010123397079e-05, "loss": 0.7254, "step": 2180 }, { "epoch": 0.3007467052062902, "grad_norm": 0.20648807792092014, "learning_rate": 1.764463477442968e-05, "loss": 0.7648, "step": 2185 }, { "epoch": 0.30143491276969137, "grad_norm": 0.19938404315682767, "learning_rate": 1.7629124187680115e-05, "loss": 0.7144, "step": 2190 }, { "epoch": 0.30212312033309247, "grad_norm": 0.19126054917579605, "learning_rate": 1.761356956325402e-05, "loss": 0.7427, "step": 2195 }, { "epoch": 0.30281132789649357, "grad_norm": 0.2006515448727996, "learning_rate": 1.7597970990937517e-05, "loss": 0.7707, "step": 2200 }, { "epoch": 0.30281132789649357, "eval_loss": 0.752252995967865, "eval_runtime": 52.7347, "eval_samples_per_second": 94.814, "eval_steps_per_second": 2.977, "step": 2200 }, { "epoch": 0.3034995354598947, "grad_norm": 0.2130563507348462, "learning_rate": 1.7582328560770427e-05, "loss": 0.7452, "step": 2205 }, { "epoch": 0.3041877430232958, "grad_norm": 0.1853586362446992, "learning_rate": 1.756664236304571e-05, "loss": 0.7783, "step": 2210 }, { "epoch": 0.304875950586697, "grad_norm": 0.2364741949452633, "learning_rate": 1.7550912488308983e-05, "loss": 0.8034, "step": 2215 }, { "epoch": 0.3055641581500981, "grad_norm": 0.20054297293439713, "learning_rate": 1.7535139027357966e-05, "loss": 0.7657, "step": 2220 }, { "epoch": 0.3062523657134992, "grad_norm": 0.20908295869701474, "learning_rate": 1.7519322071241983e-05, "loss": 0.7344, "step": 2225 }, { "epoch": 0.30694057327690033, "grad_norm": 0.2089383269754471, "learning_rate": 1.7503461711261422e-05, "loss": 0.7208, "step": 2230 }, { "epoch": 0.30762878084030143, "grad_norm": 0.20697572875036033, "learning_rate": 1.748755803896721e-05, "loss": 0.7371, "step": 2235 }, { "epoch": 0.30831698840370253, "grad_norm": 0.21492273221762667, "learning_rate": 1.747161114616029e-05, "loss": 0.7237, "step": 2240 }, { "epoch": 0.3090051959671037, "grad_norm": 0.20566358899432524, "learning_rate": 1.745562112489108e-05, "loss": 0.7197, "step": 2245 }, { "epoch": 0.3096934035305048, "grad_norm": 0.2089996215681862, "learning_rate": 1.743958806745896e-05, "loss": 0.7499, "step": 2250 }, { "epoch": 0.31038161109390594, "grad_norm": 0.18947819952391684, "learning_rate": 1.742351206641172e-05, "loss": 0.7352, "step": 2255 }, { "epoch": 0.31106981865730704, "grad_norm": 0.19218817255689666, "learning_rate": 1.7407393214545032e-05, "loss": 0.7227, "step": 2260 }, { "epoch": 0.31175802622070814, "grad_norm": 0.21779874008520173, "learning_rate": 1.739123160490192e-05, "loss": 0.702, "step": 2265 }, { "epoch": 0.3124462337841093, "grad_norm": 0.17071276760900786, "learning_rate": 1.7375027330772223e-05, "loss": 0.6974, "step": 2270 }, { "epoch": 0.3131344413475104, "grad_norm": 0.20934257841919793, "learning_rate": 1.735878048569205e-05, "loss": 0.7659, "step": 2275 }, { "epoch": 0.31382264891091155, "grad_norm": 0.192261307982098, "learning_rate": 1.7342491163443234e-05, "loss": 0.7193, "step": 2280 }, { "epoch": 0.31451085647431265, "grad_norm": 0.20257272297034706, "learning_rate": 1.7326159458052814e-05, "loss": 0.7239, "step": 2285 }, { "epoch": 0.31519906403771375, "grad_norm": 0.23198562327096983, "learning_rate": 1.730978546379247e-05, "loss": 0.7017, "step": 2290 }, { "epoch": 0.3158872716011149, "grad_norm": 0.1994661631469351, "learning_rate": 1.7293369275177984e-05, "loss": 0.7071, "step": 2295 }, { "epoch": 0.316575479164516, "grad_norm": 0.2055798520238018, "learning_rate": 1.72769109869687e-05, "loss": 0.792, "step": 2300 }, { "epoch": 0.31726368672791716, "grad_norm": 0.20142375855697997, "learning_rate": 1.726041069416698e-05, "loss": 0.7649, "step": 2305 }, { "epoch": 0.31795189429131826, "grad_norm": 0.2217129361983873, "learning_rate": 1.7243868492017636e-05, "loss": 0.7592, "step": 2310 }, { "epoch": 0.31864010185471936, "grad_norm": 0.21134384827862585, "learning_rate": 1.7227284476007407e-05, "loss": 0.7178, "step": 2315 }, { "epoch": 0.3193283094181205, "grad_norm": 0.22663361077597582, "learning_rate": 1.7210658741864384e-05, "loss": 0.8213, "step": 2320 }, { "epoch": 0.3200165169815216, "grad_norm": 0.19414120845972468, "learning_rate": 1.719399138555748e-05, "loss": 0.7282, "step": 2325 }, { "epoch": 0.32070472454492277, "grad_norm": 0.247893539173657, "learning_rate": 1.717728250329585e-05, "loss": 0.7747, "step": 2330 }, { "epoch": 0.32139293210832387, "grad_norm": 0.21421369422533218, "learning_rate": 1.7160532191528356e-05, "loss": 0.7384, "step": 2335 }, { "epoch": 0.32208113967172497, "grad_norm": 0.25617279932961995, "learning_rate": 1.7143740546943014e-05, "loss": 0.7703, "step": 2340 }, { "epoch": 0.3227693472351261, "grad_norm": 0.2006969856217768, "learning_rate": 1.7126907666466407e-05, "loss": 0.7258, "step": 2345 }, { "epoch": 0.3234575547985272, "grad_norm": 0.23017565062093528, "learning_rate": 1.711003364726316e-05, "loss": 0.7066, "step": 2350 }, { "epoch": 0.3241457623619284, "grad_norm": 0.20610961796937408, "learning_rate": 1.709311858673536e-05, "loss": 0.7052, "step": 2355 }, { "epoch": 0.3248339699253295, "grad_norm": 0.2146420235422493, "learning_rate": 1.7076162582521987e-05, "loss": 0.7138, "step": 2360 }, { "epoch": 0.3255221774887306, "grad_norm": 0.19066667274939134, "learning_rate": 1.7059165732498374e-05, "loss": 0.7142, "step": 2365 }, { "epoch": 0.32621038505213173, "grad_norm": 0.20578807031714857, "learning_rate": 1.704212813477562e-05, "loss": 0.8244, "step": 2370 }, { "epoch": 0.32689859261553283, "grad_norm": 0.22058132686896914, "learning_rate": 1.7025049887700037e-05, "loss": 0.7536, "step": 2375 }, { "epoch": 0.327586800178934, "grad_norm": 0.22016316721496565, "learning_rate": 1.7007931089852572e-05, "loss": 0.7123, "step": 2380 }, { "epoch": 0.3282750077423351, "grad_norm": 0.39746207387871074, "learning_rate": 1.699077184004826e-05, "loss": 0.7147, "step": 2385 }, { "epoch": 0.3289632153057362, "grad_norm": 0.20243205933956865, "learning_rate": 1.6973572237335613e-05, "loss": 0.731, "step": 2390 }, { "epoch": 0.32965142286913734, "grad_norm": 0.19672524248633075, "learning_rate": 1.6956332380996094e-05, "loss": 0.7415, "step": 2395 }, { "epoch": 0.33033963043253844, "grad_norm": 0.21083996022202736, "learning_rate": 1.6939052370543508e-05, "loss": 0.7063, "step": 2400 }, { "epoch": 0.33033963043253844, "eval_loss": 0.7481091022491455, "eval_runtime": 52.734, "eval_samples_per_second": 94.816, "eval_steps_per_second": 2.977, "step": 2400 }, { "epoch": 0.3310278379959396, "grad_norm": 0.2271944087173273, "learning_rate": 1.6921732305723446e-05, "loss": 0.7607, "step": 2405 }, { "epoch": 0.3317160455593407, "grad_norm": 0.20162153376323613, "learning_rate": 1.6904372286512706e-05, "loss": 0.6935, "step": 2410 }, { "epoch": 0.3324042531227418, "grad_norm": 0.22289813379488727, "learning_rate": 1.6886972413118724e-05, "loss": 0.7615, "step": 2415 }, { "epoch": 0.33309246068614295, "grad_norm": 0.20835805435560778, "learning_rate": 1.6869532785978974e-05, "loss": 0.7266, "step": 2420 }, { "epoch": 0.33378066824954405, "grad_norm": 0.19311384440336374, "learning_rate": 1.6852053505760397e-05, "loss": 0.7446, "step": 2425 }, { "epoch": 0.3344688758129452, "grad_norm": 0.21294665523870018, "learning_rate": 1.6834534673358846e-05, "loss": 0.7317, "step": 2430 }, { "epoch": 0.3351570833763463, "grad_norm": 0.21055068641548813, "learning_rate": 1.6816976389898457e-05, "loss": 0.7981, "step": 2435 }, { "epoch": 0.3358452909397474, "grad_norm": 0.21492983905497037, "learning_rate": 1.6799378756731107e-05, "loss": 0.7322, "step": 2440 }, { "epoch": 0.33653349850314856, "grad_norm": 0.18355149850562083, "learning_rate": 1.6781741875435806e-05, "loss": 0.6827, "step": 2445 }, { "epoch": 0.33722170606654966, "grad_norm": 0.21420264573660938, "learning_rate": 1.676406584781811e-05, "loss": 0.7618, "step": 2450 }, { "epoch": 0.3379099136299508, "grad_norm": 0.2161673745225748, "learning_rate": 1.6746350775909552e-05, "loss": 0.7245, "step": 2455 }, { "epoch": 0.3385981211933519, "grad_norm": 0.20689393595952102, "learning_rate": 1.6728596761967028e-05, "loss": 0.7327, "step": 2460 }, { "epoch": 0.339286328756753, "grad_norm": 0.19704379479027279, "learning_rate": 1.6710803908472225e-05, "loss": 0.7234, "step": 2465 }, { "epoch": 0.3399745363201542, "grad_norm": 0.1975894212015622, "learning_rate": 1.6692972318131034e-05, "loss": 0.7182, "step": 2470 }, { "epoch": 0.3406627438835553, "grad_norm": 0.21952261730053016, "learning_rate": 1.667510209387293e-05, "loss": 0.7286, "step": 2475 }, { "epoch": 0.3413509514469564, "grad_norm": 0.23092195819504874, "learning_rate": 1.665719333885041e-05, "loss": 0.8018, "step": 2480 }, { "epoch": 0.34203915901035753, "grad_norm": 0.19359751035881798, "learning_rate": 1.6639246156438363e-05, "loss": 0.7031, "step": 2485 }, { "epoch": 0.34272736657375863, "grad_norm": 0.18925791528918764, "learning_rate": 1.662126065023352e-05, "loss": 0.6832, "step": 2490 }, { "epoch": 0.3434155741371598, "grad_norm": 0.2287902168604567, "learning_rate": 1.66032369240538e-05, "loss": 0.7442, "step": 2495 }, { "epoch": 0.3441037817005609, "grad_norm": 0.2000063935843219, "learning_rate": 1.6585175081937763e-05, "loss": 0.795, "step": 2500 }, { "epoch": 0.344791989263962, "grad_norm": 0.2137641286928194, "learning_rate": 1.6567075228143975e-05, "loss": 0.7397, "step": 2505 }, { "epoch": 0.34548019682736314, "grad_norm": 0.20177629358398547, "learning_rate": 1.654893746715042e-05, "loss": 0.6912, "step": 2510 }, { "epoch": 0.34616840439076424, "grad_norm": 0.20053095772078194, "learning_rate": 1.653076190365389e-05, "loss": 0.772, "step": 2515 }, { "epoch": 0.3468566119541654, "grad_norm": 0.222129173475987, "learning_rate": 1.6512548642569388e-05, "loss": 0.7371, "step": 2520 }, { "epoch": 0.3475448195175665, "grad_norm": 0.1979624125485493, "learning_rate": 1.649429778902952e-05, "loss": 0.7337, "step": 2525 }, { "epoch": 0.3482330270809676, "grad_norm": 0.2250118633329388, "learning_rate": 1.6476009448383888e-05, "loss": 0.7425, "step": 2530 }, { "epoch": 0.34892123464436875, "grad_norm": 0.20568611920081328, "learning_rate": 1.645768372619848e-05, "loss": 0.7686, "step": 2535 }, { "epoch": 0.34960944220776985, "grad_norm": 0.17344624313416615, "learning_rate": 1.6439320728255057e-05, "loss": 0.7238, "step": 2540 }, { "epoch": 0.350297649771171, "grad_norm": 0.20376761934047347, "learning_rate": 1.6420920560550553e-05, "loss": 0.684, "step": 2545 }, { "epoch": 0.3509858573345721, "grad_norm": 0.21479243320728963, "learning_rate": 1.640248332929646e-05, "loss": 0.7475, "step": 2550 }, { "epoch": 0.3516740648979732, "grad_norm": 0.20962664569318049, "learning_rate": 1.6384009140918207e-05, "loss": 0.7333, "step": 2555 }, { "epoch": 0.35236227246137436, "grad_norm": 0.17635721662572573, "learning_rate": 1.636549810205455e-05, "loss": 0.6952, "step": 2560 }, { "epoch": 0.35305048002477546, "grad_norm": 0.20473661393396644, "learning_rate": 1.6346950319556963e-05, "loss": 0.7123, "step": 2565 }, { "epoch": 0.3537386875881766, "grad_norm": 0.21714271859260473, "learning_rate": 1.6328365900489012e-05, "loss": 0.7396, "step": 2570 }, { "epoch": 0.3544268951515777, "grad_norm": 0.24067819714194466, "learning_rate": 1.6309744952125736e-05, "loss": 0.7401, "step": 2575 }, { "epoch": 0.3551151027149788, "grad_norm": 0.20280779825157516, "learning_rate": 1.6291087581953046e-05, "loss": 0.7271, "step": 2580 }, { "epoch": 0.35580331027837997, "grad_norm": 0.19607724560087014, "learning_rate": 1.6272393897667077e-05, "loss": 0.737, "step": 2585 }, { "epoch": 0.35649151784178107, "grad_norm": 0.21103509023168734, "learning_rate": 1.625366400717359e-05, "loss": 0.7198, "step": 2590 }, { "epoch": 0.3571797254051822, "grad_norm": 0.20964076642756213, "learning_rate": 1.6234898018587336e-05, "loss": 0.7092, "step": 2595 }, { "epoch": 0.3578679329685833, "grad_norm": 0.20117135150827015, "learning_rate": 1.6216096040231434e-05, "loss": 0.8091, "step": 2600 }, { "epoch": 0.3578679329685833, "eval_loss": 0.7439802885055542, "eval_runtime": 52.7913, "eval_samples_per_second": 94.713, "eval_steps_per_second": 2.974, "step": 2600 }, { "epoch": 0.3585561405319844, "grad_norm": 0.20727759523405648, "learning_rate": 1.6197258180636742e-05, "loss": 0.7227, "step": 2605 }, { "epoch": 0.3592443480953856, "grad_norm": 0.18359425257363016, "learning_rate": 1.617838454854125e-05, "loss": 0.7378, "step": 2610 }, { "epoch": 0.3599325556587867, "grad_norm": 0.19893194992421948, "learning_rate": 1.6159475252889427e-05, "loss": 0.7684, "step": 2615 }, { "epoch": 0.36062076322218783, "grad_norm": 0.19966518175691764, "learning_rate": 1.6140530402831605e-05, "loss": 0.7513, "step": 2620 }, { "epoch": 0.36130897078558893, "grad_norm": 0.22371909857690997, "learning_rate": 1.6121550107723348e-05, "loss": 0.7378, "step": 2625 }, { "epoch": 0.36199717834899003, "grad_norm": 0.20885323797529146, "learning_rate": 1.610253447712482e-05, "loss": 0.7258, "step": 2630 }, { "epoch": 0.3626853859123912, "grad_norm": 0.19529959488462512, "learning_rate": 1.6083483620800154e-05, "loss": 0.7309, "step": 2635 }, { "epoch": 0.3633735934757923, "grad_norm": 0.22590913932532183, "learning_rate": 1.6064397648716815e-05, "loss": 0.749, "step": 2640 }, { "epoch": 0.36406180103919344, "grad_norm": 0.20853020192820743, "learning_rate": 1.6045276671044967e-05, "loss": 0.7011, "step": 2645 }, { "epoch": 0.36475000860259454, "grad_norm": 0.21785773238298042, "learning_rate": 1.6026120798156836e-05, "loss": 0.7917, "step": 2650 }, { "epoch": 0.36543821616599564, "grad_norm": 0.17182131686685842, "learning_rate": 1.6006930140626076e-05, "loss": 0.745, "step": 2655 }, { "epoch": 0.3661264237293968, "grad_norm": 0.18387358711208862, "learning_rate": 1.598770480922713e-05, "loss": 0.6805, "step": 2660 }, { "epoch": 0.3668146312927979, "grad_norm": 0.179924297035791, "learning_rate": 1.5968444914934586e-05, "loss": 0.7289, "step": 2665 }, { "epoch": 0.36750283885619905, "grad_norm": 0.19879419903800225, "learning_rate": 1.594915056892254e-05, "loss": 0.7986, "step": 2670 }, { "epoch": 0.36819104641960015, "grad_norm": 0.20768540338976835, "learning_rate": 1.5929821882563955e-05, "loss": 0.7277, "step": 2675 }, { "epoch": 0.36887925398300125, "grad_norm": 0.23160266661496431, "learning_rate": 1.5910458967430025e-05, "loss": 0.7143, "step": 2680 }, { "epoch": 0.3695674615464024, "grad_norm": 0.21971059664007217, "learning_rate": 1.5891061935289506e-05, "loss": 0.7827, "step": 2685 }, { "epoch": 0.3702556691098035, "grad_norm": 0.2014935110286079, "learning_rate": 1.58716308981081e-05, "loss": 0.7427, "step": 2690 }, { "epoch": 0.37094387667320466, "grad_norm": 0.2058874355314015, "learning_rate": 1.58521659680478e-05, "loss": 0.8216, "step": 2695 }, { "epoch": 0.37163208423660576, "grad_norm": 0.18386072284506022, "learning_rate": 1.5832667257466227e-05, "loss": 0.7695, "step": 2700 }, { "epoch": 0.37232029180000686, "grad_norm": 0.19498518596330836, "learning_rate": 1.5813134878916002e-05, "loss": 0.6921, "step": 2705 }, { "epoch": 0.373008499363408, "grad_norm": 0.19753152296139484, "learning_rate": 1.5793568945144086e-05, "loss": 0.7207, "step": 2710 }, { "epoch": 0.3736967069268091, "grad_norm": 0.20559841147196925, "learning_rate": 1.577396956909113e-05, "loss": 0.7922, "step": 2715 }, { "epoch": 0.3743849144902103, "grad_norm": 0.19670375043668217, "learning_rate": 1.5754336863890832e-05, "loss": 0.7299, "step": 2720 }, { "epoch": 0.3750731220536114, "grad_norm": 0.20609008432938294, "learning_rate": 1.5734670942869262e-05, "loss": 0.7181, "step": 2725 }, { "epoch": 0.3757613296170125, "grad_norm": 0.19751031029657254, "learning_rate": 1.5714971919544238e-05, "loss": 0.7089, "step": 2730 }, { "epoch": 0.37644953718041363, "grad_norm": 0.2101649631245876, "learning_rate": 1.569523990762464e-05, "loss": 0.7573, "step": 2735 }, { "epoch": 0.37713774474381473, "grad_norm": 0.19765149330289453, "learning_rate": 1.5675475021009772e-05, "loss": 0.8016, "step": 2740 }, { "epoch": 0.37782595230721583, "grad_norm": 0.2071939825528664, "learning_rate": 1.565567737378871e-05, "loss": 0.7543, "step": 2745 }, { "epoch": 0.378514159870617, "grad_norm": 0.195884041269756, "learning_rate": 1.563584708023963e-05, "loss": 0.7178, "step": 2750 }, { "epoch": 0.3792023674340181, "grad_norm": 0.21847428550997253, "learning_rate": 1.5615984254829148e-05, "loss": 0.7061, "step": 2755 }, { "epoch": 0.37989057499741924, "grad_norm": 0.18980925992148456, "learning_rate": 1.5596089012211665e-05, "loss": 0.7082, "step": 2760 }, { "epoch": 0.38057878256082034, "grad_norm": 0.1782600515555842, "learning_rate": 1.5576161467228712e-05, "loss": 0.6771, "step": 2765 }, { "epoch": 0.38126699012422144, "grad_norm": 0.19853969780623046, "learning_rate": 1.5556201734908267e-05, "loss": 0.6935, "step": 2770 }, { "epoch": 0.3819551976876226, "grad_norm": 0.2336072456214204, "learning_rate": 1.5536209930464114e-05, "loss": 0.7393, "step": 2775 }, { "epoch": 0.3826434052510237, "grad_norm": 0.22132120718236145, "learning_rate": 1.5516186169295165e-05, "loss": 0.8061, "step": 2780 }, { "epoch": 0.38333161281442485, "grad_norm": 0.2048747922862239, "learning_rate": 1.5496130566984794e-05, "loss": 0.7459, "step": 2785 }, { "epoch": 0.38401982037782595, "grad_norm": 0.20634682177457175, "learning_rate": 1.5476043239300163e-05, "loss": 0.648, "step": 2790 }, { "epoch": 0.38470802794122705, "grad_norm": 0.21894751377726399, "learning_rate": 1.5455924302191582e-05, "loss": 0.7238, "step": 2795 }, { "epoch": 0.3853962355046282, "grad_norm": 0.20175426509794794, "learning_rate": 1.5435773871791813e-05, "loss": 0.764, "step": 2800 }, { "epoch": 0.3853962355046282, "eval_loss": 0.7406504154205322, "eval_runtime": 52.732, "eval_samples_per_second": 94.819, "eval_steps_per_second": 2.977, "step": 2800 }, { "epoch": 0.3860844430680293, "grad_norm": 0.18943282741499323, "learning_rate": 1.5415592064415395e-05, "loss": 0.7222, "step": 2805 }, { "epoch": 0.38677265063143046, "grad_norm": 0.20927377783175394, "learning_rate": 1.5395378996557998e-05, "loss": 0.7312, "step": 2810 }, { "epoch": 0.38746085819483156, "grad_norm": 0.21217722818222637, "learning_rate": 1.537513478489574e-05, "loss": 0.7872, "step": 2815 }, { "epoch": 0.38814906575823266, "grad_norm": 0.1989014527389125, "learning_rate": 1.5354859546284493e-05, "loss": 0.7721, "step": 2820 }, { "epoch": 0.3888372733216338, "grad_norm": 0.2015367144561831, "learning_rate": 1.5334553397759244e-05, "loss": 0.6843, "step": 2825 }, { "epoch": 0.3895254808850349, "grad_norm": 0.19921589555334238, "learning_rate": 1.531421645653339e-05, "loss": 0.7267, "step": 2830 }, { "epoch": 0.39021368844843607, "grad_norm": 0.1917602636669685, "learning_rate": 1.5293848839998085e-05, "loss": 0.6966, "step": 2835 }, { "epoch": 0.39090189601183717, "grad_norm": 0.18631634243749887, "learning_rate": 1.5273450665721545e-05, "loss": 0.733, "step": 2840 }, { "epoch": 0.39159010357523827, "grad_norm": 0.21152164597654996, "learning_rate": 1.5253022051448362e-05, "loss": 0.7537, "step": 2845 }, { "epoch": 0.3922783111386394, "grad_norm": 0.1787044371486554, "learning_rate": 1.5232563115098858e-05, "loss": 0.6999, "step": 2850 }, { "epoch": 0.3929665187020405, "grad_norm": 0.18940726746843622, "learning_rate": 1.521207397476837e-05, "loss": 0.7804, "step": 2855 }, { "epoch": 0.3936547262654417, "grad_norm": 0.19449486033853836, "learning_rate": 1.5191554748726584e-05, "loss": 0.756, "step": 2860 }, { "epoch": 0.3943429338288428, "grad_norm": 0.18523711450501995, "learning_rate": 1.5171005555416857e-05, "loss": 0.7756, "step": 2865 }, { "epoch": 0.3950311413922439, "grad_norm": 0.19470684356486426, "learning_rate": 1.5150426513455519e-05, "loss": 0.7651, "step": 2870 }, { "epoch": 0.39571934895564503, "grad_norm": 0.19076115929732465, "learning_rate": 1.5129817741631195e-05, "loss": 0.7098, "step": 2875 }, { "epoch": 0.39640755651904613, "grad_norm": 0.2091201073229589, "learning_rate": 1.5109179358904123e-05, "loss": 0.7026, "step": 2880 }, { "epoch": 0.3970957640824473, "grad_norm": 0.22074501221244242, "learning_rate": 1.5088511484405466e-05, "loss": 0.7468, "step": 2885 }, { "epoch": 0.3977839716458484, "grad_norm": 0.21590230944332936, "learning_rate": 1.5067814237436622e-05, "loss": 0.7475, "step": 2890 }, { "epoch": 0.3984721792092495, "grad_norm": 0.19438946952337438, "learning_rate": 1.5047087737468534e-05, "loss": 0.7024, "step": 2895 }, { "epoch": 0.39916038677265064, "grad_norm": 0.20159635289001404, "learning_rate": 1.5026332104141e-05, "loss": 0.7318, "step": 2900 }, { "epoch": 0.39984859433605174, "grad_norm": 0.22014741853829994, "learning_rate": 1.500554745726199e-05, "loss": 0.7512, "step": 2905 }, { "epoch": 0.4005368018994529, "grad_norm": 0.18390705510644959, "learning_rate": 1.4984733916806948e-05, "loss": 0.7217, "step": 2910 }, { "epoch": 0.401225009462854, "grad_norm": 0.2070176549552474, "learning_rate": 1.4963891602918102e-05, "loss": 0.7384, "step": 2915 }, { "epoch": 0.4019132170262551, "grad_norm": 0.1989240876822762, "learning_rate": 1.4943020635903762e-05, "loss": 0.7041, "step": 2920 }, { "epoch": 0.40260142458965625, "grad_norm": 0.18330464231638063, "learning_rate": 1.4922121136237645e-05, "loss": 0.6755, "step": 2925 }, { "epoch": 0.40328963215305735, "grad_norm": 0.18399739059007378, "learning_rate": 1.490119322455815e-05, "loss": 0.6997, "step": 2930 }, { "epoch": 0.4039778397164585, "grad_norm": 0.2113893009982699, "learning_rate": 1.4880237021667699e-05, "loss": 0.7122, "step": 2935 }, { "epoch": 0.4046660472798596, "grad_norm": 0.18733027333847171, "learning_rate": 1.4859252648532003e-05, "loss": 0.6762, "step": 2940 }, { "epoch": 0.4053542548432607, "grad_norm": 0.1949838506267082, "learning_rate": 1.4838240226279393e-05, "loss": 0.7213, "step": 2945 }, { "epoch": 0.40604246240666186, "grad_norm": 0.2200056849420356, "learning_rate": 1.48171998762001e-05, "loss": 0.7907, "step": 2950 }, { "epoch": 0.40673066997006296, "grad_norm": 0.1772352976411828, "learning_rate": 1.4796131719745563e-05, "loss": 0.6892, "step": 2955 }, { "epoch": 0.4074188775334641, "grad_norm": 0.19682774329357552, "learning_rate": 1.4775035878527735e-05, "loss": 0.6974, "step": 2960 }, { "epoch": 0.4081070850968652, "grad_norm": 0.20781812035184166, "learning_rate": 1.4753912474318366e-05, "loss": 0.7211, "step": 2965 }, { "epoch": 0.4087952926602663, "grad_norm": 0.20000774888986145, "learning_rate": 1.4732761629048318e-05, "loss": 0.6554, "step": 2970 }, { "epoch": 0.4094835002236675, "grad_norm": 0.21108132954589012, "learning_rate": 1.4711583464806838e-05, "loss": 0.7067, "step": 2975 }, { "epoch": 0.4101717077870686, "grad_norm": 0.19391738099874137, "learning_rate": 1.4690378103840877e-05, "loss": 0.715, "step": 2980 }, { "epoch": 0.41085991535046973, "grad_norm": 0.20227350193713167, "learning_rate": 1.466914566855437e-05, "loss": 0.6911, "step": 2985 }, { "epoch": 0.41154812291387083, "grad_norm": 0.1887611488515709, "learning_rate": 1.4647886281507533e-05, "loss": 0.7065, "step": 2990 }, { "epoch": 0.41223633047727193, "grad_norm": 0.21328159674447736, "learning_rate": 1.4626600065416161e-05, "loss": 0.7492, "step": 2995 }, { "epoch": 0.4129245380406731, "grad_norm": 0.18372415384197258, "learning_rate": 1.4605287143150908e-05, "loss": 0.714, "step": 3000 }, { "epoch": 0.4129245380406731, "eval_loss": 0.7369969487190247, "eval_runtime": 52.7336, "eval_samples_per_second": 94.816, "eval_steps_per_second": 2.977, "step": 3000 }, { "epoch": 0.4136127456040742, "grad_norm": 0.1795227889752451, "learning_rate": 1.4583947637736586e-05, "loss": 0.6831, "step": 3005 }, { "epoch": 0.4143009531674753, "grad_norm": 0.19268679397342522, "learning_rate": 1.4562581672351458e-05, "loss": 0.7269, "step": 3010 }, { "epoch": 0.41498916073087644, "grad_norm": 0.1889657897118296, "learning_rate": 1.4541189370326513e-05, "loss": 0.7099, "step": 3015 }, { "epoch": 0.41567736829427754, "grad_norm": 0.20689626204207287, "learning_rate": 1.4519770855144774e-05, "loss": 0.7279, "step": 3020 }, { "epoch": 0.4163655758576787, "grad_norm": 0.20266145307684555, "learning_rate": 1.4498326250440567e-05, "loss": 0.7588, "step": 3025 }, { "epoch": 0.4170537834210798, "grad_norm": 0.21048290198040726, "learning_rate": 1.4476855679998818e-05, "loss": 0.7217, "step": 3030 }, { "epoch": 0.4177419909844809, "grad_norm": 0.17309038366729013, "learning_rate": 1.445535926775433e-05, "loss": 0.7211, "step": 3035 }, { "epoch": 0.41843019854788205, "grad_norm": 0.2193072945066012, "learning_rate": 1.443383713779108e-05, "loss": 0.7219, "step": 3040 }, { "epoch": 0.41911840611128315, "grad_norm": 0.19335766577766292, "learning_rate": 1.4412289414341497e-05, "loss": 0.6943, "step": 3045 }, { "epoch": 0.4198066136746843, "grad_norm": 0.18815905043377307, "learning_rate": 1.439071622178573e-05, "loss": 0.7779, "step": 3050 }, { "epoch": 0.4204948212380854, "grad_norm": 0.20002782099455188, "learning_rate": 1.4369117684650964e-05, "loss": 0.7315, "step": 3055 }, { "epoch": 0.4211830288014865, "grad_norm": 0.20877083773638963, "learning_rate": 1.434749392761066e-05, "loss": 0.7827, "step": 3060 }, { "epoch": 0.42187123636488766, "grad_norm": 0.17952983493256336, "learning_rate": 1.4325845075483874e-05, "loss": 0.7328, "step": 3065 }, { "epoch": 0.42255944392828876, "grad_norm": 0.21494585206735226, "learning_rate": 1.4304171253234504e-05, "loss": 0.7525, "step": 3070 }, { "epoch": 0.4232476514916899, "grad_norm": 0.1783895027769175, "learning_rate": 1.4282472585970596e-05, "loss": 0.7157, "step": 3075 }, { "epoch": 0.423935859055091, "grad_norm": 0.17601197529584237, "learning_rate": 1.4260749198943597e-05, "loss": 0.6665, "step": 3080 }, { "epoch": 0.4246240666184921, "grad_norm": 0.18380797415813444, "learning_rate": 1.4239001217547656e-05, "loss": 0.7352, "step": 3085 }, { "epoch": 0.42531227418189327, "grad_norm": 0.21481989140772542, "learning_rate": 1.421722876731888e-05, "loss": 0.6928, "step": 3090 }, { "epoch": 0.42600048174529437, "grad_norm": 0.17405560839455303, "learning_rate": 1.419543197393462e-05, "loss": 0.7523, "step": 3095 }, { "epoch": 0.4266886893086955, "grad_norm": 0.20574186057532176, "learning_rate": 1.417361096321274e-05, "loss": 0.8106, "step": 3100 }, { "epoch": 0.4273768968720966, "grad_norm": 0.16609670181299763, "learning_rate": 1.4151765861110902e-05, "loss": 0.6745, "step": 3105 }, { "epoch": 0.4280651044354977, "grad_norm": 0.19387641816788526, "learning_rate": 1.4129896793725827e-05, "loss": 0.6706, "step": 3110 }, { "epoch": 0.4287533119988989, "grad_norm": 0.18931720070179983, "learning_rate": 1.4108003887292572e-05, "loss": 0.7152, "step": 3115 }, { "epoch": 0.4294415195623, "grad_norm": 0.20521724200671632, "learning_rate": 1.4086087268183792e-05, "loss": 0.6661, "step": 3120 }, { "epoch": 0.43012972712570113, "grad_norm": 0.20245734000998633, "learning_rate": 1.406414706290903e-05, "loss": 0.7268, "step": 3125 }, { "epoch": 0.43081793468910223, "grad_norm": 0.17652421442825533, "learning_rate": 1.4042183398113973e-05, "loss": 0.7835, "step": 3130 }, { "epoch": 0.43150614225250333, "grad_norm": 0.22226204105367148, "learning_rate": 1.402019640057972e-05, "loss": 0.7882, "step": 3135 }, { "epoch": 0.4321943498159045, "grad_norm": 0.19398160341066903, "learning_rate": 1.399818619722206e-05, "loss": 0.694, "step": 3140 }, { "epoch": 0.4328825573793056, "grad_norm": 0.19452659826082636, "learning_rate": 1.3976152915090719e-05, "loss": 0.6984, "step": 3145 }, { "epoch": 0.43357076494270674, "grad_norm": 0.18110422519703132, "learning_rate": 1.3954096681368664e-05, "loss": 0.6882, "step": 3150 }, { "epoch": 0.43425897250610784, "grad_norm": 0.18514831198785398, "learning_rate": 1.3932017623371324e-05, "loss": 0.6372, "step": 3155 }, { "epoch": 0.43494718006950894, "grad_norm": 0.1907039848705332, "learning_rate": 1.3909915868545886e-05, "loss": 0.7035, "step": 3160 }, { "epoch": 0.4356353876329101, "grad_norm": 0.20043098572201315, "learning_rate": 1.3887791544470559e-05, "loss": 0.7818, "step": 3165 }, { "epoch": 0.4363235951963112, "grad_norm": 0.2075010097605494, "learning_rate": 1.3865644778853809e-05, "loss": 0.6939, "step": 3170 }, { "epoch": 0.43701180275971235, "grad_norm": 0.18319492503147103, "learning_rate": 1.3843475699533658e-05, "loss": 0.7037, "step": 3175 }, { "epoch": 0.43770001032311345, "grad_norm": 0.20269575765891254, "learning_rate": 1.3821284434476916e-05, "loss": 0.7131, "step": 3180 }, { "epoch": 0.43838821788651455, "grad_norm": 0.18942042471015913, "learning_rate": 1.3799071111778475e-05, "loss": 0.7319, "step": 3185 }, { "epoch": 0.4390764254499157, "grad_norm": 0.17403812023742662, "learning_rate": 1.3776835859660529e-05, "loss": 0.7587, "step": 3190 }, { "epoch": 0.4397646330133168, "grad_norm": 0.21846607689856837, "learning_rate": 1.3754578806471872e-05, "loss": 0.7633, "step": 3195 }, { "epoch": 0.44045284057671796, "grad_norm": 0.19451707282995306, "learning_rate": 1.3732300080687133e-05, "loss": 0.6745, "step": 3200 }, { "epoch": 0.44045284057671796, "eval_loss": 0.7338817119598389, "eval_runtime": 52.7413, "eval_samples_per_second": 94.802, "eval_steps_per_second": 2.977, "step": 3200 }, { "epoch": 0.44114104814011906, "grad_norm": 0.22448551388402496, "learning_rate": 1.3709999810906043e-05, "loss": 0.7611, "step": 3205 }, { "epoch": 0.44182925570352016, "grad_norm": 0.18718293485729295, "learning_rate": 1.368767812585269e-05, "loss": 0.7398, "step": 3210 }, { "epoch": 0.4425174632669213, "grad_norm": 0.20430212485442212, "learning_rate": 1.3665335154374779e-05, "loss": 0.6773, "step": 3215 }, { "epoch": 0.4432056708303224, "grad_norm": 0.21146133827414298, "learning_rate": 1.364297102544289e-05, "loss": 0.7439, "step": 3220 }, { "epoch": 0.4438938783937236, "grad_norm": 0.21856812497682784, "learning_rate": 1.362058586814973e-05, "loss": 0.7302, "step": 3225 }, { "epoch": 0.4445820859571247, "grad_norm": 0.2072882869509707, "learning_rate": 1.359817981170938e-05, "loss": 0.8, "step": 3230 }, { "epoch": 0.4452702935205258, "grad_norm": 0.18827317551311795, "learning_rate": 1.3575752985456566e-05, "loss": 0.6823, "step": 3235 }, { "epoch": 0.44595850108392693, "grad_norm": 0.19580677313279943, "learning_rate": 1.3553305518845898e-05, "loss": 0.7909, "step": 3240 }, { "epoch": 0.44664670864732803, "grad_norm": 0.1822855392891766, "learning_rate": 1.3530837541451138e-05, "loss": 0.6645, "step": 3245 }, { "epoch": 0.4473349162107292, "grad_norm": 0.18888245490423103, "learning_rate": 1.3508349182964432e-05, "loss": 0.7016, "step": 3250 }, { "epoch": 0.4480231237741303, "grad_norm": 0.19332401863625526, "learning_rate": 1.348584057319558e-05, "loss": 0.7282, "step": 3255 }, { "epoch": 0.4487113313375314, "grad_norm": 0.20411607460757145, "learning_rate": 1.346331184207127e-05, "loss": 0.7618, "step": 3260 }, { "epoch": 0.44939953890093254, "grad_norm": 0.1561055256270716, "learning_rate": 1.3440763119634341e-05, "loss": 0.7198, "step": 3265 }, { "epoch": 0.45008774646433364, "grad_norm": 0.21753310423752611, "learning_rate": 1.3418194536043037e-05, "loss": 0.7547, "step": 3270 }, { "epoch": 0.45077595402773474, "grad_norm": 0.19790471919914365, "learning_rate": 1.339560622157023e-05, "loss": 0.7769, "step": 3275 }, { "epoch": 0.4514641615911359, "grad_norm": 0.1694341563768366, "learning_rate": 1.3372998306602696e-05, "loss": 0.6702, "step": 3280 }, { "epoch": 0.452152369154537, "grad_norm": 0.2067240860252485, "learning_rate": 1.335037092164035e-05, "loss": 0.7474, "step": 3285 }, { "epoch": 0.45284057671793815, "grad_norm": 0.20116183709755686, "learning_rate": 1.332772419729549e-05, "loss": 0.7525, "step": 3290 }, { "epoch": 0.45352878428133925, "grad_norm": 0.20487698819831318, "learning_rate": 1.3305058264292055e-05, "loss": 0.7282, "step": 3295 }, { "epoch": 0.45421699184474035, "grad_norm": 0.18002430768171065, "learning_rate": 1.3282373253464848e-05, "loss": 0.7086, "step": 3300 }, { "epoch": 0.4549051994081415, "grad_norm": 0.18882658605531621, "learning_rate": 1.325966929575881e-05, "loss": 0.7134, "step": 3305 }, { "epoch": 0.4555934069715426, "grad_norm": 0.1982295375303005, "learning_rate": 1.3236946522228243e-05, "loss": 0.733, "step": 3310 }, { "epoch": 0.45628161453494376, "grad_norm": 0.2280465359878875, "learning_rate": 1.3214205064036055e-05, "loss": 0.7231, "step": 3315 }, { "epoch": 0.45696982209834486, "grad_norm": 0.1753932192358702, "learning_rate": 1.3191445052453014e-05, "loss": 0.7176, "step": 3320 }, { "epoch": 0.45765802966174596, "grad_norm": 0.19553836430863297, "learning_rate": 1.3168666618856984e-05, "loss": 0.7367, "step": 3325 }, { "epoch": 0.4583462372251471, "grad_norm": 0.1991065220473743, "learning_rate": 1.3145869894732169e-05, "loss": 0.7217, "step": 3330 }, { "epoch": 0.4590344447885482, "grad_norm": 0.182004320085633, "learning_rate": 1.312305501166834e-05, "loss": 0.7567, "step": 3335 }, { "epoch": 0.45972265235194937, "grad_norm": 0.19995353351401818, "learning_rate": 1.3100222101360098e-05, "loss": 0.7438, "step": 3340 }, { "epoch": 0.46041085991535047, "grad_norm": 0.18819227934094418, "learning_rate": 1.3077371295606098e-05, "loss": 0.7179, "step": 3345 }, { "epoch": 0.46109906747875157, "grad_norm": 0.1990780231046755, "learning_rate": 1.30545027263083e-05, "loss": 0.6861, "step": 3350 }, { "epoch": 0.4617872750421527, "grad_norm": 0.19768219215123314, "learning_rate": 1.303161652547119e-05, "loss": 0.657, "step": 3355 }, { "epoch": 0.4624754826055538, "grad_norm": 0.20937667173315025, "learning_rate": 1.3008712825201035e-05, "loss": 0.6631, "step": 3360 }, { "epoch": 0.463163690168955, "grad_norm": 0.19957070565544377, "learning_rate": 1.2985791757705112e-05, "loss": 0.6959, "step": 3365 }, { "epoch": 0.4638518977323561, "grad_norm": 0.20259984789880758, "learning_rate": 1.2962853455290954e-05, "loss": 0.7325, "step": 3370 }, { "epoch": 0.4645401052957572, "grad_norm": 0.20754956012413098, "learning_rate": 1.2939898050365564e-05, "loss": 0.7184, "step": 3375 }, { "epoch": 0.46522831285915833, "grad_norm": 0.19277507821672074, "learning_rate": 1.2916925675434685e-05, "loss": 0.6339, "step": 3380 }, { "epoch": 0.46591652042255943, "grad_norm": 0.19965565258463597, "learning_rate": 1.2893936463102002e-05, "loss": 0.7106, "step": 3385 }, { "epoch": 0.4666047279859606, "grad_norm": 0.18888810238847878, "learning_rate": 1.2870930546068397e-05, "loss": 0.7314, "step": 3390 }, { "epoch": 0.4672929355493617, "grad_norm": 0.1917015598884196, "learning_rate": 1.2847908057131173e-05, "loss": 0.7535, "step": 3395 }, { "epoch": 0.4679811431127628, "grad_norm": 0.21336527413113837, "learning_rate": 1.2824869129183292e-05, "loss": 0.6771, "step": 3400 }, { "epoch": 0.4679811431127628, "eval_loss": 0.7295014262199402, "eval_runtime": 52.7354, "eval_samples_per_second": 94.813, "eval_steps_per_second": 2.977, "step": 3400 }, { "epoch": 0.46866935067616394, "grad_norm": 0.18998706828435286, "learning_rate": 1.2801813895212614e-05, "loss": 0.7051, "step": 3405 }, { "epoch": 0.46935755823956504, "grad_norm": 0.2112610605863021, "learning_rate": 1.2778742488301111e-05, "loss": 0.7334, "step": 3410 }, { "epoch": 0.4700457658029662, "grad_norm": 0.19226608739058665, "learning_rate": 1.2755655041624121e-05, "loss": 0.7348, "step": 3415 }, { "epoch": 0.4707339733663673, "grad_norm": 0.1891384859847331, "learning_rate": 1.2732551688449562e-05, "loss": 0.6482, "step": 3420 }, { "epoch": 0.4714221809297684, "grad_norm": 0.19014309365022622, "learning_rate": 1.2709432562137168e-05, "loss": 0.7075, "step": 3425 }, { "epoch": 0.47211038849316955, "grad_norm": 0.1997453430647959, "learning_rate": 1.2686297796137725e-05, "loss": 0.7111, "step": 3430 }, { "epoch": 0.47279859605657065, "grad_norm": 0.1846489911200262, "learning_rate": 1.2663147523992294e-05, "loss": 0.705, "step": 3435 }, { "epoch": 0.4734868036199718, "grad_norm": 0.20168662931632128, "learning_rate": 1.2639981879331446e-05, "loss": 0.7865, "step": 3440 }, { "epoch": 0.4741750111833729, "grad_norm": 0.21359675683768095, "learning_rate": 1.261680099587448e-05, "loss": 0.7026, "step": 3445 }, { "epoch": 0.474863218746774, "grad_norm": 0.1753857782046223, "learning_rate": 1.2593605007428666e-05, "loss": 0.7118, "step": 3450 }, { "epoch": 0.47555142631017516, "grad_norm": 0.23891577735297356, "learning_rate": 1.2570394047888452e-05, "loss": 0.7431, "step": 3455 }, { "epoch": 0.47623963387357626, "grad_norm": 0.18663142323306373, "learning_rate": 1.2547168251234722e-05, "loss": 0.7326, "step": 3460 }, { "epoch": 0.4769278414369774, "grad_norm": 0.19933661807414477, "learning_rate": 1.2523927751533988e-05, "loss": 0.6671, "step": 3465 }, { "epoch": 0.4776160490003785, "grad_norm": 0.2057332447630491, "learning_rate": 1.250067268293764e-05, "loss": 0.69, "step": 3470 }, { "epoch": 0.4783042565637796, "grad_norm": 0.21891410712833587, "learning_rate": 1.2477403179681167e-05, "loss": 0.8272, "step": 3475 }, { "epoch": 0.4789924641271808, "grad_norm": 0.20345103212410637, "learning_rate": 1.2454119376083374e-05, "loss": 0.6507, "step": 3480 }, { "epoch": 0.4796806716905819, "grad_norm": 0.20102680218034322, "learning_rate": 1.243082140654561e-05, "loss": 0.7517, "step": 3485 }, { "epoch": 0.480368879253983, "grad_norm": 0.20229232949872755, "learning_rate": 1.2407509405551004e-05, "loss": 0.7699, "step": 3490 }, { "epoch": 0.4810570868173841, "grad_norm": 0.19025224591551693, "learning_rate": 1.2384183507663667e-05, "loss": 0.7259, "step": 3495 }, { "epoch": 0.4817452943807852, "grad_norm": 0.1840318678566497, "learning_rate": 1.2360843847527941e-05, "loss": 0.7132, "step": 3500 }, { "epoch": 0.4824335019441864, "grad_norm": 0.22604835224477485, "learning_rate": 1.2337490559867591e-05, "loss": 0.743, "step": 3505 }, { "epoch": 0.4831217095075875, "grad_norm": 0.20450286485631108, "learning_rate": 1.2314123779485059e-05, "loss": 0.6908, "step": 3510 }, { "epoch": 0.48380991707098864, "grad_norm": 0.19947427270474852, "learning_rate": 1.2290743641260665e-05, "loss": 0.6622, "step": 3515 }, { "epoch": 0.48449812463438974, "grad_norm": 0.2097371575777961, "learning_rate": 1.2267350280151833e-05, "loss": 0.6926, "step": 3520 }, { "epoch": 0.48518633219779084, "grad_norm": 0.22183150514548064, "learning_rate": 1.224394383119232e-05, "loss": 0.7021, "step": 3525 }, { "epoch": 0.485874539761192, "grad_norm": 0.1660981326275928, "learning_rate": 1.222052442949142e-05, "loss": 0.6767, "step": 3530 }, { "epoch": 0.4865627473245931, "grad_norm": 0.21459159605923867, "learning_rate": 1.2197092210233205e-05, "loss": 0.6876, "step": 3535 }, { "epoch": 0.4872509548879942, "grad_norm": 0.2139833661366623, "learning_rate": 1.2173647308675725e-05, "loss": 0.72, "step": 3540 }, { "epoch": 0.48793916245139535, "grad_norm": 0.1842810422193067, "learning_rate": 1.2150189860150246e-05, "loss": 0.6276, "step": 3545 }, { "epoch": 0.48862737001479645, "grad_norm": 0.20671163141189355, "learning_rate": 1.2126720000060448e-05, "loss": 0.7331, "step": 3550 }, { "epoch": 0.4893155775781976, "grad_norm": 0.21903293414761826, "learning_rate": 1.210323786388166e-05, "loss": 0.7442, "step": 3555 }, { "epoch": 0.4900037851415987, "grad_norm": 0.18907991843677824, "learning_rate": 1.2079743587160078e-05, "loss": 0.7065, "step": 3560 }, { "epoch": 0.4906919927049998, "grad_norm": 0.1974809354519325, "learning_rate": 1.205623730551196e-05, "loss": 0.7272, "step": 3565 }, { "epoch": 0.49138020026840096, "grad_norm": 0.17318773218091726, "learning_rate": 1.2032719154622884e-05, "loss": 0.7171, "step": 3570 }, { "epoch": 0.49206840783180206, "grad_norm": 0.17614594275013068, "learning_rate": 1.2009189270246919e-05, "loss": 0.7442, "step": 3575 }, { "epoch": 0.4927566153952032, "grad_norm": 0.18296770656849906, "learning_rate": 1.1985647788205877e-05, "loss": 0.7139, "step": 3580 }, { "epoch": 0.4934448229586043, "grad_norm": 0.1667045221563047, "learning_rate": 1.196209484438851e-05, "loss": 0.7507, "step": 3585 }, { "epoch": 0.4941330305220054, "grad_norm": 0.17899761680751788, "learning_rate": 1.1938530574749732e-05, "loss": 0.6592, "step": 3590 }, { "epoch": 0.49482123808540657, "grad_norm": 0.20748958074126803, "learning_rate": 1.1914955115309828e-05, "loss": 0.6668, "step": 3595 }, { "epoch": 0.49550944564880767, "grad_norm": 0.2048813834701349, "learning_rate": 1.1891368602153686e-05, "loss": 0.7419, "step": 3600 }, { "epoch": 0.49550944564880767, "eval_loss": 0.7257180213928223, "eval_runtime": 52.728, "eval_samples_per_second": 94.826, "eval_steps_per_second": 2.978, "step": 3600 }, { "epoch": 0.4961976532122088, "grad_norm": 0.20301789693446085, "learning_rate": 1.1867771171429991e-05, "loss": 0.6967, "step": 3605 }, { "epoch": 0.4968858607756099, "grad_norm": 0.21780005785765175, "learning_rate": 1.1844162959350445e-05, "loss": 0.7268, "step": 3610 }, { "epoch": 0.497574068339011, "grad_norm": 0.2158563755483925, "learning_rate": 1.1820544102188989e-05, "loss": 0.7765, "step": 3615 }, { "epoch": 0.4982622759024122, "grad_norm": 0.19964481329438474, "learning_rate": 1.1796914736281013e-05, "loss": 0.8053, "step": 3620 }, { "epoch": 0.4989504834658133, "grad_norm": 0.21132409465754357, "learning_rate": 1.1773274998022557e-05, "loss": 0.7294, "step": 3625 }, { "epoch": 0.49963869102921443, "grad_norm": 0.2018152914799053, "learning_rate": 1.1749625023869546e-05, "loss": 0.671, "step": 3630 }, { "epoch": 0.5003268985926156, "grad_norm": 0.17415960026909627, "learning_rate": 1.1725964950336976e-05, "loss": 0.6483, "step": 3635 }, { "epoch": 0.5010151061560166, "grad_norm": 0.19957766172198385, "learning_rate": 1.170229491399815e-05, "loss": 0.7583, "step": 3640 }, { "epoch": 0.5017033137194178, "grad_norm": 0.19116981958052348, "learning_rate": 1.1678615051483882e-05, "loss": 0.7169, "step": 3645 }, { "epoch": 0.5023915212828189, "grad_norm": 0.18050679156301855, "learning_rate": 1.1654925499481686e-05, "loss": 0.7454, "step": 3650 }, { "epoch": 0.50307972884622, "grad_norm": 0.19071009358318072, "learning_rate": 1.1631226394735034e-05, "loss": 0.7236, "step": 3655 }, { "epoch": 0.5037679364096211, "grad_norm": 0.20758190571790927, "learning_rate": 1.1607517874042519e-05, "loss": 0.7397, "step": 3660 }, { "epoch": 0.5044561439730223, "grad_norm": 0.18374359122605288, "learning_rate": 1.1583800074257097e-05, "loss": 0.7174, "step": 3665 }, { "epoch": 0.5051443515364233, "grad_norm": 0.1763429762634088, "learning_rate": 1.1560073132285276e-05, "loss": 0.7538, "step": 3670 }, { "epoch": 0.5058325590998245, "grad_norm": 0.18743473164928476, "learning_rate": 1.1536337185086348e-05, "loss": 0.6769, "step": 3675 }, { "epoch": 0.5065207666632257, "grad_norm": 0.22379879701847064, "learning_rate": 1.1512592369671574e-05, "loss": 0.6861, "step": 3680 }, { "epoch": 0.5072089742266267, "grad_norm": 0.18354051807569818, "learning_rate": 1.1488838823103412e-05, "loss": 0.6171, "step": 3685 }, { "epoch": 0.5078971817900279, "grad_norm": 0.19902767728140927, "learning_rate": 1.146507668249472e-05, "loss": 0.7197, "step": 3690 }, { "epoch": 0.508585389353429, "grad_norm": 0.23101182529622763, "learning_rate": 1.1441306085007957e-05, "loss": 0.7221, "step": 3695 }, { "epoch": 0.5092735969168302, "grad_norm": 0.18703656371802269, "learning_rate": 1.14175271678544e-05, "loss": 0.6784, "step": 3700 }, { "epoch": 0.5099618044802312, "grad_norm": 0.20638878146584352, "learning_rate": 1.1393740068293355e-05, "loss": 0.739, "step": 3705 }, { "epoch": 0.5106500120436324, "grad_norm": 0.17148254731370824, "learning_rate": 1.1369944923631356e-05, "loss": 0.6259, "step": 3710 }, { "epoch": 0.5113382196070335, "grad_norm": 0.19385695801152034, "learning_rate": 1.1346141871221374e-05, "loss": 0.6754, "step": 3715 }, { "epoch": 0.5120264271704346, "grad_norm": 0.21242745842369185, "learning_rate": 1.1322331048462026e-05, "loss": 0.7567, "step": 3720 }, { "epoch": 0.5127146347338357, "grad_norm": 0.17652181777803497, "learning_rate": 1.1298512592796784e-05, "loss": 0.6835, "step": 3725 }, { "epoch": 0.5134028422972369, "grad_norm": 0.20511137425578288, "learning_rate": 1.1274686641713178e-05, "loss": 0.7622, "step": 3730 }, { "epoch": 0.5140910498606379, "grad_norm": 0.19892255958765262, "learning_rate": 1.1250853332742005e-05, "loss": 0.7068, "step": 3735 }, { "epoch": 0.5147792574240391, "grad_norm": 0.22002947413962234, "learning_rate": 1.1227012803456537e-05, "loss": 0.7097, "step": 3740 }, { "epoch": 0.5154674649874402, "grad_norm": 0.20764114199131012, "learning_rate": 1.1203165191471714e-05, "loss": 0.7645, "step": 3745 }, { "epoch": 0.5161556725508414, "grad_norm": 0.21793841540209058, "learning_rate": 1.1179310634443372e-05, "loss": 0.7554, "step": 3750 }, { "epoch": 0.5168438801142424, "grad_norm": 0.21072905433622258, "learning_rate": 1.1155449270067427e-05, "loss": 0.6971, "step": 3755 }, { "epoch": 0.5175320876776436, "grad_norm": 0.21940318340987733, "learning_rate": 1.113158123607909e-05, "loss": 0.7333, "step": 3760 }, { "epoch": 0.5182202952410447, "grad_norm": 0.20339485302326396, "learning_rate": 1.1107706670252079e-05, "loss": 0.6548, "step": 3765 }, { "epoch": 0.5189085028044458, "grad_norm": 0.1828137989841553, "learning_rate": 1.1083825710397804e-05, "loss": 0.7072, "step": 3770 }, { "epoch": 0.5195967103678469, "grad_norm": 0.17399436008133762, "learning_rate": 1.1059938494364598e-05, "loss": 0.7242, "step": 3775 }, { "epoch": 0.5202849179312481, "grad_norm": 0.19942397134184592, "learning_rate": 1.1036045160036886e-05, "loss": 0.6963, "step": 3780 }, { "epoch": 0.5209731254946491, "grad_norm": 0.19238292205391158, "learning_rate": 1.101214584533443e-05, "loss": 0.6977, "step": 3785 }, { "epoch": 0.5216613330580503, "grad_norm": 0.2663461640731902, "learning_rate": 1.0988240688211502e-05, "loss": 0.6308, "step": 3790 }, { "epoch": 0.5223495406214514, "grad_norm": 0.20571720806499427, "learning_rate": 1.09643298266561e-05, "loss": 0.698, "step": 3795 }, { "epoch": 0.5230377481848526, "grad_norm": 0.19850319498735924, "learning_rate": 1.0940413398689154e-05, "loss": 0.71, "step": 3800 }, { "epoch": 0.5230377481848526, "eval_loss": 0.7223002910614014, "eval_runtime": 52.7513, "eval_samples_per_second": 94.784, "eval_steps_per_second": 2.976, "step": 3800 }, { "epoch": 0.5237259557482536, "grad_norm": 0.21710139249740856, "learning_rate": 1.0916491542363714e-05, "loss": 0.7602, "step": 3805 }, { "epoch": 0.5244141633116548, "grad_norm": 0.19188546122822303, "learning_rate": 1.0892564395764177e-05, "loss": 0.7101, "step": 3810 }, { "epoch": 0.525102370875056, "grad_norm": 0.19577535946258096, "learning_rate": 1.0868632097005468e-05, "loss": 0.7429, "step": 3815 }, { "epoch": 0.525790578438457, "grad_norm": 0.20086984604968092, "learning_rate": 1.0844694784232261e-05, "loss": 0.7342, "step": 3820 }, { "epoch": 0.5264787860018582, "grad_norm": 0.19290900890656384, "learning_rate": 1.082075259561816e-05, "loss": 0.6813, "step": 3825 }, { "epoch": 0.5271669935652593, "grad_norm": 0.19974310506286086, "learning_rate": 1.0796805669364925e-05, "loss": 0.691, "step": 3830 }, { "epoch": 0.5278552011286604, "grad_norm": 0.2151830867233287, "learning_rate": 1.0772854143701661e-05, "loss": 0.7206, "step": 3835 }, { "epoch": 0.5285434086920615, "grad_norm": 0.208790673050775, "learning_rate": 1.0748898156884011e-05, "loss": 0.6946, "step": 3840 }, { "epoch": 0.5292316162554627, "grad_norm": 0.20957712508589568, "learning_rate": 1.0724937847193391e-05, "loss": 0.7341, "step": 3845 }, { "epoch": 0.5299198238188638, "grad_norm": 0.21424893348529656, "learning_rate": 1.0700973352936146e-05, "loss": 0.698, "step": 3850 }, { "epoch": 0.5306080313822649, "grad_norm": 0.19585302281853811, "learning_rate": 1.0677004812442792e-05, "loss": 0.6893, "step": 3855 }, { "epoch": 0.531296238945666, "grad_norm": 0.2105344658466114, "learning_rate": 1.0653032364067197e-05, "loss": 0.7437, "step": 3860 }, { "epoch": 0.5319844465090672, "grad_norm": 0.18484455689004856, "learning_rate": 1.0629056146185784e-05, "loss": 0.6999, "step": 3865 }, { "epoch": 0.5326726540724682, "grad_norm": 0.20708065287970281, "learning_rate": 1.0605076297196735e-05, "loss": 0.7251, "step": 3870 }, { "epoch": 0.5333608616358694, "grad_norm": 0.18551938683172833, "learning_rate": 1.0581092955519195e-05, "loss": 0.6619, "step": 3875 }, { "epoch": 0.5340490691992705, "grad_norm": 0.22451853503852692, "learning_rate": 1.0557106259592468e-05, "loss": 0.718, "step": 3880 }, { "epoch": 0.5347372767626716, "grad_norm": 0.18870487306966632, "learning_rate": 1.0533116347875218e-05, "loss": 0.7137, "step": 3885 }, { "epoch": 0.5354254843260727, "grad_norm": 0.1870075102248364, "learning_rate": 1.0509123358844675e-05, "loss": 0.7241, "step": 3890 }, { "epoch": 0.5361136918894739, "grad_norm": 0.19315776674575108, "learning_rate": 1.048512743099583e-05, "loss": 0.7616, "step": 3895 }, { "epoch": 0.536801899452875, "grad_norm": 0.18235793985202783, "learning_rate": 1.0461128702840637e-05, "loss": 0.7011, "step": 3900 }, { "epoch": 0.5374901070162761, "grad_norm": 0.18791560763258985, "learning_rate": 1.0437127312907218e-05, "loss": 0.7095, "step": 3905 }, { "epoch": 0.5381783145796772, "grad_norm": 0.1992410147395156, "learning_rate": 1.0413123399739058e-05, "loss": 0.7482, "step": 3910 }, { "epoch": 0.5388665221430784, "grad_norm": 0.19454617873447524, "learning_rate": 1.0389117101894201e-05, "loss": 0.7185, "step": 3915 }, { "epoch": 0.5395547297064794, "grad_norm": 0.186967162302862, "learning_rate": 1.0365108557944469e-05, "loss": 0.7636, "step": 3920 }, { "epoch": 0.5402429372698806, "grad_norm": 0.1902395967709683, "learning_rate": 1.0341097906474628e-05, "loss": 0.6945, "step": 3925 }, { "epoch": 0.5409311448332818, "grad_norm": 0.2260490046059926, "learning_rate": 1.0317085286081641e-05, "loss": 0.7449, "step": 3930 }, { "epoch": 0.5416193523966828, "grad_norm": 0.193851094325017, "learning_rate": 1.0293070835373802e-05, "loss": 0.7126, "step": 3935 }, { "epoch": 0.542307559960084, "grad_norm": 0.20544055764539737, "learning_rate": 1.0269054692969996e-05, "loss": 0.7266, "step": 3940 }, { "epoch": 0.5429957675234851, "grad_norm": 0.18002691732755227, "learning_rate": 1.0245036997498858e-05, "loss": 0.7128, "step": 3945 }, { "epoch": 0.5436839750868862, "grad_norm": 0.22150227013391063, "learning_rate": 1.0221017887597991e-05, "loss": 0.7049, "step": 3950 }, { "epoch": 0.5443721826502873, "grad_norm": 0.19596692331518673, "learning_rate": 1.0196997501913174e-05, "loss": 0.6888, "step": 3955 }, { "epoch": 0.5450603902136885, "grad_norm": 0.18169354826353964, "learning_rate": 1.0172975979097527e-05, "loss": 0.7502, "step": 3960 }, { "epoch": 0.5457485977770896, "grad_norm": 0.17634975141047946, "learning_rate": 1.014895345781076e-05, "loss": 0.6573, "step": 3965 }, { "epoch": 0.5464368053404907, "grad_norm": 0.20852202335220715, "learning_rate": 1.0124930076718326e-05, "loss": 0.6802, "step": 3970 }, { "epoch": 0.5471250129038918, "grad_norm": 0.1910422872815004, "learning_rate": 1.010090597449065e-05, "loss": 0.723, "step": 3975 }, { "epoch": 0.547813220467293, "grad_norm": 0.1623599966191097, "learning_rate": 1.0076881289802321e-05, "loss": 0.6618, "step": 3980 }, { "epoch": 0.548501428030694, "grad_norm": 0.213513963534071, "learning_rate": 1.0052856161331285e-05, "loss": 0.692, "step": 3985 }, { "epoch": 0.5491896355940952, "grad_norm": 0.19369320437016782, "learning_rate": 1.0028830727758059e-05, "loss": 0.6615, "step": 3990 }, { "epoch": 0.5498778431574963, "grad_norm": 0.19992044452416324, "learning_rate": 1.0004805127764908e-05, "loss": 0.7016, "step": 3995 }, { "epoch": 0.5505660507208974, "grad_norm": 0.18598264917512458, "learning_rate": 9.980779500035062e-06, "loss": 0.6362, "step": 4000 }, { "epoch": 0.5505660507208974, "eval_loss": 0.7189182639122009, "eval_runtime": 52.741, "eval_samples_per_second": 94.803, "eval_steps_per_second": 2.977, "step": 4000 }, { "epoch": 0.5512542582842985, "grad_norm": 0.1870639091016689, "learning_rate": 9.95675398325192e-06, "loss": 0.6952, "step": 4005 }, { "epoch": 0.5519424658476997, "grad_norm": 0.2077676830803775, "learning_rate": 9.932728716098227e-06, "loss": 0.7294, "step": 4010 }, { "epoch": 0.5526306734111008, "grad_norm": 0.2064719778902339, "learning_rate": 9.908703837255304e-06, "loss": 0.7727, "step": 4015 }, { "epoch": 0.5533188809745019, "grad_norm": 0.18957837577428344, "learning_rate": 9.88467948540221e-06, "loss": 0.7317, "step": 4020 }, { "epoch": 0.554007088537903, "grad_norm": 0.20290712994202398, "learning_rate": 9.86065579921498e-06, "loss": 0.6922, "step": 4025 }, { "epoch": 0.5546952961013042, "grad_norm": 0.18656962986211925, "learning_rate": 9.836632917365792e-06, "loss": 0.6322, "step": 4030 }, { "epoch": 0.5553835036647052, "grad_norm": 0.19532781721618336, "learning_rate": 9.812610978522194e-06, "loss": 0.7527, "step": 4035 }, { "epoch": 0.5560717112281064, "grad_norm": 0.1939428400668435, "learning_rate": 9.788590121346285e-06, "loss": 0.6676, "step": 4040 }, { "epoch": 0.5567599187915075, "grad_norm": 0.20749116911089652, "learning_rate": 9.764570484493916e-06, "loss": 0.7414, "step": 4045 }, { "epoch": 0.5574481263549086, "grad_norm": 0.20083968529615154, "learning_rate": 9.740552206613896e-06, "loss": 0.7968, "step": 4050 }, { "epoch": 0.5581363339183097, "grad_norm": 0.20011435748233797, "learning_rate": 9.716535426347198e-06, "loss": 0.7417, "step": 4055 }, { "epoch": 0.5588245414817109, "grad_norm": 0.1803593074079462, "learning_rate": 9.692520282326146e-06, "loss": 0.7119, "step": 4060 }, { "epoch": 0.5595127490451121, "grad_norm": 0.20187365650638342, "learning_rate": 9.668506913173609e-06, "loss": 0.7138, "step": 4065 }, { "epoch": 0.5602009566085131, "grad_norm": 0.19668980092142224, "learning_rate": 9.644495457502222e-06, "loss": 0.7208, "step": 4070 }, { "epoch": 0.5608891641719143, "grad_norm": 0.19303276515844067, "learning_rate": 9.620486053913576e-06, "loss": 0.69, "step": 4075 }, { "epoch": 0.5615773717353154, "grad_norm": 0.18400504647499435, "learning_rate": 9.596478840997407e-06, "loss": 0.7322, "step": 4080 }, { "epoch": 0.5622655792987165, "grad_norm": 0.17568748242031135, "learning_rate": 9.572473957330814e-06, "loss": 0.6963, "step": 4085 }, { "epoch": 0.5629537868621176, "grad_norm": 0.1934902469589795, "learning_rate": 9.548471541477447e-06, "loss": 0.7442, "step": 4090 }, { "epoch": 0.5636419944255188, "grad_norm": 0.19516289639181503, "learning_rate": 9.52447173198671e-06, "loss": 0.7405, "step": 4095 }, { "epoch": 0.5643302019889198, "grad_norm": 0.17782591195753142, "learning_rate": 9.500474667392972e-06, "loss": 0.6793, "step": 4100 }, { "epoch": 0.565018409552321, "grad_norm": 0.18788022791144948, "learning_rate": 9.476480486214735e-06, "loss": 0.7325, "step": 4105 }, { "epoch": 0.5657066171157221, "grad_norm": 0.18632975839490698, "learning_rate": 9.45248932695389e-06, "loss": 0.7002, "step": 4110 }, { "epoch": 0.5663948246791233, "grad_norm": 0.21900311589925658, "learning_rate": 9.428501328094855e-06, "loss": 0.7305, "step": 4115 }, { "epoch": 0.5670830322425243, "grad_norm": 0.21427272938073105, "learning_rate": 9.404516628103824e-06, "loss": 0.709, "step": 4120 }, { "epoch": 0.5677712398059255, "grad_norm": 0.17199248152840274, "learning_rate": 9.380535365427933e-06, "loss": 0.7047, "step": 4125 }, { "epoch": 0.5684594473693266, "grad_norm": 0.19348521270336058, "learning_rate": 9.35655767849449e-06, "loss": 0.7554, "step": 4130 }, { "epoch": 0.5691476549327277, "grad_norm": 0.17922952609300785, "learning_rate": 9.332583705710163e-06, "loss": 0.6975, "step": 4135 }, { "epoch": 0.5698358624961288, "grad_norm": 0.20801864425372835, "learning_rate": 9.308613585460167e-06, "loss": 0.7171, "step": 4140 }, { "epoch": 0.57052407005953, "grad_norm": 0.19665632032344266, "learning_rate": 9.284647456107494e-06, "loss": 0.739, "step": 4145 }, { "epoch": 0.571212277622931, "grad_norm": 0.1790315879059953, "learning_rate": 9.26068545599209e-06, "loss": 0.6663, "step": 4150 }, { "epoch": 0.5719004851863322, "grad_norm": 0.21328223231318577, "learning_rate": 9.236727723430067e-06, "loss": 0.7369, "step": 4155 }, { "epoch": 0.5725886927497333, "grad_norm": 0.22230433536937821, "learning_rate": 9.212774396712909e-06, "loss": 0.6674, "step": 4160 }, { "epoch": 0.5732769003131345, "grad_norm": 0.17811301169605204, "learning_rate": 9.188825614106663e-06, "loss": 0.6765, "step": 4165 }, { "epoch": 0.5739651078765355, "grad_norm": 0.17887486012089068, "learning_rate": 9.164881513851152e-06, "loss": 0.7336, "step": 4170 }, { "epoch": 0.5746533154399367, "grad_norm": 0.1808756609761279, "learning_rate": 9.140942234159159e-06, "loss": 0.6865, "step": 4175 }, { "epoch": 0.5753415230033379, "grad_norm": 0.19389917568741993, "learning_rate": 9.117007913215655e-06, "loss": 0.7237, "step": 4180 }, { "epoch": 0.5760297305667389, "grad_norm": 0.20025219631250624, "learning_rate": 9.093078689176973e-06, "loss": 0.7007, "step": 4185 }, { "epoch": 0.57671793813014, "grad_norm": 0.2248966075132312, "learning_rate": 9.06915470017004e-06, "loss": 0.7403, "step": 4190 }, { "epoch": 0.5774061456935412, "grad_norm": 0.19541814595571855, "learning_rate": 9.045236084291557e-06, "loss": 0.6544, "step": 4195 }, { "epoch": 0.5780943532569423, "grad_norm": 0.20037795354637292, "learning_rate": 9.021322979607208e-06, "loss": 0.7616, "step": 4200 }, { "epoch": 0.5780943532569423, "eval_loss": 0.7159020304679871, "eval_runtime": 52.7513, "eval_samples_per_second": 94.784, "eval_steps_per_second": 2.976, "step": 4200 }, { "epoch": 0.5787825608203434, "grad_norm": 0.2109353858513512, "learning_rate": 8.997415524150868e-06, "loss": 0.6956, "step": 4205 }, { "epoch": 0.5794707683837446, "grad_norm": 0.18553203189563902, "learning_rate": 8.973513855923805e-06, "loss": 0.7114, "step": 4210 }, { "epoch": 0.5801589759471456, "grad_norm": 0.19681427530783205, "learning_rate": 8.949618112893873e-06, "loss": 0.6242, "step": 4215 }, { "epoch": 0.5808471835105468, "grad_norm": 0.19548269232792448, "learning_rate": 8.925728432994737e-06, "loss": 0.7065, "step": 4220 }, { "epoch": 0.5815353910739479, "grad_norm": 0.18473949631967448, "learning_rate": 8.901844954125048e-06, "loss": 0.6943, "step": 4225 }, { "epoch": 0.5822235986373491, "grad_norm": 0.18742993149994672, "learning_rate": 8.87796781414768e-06, "loss": 0.7297, "step": 4230 }, { "epoch": 0.5829118062007501, "grad_norm": 0.16773751875957418, "learning_rate": 8.854097150888898e-06, "loss": 0.7298, "step": 4235 }, { "epoch": 0.5836000137641513, "grad_norm": 0.19163726387685784, "learning_rate": 8.8302331021376e-06, "loss": 0.7007, "step": 4240 }, { "epoch": 0.5842882213275524, "grad_norm": 0.19410393777561452, "learning_rate": 8.806375805644487e-06, "loss": 0.6931, "step": 4245 }, { "epoch": 0.5849764288909535, "grad_norm": 0.1890281957288065, "learning_rate": 8.782525399121294e-06, "loss": 0.6909, "step": 4250 }, { "epoch": 0.5856646364543546, "grad_norm": 0.1749717804198359, "learning_rate": 8.758682020239983e-06, "loss": 0.7106, "step": 4255 }, { "epoch": 0.5863528440177558, "grad_norm": 0.22087239511463866, "learning_rate": 8.734845806631947e-06, "loss": 0.7086, "step": 4260 }, { "epoch": 0.5870410515811568, "grad_norm": 0.20679569967089562, "learning_rate": 8.711016895887223e-06, "loss": 0.6586, "step": 4265 }, { "epoch": 0.587729259144558, "grad_norm": 0.21358138454104006, "learning_rate": 8.687195425553688e-06, "loss": 0.7765, "step": 4270 }, { "epoch": 0.5884174667079591, "grad_norm": 0.1893669402062755, "learning_rate": 8.663381533136283e-06, "loss": 0.7402, "step": 4275 }, { "epoch": 0.5891056742713603, "grad_norm": 0.2069650296504844, "learning_rate": 8.63957535609619e-06, "loss": 0.6756, "step": 4280 }, { "epoch": 0.5897938818347613, "grad_norm": 0.18377309186495303, "learning_rate": 8.615777031850064e-06, "loss": 0.7253, "step": 4285 }, { "epoch": 0.5904820893981625, "grad_norm": 0.20310498832827714, "learning_rate": 8.591986697769237e-06, "loss": 0.7541, "step": 4290 }, { "epoch": 0.5911702969615636, "grad_norm": 0.19804727438428799, "learning_rate": 8.568204491178907e-06, "loss": 0.6593, "step": 4295 }, { "epoch": 0.5918585045249647, "grad_norm": 0.20341423825341978, "learning_rate": 8.544430549357368e-06, "loss": 0.7025, "step": 4300 }, { "epoch": 0.5925467120883658, "grad_norm": 0.19716437993961589, "learning_rate": 8.5206650095352e-06, "loss": 0.7174, "step": 4305 }, { "epoch": 0.593234919651767, "grad_norm": 0.1990007427057575, "learning_rate": 8.496908008894486e-06, "loss": 0.7105, "step": 4310 }, { "epoch": 0.593923127215168, "grad_norm": 0.17983827916890696, "learning_rate": 8.473159684568025e-06, "loss": 0.7096, "step": 4315 }, { "epoch": 0.5946113347785692, "grad_norm": 0.20399518215672602, "learning_rate": 8.44942017363852e-06, "loss": 0.6737, "step": 4320 }, { "epoch": 0.5952995423419704, "grad_norm": 0.2049946998918872, "learning_rate": 8.425689613137812e-06, "loss": 0.7381, "step": 4325 }, { "epoch": 0.5959877499053715, "grad_norm": 0.1809340220238709, "learning_rate": 8.401968140046075e-06, "loss": 0.6283, "step": 4330 }, { "epoch": 0.5966759574687726, "grad_norm": 0.19457017689450828, "learning_rate": 8.378255891291028e-06, "loss": 0.6998, "step": 4335 }, { "epoch": 0.5973641650321737, "grad_norm": 0.19063960477689923, "learning_rate": 8.354553003747136e-06, "loss": 0.6601, "step": 4340 }, { "epoch": 0.5980523725955749, "grad_norm": 0.1657140263783586, "learning_rate": 8.330859614234839e-06, "loss": 0.682, "step": 4345 }, { "epoch": 0.5987405801589759, "grad_norm": 0.1993138439519459, "learning_rate": 8.307175859519747e-06, "loss": 0.7636, "step": 4350 }, { "epoch": 0.5994287877223771, "grad_norm": 0.20748035215179747, "learning_rate": 8.283501876311851e-06, "loss": 0.8094, "step": 4355 }, { "epoch": 0.6001169952857782, "grad_norm": 0.17847205487134624, "learning_rate": 8.259837801264745e-06, "loss": 0.718, "step": 4360 }, { "epoch": 0.6008052028491793, "grad_norm": 0.18883104393696667, "learning_rate": 8.236183770974828e-06, "loss": 0.6996, "step": 4365 }, { "epoch": 0.6014934104125804, "grad_norm": 0.2166010092586326, "learning_rate": 8.212539921980514e-06, "loss": 0.7238, "step": 4370 }, { "epoch": 0.6021816179759816, "grad_norm": 0.17877956102105272, "learning_rate": 8.188906390761452e-06, "loss": 0.686, "step": 4375 }, { "epoch": 0.6028698255393827, "grad_norm": 0.18104829272562242, "learning_rate": 8.165283313737724e-06, "loss": 0.6681, "step": 4380 }, { "epoch": 0.6035580331027838, "grad_norm": 0.20056744936658638, "learning_rate": 8.141670827269083e-06, "loss": 0.6788, "step": 4385 }, { "epoch": 0.6042462406661849, "grad_norm": 0.18549948325237078, "learning_rate": 8.118069067654139e-06, "loss": 0.7413, "step": 4390 }, { "epoch": 0.6049344482295861, "grad_norm": 0.19832102137077345, "learning_rate": 8.094478171129588e-06, "loss": 0.6422, "step": 4395 }, { "epoch": 0.6056226557929871, "grad_norm": 0.17801328332326272, "learning_rate": 8.070898273869413e-06, "loss": 0.676, "step": 4400 }, { "epoch": 0.6056226557929871, "eval_loss": 0.7125820517539978, "eval_runtime": 52.7366, "eval_samples_per_second": 94.811, "eval_steps_per_second": 2.977, "step": 4400 }, { "epoch": 0.6063108633563883, "grad_norm": 0.18442026455371746, "learning_rate": 8.047329511984116e-06, "loss": 0.691, "step": 4405 }, { "epoch": 0.6069990709197894, "grad_norm": 0.19716515739085574, "learning_rate": 8.023772021519915e-06, "loss": 0.7268, "step": 4410 }, { "epoch": 0.6076872784831905, "grad_norm": 0.18782105358588455, "learning_rate": 8.00022593845797e-06, "loss": 0.6676, "step": 4415 }, { "epoch": 0.6083754860465916, "grad_norm": 0.2342789086582858, "learning_rate": 7.97669139871359e-06, "loss": 0.7009, "step": 4420 }, { "epoch": 0.6090636936099928, "grad_norm": 0.19716924549505815, "learning_rate": 7.95316853813546e-06, "loss": 0.7121, "step": 4425 }, { "epoch": 0.609751901173394, "grad_norm": 0.17721030403981378, "learning_rate": 7.929657492504838e-06, "loss": 0.6855, "step": 4430 }, { "epoch": 0.610440108736795, "grad_norm": 0.20996002453353285, "learning_rate": 7.906158397534789e-06, "loss": 0.7238, "step": 4435 }, { "epoch": 0.6111283163001962, "grad_norm": 0.18270043395097404, "learning_rate": 7.882671388869398e-06, "loss": 0.699, "step": 4440 }, { "epoch": 0.6118165238635973, "grad_norm": 0.21340317941501827, "learning_rate": 7.85919660208298e-06, "loss": 0.7113, "step": 4445 }, { "epoch": 0.6125047314269984, "grad_norm": 0.1913013833757653, "learning_rate": 7.835734172679301e-06, "loss": 0.6597, "step": 4450 }, { "epoch": 0.6131929389903995, "grad_norm": 0.1983479934100899, "learning_rate": 7.812284236090797e-06, "loss": 0.6589, "step": 4455 }, { "epoch": 0.6138811465538007, "grad_norm": 0.19535129890745362, "learning_rate": 7.788846927677794e-06, "loss": 0.749, "step": 4460 }, { "epoch": 0.6145693541172017, "grad_norm": 0.2243367183738196, "learning_rate": 7.76542238272772e-06, "loss": 0.7287, "step": 4465 }, { "epoch": 0.6152575616806029, "grad_norm": 0.20835175129331468, "learning_rate": 7.742010736454331e-06, "loss": 0.7041, "step": 4470 }, { "epoch": 0.615945769244004, "grad_norm": 0.20057897524750573, "learning_rate": 7.718612123996927e-06, "loss": 0.733, "step": 4475 }, { "epoch": 0.6166339768074051, "grad_norm": 0.21499905332341174, "learning_rate": 7.695226680419576e-06, "loss": 0.722, "step": 4480 }, { "epoch": 0.6173221843708062, "grad_norm": 0.21342711180918972, "learning_rate": 7.671854540710327e-06, "loss": 0.7024, "step": 4485 }, { "epoch": 0.6180103919342074, "grad_norm": 0.18705212215530592, "learning_rate": 7.64849583978043e-06, "loss": 0.6813, "step": 4490 }, { "epoch": 0.6186985994976085, "grad_norm": 0.1993400276182675, "learning_rate": 7.625150712463578e-06, "loss": 0.7665, "step": 4495 }, { "epoch": 0.6193868070610096, "grad_norm": 0.1900049161400563, "learning_rate": 7.6018192935151e-06, "loss": 0.7094, "step": 4500 }, { "epoch": 0.6200750146244107, "grad_norm": 0.19958844275111823, "learning_rate": 7.578501717611198e-06, "loss": 0.7328, "step": 4505 }, { "epoch": 0.6207632221878119, "grad_norm": 0.20080243822046528, "learning_rate": 7.55519811934817e-06, "loss": 0.7187, "step": 4510 }, { "epoch": 0.6214514297512129, "grad_norm": 0.20365647691131766, "learning_rate": 7.531908633241629e-06, "loss": 0.755, "step": 4515 }, { "epoch": 0.6221396373146141, "grad_norm": 0.1821586759802284, "learning_rate": 7.508633393725733e-06, "loss": 0.6967, "step": 4520 }, { "epoch": 0.6228278448780152, "grad_norm": 0.22526201477452987, "learning_rate": 7.4853725351523935e-06, "loss": 0.7135, "step": 4525 }, { "epoch": 0.6235160524414163, "grad_norm": 0.1666906950615596, "learning_rate": 7.462126191790522e-06, "loss": 0.7569, "step": 4530 }, { "epoch": 0.6242042600048174, "grad_norm": 0.19853549089007536, "learning_rate": 7.438894497825235e-06, "loss": 0.737, "step": 4535 }, { "epoch": 0.6248924675682186, "grad_norm": 0.19790048314749545, "learning_rate": 7.415677587357097e-06, "loss": 0.6724, "step": 4540 }, { "epoch": 0.6255806751316197, "grad_norm": 0.19373030317184212, "learning_rate": 7.3924755944013294e-06, "loss": 0.683, "step": 4545 }, { "epoch": 0.6262688826950208, "grad_norm": 0.217080679705782, "learning_rate": 7.3692886528870525e-06, "loss": 0.6961, "step": 4550 }, { "epoch": 0.626957090258422, "grad_norm": 0.22621398284588756, "learning_rate": 7.3461168966564964e-06, "loss": 0.7308, "step": 4555 }, { "epoch": 0.6276452978218231, "grad_norm": 0.19123422300411355, "learning_rate": 7.322960459464244e-06, "loss": 0.6967, "step": 4560 }, { "epoch": 0.6283335053852241, "grad_norm": 0.20478189389305237, "learning_rate": 7.299819474976453e-06, "loss": 0.7375, "step": 4565 }, { "epoch": 0.6290217129486253, "grad_norm": 0.17955787445402152, "learning_rate": 7.276694076770073e-06, "loss": 0.72, "step": 4570 }, { "epoch": 0.6297099205120265, "grad_norm": 0.19282588244552165, "learning_rate": 7.253584398332094e-06, "loss": 0.6819, "step": 4575 }, { "epoch": 0.6303981280754275, "grad_norm": 0.19173715612656989, "learning_rate": 7.230490573058767e-06, "loss": 0.6967, "step": 4580 }, { "epoch": 0.6310863356388287, "grad_norm": 0.1928138980185156, "learning_rate": 7.207412734254827e-06, "loss": 0.7148, "step": 4585 }, { "epoch": 0.6317745432022298, "grad_norm": 0.1858577522596576, "learning_rate": 7.184351015132738e-06, "loss": 0.7101, "step": 4590 }, { "epoch": 0.632462750765631, "grad_norm": 0.1956797996841857, "learning_rate": 7.161305548811908e-06, "loss": 0.6883, "step": 4595 }, { "epoch": 0.633150958329032, "grad_norm": 0.19607141329913197, "learning_rate": 7.138276468317933e-06, "loss": 0.6732, "step": 4600 }, { "epoch": 0.633150958329032, "eval_loss": 0.7093552947044373, "eval_runtime": 52.735, "eval_samples_per_second": 94.814, "eval_steps_per_second": 2.977, "step": 4600 }, { "epoch": 0.6338391658924332, "grad_norm": 0.19022020821267693, "learning_rate": 7.115263906581828e-06, "loss": 0.6804, "step": 4605 }, { "epoch": 0.6345273734558343, "grad_norm": 0.21244114591571706, "learning_rate": 7.0922679964392546e-06, "loss": 0.6429, "step": 4610 }, { "epoch": 0.6352155810192354, "grad_norm": 0.18450917727095142, "learning_rate": 7.069288870629749e-06, "loss": 0.7188, "step": 4615 }, { "epoch": 0.6359037885826365, "grad_norm": 0.18627063007531697, "learning_rate": 7.0463266617959745e-06, "loss": 0.6759, "step": 4620 }, { "epoch": 0.6365919961460377, "grad_norm": 0.18298301781696022, "learning_rate": 7.023381502482938e-06, "loss": 0.6859, "step": 4625 }, { "epoch": 0.6372802037094387, "grad_norm": 0.20442358521100906, "learning_rate": 7.0004535251372295e-06, "loss": 0.7436, "step": 4630 }, { "epoch": 0.6379684112728399, "grad_norm": 0.17746907625327848, "learning_rate": 6.977542862106262e-06, "loss": 0.6581, "step": 4635 }, { "epoch": 0.638656618836241, "grad_norm": 0.19387366364898667, "learning_rate": 6.95464964563751e-06, "loss": 0.7261, "step": 4640 }, { "epoch": 0.6393448263996422, "grad_norm": 0.20127278006452534, "learning_rate": 6.93177400787773e-06, "loss": 0.6688, "step": 4645 }, { "epoch": 0.6400330339630432, "grad_norm": 0.18062170947925887, "learning_rate": 6.9089160808722185e-06, "loss": 0.6931, "step": 4650 }, { "epoch": 0.6407212415264444, "grad_norm": 0.17972450698705236, "learning_rate": 6.88607599656403e-06, "loss": 0.7585, "step": 4655 }, { "epoch": 0.6414094490898455, "grad_norm": 0.2054243298226975, "learning_rate": 6.863253886793242e-06, "loss": 0.7134, "step": 4660 }, { "epoch": 0.6420976566532466, "grad_norm": 0.1949311518593927, "learning_rate": 6.840449883296159e-06, "loss": 0.7194, "step": 4665 }, { "epoch": 0.6427858642166477, "grad_norm": 0.1905621332377741, "learning_rate": 6.817664117704583e-06, "loss": 0.6523, "step": 4670 }, { "epoch": 0.6434740717800489, "grad_norm": 0.1974406684802213, "learning_rate": 6.794896721545032e-06, "loss": 0.6821, "step": 4675 }, { "epoch": 0.6441622793434499, "grad_norm": 0.18373148612277038, "learning_rate": 6.772147826237998e-06, "loss": 0.6912, "step": 4680 }, { "epoch": 0.6448504869068511, "grad_norm": 0.22418233658388936, "learning_rate": 6.749417563097176e-06, "loss": 0.706, "step": 4685 }, { "epoch": 0.6455386944702523, "grad_norm": 0.17272513035474268, "learning_rate": 6.72670606332871e-06, "loss": 0.6953, "step": 4690 }, { "epoch": 0.6462269020336534, "grad_norm": 0.1975385042424326, "learning_rate": 6.7040134580304364e-06, "loss": 0.7117, "step": 4695 }, { "epoch": 0.6469151095970544, "grad_norm": 0.23804232557192698, "learning_rate": 6.681339878191132e-06, "loss": 0.6952, "step": 4700 }, { "epoch": 0.6476033171604556, "grad_norm": 0.1697980897842037, "learning_rate": 6.658685454689744e-06, "loss": 0.6609, "step": 4705 }, { "epoch": 0.6482915247238568, "grad_norm": 0.15978284622354497, "learning_rate": 6.636050318294646e-06, "loss": 0.6451, "step": 4710 }, { "epoch": 0.6489797322872578, "grad_norm": 0.18539099201256026, "learning_rate": 6.613434599662887e-06, "loss": 0.6863, "step": 4715 }, { "epoch": 0.649667939850659, "grad_norm": 0.19686242608140375, "learning_rate": 6.590838429339426e-06, "loss": 0.6853, "step": 4720 }, { "epoch": 0.6503561474140601, "grad_norm": 0.21821104040719008, "learning_rate": 6.56826193775638e-06, "loss": 0.7344, "step": 4725 }, { "epoch": 0.6510443549774612, "grad_norm": 0.21790228388775143, "learning_rate": 6.545705255232275e-06, "loss": 0.7275, "step": 4730 }, { "epoch": 0.6517325625408623, "grad_norm": 0.15210385380648567, "learning_rate": 6.5231685119713e-06, "loss": 0.7039, "step": 4735 }, { "epoch": 0.6524207701042635, "grad_norm": 0.21671425553180715, "learning_rate": 6.500651838062538e-06, "loss": 0.7134, "step": 4740 }, { "epoch": 0.6531089776676645, "grad_norm": 0.1760976970162948, "learning_rate": 6.478155363479236e-06, "loss": 0.6648, "step": 4745 }, { "epoch": 0.6537971852310657, "grad_norm": 0.19133216339820874, "learning_rate": 6.455679218078033e-06, "loss": 0.7049, "step": 4750 }, { "epoch": 0.6544853927944668, "grad_norm": 0.17428668324023153, "learning_rate": 6.433223531598232e-06, "loss": 0.6894, "step": 4755 }, { "epoch": 0.655173600357868, "grad_norm": 0.19671522085948878, "learning_rate": 6.410788433661037e-06, "loss": 0.6786, "step": 4760 }, { "epoch": 0.655861807921269, "grad_norm": 0.19196357492864294, "learning_rate": 6.388374053768801e-06, "loss": 0.6957, "step": 4765 }, { "epoch": 0.6565500154846702, "grad_norm": 0.18356800244946278, "learning_rate": 6.3659805213043005e-06, "loss": 0.6859, "step": 4770 }, { "epoch": 0.6572382230480713, "grad_norm": 0.20972084277538136, "learning_rate": 6.343607965529963e-06, "loss": 0.6656, "step": 4775 }, { "epoch": 0.6579264306114724, "grad_norm": 0.20370059359746825, "learning_rate": 6.3212565155871385e-06, "loss": 0.6877, "step": 4780 }, { "epoch": 0.6586146381748735, "grad_norm": 0.19989506758190317, "learning_rate": 6.298926300495341e-06, "loss": 0.7286, "step": 4785 }, { "epoch": 0.6593028457382747, "grad_norm": 0.1886305479403928, "learning_rate": 6.276617449151514e-06, "loss": 0.6948, "step": 4790 }, { "epoch": 0.6599910533016757, "grad_norm": 0.2097030663828555, "learning_rate": 6.254330090329287e-06, "loss": 0.6582, "step": 4795 }, { "epoch": 0.6606792608650769, "grad_norm": 0.18189112474120137, "learning_rate": 6.232064352678219e-06, "loss": 0.7017, "step": 4800 }, { "epoch": 0.6606792608650769, "eval_loss": 0.7067230343818665, "eval_runtime": 52.7377, "eval_samples_per_second": 94.809, "eval_steps_per_second": 2.977, "step": 4800 }, { "epoch": 0.661367468428478, "grad_norm": 0.18075428532596752, "learning_rate": 6.209820364723074e-06, "loss": 0.7075, "step": 4805 }, { "epoch": 0.6620556759918792, "grad_norm": 0.18103139290558112, "learning_rate": 6.1875982548630585e-06, "loss": 0.6953, "step": 4810 }, { "epoch": 0.6627438835552802, "grad_norm": 0.18474351077350745, "learning_rate": 6.1653981513711055e-06, "loss": 0.7488, "step": 4815 }, { "epoch": 0.6634320911186814, "grad_norm": 0.18019758236248173, "learning_rate": 6.14322018239311e-06, "loss": 0.6634, "step": 4820 }, { "epoch": 0.6641202986820826, "grad_norm": 0.19076761954646188, "learning_rate": 6.1210644759472095e-06, "loss": 0.7175, "step": 4825 }, { "epoch": 0.6648085062454836, "grad_norm": 0.1909270324319627, "learning_rate": 6.098931159923023e-06, "loss": 0.6559, "step": 4830 }, { "epoch": 0.6654967138088848, "grad_norm": 0.1993608309574148, "learning_rate": 6.076820362080933e-06, "loss": 0.6847, "step": 4835 }, { "epoch": 0.6661849213722859, "grad_norm": 0.20283109403431887, "learning_rate": 6.054732210051342e-06, "loss": 0.6413, "step": 4840 }, { "epoch": 0.666873128935687, "grad_norm": 0.21286733739692149, "learning_rate": 6.0326668313339244e-06, "loss": 0.6652, "step": 4845 }, { "epoch": 0.6675613364990881, "grad_norm": 0.19367225322106704, "learning_rate": 6.010624353296908e-06, "loss": 0.7015, "step": 4850 }, { "epoch": 0.6682495440624893, "grad_norm": 0.19854233734822802, "learning_rate": 5.988604903176331e-06, "loss": 0.6672, "step": 4855 }, { "epoch": 0.6689377516258904, "grad_norm": 0.21381758730442654, "learning_rate": 5.966608608075304e-06, "loss": 0.6939, "step": 4860 }, { "epoch": 0.6696259591892915, "grad_norm": 0.19497500668692047, "learning_rate": 5.944635594963278e-06, "loss": 0.6979, "step": 4865 }, { "epoch": 0.6703141667526926, "grad_norm": 0.22050686819261497, "learning_rate": 5.922685990675317e-06, "loss": 0.7025, "step": 4870 }, { "epoch": 0.6710023743160938, "grad_norm": 0.22023099816860126, "learning_rate": 5.900759921911357e-06, "loss": 0.6614, "step": 4875 }, { "epoch": 0.6716905818794948, "grad_norm": 0.19081264773052006, "learning_rate": 5.8788575152354894e-06, "loss": 0.6602, "step": 4880 }, { "epoch": 0.672378789442896, "grad_norm": 0.22490468797834384, "learning_rate": 5.8569788970752114e-06, "loss": 0.6775, "step": 4885 }, { "epoch": 0.6730669970062971, "grad_norm": 0.19571197576715937, "learning_rate": 5.835124193720712e-06, "loss": 0.6507, "step": 4890 }, { "epoch": 0.6737552045696982, "grad_norm": 0.1833916321082257, "learning_rate": 5.813293531324127e-06, "loss": 0.6527, "step": 4895 }, { "epoch": 0.6744434121330993, "grad_norm": 0.1780439711565371, "learning_rate": 5.791487035898831e-06, "loss": 0.6564, "step": 4900 }, { "epoch": 0.6751316196965005, "grad_norm": 0.18197343093227838, "learning_rate": 5.769704833318695e-06, "loss": 0.6524, "step": 4905 }, { "epoch": 0.6758198272599016, "grad_norm": 0.2143933763742263, "learning_rate": 5.747947049317359e-06, "loss": 0.7344, "step": 4910 }, { "epoch": 0.6765080348233027, "grad_norm": 0.1788773205187315, "learning_rate": 5.72621380948752e-06, "loss": 0.6713, "step": 4915 }, { "epoch": 0.6771962423867038, "grad_norm": 0.18010093950906272, "learning_rate": 5.704505239280194e-06, "loss": 0.7133, "step": 4920 }, { "epoch": 0.677884449950105, "grad_norm": 0.18507666496772007, "learning_rate": 5.6828214640039954e-06, "loss": 0.7082, "step": 4925 }, { "epoch": 0.678572657513506, "grad_norm": 0.1921426772805136, "learning_rate": 5.66116260882442e-06, "loss": 0.698, "step": 4930 }, { "epoch": 0.6792608650769072, "grad_norm": 0.19508143138699072, "learning_rate": 5.639528798763109e-06, "loss": 0.7193, "step": 4935 }, { "epoch": 0.6799490726403083, "grad_norm": 0.18354159436414505, "learning_rate": 5.6179201586971425e-06, "loss": 0.6839, "step": 4940 }, { "epoch": 0.6806372802037094, "grad_norm": 0.1938596566589253, "learning_rate": 5.596336813358307e-06, "loss": 0.674, "step": 4945 }, { "epoch": 0.6813254877671105, "grad_norm": 0.20414841060199332, "learning_rate": 5.574778887332381e-06, "loss": 0.6867, "step": 4950 }, { "epoch": 0.6820136953305117, "grad_norm": 0.173343340143153, "learning_rate": 5.553246505058421e-06, "loss": 0.695, "step": 4955 }, { "epoch": 0.6827019028939127, "grad_norm": 0.18840195980997082, "learning_rate": 5.5317397908280235e-06, "loss": 0.6306, "step": 4960 }, { "epoch": 0.6833901104573139, "grad_norm": 0.18724233593012157, "learning_rate": 5.51025886878463e-06, "loss": 0.7325, "step": 4965 }, { "epoch": 0.6840783180207151, "grad_norm": 0.2054958098044752, "learning_rate": 5.488803862922805e-06, "loss": 0.7285, "step": 4970 }, { "epoch": 0.6847665255841162, "grad_norm": 0.1913624661448408, "learning_rate": 5.467374897087508e-06, "loss": 0.6724, "step": 4975 }, { "epoch": 0.6854547331475173, "grad_norm": 0.19738172427275874, "learning_rate": 5.4459720949734e-06, "loss": 0.6711, "step": 4980 }, { "epoch": 0.6861429407109184, "grad_norm": 0.18753424665874546, "learning_rate": 5.424595580124091e-06, "loss": 0.657, "step": 4985 }, { "epoch": 0.6868311482743196, "grad_norm": 0.22747625530462437, "learning_rate": 5.4032454759314865e-06, "loss": 0.7281, "step": 4990 }, { "epoch": 0.6875193558377206, "grad_norm": 0.19793230057643466, "learning_rate": 5.381921905635019e-06, "loss": 0.7418, "step": 4995 }, { "epoch": 0.6882075634011218, "grad_norm": 0.20219676745682627, "learning_rate": 5.3606249923209704e-06, "loss": 0.6796, "step": 5000 }, { "epoch": 0.6882075634011218, "eval_loss": 0.7038361430168152, "eval_runtime": 52.7475, "eval_samples_per_second": 94.791, "eval_steps_per_second": 2.976, "step": 5000 }, { "epoch": 0.6888957709645229, "grad_norm": 0.17639090389321407, "learning_rate": 5.339354858921737e-06, "loss": 0.6915, "step": 5005 }, { "epoch": 0.689583978527924, "grad_norm": 0.18759422083822191, "learning_rate": 5.318111628215143e-06, "loss": 0.7263, "step": 5010 }, { "epoch": 0.6902721860913251, "grad_norm": 0.16937928089624701, "learning_rate": 5.296895422823725e-06, "loss": 0.6494, "step": 5015 }, { "epoch": 0.6909603936547263, "grad_norm": 0.19597506121101066, "learning_rate": 5.275706365214011e-06, "loss": 0.6855, "step": 5020 }, { "epoch": 0.6916486012181274, "grad_norm": 0.20020952679463452, "learning_rate": 5.2545445776958304e-06, "loss": 0.6606, "step": 5025 }, { "epoch": 0.6923368087815285, "grad_norm": 0.2221602027879528, "learning_rate": 5.2334101824216035e-06, "loss": 0.7101, "step": 5030 }, { "epoch": 0.6930250163449296, "grad_norm": 0.17944624733873674, "learning_rate": 5.212303301385624e-06, "loss": 0.7159, "step": 5035 }, { "epoch": 0.6937132239083308, "grad_norm": 0.18694273194799418, "learning_rate": 5.191224056423367e-06, "loss": 0.6695, "step": 5040 }, { "epoch": 0.6944014314717318, "grad_norm": 0.19203473490414197, "learning_rate": 5.1701725692107985e-06, "loss": 0.768, "step": 5045 }, { "epoch": 0.695089639035133, "grad_norm": 0.18154293582051748, "learning_rate": 5.149148961263648e-06, "loss": 0.6768, "step": 5050 }, { "epoch": 0.6957778465985341, "grad_norm": 0.2089571494027484, "learning_rate": 5.128153353936705e-06, "loss": 0.7263, "step": 5055 }, { "epoch": 0.6964660541619352, "grad_norm": 0.16537604105919557, "learning_rate": 5.107185868423148e-06, "loss": 0.6886, "step": 5060 }, { "epoch": 0.6971542617253363, "grad_norm": 0.1968534852749588, "learning_rate": 5.086246625753821e-06, "loss": 0.767, "step": 5065 }, { "epoch": 0.6978424692887375, "grad_norm": 0.21446207239218934, "learning_rate": 5.0653357467965425e-06, "loss": 0.6579, "step": 5070 }, { "epoch": 0.6985306768521387, "grad_norm": 0.1914560113515333, "learning_rate": 5.044453352255409e-06, "loss": 0.6986, "step": 5075 }, { "epoch": 0.6992188844155397, "grad_norm": 0.20729055388433723, "learning_rate": 5.02359956267009e-06, "loss": 0.732, "step": 5080 }, { "epoch": 0.6999070919789409, "grad_norm": 0.18749123701711354, "learning_rate": 5.002774498415141e-06, "loss": 0.6117, "step": 5085 }, { "epoch": 0.700595299542342, "grad_norm": 0.20388305619421618, "learning_rate": 4.9819782796993125e-06, "loss": 0.6787, "step": 5090 }, { "epoch": 0.701283507105743, "grad_norm": 0.19962421801333632, "learning_rate": 4.961211026564837e-06, "loss": 0.717, "step": 5095 }, { "epoch": 0.7019717146691442, "grad_norm": 0.2065896786073066, "learning_rate": 4.940472858886761e-06, "loss": 0.6556, "step": 5100 }, { "epoch": 0.7026599222325454, "grad_norm": 0.16908280609530893, "learning_rate": 4.919763896372233e-06, "loss": 0.6475, "step": 5105 }, { "epoch": 0.7033481297959464, "grad_norm": 0.20470166992370625, "learning_rate": 4.899084258559823e-06, "loss": 0.6768, "step": 5110 }, { "epoch": 0.7040363373593476, "grad_norm": 0.19491303692347314, "learning_rate": 4.87843406481883e-06, "loss": 0.6537, "step": 5115 }, { "epoch": 0.7047245449227487, "grad_norm": 0.17868141071166566, "learning_rate": 4.857813434348592e-06, "loss": 0.668, "step": 5120 }, { "epoch": 0.7054127524861499, "grad_norm": 0.1706940972488979, "learning_rate": 4.8372224861778e-06, "loss": 0.6832, "step": 5125 }, { "epoch": 0.7061009600495509, "grad_norm": 0.20817563223417004, "learning_rate": 4.816661339163803e-06, "loss": 0.741, "step": 5130 }, { "epoch": 0.7067891676129521, "grad_norm": 0.2188817131669382, "learning_rate": 4.796130111991937e-06, "loss": 0.6798, "step": 5135 }, { "epoch": 0.7074773751763532, "grad_norm": 0.18315944855738367, "learning_rate": 4.775628923174826e-06, "loss": 0.6544, "step": 5140 }, { "epoch": 0.7081655827397543, "grad_norm": 0.20405916213384084, "learning_rate": 4.7551578910517084e-06, "loss": 0.7149, "step": 5145 }, { "epoch": 0.7088537903031554, "grad_norm": 0.19628060263522507, "learning_rate": 4.734717133787743e-06, "loss": 0.6595, "step": 5150 }, { "epoch": 0.7095419978665566, "grad_norm": 0.1847744171468134, "learning_rate": 4.714306769373337e-06, "loss": 0.7655, "step": 5155 }, { "epoch": 0.7102302054299576, "grad_norm": 0.1894317621505023, "learning_rate": 4.6939269156234565e-06, "loss": 0.6851, "step": 5160 }, { "epoch": 0.7109184129933588, "grad_norm": 0.19591365734428665, "learning_rate": 4.673577690176956e-06, "loss": 0.6814, "step": 5165 }, { "epoch": 0.7116066205567599, "grad_norm": 0.1756948931064504, "learning_rate": 4.653259210495893e-06, "loss": 0.7018, "step": 5170 }, { "epoch": 0.7122948281201611, "grad_norm": 0.18909657419180922, "learning_rate": 4.632971593864841e-06, "loss": 0.6971, "step": 5175 }, { "epoch": 0.7129830356835621, "grad_norm": 0.19724144393857387, "learning_rate": 4.612714957390234e-06, "loss": 0.7225, "step": 5180 }, { "epoch": 0.7136712432469633, "grad_norm": 0.1925318694316217, "learning_rate": 4.592489417999676e-06, "loss": 0.7279, "step": 5185 }, { "epoch": 0.7143594508103644, "grad_norm": 0.18576970357116196, "learning_rate": 4.572295092441267e-06, "loss": 0.6746, "step": 5190 }, { "epoch": 0.7150476583737655, "grad_norm": 0.1957529067610005, "learning_rate": 4.552132097282936e-06, "loss": 0.7194, "step": 5195 }, { "epoch": 0.7157358659371666, "grad_norm": 0.1792052648127033, "learning_rate": 4.532000548911754e-06, "loss": 0.7065, "step": 5200 }, { "epoch": 0.7157358659371666, "eval_loss": 0.7012067437171936, "eval_runtime": 52.748, "eval_samples_per_second": 94.79, "eval_steps_per_second": 2.976, "step": 5200 }, { "epoch": 0.7164240735005678, "grad_norm": 0.18119062479557804, "learning_rate": 4.511900563533272e-06, "loss": 0.6813, "step": 5205 }, { "epoch": 0.7171122810639688, "grad_norm": 0.22230076615019942, "learning_rate": 4.491832257170865e-06, "loss": 0.7615, "step": 5210 }, { "epoch": 0.71780048862737, "grad_norm": 0.19495716412631708, "learning_rate": 4.471795745665036e-06, "loss": 0.7148, "step": 5215 }, { "epoch": 0.7184886961907712, "grad_norm": 0.21253504079100252, "learning_rate": 4.45179114467275e-06, "loss": 0.6975, "step": 5220 }, { "epoch": 0.7191769037541722, "grad_norm": 0.22542661807172912, "learning_rate": 4.431818569666787e-06, "loss": 0.6908, "step": 5225 }, { "epoch": 0.7198651113175734, "grad_norm": 0.1957567227258181, "learning_rate": 4.411878135935058e-06, "loss": 0.7162, "step": 5230 }, { "epoch": 0.7205533188809745, "grad_norm": 0.19836669816905955, "learning_rate": 4.391969958579948e-06, "loss": 0.7128, "step": 5235 }, { "epoch": 0.7212415264443757, "grad_norm": 0.2150007165036599, "learning_rate": 4.372094152517643e-06, "loss": 0.7075, "step": 5240 }, { "epoch": 0.7219297340077767, "grad_norm": 0.1964009282700955, "learning_rate": 4.3522508324774794e-06, "loss": 0.7129, "step": 5245 }, { "epoch": 0.7226179415711779, "grad_norm": 0.2059068286649054, "learning_rate": 4.332440113001262e-06, "loss": 0.706, "step": 5250 }, { "epoch": 0.723306149134579, "grad_norm": 0.17968361165423405, "learning_rate": 4.312662108442624e-06, "loss": 0.68, "step": 5255 }, { "epoch": 0.7239943566979801, "grad_norm": 0.19539919917994641, "learning_rate": 4.292916932966357e-06, "loss": 0.6286, "step": 5260 }, { "epoch": 0.7246825642613812, "grad_norm": 0.19459582851299326, "learning_rate": 4.27320470054776e-06, "loss": 0.6666, "step": 5265 }, { "epoch": 0.7253707718247824, "grad_norm": 0.2113970382681966, "learning_rate": 4.253525524971954e-06, "loss": 0.6605, "step": 5270 }, { "epoch": 0.7260589793881834, "grad_norm": 0.1999076509256194, "learning_rate": 4.233879519833266e-06, "loss": 0.7174, "step": 5275 }, { "epoch": 0.7267471869515846, "grad_norm": 0.1898083580872569, "learning_rate": 4.214266798534542e-06, "loss": 0.7242, "step": 5280 }, { "epoch": 0.7274353945149857, "grad_norm": 0.18479245446309253, "learning_rate": 4.194687474286509e-06, "loss": 0.7212, "step": 5285 }, { "epoch": 0.7281236020783869, "grad_norm": 0.1909428721117587, "learning_rate": 4.175141660107114e-06, "loss": 0.6665, "step": 5290 }, { "epoch": 0.7288118096417879, "grad_norm": 0.20439176559254485, "learning_rate": 4.1556294688208675e-06, "loss": 0.7137, "step": 5295 }, { "epoch": 0.7295000172051891, "grad_norm": 0.17639936186999353, "learning_rate": 4.136151013058207e-06, "loss": 0.6706, "step": 5300 }, { "epoch": 0.7301882247685902, "grad_norm": 0.18492823768944952, "learning_rate": 4.116706405254834e-06, "loss": 0.6295, "step": 5305 }, { "epoch": 0.7308764323319913, "grad_norm": 0.19843879248001947, "learning_rate": 4.0972957576510705e-06, "loss": 0.7422, "step": 5310 }, { "epoch": 0.7315646398953924, "grad_norm": 0.18198612878798717, "learning_rate": 4.077919182291208e-06, "loss": 0.6701, "step": 5315 }, { "epoch": 0.7322528474587936, "grad_norm": 0.17673803965381463, "learning_rate": 4.058576791022864e-06, "loss": 0.6687, "step": 5320 }, { "epoch": 0.7329410550221946, "grad_norm": 0.19849804151365122, "learning_rate": 4.0392686954963326e-06, "loss": 0.6736, "step": 5325 }, { "epoch": 0.7336292625855958, "grad_norm": 0.15998416469500543, "learning_rate": 4.019995007163944e-06, "loss": 0.6904, "step": 5330 }, { "epoch": 0.734317470148997, "grad_norm": 0.2030638938849043, "learning_rate": 4.00075583727942e-06, "loss": 0.6155, "step": 5335 }, { "epoch": 0.7350056777123981, "grad_norm": 0.19927828353148516, "learning_rate": 3.981551296897234e-06, "loss": 0.7042, "step": 5340 }, { "epoch": 0.7356938852757992, "grad_norm": 0.20115233697954152, "learning_rate": 3.9623814968719545e-06, "loss": 0.7158, "step": 5345 }, { "epoch": 0.7363820928392003, "grad_norm": 0.20241824261773944, "learning_rate": 3.943246547857631e-06, "loss": 0.6925, "step": 5350 }, { "epoch": 0.7370703004026015, "grad_norm": 0.17694093002637665, "learning_rate": 3.924146560307137e-06, "loss": 0.6861, "step": 5355 }, { "epoch": 0.7377585079660025, "grad_norm": 0.18485862930756486, "learning_rate": 3.9050816444715416e-06, "loss": 0.6687, "step": 5360 }, { "epoch": 0.7384467155294037, "grad_norm": 0.17491950787875685, "learning_rate": 3.886051910399465e-06, "loss": 0.6589, "step": 5365 }, { "epoch": 0.7391349230928048, "grad_norm": 0.2134976073613459, "learning_rate": 3.867057467936441e-06, "loss": 0.7585, "step": 5370 }, { "epoch": 0.7398231306562059, "grad_norm": 0.19914714441825998, "learning_rate": 3.848098426724306e-06, "loss": 0.6366, "step": 5375 }, { "epoch": 0.740511338219607, "grad_norm": 0.1690800862710958, "learning_rate": 3.829174896200535e-06, "loss": 0.725, "step": 5380 }, { "epoch": 0.7411995457830082, "grad_norm": 0.1678247469890665, "learning_rate": 3.810286985597634e-06, "loss": 0.6993, "step": 5385 }, { "epoch": 0.7418877533464093, "grad_norm": 0.19478162532514748, "learning_rate": 3.7914348039424876e-06, "loss": 0.8171, "step": 5390 }, { "epoch": 0.7425759609098104, "grad_norm": 0.20089415533212981, "learning_rate": 3.7726184600557514e-06, "loss": 0.6258, "step": 5395 }, { "epoch": 0.7432641684732115, "grad_norm": 0.1662613379107829, "learning_rate": 3.753838062551214e-06, "loss": 0.6318, "step": 5400 }, { "epoch": 0.7432641684732115, "eval_loss": 0.6987390518188477, "eval_runtime": 52.7294, "eval_samples_per_second": 94.824, "eval_steps_per_second": 2.977, "step": 5400 }, { "epoch": 0.7439523760366127, "grad_norm": 0.1952072159154419, "learning_rate": 3.7350937198351654e-06, "loss": 0.7331, "step": 5405 }, { "epoch": 0.7446405836000137, "grad_norm": 0.20146722827293512, "learning_rate": 3.7163855401057857e-06, "loss": 0.708, "step": 5410 }, { "epoch": 0.7453287911634149, "grad_norm": 0.2039198020800785, "learning_rate": 3.6977136313524942e-06, "loss": 0.6782, "step": 5415 }, { "epoch": 0.746016998726816, "grad_norm": 0.1876312791817516, "learning_rate": 3.679078101355359e-06, "loss": 0.6333, "step": 5420 }, { "epoch": 0.7467052062902171, "grad_norm": 0.21532741208430087, "learning_rate": 3.660479057684447e-06, "loss": 0.7063, "step": 5425 }, { "epoch": 0.7473934138536182, "grad_norm": 0.17540092446618383, "learning_rate": 3.6419166076992316e-06, "loss": 0.6014, "step": 5430 }, { "epoch": 0.7480816214170194, "grad_norm": 0.22759182657845375, "learning_rate": 3.623390858547935e-06, "loss": 0.7322, "step": 5435 }, { "epoch": 0.7487698289804205, "grad_norm": 0.21293209260101892, "learning_rate": 3.604901917166943e-06, "loss": 0.6814, "step": 5440 }, { "epoch": 0.7494580365438216, "grad_norm": 0.19534766295472808, "learning_rate": 3.5864498902801727e-06, "loss": 0.6991, "step": 5445 }, { "epoch": 0.7501462441072227, "grad_norm": 0.2062495651644779, "learning_rate": 3.5680348843984603e-06, "loss": 0.6756, "step": 5450 }, { "epoch": 0.7508344516706239, "grad_norm": 0.18100695023598806, "learning_rate": 3.549657005818945e-06, "loss": 0.6538, "step": 5455 }, { "epoch": 0.751522659234025, "grad_norm": 0.18957398598188308, "learning_rate": 3.531316360624458e-06, "loss": 0.6737, "step": 5460 }, { "epoch": 0.7522108667974261, "grad_norm": 0.20471894835008617, "learning_rate": 3.513013054682899e-06, "loss": 0.73, "step": 5465 }, { "epoch": 0.7528990743608273, "grad_norm": 0.18834690464193307, "learning_rate": 3.4947471936466416e-06, "loss": 0.7005, "step": 5470 }, { "epoch": 0.7535872819242283, "grad_norm": 0.18689136249440028, "learning_rate": 3.4765188829519157e-06, "loss": 0.7284, "step": 5475 }, { "epoch": 0.7542754894876295, "grad_norm": 0.17997921002101389, "learning_rate": 3.4583282278181974e-06, "loss": 0.6543, "step": 5480 }, { "epoch": 0.7549636970510306, "grad_norm": 0.19227829891769094, "learning_rate": 3.4401753332476016e-06, "loss": 0.7069, "step": 5485 }, { "epoch": 0.7556519046144317, "grad_norm": 0.19790826850962326, "learning_rate": 3.4220603040242804e-06, "loss": 0.6395, "step": 5490 }, { "epoch": 0.7563401121778328, "grad_norm": 0.20598924418820477, "learning_rate": 3.4039832447138143e-06, "loss": 0.655, "step": 5495 }, { "epoch": 0.757028319741234, "grad_norm": 0.19072732612497695, "learning_rate": 3.385944259662609e-06, "loss": 0.6871, "step": 5500 }, { "epoch": 0.7577165273046351, "grad_norm": 0.18422778362815703, "learning_rate": 3.3679434529972975e-06, "loss": 0.7178, "step": 5505 }, { "epoch": 0.7584047348680362, "grad_norm": 0.20873770896049348, "learning_rate": 3.3499809286241248e-06, "loss": 0.7209, "step": 5510 }, { "epoch": 0.7590929424314373, "grad_norm": 0.1878481024881775, "learning_rate": 3.3320567902283694e-06, "loss": 0.6545, "step": 5515 }, { "epoch": 0.7597811499948385, "grad_norm": 0.19344169071289158, "learning_rate": 3.3141711412737308e-06, "loss": 0.662, "step": 5520 }, { "epoch": 0.7604693575582395, "grad_norm": 0.19392339460945293, "learning_rate": 3.296324085001734e-06, "loss": 0.6828, "step": 5525 }, { "epoch": 0.7611575651216407, "grad_norm": 0.1824917412819095, "learning_rate": 3.2785157244311392e-06, "loss": 0.635, "step": 5530 }, { "epoch": 0.7618457726850418, "grad_norm": 0.2207873233188592, "learning_rate": 3.260746162357331e-06, "loss": 0.6776, "step": 5535 }, { "epoch": 0.7625339802484429, "grad_norm": 0.19477972695506887, "learning_rate": 3.243015501351756e-06, "loss": 0.6819, "step": 5540 }, { "epoch": 0.763222187811844, "grad_norm": 0.198430547782452, "learning_rate": 3.2253238437612967e-06, "loss": 0.6333, "step": 5545 }, { "epoch": 0.7639103953752452, "grad_norm": 0.1854787918835233, "learning_rate": 3.2076712917077015e-06, "loss": 0.728, "step": 5550 }, { "epoch": 0.7645986029386463, "grad_norm": 0.18677037936471028, "learning_rate": 3.1900579470869917e-06, "loss": 0.7595, "step": 5555 }, { "epoch": 0.7652868105020474, "grad_norm": 0.1829405121449866, "learning_rate": 3.172483911568862e-06, "loss": 0.7129, "step": 5560 }, { "epoch": 0.7659750180654485, "grad_norm": 0.17293406576243453, "learning_rate": 3.1549492865961116e-06, "loss": 0.6839, "step": 5565 }, { "epoch": 0.7666632256288497, "grad_norm": 0.20066220447881328, "learning_rate": 3.137454173384048e-06, "loss": 0.6856, "step": 5570 }, { "epoch": 0.7673514331922507, "grad_norm": 0.19119704035560345, "learning_rate": 3.1199986729199015e-06, "loss": 0.7684, "step": 5575 }, { "epoch": 0.7680396407556519, "grad_norm": 0.18240884028885423, "learning_rate": 3.102582885962252e-06, "loss": 0.66, "step": 5580 }, { "epoch": 0.768727848319053, "grad_norm": 0.20262910652982047, "learning_rate": 3.085206913040428e-06, "loss": 0.6403, "step": 5585 }, { "epoch": 0.7694160558824541, "grad_norm": 0.17827576973666137, "learning_rate": 3.0678708544539493e-06, "loss": 0.6462, "step": 5590 }, { "epoch": 0.7701042634458553, "grad_norm": 0.1846601562967418, "learning_rate": 3.050574810271941e-06, "loss": 0.692, "step": 5595 }, { "epoch": 0.7707924710092564, "grad_norm": 0.18225044153474118, "learning_rate": 3.0333188803325465e-06, "loss": 0.639, "step": 5600 }, { "epoch": 0.7707924710092564, "eval_loss": 0.6965476870536804, "eval_runtime": 52.7361, "eval_samples_per_second": 94.812, "eval_steps_per_second": 2.977, "step": 5600 }, { "epoch": 0.7714806785726576, "grad_norm": 0.19714473316943049, "learning_rate": 3.0161031642423512e-06, "loss": 0.7176, "step": 5605 }, { "epoch": 0.7721688861360586, "grad_norm": 0.21070750730382332, "learning_rate": 2.998927761375824e-06, "loss": 0.7298, "step": 5610 }, { "epoch": 0.7728570936994598, "grad_norm": 0.18587806793346764, "learning_rate": 2.9817927708747295e-06, "loss": 0.7157, "step": 5615 }, { "epoch": 0.7735453012628609, "grad_norm": 0.20451769063167233, "learning_rate": 2.964698291647561e-06, "loss": 0.7141, "step": 5620 }, { "epoch": 0.774233508826262, "grad_norm": 0.18540490634047263, "learning_rate": 2.9476444223689683e-06, "loss": 0.6553, "step": 5625 }, { "epoch": 0.7749217163896631, "grad_norm": 0.17553333455746364, "learning_rate": 2.930631261479181e-06, "loss": 0.7156, "step": 5630 }, { "epoch": 0.7756099239530643, "grad_norm": 0.22611031851634142, "learning_rate": 2.9136589071834553e-06, "loss": 0.6569, "step": 5635 }, { "epoch": 0.7762981315164653, "grad_norm": 0.18888162089902075, "learning_rate": 2.896727457451498e-06, "loss": 0.6584, "step": 5640 }, { "epoch": 0.7769863390798665, "grad_norm": 0.1869555880263662, "learning_rate": 2.879837010016896e-06, "loss": 0.686, "step": 5645 }, { "epoch": 0.7776745466432676, "grad_norm": 0.19344474526421232, "learning_rate": 2.862987662376573e-06, "loss": 0.6853, "step": 5650 }, { "epoch": 0.7783627542066688, "grad_norm": 0.18460938356794904, "learning_rate": 2.84617951179019e-06, "loss": 0.6822, "step": 5655 }, { "epoch": 0.7790509617700698, "grad_norm": 0.20488994543378347, "learning_rate": 2.829412655279624e-06, "loss": 0.688, "step": 5660 }, { "epoch": 0.779739169333471, "grad_norm": 0.1710863653314867, "learning_rate": 2.8126871896283826e-06, "loss": 0.691, "step": 5665 }, { "epoch": 0.7804273768968721, "grad_norm": 0.18406464711535972, "learning_rate": 2.796003211381051e-06, "loss": 0.6481, "step": 5670 }, { "epoch": 0.7811155844602732, "grad_norm": 0.19210990997388466, "learning_rate": 2.779360816842742e-06, "loss": 0.7219, "step": 5675 }, { "epoch": 0.7818037920236743, "grad_norm": 0.2040341102319677, "learning_rate": 2.7627601020785254e-06, "loss": 0.6789, "step": 5680 }, { "epoch": 0.7824919995870755, "grad_norm": 0.19589129180249465, "learning_rate": 2.7462011629128904e-06, "loss": 0.6774, "step": 5685 }, { "epoch": 0.7831802071504765, "grad_norm": 0.19487289452716833, "learning_rate": 2.7296840949291824e-06, "loss": 0.6858, "step": 5690 }, { "epoch": 0.7838684147138777, "grad_norm": 0.2042156163330246, "learning_rate": 2.713208993469051e-06, "loss": 0.6663, "step": 5695 }, { "epoch": 0.7845566222772788, "grad_norm": 0.18525421551507126, "learning_rate": 2.696775953631904e-06, "loss": 0.6656, "step": 5700 }, { "epoch": 0.78524482984068, "grad_norm": 0.1811246634552583, "learning_rate": 2.680385070274356e-06, "loss": 0.7208, "step": 5705 }, { "epoch": 0.785933037404081, "grad_norm": 0.19961693750329815, "learning_rate": 2.6640364380096817e-06, "loss": 0.703, "step": 5710 }, { "epoch": 0.7866212449674822, "grad_norm": 0.19496244367728713, "learning_rate": 2.6477301512072697e-06, "loss": 0.6615, "step": 5715 }, { "epoch": 0.7873094525308834, "grad_norm": 0.1845931889513949, "learning_rate": 2.6314663039920776e-06, "loss": 0.643, "step": 5720 }, { "epoch": 0.7879976600942844, "grad_norm": 0.18802679571504138, "learning_rate": 2.6152449902440826e-06, "loss": 0.662, "step": 5725 }, { "epoch": 0.7886858676576856, "grad_norm": 0.20800911768968158, "learning_rate": 2.5990663035977538e-06, "loss": 0.7307, "step": 5730 }, { "epoch": 0.7893740752210867, "grad_norm": 0.17413490164428394, "learning_rate": 2.5829303374415004e-06, "loss": 0.6669, "step": 5735 }, { "epoch": 0.7900622827844878, "grad_norm": 0.18992108078024184, "learning_rate": 2.5668371849171346e-06, "loss": 0.682, "step": 5740 }, { "epoch": 0.7907504903478889, "grad_norm": 0.1874169422451266, "learning_rate": 2.550786938919336e-06, "loss": 0.7084, "step": 5745 }, { "epoch": 0.7914386979112901, "grad_norm": 0.18234010561145236, "learning_rate": 2.5347796920951174e-06, "loss": 0.6967, "step": 5750 }, { "epoch": 0.7921269054746911, "grad_norm": 0.1826075091019724, "learning_rate": 2.518815536843274e-06, "loss": 0.741, "step": 5755 }, { "epoch": 0.7928151130380923, "grad_norm": 0.19262477936097155, "learning_rate": 2.5028945653138823e-06, "loss": 0.7039, "step": 5760 }, { "epoch": 0.7935033206014934, "grad_norm": 0.16505783623934733, "learning_rate": 2.4870168694077377e-06, "loss": 0.6656, "step": 5765 }, { "epoch": 0.7941915281648946, "grad_norm": 0.20932953621500655, "learning_rate": 2.4711825407758372e-06, "loss": 0.7299, "step": 5770 }, { "epoch": 0.7948797357282956, "grad_norm": 0.15902936018108288, "learning_rate": 2.4553916708188453e-06, "loss": 0.692, "step": 5775 }, { "epoch": 0.7955679432916968, "grad_norm": 0.17790445217361386, "learning_rate": 2.439644350686573e-06, "loss": 0.7445, "step": 5780 }, { "epoch": 0.7962561508550979, "grad_norm": 0.18763632593468577, "learning_rate": 2.4239406712774482e-06, "loss": 0.722, "step": 5785 }, { "epoch": 0.796944358418499, "grad_norm": 0.19117839385336058, "learning_rate": 2.40828072323799e-06, "loss": 0.6357, "step": 5790 }, { "epoch": 0.7976325659819001, "grad_norm": 0.20568782978683467, "learning_rate": 2.3926645969622907e-06, "loss": 0.707, "step": 5795 }, { "epoch": 0.7983207735453013, "grad_norm": 0.22513844074641307, "learning_rate": 2.3770923825914803e-06, "loss": 0.7078, "step": 5800 }, { "epoch": 0.7983207735453013, "eval_loss": 0.6949214935302734, "eval_runtime": 52.7397, "eval_samples_per_second": 94.805, "eval_steps_per_second": 2.977, "step": 5800 }, { "epoch": 0.7990089811087023, "grad_norm": 0.19174641046285376, "learning_rate": 2.361564170013223e-06, "loss": 0.6223, "step": 5805 }, { "epoch": 0.7996971886721035, "grad_norm": 0.21938580435018606, "learning_rate": 2.3460800488611868e-06, "loss": 0.7414, "step": 5810 }, { "epoch": 0.8003853962355046, "grad_norm": 0.20794801346301475, "learning_rate": 2.330640108514541e-06, "loss": 0.6417, "step": 5815 }, { "epoch": 0.8010736037989058, "grad_norm": 0.21038225645782224, "learning_rate": 2.3152444380974125e-06, "loss": 0.6253, "step": 5820 }, { "epoch": 0.8017618113623068, "grad_norm": 0.18679275958736286, "learning_rate": 2.299893126478401e-06, "loss": 0.805, "step": 5825 }, { "epoch": 0.802450018925708, "grad_norm": 0.19883531787250555, "learning_rate": 2.284586262270049e-06, "loss": 0.7446, "step": 5830 }, { "epoch": 0.8031382264891092, "grad_norm": 0.16779395751910986, "learning_rate": 2.2693239338283333e-06, "loss": 0.6644, "step": 5835 }, { "epoch": 0.8038264340525102, "grad_norm": 0.1916406129310036, "learning_rate": 2.2541062292521644e-06, "loss": 0.645, "step": 5840 }, { "epoch": 0.8045146416159114, "grad_norm": 0.19429810534117156, "learning_rate": 2.2389332363828565e-06, "loss": 0.6665, "step": 5845 }, { "epoch": 0.8052028491793125, "grad_norm": 0.21096896708120694, "learning_rate": 2.2238050428036438e-06, "loss": 0.7064, "step": 5850 }, { "epoch": 0.8058910567427136, "grad_norm": 0.18629095487510014, "learning_rate": 2.2087217358391643e-06, "loss": 0.6984, "step": 5855 }, { "epoch": 0.8065792643061147, "grad_norm": 0.17575230674928513, "learning_rate": 2.1936834025549535e-06, "loss": 0.6905, "step": 5860 }, { "epoch": 0.8072674718695159, "grad_norm": 0.17805337183590111, "learning_rate": 2.1786901297569473e-06, "loss": 0.6092, "step": 5865 }, { "epoch": 0.807955679432917, "grad_norm": 0.18386311399818622, "learning_rate": 2.163742003990976e-06, "loss": 0.6958, "step": 5870 }, { "epoch": 0.8086438869963181, "grad_norm": 0.19156118792525031, "learning_rate": 2.148839111542269e-06, "loss": 0.6363, "step": 5875 }, { "epoch": 0.8093320945597192, "grad_norm": 0.1802931698977988, "learning_rate": 2.1339815384349548e-06, "loss": 0.6325, "step": 5880 }, { "epoch": 0.8100203021231204, "grad_norm": 0.2111370452622072, "learning_rate": 2.119169370431564e-06, "loss": 0.7075, "step": 5885 }, { "epoch": 0.8107085096865214, "grad_norm": 0.22360761007428157, "learning_rate": 2.104402693032539e-06, "loss": 0.6447, "step": 5890 }, { "epoch": 0.8113967172499226, "grad_norm": 0.1961130162714615, "learning_rate": 2.089681591475725e-06, "loss": 0.7037, "step": 5895 }, { "epoch": 0.8120849248133237, "grad_norm": 0.193516943987407, "learning_rate": 2.0750061507359e-06, "loss": 0.6428, "step": 5900 }, { "epoch": 0.8127731323767248, "grad_norm": 0.1941102766905568, "learning_rate": 2.060376455524271e-06, "loss": 0.6951, "step": 5905 }, { "epoch": 0.8134613399401259, "grad_norm": 0.2230021870255672, "learning_rate": 2.045792590287986e-06, "loss": 0.7136, "step": 5910 }, { "epoch": 0.8141495475035271, "grad_norm": 0.17986119797200004, "learning_rate": 2.031254639209651e-06, "loss": 0.6436, "step": 5915 }, { "epoch": 0.8148377550669282, "grad_norm": 0.17731897460123694, "learning_rate": 2.0167626862068334e-06, "loss": 0.6255, "step": 5920 }, { "epoch": 0.8155259626303293, "grad_norm": 0.18483216524664386, "learning_rate": 2.002316814931596e-06, "loss": 0.6476, "step": 5925 }, { "epoch": 0.8162141701937304, "grad_norm": 0.2011603909559792, "learning_rate": 1.987917108769999e-06, "loss": 0.731, "step": 5930 }, { "epoch": 0.8169023777571316, "grad_norm": 0.20372237809598387, "learning_rate": 1.9735636508416243e-06, "loss": 0.6958, "step": 5935 }, { "epoch": 0.8175905853205326, "grad_norm": 0.19276870687041622, "learning_rate": 1.959256523999088e-06, "loss": 0.5814, "step": 5940 }, { "epoch": 0.8182787928839338, "grad_norm": 0.19356657768898541, "learning_rate": 1.9449958108275756e-06, "loss": 0.7013, "step": 5945 }, { "epoch": 0.818967000447335, "grad_norm": 0.19329254677616256, "learning_rate": 1.9307815936443585e-06, "loss": 0.699, "step": 5950 }, { "epoch": 0.819655208010736, "grad_norm": 0.19060952032802758, "learning_rate": 1.9166139544983163e-06, "loss": 0.738, "step": 5955 }, { "epoch": 0.8203434155741371, "grad_norm": 0.1998574716239888, "learning_rate": 1.9024929751694665e-06, "loss": 0.6756, "step": 5960 }, { "epoch": 0.8210316231375383, "grad_norm": 0.191025814440007, "learning_rate": 1.8884187371684936e-06, "loss": 0.6659, "step": 5965 }, { "epoch": 0.8217198307009395, "grad_norm": 0.21396495578837582, "learning_rate": 1.874391321736272e-06, "loss": 0.7268, "step": 5970 }, { "epoch": 0.8224080382643405, "grad_norm": 0.1765869930936367, "learning_rate": 1.8604108098434014e-06, "loss": 0.6516, "step": 5975 }, { "epoch": 0.8230962458277417, "grad_norm": 0.18614406662090502, "learning_rate": 1.8464772821897504e-06, "loss": 0.6528, "step": 5980 }, { "epoch": 0.8237844533911428, "grad_norm": 0.20104023977939134, "learning_rate": 1.8325908192039699e-06, "loss": 0.6895, "step": 5985 }, { "epoch": 0.8244726609545439, "grad_norm": 0.17592763786084412, "learning_rate": 1.818751501043038e-06, "loss": 0.7037, "step": 5990 }, { "epoch": 0.825160868517945, "grad_norm": 0.16537094609199737, "learning_rate": 1.804959407591802e-06, "loss": 0.6509, "step": 5995 }, { "epoch": 0.8258490760813462, "grad_norm": 0.1825470731293043, "learning_rate": 1.7912146184625158e-06, "loss": 0.7029, "step": 6000 }, { "epoch": 0.8258490760813462, "eval_loss": 0.6932981610298157, "eval_runtime": 52.7321, "eval_samples_per_second": 94.819, "eval_steps_per_second": 2.977, "step": 6000 }, { "epoch": 0.8265372836447472, "grad_norm": 0.17934491088764612, "learning_rate": 1.7775172129943719e-06, "loss": 0.6736, "step": 6005 }, { "epoch": 0.8272254912081484, "grad_norm": 0.1939478908874411, "learning_rate": 1.763867270253058e-06, "loss": 0.7128, "step": 6010 }, { "epoch": 0.8279136987715495, "grad_norm": 0.1599539109722343, "learning_rate": 1.7502648690302816e-06, "loss": 0.6426, "step": 6015 }, { "epoch": 0.8286019063349506, "grad_norm": 0.21387019896379106, "learning_rate": 1.736710087843332e-06, "loss": 0.723, "step": 6020 }, { "epoch": 0.8292901138983517, "grad_norm": 0.20098376474801669, "learning_rate": 1.7232030049346216e-06, "loss": 0.7147, "step": 6025 }, { "epoch": 0.8299783214617529, "grad_norm": 0.2090789804826148, "learning_rate": 1.7097436982712312e-06, "loss": 0.6987, "step": 6030 }, { "epoch": 0.830666529025154, "grad_norm": 0.20292647384617366, "learning_rate": 1.6963322455444642e-06, "loss": 0.6979, "step": 6035 }, { "epoch": 0.8313547365885551, "grad_norm": 0.2003419085519311, "learning_rate": 1.6829687241693937e-06, "loss": 0.6837, "step": 6040 }, { "epoch": 0.8320429441519562, "grad_norm": 0.19564821302303095, "learning_rate": 1.669653211284421e-06, "loss": 0.6862, "step": 6045 }, { "epoch": 0.8327311517153574, "grad_norm": 0.211991897645666, "learning_rate": 1.6563857837508245e-06, "loss": 0.6899, "step": 6050 }, { "epoch": 0.8334193592787584, "grad_norm": 0.19061860193226596, "learning_rate": 1.6431665181523237e-06, "loss": 0.6542, "step": 6055 }, { "epoch": 0.8341075668421596, "grad_norm": 0.2158979905972158, "learning_rate": 1.6299954907946247e-06, "loss": 0.6049, "step": 6060 }, { "epoch": 0.8347957744055607, "grad_norm": 0.2050352076033631, "learning_rate": 1.6168727777049931e-06, "loss": 0.6541, "step": 6065 }, { "epoch": 0.8354839819689618, "grad_norm": 0.17097839381448277, "learning_rate": 1.6037984546318086e-06, "loss": 0.6633, "step": 6070 }, { "epoch": 0.8361721895323629, "grad_norm": 0.20625976713571156, "learning_rate": 1.59077259704413e-06, "loss": 0.6381, "step": 6075 }, { "epoch": 0.8368603970957641, "grad_norm": 0.18802501867247656, "learning_rate": 1.5777952801312558e-06, "loss": 0.7009, "step": 6080 }, { "epoch": 0.8375486046591653, "grad_norm": 0.19091194281009652, "learning_rate": 1.5648665788022932e-06, "loss": 0.6837, "step": 6085 }, { "epoch": 0.8382368122225663, "grad_norm": 0.19985564628883268, "learning_rate": 1.5519865676857248e-06, "loss": 0.6423, "step": 6090 }, { "epoch": 0.8389250197859675, "grad_norm": 0.20818483962341972, "learning_rate": 1.5391553211289811e-06, "loss": 0.6752, "step": 6095 }, { "epoch": 0.8396132273493686, "grad_norm": 0.2205381052542353, "learning_rate": 1.5263729131980054e-06, "loss": 0.7138, "step": 6100 }, { "epoch": 0.8403014349127697, "grad_norm": 0.18789529768688978, "learning_rate": 1.513639417676831e-06, "loss": 0.7035, "step": 6105 }, { "epoch": 0.8409896424761708, "grad_norm": 0.1744335359659172, "learning_rate": 1.5009549080671481e-06, "loss": 0.7364, "step": 6110 }, { "epoch": 0.841677850039572, "grad_norm": 0.1945915933824967, "learning_rate": 1.4883194575878946e-06, "loss": 0.6304, "step": 6115 }, { "epoch": 0.842366057602973, "grad_norm": 0.19929508769233706, "learning_rate": 1.4757331391748153e-06, "loss": 0.7182, "step": 6120 }, { "epoch": 0.8430542651663742, "grad_norm": 0.1866962016028064, "learning_rate": 1.4631960254800548e-06, "loss": 0.6566, "step": 6125 }, { "epoch": 0.8437424727297753, "grad_norm": 0.22382669841429953, "learning_rate": 1.4507081888717322e-06, "loss": 0.7306, "step": 6130 }, { "epoch": 0.8444306802931765, "grad_norm": 0.19070409927487098, "learning_rate": 1.438269701433519e-06, "loss": 0.6405, "step": 6135 }, { "epoch": 0.8451188878565775, "grad_norm": 0.18510404570737995, "learning_rate": 1.4258806349642295e-06, "loss": 0.6943, "step": 6140 }, { "epoch": 0.8458070954199787, "grad_norm": 0.20583964616311395, "learning_rate": 1.4135410609774148e-06, "loss": 0.6983, "step": 6145 }, { "epoch": 0.8464953029833798, "grad_norm": 0.19170244584935608, "learning_rate": 1.4012510507009292e-06, "loss": 0.6925, "step": 6150 }, { "epoch": 0.8471835105467809, "grad_norm": 0.18252985236829147, "learning_rate": 1.3890106750765286e-06, "loss": 0.6888, "step": 6155 }, { "epoch": 0.847871718110182, "grad_norm": 0.1861913981021295, "learning_rate": 1.3768200047594693e-06, "loss": 0.6527, "step": 6160 }, { "epoch": 0.8485599256735832, "grad_norm": 0.18072859258727786, "learning_rate": 1.36467911011809e-06, "loss": 0.7324, "step": 6165 }, { "epoch": 0.8492481332369842, "grad_norm": 0.1646893595617525, "learning_rate": 1.3525880612334096e-06, "loss": 0.6657, "step": 6170 }, { "epoch": 0.8499363408003854, "grad_norm": 0.19447657999769247, "learning_rate": 1.3405469278987193e-06, "loss": 0.7074, "step": 6175 }, { "epoch": 0.8506245483637865, "grad_norm": 0.19645347940397737, "learning_rate": 1.3285557796191884e-06, "loss": 0.6628, "step": 6180 }, { "epoch": 0.8513127559271877, "grad_norm": 0.20482849273751774, "learning_rate": 1.3166146856114458e-06, "loss": 0.695, "step": 6185 }, { "epoch": 0.8520009634905887, "grad_norm": 0.20938284854533107, "learning_rate": 1.3047237148032022e-06, "loss": 0.6857, "step": 6190 }, { "epoch": 0.8526891710539899, "grad_norm": 0.19449104633323444, "learning_rate": 1.2928829358328354e-06, "loss": 0.6979, "step": 6195 }, { "epoch": 0.853377378617391, "grad_norm": 0.20878698679388993, "learning_rate": 1.281092417049008e-06, "loss": 0.6977, "step": 6200 }, { "epoch": 0.853377378617391, "eval_loss": 0.6920512914657593, "eval_runtime": 52.7397, "eval_samples_per_second": 94.805, "eval_steps_per_second": 2.977, "step": 6200 }, { "epoch": 0.8540655861807921, "grad_norm": 0.19736744396625097, "learning_rate": 1.2693522265102543e-06, "loss": 0.666, "step": 6205 }, { "epoch": 0.8547537937441932, "grad_norm": 0.19020854267635723, "learning_rate": 1.2576624319846064e-06, "loss": 0.6828, "step": 6210 }, { "epoch": 0.8554420013075944, "grad_norm": 0.19285406133223637, "learning_rate": 1.2460231009491908e-06, "loss": 0.6946, "step": 6215 }, { "epoch": 0.8561302088709954, "grad_norm": 0.18365962251758106, "learning_rate": 1.2344343005898463e-06, "loss": 0.7181, "step": 6220 }, { "epoch": 0.8568184164343966, "grad_norm": 0.162094985688185, "learning_rate": 1.2228960978007332e-06, "loss": 0.6182, "step": 6225 }, { "epoch": 0.8575066239977978, "grad_norm": 0.15288921494935956, "learning_rate": 1.2114085591839397e-06, "loss": 0.6366, "step": 6230 }, { "epoch": 0.8581948315611989, "grad_norm": 0.18982870841461344, "learning_rate": 1.1999717510491127e-06, "loss": 0.7524, "step": 6235 }, { "epoch": 0.8588830391246, "grad_norm": 0.20583041197106247, "learning_rate": 1.188585739413064e-06, "loss": 0.6649, "step": 6240 }, { "epoch": 0.8595712466880011, "grad_norm": 0.19843612495824275, "learning_rate": 1.1772505899993925e-06, "loss": 0.7065, "step": 6245 }, { "epoch": 0.8602594542514023, "grad_norm": 0.20158218667621194, "learning_rate": 1.165966368238105e-06, "loss": 0.7035, "step": 6250 }, { "epoch": 0.8609476618148033, "grad_norm": 0.1909710500766969, "learning_rate": 1.154733139265236e-06, "loss": 0.7486, "step": 6255 }, { "epoch": 0.8616358693782045, "grad_norm": 0.20476523868239638, "learning_rate": 1.1435509679224776e-06, "loss": 0.7214, "step": 6260 }, { "epoch": 0.8623240769416056, "grad_norm": 0.1860272202174298, "learning_rate": 1.1324199187567964e-06, "loss": 0.654, "step": 6265 }, { "epoch": 0.8630122845050067, "grad_norm": 0.2041057820938216, "learning_rate": 1.1213400560200728e-06, "loss": 0.6691, "step": 6270 }, { "epoch": 0.8637004920684078, "grad_norm": 0.1863544558125701, "learning_rate": 1.1103114436687135e-06, "loss": 0.6234, "step": 6275 }, { "epoch": 0.864388699631809, "grad_norm": 0.2107834963082916, "learning_rate": 1.0993341453633e-06, "loss": 0.7096, "step": 6280 }, { "epoch": 0.86507690719521, "grad_norm": 0.20679301657000468, "learning_rate": 1.0884082244682114e-06, "loss": 0.6822, "step": 6285 }, { "epoch": 0.8657651147586112, "grad_norm": 0.19368671507159313, "learning_rate": 1.077533744051259e-06, "loss": 0.6981, "step": 6290 }, { "epoch": 0.8664533223220123, "grad_norm": 0.18151674745654697, "learning_rate": 1.0667107668833243e-06, "loss": 0.6323, "step": 6295 }, { "epoch": 0.8671415298854135, "grad_norm": 0.16978903333779766, "learning_rate": 1.0559393554379983e-06, "loss": 0.6777, "step": 6300 }, { "epoch": 0.8678297374488145, "grad_norm": 0.16951239541226334, "learning_rate": 1.0452195718912105e-06, "loss": 0.655, "step": 6305 }, { "epoch": 0.8685179450122157, "grad_norm": 0.18882103386757843, "learning_rate": 1.0345514781208898e-06, "loss": 0.7187, "step": 6310 }, { "epoch": 0.8692061525756168, "grad_norm": 0.19035840082949337, "learning_rate": 1.0239351357065885e-06, "loss": 0.6409, "step": 6315 }, { "epoch": 0.8698943601390179, "grad_norm": 0.20125402996443767, "learning_rate": 1.013370605929137e-06, "loss": 0.6427, "step": 6320 }, { "epoch": 0.870582567702419, "grad_norm": 0.20400960940090926, "learning_rate": 1.0028579497702818e-06, "loss": 0.6938, "step": 6325 }, { "epoch": 0.8712707752658202, "grad_norm": 0.19907941931796957, "learning_rate": 9.923972279123428e-07, "loss": 0.6857, "step": 6330 }, { "epoch": 0.8719589828292212, "grad_norm": 0.18413039953650398, "learning_rate": 9.819885007378615e-07, "loss": 0.6362, "step": 6335 }, { "epoch": 0.8726471903926224, "grad_norm": 0.16258462645287747, "learning_rate": 9.716318283292447e-07, "loss": 0.636, "step": 6340 }, { "epoch": 0.8733353979560236, "grad_norm": 0.1786953710544283, "learning_rate": 9.613272704684273e-07, "loss": 0.7077, "step": 6345 }, { "epoch": 0.8740236055194247, "grad_norm": 0.18012875740599288, "learning_rate": 9.510748866365183e-07, "loss": 0.7114, "step": 6350 }, { "epoch": 0.8747118130828258, "grad_norm": 0.22049669700740293, "learning_rate": 9.408747360134651e-07, "loss": 0.6872, "step": 6355 }, { "epoch": 0.8754000206462269, "grad_norm": 0.19749734769136157, "learning_rate": 9.307268774777056e-07, "loss": 0.6818, "step": 6360 }, { "epoch": 0.8760882282096281, "grad_norm": 0.17760509789940265, "learning_rate": 9.20631369605841e-07, "loss": 0.6447, "step": 6365 }, { "epoch": 0.8767764357730291, "grad_norm": 0.1991207688959761, "learning_rate": 9.105882706722758e-07, "loss": 0.6912, "step": 6370 }, { "epoch": 0.8774646433364303, "grad_norm": 0.21097294713534187, "learning_rate": 9.00597638648899e-07, "loss": 0.7105, "step": 6375 }, { "epoch": 0.8781528508998314, "grad_norm": 0.19655256859708284, "learning_rate": 8.906595312047472e-07, "loss": 0.6662, "step": 6380 }, { "epoch": 0.8788410584632325, "grad_norm": 0.2026459458747689, "learning_rate": 8.807740057056635e-07, "loss": 0.6694, "step": 6385 }, { "epoch": 0.8795292660266336, "grad_norm": 0.21403689149263802, "learning_rate": 8.709411192139761e-07, "loss": 0.6492, "step": 6390 }, { "epoch": 0.8802174735900348, "grad_norm": 0.19175768516542946, "learning_rate": 8.611609284881639e-07, "loss": 0.688, "step": 6395 }, { "epoch": 0.8809056811534359, "grad_norm": 0.19919807171004464, "learning_rate": 8.514334899825249e-07, "loss": 0.6803, "step": 6400 }, { "epoch": 0.8809056811534359, "eval_loss": 0.6911298632621765, "eval_runtime": 52.7445, "eval_samples_per_second": 94.797, "eval_steps_per_second": 2.977, "step": 6400 }, { "epoch": 0.881593888716837, "grad_norm": 0.21643870973799728, "learning_rate": 8.417588598468596e-07, "loss": 0.626, "step": 6405 }, { "epoch": 0.8822820962802381, "grad_norm": 0.17701022731305052, "learning_rate": 8.321370939261409e-07, "loss": 0.6871, "step": 6410 }, { "epoch": 0.8829703038436393, "grad_norm": 0.18188616853651765, "learning_rate": 8.225682477601915e-07, "loss": 0.6923, "step": 6415 }, { "epoch": 0.8836585114070403, "grad_norm": 0.18907905658960644, "learning_rate": 8.130523765833687e-07, "loss": 0.7047, "step": 6420 }, { "epoch": 0.8843467189704415, "grad_norm": 0.1950727575852637, "learning_rate": 8.035895353242373e-07, "loss": 0.6748, "step": 6425 }, { "epoch": 0.8850349265338426, "grad_norm": 0.20230498327343824, "learning_rate": 7.941797786052607e-07, "loss": 0.702, "step": 6430 }, { "epoch": 0.8857231340972437, "grad_norm": 0.20696629408486375, "learning_rate": 7.848231607424783e-07, "loss": 0.6371, "step": 6435 }, { "epoch": 0.8864113416606448, "grad_norm": 0.22710417910591632, "learning_rate": 7.755197357451982e-07, "loss": 0.6656, "step": 6440 }, { "epoch": 0.887099549224046, "grad_norm": 0.19978432846550875, "learning_rate": 7.662695573156809e-07, "loss": 0.7537, "step": 6445 }, { "epoch": 0.8877877567874471, "grad_norm": 0.1697109855259628, "learning_rate": 7.570726788488314e-07, "loss": 0.6393, "step": 6450 }, { "epoch": 0.8884759643508482, "grad_norm": 0.18052334545234516, "learning_rate": 7.479291534318922e-07, "loss": 0.659, "step": 6455 }, { "epoch": 0.8891641719142493, "grad_norm": 0.20042489452307366, "learning_rate": 7.388390338441342e-07, "loss": 0.6693, "step": 6460 }, { "epoch": 0.8898523794776505, "grad_norm": 0.19628749047358537, "learning_rate": 7.298023725565561e-07, "loss": 0.6807, "step": 6465 }, { "epoch": 0.8905405870410515, "grad_norm": 0.2005825890371119, "learning_rate": 7.208192217315702e-07, "loss": 0.6813, "step": 6470 }, { "epoch": 0.8912287946044527, "grad_norm": 0.1872098030236323, "learning_rate": 7.11889633222721e-07, "loss": 0.7146, "step": 6475 }, { "epoch": 0.8919170021678539, "grad_norm": 0.16785271394173304, "learning_rate": 7.030136585743674e-07, "loss": 0.6143, "step": 6480 }, { "epoch": 0.8926052097312549, "grad_norm": 0.1794374046192592, "learning_rate": 6.941913490213959e-07, "loss": 0.6515, "step": 6485 }, { "epoch": 0.8932934172946561, "grad_norm": 0.1785915158513817, "learning_rate": 6.854227554889193e-07, "loss": 0.6934, "step": 6490 }, { "epoch": 0.8939816248580572, "grad_norm": 0.17809671340570252, "learning_rate": 6.76707928591983e-07, "loss": 0.6414, "step": 6495 }, { "epoch": 0.8946698324214584, "grad_norm": 0.21905226255033425, "learning_rate": 6.6804691863528e-07, "loss": 0.7303, "step": 6500 }, { "epoch": 0.8953580399848594, "grad_norm": 0.16677700737565374, "learning_rate": 6.594397756128501e-07, "loss": 0.6551, "step": 6505 }, { "epoch": 0.8960462475482606, "grad_norm": 0.1970460179046081, "learning_rate": 6.508865492078009e-07, "loss": 0.6484, "step": 6510 }, { "epoch": 0.8967344551116617, "grad_norm": 0.19823634456314093, "learning_rate": 6.423872887920135e-07, "loss": 0.7088, "step": 6515 }, { "epoch": 0.8974226626750628, "grad_norm": 0.19267509960616974, "learning_rate": 6.339420434258603e-07, "loss": 0.6361, "step": 6520 }, { "epoch": 0.8981108702384639, "grad_norm": 0.22726394438336617, "learning_rate": 6.25550861857921e-07, "loss": 0.6375, "step": 6525 }, { "epoch": 0.8987990778018651, "grad_norm": 0.19330414195598447, "learning_rate": 6.17213792524708e-07, "loss": 0.6574, "step": 6530 }, { "epoch": 0.8994872853652661, "grad_norm": 0.19755770664510916, "learning_rate": 6.089308835503782e-07, "loss": 0.6802, "step": 6535 }, { "epoch": 0.9001754929286673, "grad_norm": 0.19460484699314204, "learning_rate": 6.007021827464532e-07, "loss": 0.6546, "step": 6540 }, { "epoch": 0.9008637004920684, "grad_norm": 0.1819407843803622, "learning_rate": 5.925277376115534e-07, "loss": 0.66, "step": 6545 }, { "epoch": 0.9015519080554695, "grad_norm": 0.21057590092865455, "learning_rate": 5.84407595331119e-07, "loss": 0.632, "step": 6550 }, { "epoch": 0.9022401156188706, "grad_norm": 0.1826156547255721, "learning_rate": 5.763418027771339e-07, "loss": 0.6531, "step": 6555 }, { "epoch": 0.9029283231822718, "grad_norm": 0.18698887503948985, "learning_rate": 5.683304065078621e-07, "loss": 0.6709, "step": 6560 }, { "epoch": 0.9036165307456729, "grad_norm": 0.17931482869154056, "learning_rate": 5.603734527675697e-07, "loss": 0.6862, "step": 6565 }, { "epoch": 0.904304738309074, "grad_norm": 0.18936441375844643, "learning_rate": 5.52470987486271e-07, "loss": 0.6701, "step": 6570 }, { "epoch": 0.9049929458724751, "grad_norm": 0.18651568277176628, "learning_rate": 5.446230562794508e-07, "loss": 0.7286, "step": 6575 }, { "epoch": 0.9056811534358763, "grad_norm": 0.19949968033129167, "learning_rate": 5.368297044478076e-07, "loss": 0.6131, "step": 6580 }, { "epoch": 0.9063693609992773, "grad_norm": 0.2496827718958403, "learning_rate": 5.290909769769914e-07, "loss": 0.6952, "step": 6585 }, { "epoch": 0.9070575685626785, "grad_norm": 0.19189482408115774, "learning_rate": 5.214069185373427e-07, "loss": 0.6603, "step": 6590 }, { "epoch": 0.9077457761260797, "grad_norm": 0.19420868915508518, "learning_rate": 5.137775734836359e-07, "loss": 0.6614, "step": 6595 }, { "epoch": 0.9084339836894807, "grad_norm": 0.1815424218356865, "learning_rate": 5.062029858548223e-07, "loss": 0.703, "step": 6600 }, { "epoch": 0.9084339836894807, "eval_loss": 0.6905099749565125, "eval_runtime": 52.749, "eval_samples_per_second": 94.788, "eval_steps_per_second": 2.976, "step": 6600 }, { "epoch": 0.9091221912528819, "grad_norm": 0.17101733543452816, "learning_rate": 4.986831993737762e-07, "loss": 0.6138, "step": 6605 }, { "epoch": 0.909810398816283, "grad_norm": 0.19925363933024076, "learning_rate": 4.912182574470459e-07, "loss": 0.731, "step": 6610 }, { "epoch": 0.9104986063796842, "grad_norm": 0.1865909484763774, "learning_rate": 4.838082031645907e-07, "loss": 0.6268, "step": 6615 }, { "epoch": 0.9111868139430852, "grad_norm": 0.2113879867747988, "learning_rate": 4.764530792995525e-07, "loss": 0.6984, "step": 6620 }, { "epoch": 0.9118750215064864, "grad_norm": 0.20594059346411567, "learning_rate": 4.69152928307991e-07, "loss": 0.7813, "step": 6625 }, { "epoch": 0.9125632290698875, "grad_norm": 0.18306193340262086, "learning_rate": 4.619077923286475e-07, "loss": 0.6712, "step": 6630 }, { "epoch": 0.9132514366332886, "grad_norm": 0.1745697056598823, "learning_rate": 4.5471771318269855e-07, "loss": 0.7136, "step": 6635 }, { "epoch": 0.9139396441966897, "grad_norm": 0.20712851761406995, "learning_rate": 4.475827323735171e-07, "loss": 0.7147, "step": 6640 }, { "epoch": 0.9146278517600909, "grad_norm": 0.18362212868310707, "learning_rate": 4.405028910864295e-07, "loss": 0.6623, "step": 6645 }, { "epoch": 0.9153160593234919, "grad_norm": 0.19132472308778523, "learning_rate": 4.334782301884821e-07, "loss": 0.6751, "step": 6650 }, { "epoch": 0.9160042668868931, "grad_norm": 0.19652744213699724, "learning_rate": 4.265087902282028e-07, "loss": 0.6657, "step": 6655 }, { "epoch": 0.9166924744502942, "grad_norm": 0.21365771611324716, "learning_rate": 4.195946114353611e-07, "loss": 0.6578, "step": 6660 }, { "epoch": 0.9173806820136954, "grad_norm": 0.18479577847246767, "learning_rate": 4.127357337207483e-07, "loss": 0.6718, "step": 6665 }, { "epoch": 0.9180688895770964, "grad_norm": 0.2083525090133303, "learning_rate": 4.0593219667594e-07, "loss": 0.6767, "step": 6670 }, { "epoch": 0.9187570971404976, "grad_norm": 0.20910176522055363, "learning_rate": 3.991840395730673e-07, "loss": 0.6643, "step": 6675 }, { "epoch": 0.9194453047038987, "grad_norm": 0.22251660130913503, "learning_rate": 3.9249130136459035e-07, "loss": 0.7031, "step": 6680 }, { "epoch": 0.9201335122672998, "grad_norm": 0.18704860148654195, "learning_rate": 3.85854020683073e-07, "loss": 0.6539, "step": 6685 }, { "epoch": 0.9208217198307009, "grad_norm": 0.19739992875859028, "learning_rate": 3.7927223584096283e-07, "loss": 0.6507, "step": 6690 }, { "epoch": 0.9215099273941021, "grad_norm": 0.1735761889177779, "learning_rate": 3.727459848303716e-07, "loss": 0.6233, "step": 6695 }, { "epoch": 0.9221981349575031, "grad_norm": 0.18569425105118043, "learning_rate": 3.662753053228463e-07, "loss": 0.7093, "step": 6700 }, { "epoch": 0.9228863425209043, "grad_norm": 0.20364562171705752, "learning_rate": 3.5986023466916153e-07, "loss": 0.7127, "step": 6705 }, { "epoch": 0.9235745500843054, "grad_norm": 0.17768079314695193, "learning_rate": 3.5350080989909774e-07, "loss": 0.6237, "step": 6710 }, { "epoch": 0.9242627576477066, "grad_norm": 0.1911830618531974, "learning_rate": 3.471970677212344e-07, "loss": 0.7143, "step": 6715 }, { "epoch": 0.9249509652111076, "grad_norm": 0.17442866634356657, "learning_rate": 3.4094904452272814e-07, "loss": 0.7098, "step": 6720 }, { "epoch": 0.9256391727745088, "grad_norm": 0.21474292824253677, "learning_rate": 3.3475677636911284e-07, "loss": 0.6698, "step": 6725 }, { "epoch": 0.92632738033791, "grad_norm": 0.20564317915563196, "learning_rate": 3.286202990040865e-07, "loss": 0.66, "step": 6730 }, { "epoch": 0.927015587901311, "grad_norm": 0.19320724772827405, "learning_rate": 3.225396478493015e-07, "loss": 0.6349, "step": 6735 }, { "epoch": 0.9277037954647122, "grad_norm": 0.1946135654990813, "learning_rate": 3.165148580041688e-07, "loss": 0.6825, "step": 6740 }, { "epoch": 0.9283920030281133, "grad_norm": 0.20447432035066396, "learning_rate": 3.1054596424564544e-07, "loss": 0.6665, "step": 6745 }, { "epoch": 0.9290802105915144, "grad_norm": 0.2157462186386394, "learning_rate": 3.046330010280463e-07, "loss": 0.6548, "step": 6750 }, { "epoch": 0.9297684181549155, "grad_norm": 0.20965616696713615, "learning_rate": 2.98776002482829e-07, "loss": 0.6944, "step": 6755 }, { "epoch": 0.9304566257183167, "grad_norm": 0.19003138729116303, "learning_rate": 2.9297500241841304e-07, "loss": 0.6547, "step": 6760 }, { "epoch": 0.9311448332817178, "grad_norm": 0.2173501983150109, "learning_rate": 2.8723003431997186e-07, "loss": 0.6876, "step": 6765 }, { "epoch": 0.9318330408451189, "grad_norm": 0.19269808218592221, "learning_rate": 2.815411313492489e-07, "loss": 0.6388, "step": 6770 }, { "epoch": 0.93252124840852, "grad_norm": 0.17933033480414845, "learning_rate": 2.759083263443607e-07, "loss": 0.6676, "step": 6775 }, { "epoch": 0.9332094559719212, "grad_norm": 0.21259770206985748, "learning_rate": 2.703316518196075e-07, "loss": 0.6876, "step": 6780 }, { "epoch": 0.9338976635353222, "grad_norm": 0.20520334522376352, "learning_rate": 2.648111399652886e-07, "loss": 0.6865, "step": 6785 }, { "epoch": 0.9345858710987234, "grad_norm": 0.19367706984263186, "learning_rate": 2.59346822647516e-07, "loss": 0.6668, "step": 6790 }, { "epoch": 0.9352740786621245, "grad_norm": 0.1683009312669201, "learning_rate": 2.539387314080266e-07, "loss": 0.6203, "step": 6795 }, { "epoch": 0.9359622862255256, "grad_norm": 0.2131385076640653, "learning_rate": 2.4858689746400486e-07, "loss": 0.6819, "step": 6800 }, { "epoch": 0.9359622862255256, "eval_loss": 0.6900854706764221, "eval_runtime": 52.7524, "eval_samples_per_second": 94.782, "eval_steps_per_second": 2.976, "step": 6800 }, { "epoch": 0.9366504937889267, "grad_norm": 0.17754659855601151, "learning_rate": 2.432913517078994e-07, "loss": 0.7109, "step": 6805 }, { "epoch": 0.9373387013523279, "grad_norm": 0.19288247407024475, "learning_rate": 2.380521247072476e-07, "loss": 0.6246, "step": 6810 }, { "epoch": 0.9380269089157289, "grad_norm": 0.19468570482909206, "learning_rate": 2.3286924670449574e-07, "loss": 0.7023, "step": 6815 }, { "epoch": 0.9387151164791301, "grad_norm": 0.189448016964437, "learning_rate": 2.2774274761682812e-07, "loss": 0.619, "step": 6820 }, { "epoch": 0.9394033240425312, "grad_norm": 0.19851015804951902, "learning_rate": 2.226726570359894e-07, "loss": 0.6459, "step": 6825 }, { "epoch": 0.9400915316059324, "grad_norm": 0.1719839965929755, "learning_rate": 2.1765900422812015e-07, "loss": 0.6718, "step": 6830 }, { "epoch": 0.9407797391693334, "grad_norm": 0.19855118421955434, "learning_rate": 2.127018181335816e-07, "loss": 0.7278, "step": 6835 }, { "epoch": 0.9414679467327346, "grad_norm": 0.1701142207483371, "learning_rate": 2.0780112736679346e-07, "loss": 0.7314, "step": 6840 }, { "epoch": 0.9421561542961358, "grad_norm": 0.183630965826083, "learning_rate": 2.0295696021606637e-07, "loss": 0.6569, "step": 6845 }, { "epoch": 0.9428443618595368, "grad_norm": 0.1951289001602098, "learning_rate": 1.9816934464344073e-07, "loss": 0.692, "step": 6850 }, { "epoch": 0.943532569422938, "grad_norm": 0.2019302772631312, "learning_rate": 1.9343830828451815e-07, "loss": 0.6234, "step": 6855 }, { "epoch": 0.9442207769863391, "grad_norm": 0.211013104974058, "learning_rate": 1.8876387844831478e-07, "loss": 0.6768, "step": 6860 }, { "epoch": 0.9449089845497401, "grad_norm": 0.21109940935157245, "learning_rate": 1.841460821170926e-07, "loss": 0.6973, "step": 6865 }, { "epoch": 0.9455971921131413, "grad_norm": 0.1945081194318705, "learning_rate": 1.7958494594620623e-07, "loss": 0.6892, "step": 6870 }, { "epoch": 0.9462853996765425, "grad_norm": 0.1932171430056548, "learning_rate": 1.7508049626395074e-07, "loss": 0.6267, "step": 6875 }, { "epoch": 0.9469736072399436, "grad_norm": 0.21983214606780344, "learning_rate": 1.7063275907141185e-07, "loss": 0.7088, "step": 6880 }, { "epoch": 0.9476618148033447, "grad_norm": 0.16707301863515253, "learning_rate": 1.6624176004231053e-07, "loss": 0.6519, "step": 6885 }, { "epoch": 0.9483500223667458, "grad_norm": 0.17763470116963997, "learning_rate": 1.6190752452285964e-07, "loss": 0.689, "step": 6890 }, { "epoch": 0.949038229930147, "grad_norm": 0.18490997791891867, "learning_rate": 1.5763007753161196e-07, "loss": 0.6402, "step": 6895 }, { "epoch": 0.949726437493548, "grad_norm": 0.1780927945355048, "learning_rate": 1.5340944375932254e-07, "loss": 0.6955, "step": 6900 }, { "epoch": 0.9504146450569492, "grad_norm": 0.1992464781496288, "learning_rate": 1.492456475688009e-07, "loss": 0.6797, "step": 6905 }, { "epoch": 0.9511028526203503, "grad_norm": 0.1756378952839318, "learning_rate": 1.4513871299477233e-07, "loss": 0.702, "step": 6910 }, { "epoch": 0.9517910601837514, "grad_norm": 0.19713697789455303, "learning_rate": 1.410886637437414e-07, "loss": 0.715, "step": 6915 }, { "epoch": 0.9524792677471525, "grad_norm": 0.16926031485229537, "learning_rate": 1.37095523193852e-07, "loss": 0.6343, "step": 6920 }, { "epoch": 0.9531674753105537, "grad_norm": 0.19068058948060032, "learning_rate": 1.3315931439475183e-07, "loss": 0.706, "step": 6925 }, { "epoch": 0.9538556828739548, "grad_norm": 0.17164109313592418, "learning_rate": 1.2928006006746153e-07, "loss": 0.6583, "step": 6930 }, { "epoch": 0.9545438904373559, "grad_norm": 0.19841252779671575, "learning_rate": 1.2545778260424245e-07, "loss": 0.6334, "step": 6935 }, { "epoch": 0.955232098000757, "grad_norm": 0.20770310379343845, "learning_rate": 1.2169250406847023e-07, "loss": 0.6396, "step": 6940 }, { "epoch": 0.9559203055641582, "grad_norm": 0.19076306656540537, "learning_rate": 1.1798424619450244e-07, "loss": 0.6838, "step": 6945 }, { "epoch": 0.9566085131275592, "grad_norm": 0.18865907141313035, "learning_rate": 1.1433303038755561e-07, "loss": 0.7372, "step": 6950 }, { "epoch": 0.9572967206909604, "grad_norm": 0.19403743680491373, "learning_rate": 1.1073887772358183e-07, "loss": 0.6803, "step": 6955 }, { "epoch": 0.9579849282543615, "grad_norm": 0.18717828198442052, "learning_rate": 1.0720180894914888e-07, "loss": 0.669, "step": 6960 }, { "epoch": 0.9586731358177626, "grad_norm": 0.19191467557662562, "learning_rate": 1.0372184448131706e-07, "loss": 0.67, "step": 6965 }, { "epoch": 0.9593613433811637, "grad_norm": 0.2010238062573217, "learning_rate": 1.0029900440752361e-07, "loss": 0.6937, "step": 6970 }, { "epoch": 0.9600495509445649, "grad_norm": 0.1680493780922707, "learning_rate": 9.693330848546512e-08, "loss": 0.6387, "step": 6975 }, { "epoch": 0.960737758507966, "grad_norm": 0.18581469117207006, "learning_rate": 9.362477614298426e-08, "loss": 0.6607, "step": 6980 }, { "epoch": 0.9614259660713671, "grad_norm": 0.1973931447761628, "learning_rate": 9.037342647795877e-08, "loss": 0.7274, "step": 6985 }, { "epoch": 0.9621141736347683, "grad_norm": 0.17678271236644427, "learning_rate": 8.717927825818928e-08, "loss": 0.6251, "step": 6990 }, { "epoch": 0.9628023811981694, "grad_norm": 0.18383474785092063, "learning_rate": 8.404234992129279e-08, "loss": 0.7032, "step": 6995 }, { "epoch": 0.9634905887615705, "grad_norm": 0.18369664157259255, "learning_rate": 8.096265957459493e-08, "loss": 0.6327, "step": 7000 }, { "epoch": 0.9634905887615705, "eval_loss": 0.6899093985557556, "eval_runtime": 52.7425, "eval_samples_per_second": 94.8, "eval_steps_per_second": 2.977, "step": 7000 }, { "epoch": 0.9641787963249716, "grad_norm": 0.18191350194659797, "learning_rate": 7.794022499502563e-08, "loss": 0.6432, "step": 7005 }, { "epoch": 0.9648670038883728, "grad_norm": 0.18229234936294433, "learning_rate": 7.497506362901585e-08, "loss": 0.7294, "step": 7010 }, { "epoch": 0.9655552114517738, "grad_norm": 0.20954215606079155, "learning_rate": 7.206719259240103e-08, "loss": 0.6883, "step": 7015 }, { "epoch": 0.966243419015175, "grad_norm": 0.1924993668448291, "learning_rate": 6.921662867031443e-08, "loss": 0.6898, "step": 7020 }, { "epoch": 0.9669316265785761, "grad_norm": 0.15859297501125322, "learning_rate": 6.642338831709838e-08, "loss": 0.69, "step": 7025 }, { "epoch": 0.9676198341419773, "grad_norm": 0.1871948387257139, "learning_rate": 6.368748765620436e-08, "loss": 0.6566, "step": 7030 }, { "epoch": 0.9683080417053783, "grad_norm": 0.18218137504098394, "learning_rate": 6.100894248010192e-08, "loss": 0.6708, "step": 7035 }, { "epoch": 0.9689962492687795, "grad_norm": 0.1974957390473882, "learning_rate": 5.838776825018655e-08, "loss": 0.6309, "step": 7040 }, { "epoch": 0.9696844568321806, "grad_norm": 0.1719590216131498, "learning_rate": 5.582398009668977e-08, "loss": 0.6822, "step": 7045 }, { "epoch": 0.9703726643955817, "grad_norm": 0.1772195220819184, "learning_rate": 5.331759281859361e-08, "loss": 0.7284, "step": 7050 }, { "epoch": 0.9710608719589828, "grad_norm": 0.20302507492542882, "learning_rate": 5.086862088354627e-08, "loss": 0.6511, "step": 7055 }, { "epoch": 0.971749079522384, "grad_norm": 0.1752463060462393, "learning_rate": 4.8477078427774385e-08, "loss": 0.642, "step": 7060 }, { "epoch": 0.972437287085785, "grad_norm": 0.19159616562708437, "learning_rate": 4.6142979256004196e-08, "loss": 0.6604, "step": 7065 }, { "epoch": 0.9731254946491862, "grad_norm": 0.1823934026913009, "learning_rate": 4.386633684138164e-08, "loss": 0.687, "step": 7070 }, { "epoch": 0.9738137022125873, "grad_norm": 0.17640973716281508, "learning_rate": 4.164716432539462e-08, "loss": 0.6702, "step": 7075 }, { "epoch": 0.9745019097759884, "grad_norm": 0.18376261012864936, "learning_rate": 3.948547451779749e-08, "loss": 0.6398, "step": 7080 }, { "epoch": 0.9751901173393895, "grad_norm": 0.21941249009506197, "learning_rate": 3.738127989653562e-08, "loss": 0.7264, "step": 7085 }, { "epoch": 0.9758783249027907, "grad_norm": 0.18534716563656556, "learning_rate": 3.533459260767314e-08, "loss": 0.6601, "step": 7090 }, { "epoch": 0.9765665324661919, "grad_norm": 0.20545828772642993, "learning_rate": 3.3345424465326405e-08, "loss": 0.6586, "step": 7095 }, { "epoch": 0.9772547400295929, "grad_norm": 0.1977325801206686, "learning_rate": 3.1413786951592915e-08, "loss": 0.6894, "step": 7100 }, { "epoch": 0.977942947592994, "grad_norm": 0.2043738857897438, "learning_rate": 2.953969121648359e-08, "loss": 0.7363, "step": 7105 }, { "epoch": 0.9786311551563952, "grad_norm": 0.19046606729466464, "learning_rate": 2.7723148077862806e-08, "loss": 0.6598, "step": 7110 }, { "epoch": 0.9793193627197962, "grad_norm": 0.21151630559997697, "learning_rate": 2.5964168021381796e-08, "loss": 0.684, "step": 7115 }, { "epoch": 0.9800075702831974, "grad_norm": 0.19591926839600343, "learning_rate": 2.4262761200419814e-08, "loss": 0.7376, "step": 7120 }, { "epoch": 0.9806957778465986, "grad_norm": 0.18426320682509564, "learning_rate": 2.2618937436026388e-08, "loss": 0.735, "step": 7125 }, { "epoch": 0.9813839854099996, "grad_norm": 0.23245603425571917, "learning_rate": 2.1032706216863595e-08, "loss": 0.7066, "step": 7130 }, { "epoch": 0.9820721929734008, "grad_norm": 0.1966410657504021, "learning_rate": 1.9504076699150555e-08, "loss": 0.7208, "step": 7135 }, { "epoch": 0.9827604005368019, "grad_norm": 0.22528131378286567, "learning_rate": 1.8033057706612343e-08, "loss": 0.7174, "step": 7140 }, { "epoch": 0.9834486081002031, "grad_norm": 0.19405011626017643, "learning_rate": 1.661965773042895e-08, "loss": 0.7454, "step": 7145 }, { "epoch": 0.9841368156636041, "grad_norm": 0.2240157200792808, "learning_rate": 1.526388492918196e-08, "loss": 0.695, "step": 7150 }, { "epoch": 0.9848250232270053, "grad_norm": 0.20111810729611992, "learning_rate": 1.396574712881571e-08, "loss": 0.7192, "step": 7155 }, { "epoch": 0.9855132307904064, "grad_norm": 0.19526832015872803, "learning_rate": 1.2725251822583995e-08, "loss": 0.6525, "step": 7160 }, { "epoch": 0.9862014383538075, "grad_norm": 0.18255652028436334, "learning_rate": 1.1542406171012321e-08, "loss": 0.7702, "step": 7165 }, { "epoch": 0.9868896459172086, "grad_norm": 0.19391051530711373, "learning_rate": 1.0417217001852386e-08, "loss": 0.7028, "step": 7170 }, { "epoch": 0.9875778534806098, "grad_norm": 0.17167920024663555, "learning_rate": 9.349690810046552e-09, "loss": 0.6775, "step": 7175 }, { "epoch": 0.9882660610440108, "grad_norm": 0.20021224983436853, "learning_rate": 8.339833757686766e-09, "loss": 0.6791, "step": 7180 }, { "epoch": 0.988954268607412, "grad_norm": 0.19065130159524532, "learning_rate": 7.387651673983476e-09, "loss": 0.6548, "step": 7185 }, { "epoch": 0.9896424761708131, "grad_norm": 0.22283254908062175, "learning_rate": 6.493150055227881e-09, "loss": 0.696, "step": 7190 }, { "epoch": 0.9903306837342143, "grad_norm": 0.18654171096727787, "learning_rate": 5.656334064759739e-09, "loss": 0.69, "step": 7195 }, { "epoch": 0.9910188912976153, "grad_norm": 0.17831896622624438, "learning_rate": 4.877208532941824e-09, "loss": 0.6685, "step": 7200 }, { "epoch": 0.9910188912976153, "eval_loss": 0.6898561120033264, "eval_runtime": 52.7317, "eval_samples_per_second": 94.82, "eval_steps_per_second": 2.977, "step": 7200 }, { "epoch": 0.9917070988610165, "grad_norm": 0.18862677062584648, "learning_rate": 4.155777957128848e-09, "loss": 0.6511, "step": 7205 }, { "epoch": 0.9923953064244176, "grad_norm": 0.18385842528342736, "learning_rate": 3.4920465016419215e-09, "loss": 0.6793, "step": 7210 }, { "epoch": 0.9930835139878187, "grad_norm": 0.20067343979688615, "learning_rate": 2.886017997747459e-09, "loss": 0.6537, "step": 7215 }, { "epoch": 0.9937717215512198, "grad_norm": 0.1785363610886247, "learning_rate": 2.3376959436283153e-09, "loss": 0.7049, "step": 7220 }, { "epoch": 0.994459929114621, "grad_norm": 0.18415606203784593, "learning_rate": 1.8470835043693513e-09, "loss": 0.6706, "step": 7225 }, { "epoch": 0.995148136678022, "grad_norm": 0.1984355345218752, "learning_rate": 1.4141835119396708e-09, "loss": 0.708, "step": 7230 }, { "epoch": 0.9958363442414232, "grad_norm": 0.19461528899232708, "learning_rate": 1.0389984651726358e-09, "loss": 0.6695, "step": 7235 }, { "epoch": 0.9965245518048244, "grad_norm": 0.2019558029320888, "learning_rate": 7.215305297525455e-10, "loss": 0.6881, "step": 7240 }, { "epoch": 0.9972127593682255, "grad_norm": 0.19208860577440173, "learning_rate": 4.6178153820131134e-10, "loss": 0.734, "step": 7245 }, { "epoch": 0.9979009669316266, "grad_norm": 0.15991250745315563, "learning_rate": 2.597529898740181e-10, "loss": 0.6272, "step": 7250 }, { "epoch": 0.9985891744950277, "grad_norm": 0.1972094757548368, "learning_rate": 1.1544605094004901e-10, "loss": 0.6948, "step": 7255 }, { "epoch": 0.9992773820584289, "grad_norm": 0.18914701174560125, "learning_rate": 2.8861554384196356e-11, "loss": 0.6865, "step": 7260 }, { "epoch": 0.9999655896218299, "grad_norm": 0.17008842784129619, "learning_rate": 0.0, "loss": 0.7312, "step": 7265 }, { "epoch": 0.9999655896218299, "step": 7265, "total_flos": 1437402890797056.0, "train_loss": 0.7255030241491708, "train_runtime": 38613.5919, "train_samples_per_second": 24.083, "train_steps_per_second": 0.188 } ], "logging_steps": 5, "max_steps": 7265, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1437402890797056.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }