{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0005646527385659, "eval_steps": 222, "global_step": 443, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002258610954263128, "grad_norm": 0.734693706035614, "learning_rate": 3.3333333333333333e-06, "loss": 2.1698, "step": 1 }, { "epoch": 0.002258610954263128, "eval_loss": 2.083223819732666, "eval_runtime": 98.3201, "eval_samples_per_second": 7.587, "eval_steps_per_second": 0.956, "step": 1 }, { "epoch": 0.004517221908526256, "grad_norm": 0.7944502830505371, "learning_rate": 6.666666666666667e-06, "loss": 2.2383, "step": 2 }, { "epoch": 0.006775832862789385, "grad_norm": 0.9303544759750366, "learning_rate": 1e-05, "loss": 1.952, "step": 3 }, { "epoch": 0.009034443817052512, "grad_norm": 0.9056912660598755, "learning_rate": 1.3333333333333333e-05, "loss": 2.306, "step": 4 }, { "epoch": 0.01129305477131564, "grad_norm": 1.3058993816375732, "learning_rate": 1.6666666666666667e-05, "loss": 2.3384, "step": 5 }, { "epoch": 0.01355166572557877, "grad_norm": 1.2961996793746948, "learning_rate": 2e-05, "loss": 2.3395, "step": 6 }, { "epoch": 0.015810276679841896, "grad_norm": 0.8225818276405334, "learning_rate": 2.3333333333333336e-05, "loss": 1.6852, "step": 7 }, { "epoch": 0.018068887634105024, "grad_norm": 0.7992662787437439, "learning_rate": 2.6666666666666667e-05, "loss": 1.7517, "step": 8 }, { "epoch": 0.020327498588368152, "grad_norm": 1.9861479997634888, "learning_rate": 3e-05, "loss": 2.6289, "step": 9 }, { "epoch": 0.02258610954263128, "grad_norm": 0.720796525478363, "learning_rate": 3.3333333333333335e-05, "loss": 2.3053, "step": 10 }, { "epoch": 0.024844720496894408, "grad_norm": 0.8933535218238831, "learning_rate": 3.6666666666666666e-05, "loss": 2.0577, "step": 11 }, { "epoch": 0.02710333145115754, "grad_norm": 1.1678780317306519, "learning_rate": 4e-05, "loss": 2.1599, "step": 12 }, { "epoch": 0.029361942405420668, "grad_norm": 1.06686270236969, "learning_rate": 4.3333333333333334e-05, "loss": 1.9538, "step": 13 }, { "epoch": 0.03162055335968379, "grad_norm": 1.0956751108169556, "learning_rate": 4.666666666666667e-05, "loss": 1.8666, "step": 14 }, { "epoch": 0.03387916431394692, "grad_norm": 1.3107072114944458, "learning_rate": 5e-05, "loss": 2.0312, "step": 15 }, { "epoch": 0.03613777526821005, "grad_norm": 1.6173886060714722, "learning_rate": 5.333333333333333e-05, "loss": 1.9908, "step": 16 }, { "epoch": 0.038396386222473176, "grad_norm": 1.1099895238876343, "learning_rate": 5.666666666666667e-05, "loss": 1.6063, "step": 17 }, { "epoch": 0.040654997176736304, "grad_norm": 1.4625017642974854, "learning_rate": 6e-05, "loss": 1.6362, "step": 18 }, { "epoch": 0.04291360813099943, "grad_norm": 1.05028235912323, "learning_rate": 6.333333333333333e-05, "loss": 1.7419, "step": 19 }, { "epoch": 0.04517221908526256, "grad_norm": 0.9315014481544495, "learning_rate": 6.666666666666667e-05, "loss": 1.6991, "step": 20 }, { "epoch": 0.04743083003952569, "grad_norm": 1.2721033096313477, "learning_rate": 7e-05, "loss": 1.5631, "step": 21 }, { "epoch": 0.049689440993788817, "grad_norm": 1.578466534614563, "learning_rate": 7.333333333333333e-05, "loss": 1.6748, "step": 22 }, { "epoch": 0.05194805194805195, "grad_norm": 1.692025065422058, "learning_rate": 7.666666666666667e-05, "loss": 1.607, "step": 23 }, { "epoch": 0.05420666290231508, "grad_norm": 1.251129388809204, "learning_rate": 8e-05, "loss": 1.8119, "step": 24 }, { "epoch": 0.05646527385657821, "grad_norm": 1.340306043624878, "learning_rate": 8.333333333333334e-05, "loss": 1.6167, "step": 25 }, { "epoch": 0.058723884810841336, "grad_norm": 1.1570920944213867, "learning_rate": 8.666666666666667e-05, "loss": 1.7172, "step": 26 }, { "epoch": 0.060982495765104464, "grad_norm": 1.0314528942108154, "learning_rate": 9e-05, "loss": 1.8385, "step": 27 }, { "epoch": 0.06324110671936758, "grad_norm": 1.032861351966858, "learning_rate": 9.333333333333334e-05, "loss": 1.3839, "step": 28 }, { "epoch": 0.06549971767363072, "grad_norm": 0.9600235819816589, "learning_rate": 9.666666666666667e-05, "loss": 1.4608, "step": 29 }, { "epoch": 0.06775832862789384, "grad_norm": 0.9947760105133057, "learning_rate": 0.0001, "loss": 1.4875, "step": 30 }, { "epoch": 0.07001693958215698, "grad_norm": 1.2269097566604614, "learning_rate": 9.999855343632036e-05, "loss": 1.6437, "step": 31 }, { "epoch": 0.0722755505364201, "grad_norm": 0.865011990070343, "learning_rate": 9.999421382898329e-05, "loss": 1.6969, "step": 32 }, { "epoch": 0.07453416149068323, "grad_norm": 0.6729279160499573, "learning_rate": 9.998698142908953e-05, "loss": 1.2089, "step": 33 }, { "epoch": 0.07679277244494635, "grad_norm": 1.4308388233184814, "learning_rate": 9.997685665512418e-05, "loss": 1.3744, "step": 34 }, { "epoch": 0.07905138339920949, "grad_norm": 0.582622230052948, "learning_rate": 9.99638400929324e-05, "loss": 1.2607, "step": 35 }, { "epoch": 0.08130999435347261, "grad_norm": 0.5864912867546082, "learning_rate": 9.994793249568569e-05, "loss": 1.3634, "step": 36 }, { "epoch": 0.08356860530773574, "grad_norm": 0.5478243827819824, "learning_rate": 9.99291347838381e-05, "loss": 1.5985, "step": 37 }, { "epoch": 0.08582721626199886, "grad_norm": 0.7038874626159668, "learning_rate": 9.990744804507315e-05, "loss": 1.3606, "step": 38 }, { "epoch": 0.088085827216262, "grad_norm": 0.5414575934410095, "learning_rate": 9.988287353424077e-05, "loss": 1.7192, "step": 39 }, { "epoch": 0.09034443817052512, "grad_norm": 0.5917218327522278, "learning_rate": 9.985541267328477e-05, "loss": 1.5298, "step": 40 }, { "epoch": 0.09260304912478826, "grad_norm": 0.5787907838821411, "learning_rate": 9.98250670511605e-05, "loss": 1.9064, "step": 41 }, { "epoch": 0.09486166007905138, "grad_norm": 0.8571286201477051, "learning_rate": 9.979183842374293e-05, "loss": 1.3743, "step": 42 }, { "epoch": 0.09712027103331451, "grad_norm": 0.7187583446502686, "learning_rate": 9.975572871372513e-05, "loss": 1.3655, "step": 43 }, { "epoch": 0.09937888198757763, "grad_norm": 1.4509934186935425, "learning_rate": 9.971674001050686e-05, "loss": 1.4908, "step": 44 }, { "epoch": 0.10163749294184077, "grad_norm": 0.7309430837631226, "learning_rate": 9.967487457007381e-05, "loss": 1.0838, "step": 45 }, { "epoch": 0.1038961038961039, "grad_norm": 0.7827832102775574, "learning_rate": 9.963013481486703e-05, "loss": 1.5432, "step": 46 }, { "epoch": 0.10615471485036702, "grad_norm": 0.6648978590965271, "learning_rate": 9.958252333364267e-05, "loss": 1.1893, "step": 47 }, { "epoch": 0.10841332580463016, "grad_norm": 1.085577130317688, "learning_rate": 9.953204288132234e-05, "loss": 1.3416, "step": 48 }, { "epoch": 0.11067193675889328, "grad_norm": 1.0411715507507324, "learning_rate": 9.947869637883358e-05, "loss": 1.4301, "step": 49 }, { "epoch": 0.11293054771315642, "grad_norm": 0.6772856116294861, "learning_rate": 9.942248691294093e-05, "loss": 1.1494, "step": 50 }, { "epoch": 0.11518915866741954, "grad_norm": 1.099043846130371, "learning_rate": 9.936341773606723e-05, "loss": 1.4349, "step": 51 }, { "epoch": 0.11744776962168267, "grad_norm": 0.7465451955795288, "learning_rate": 9.930149226610554e-05, "loss": 1.4149, "step": 52 }, { "epoch": 0.11970638057594579, "grad_norm": 0.6757813096046448, "learning_rate": 9.923671408622129e-05, "loss": 1.3782, "step": 53 }, { "epoch": 0.12196499153020893, "grad_norm": 0.6110934019088745, "learning_rate": 9.916908694464492e-05, "loss": 1.5505, "step": 54 }, { "epoch": 0.12422360248447205, "grad_norm": 0.8363707065582275, "learning_rate": 9.909861475445517e-05, "loss": 1.6348, "step": 55 }, { "epoch": 0.12648221343873517, "grad_norm": 0.5607156753540039, "learning_rate": 9.902530159335243e-05, "loss": 1.6425, "step": 56 }, { "epoch": 0.1287408243929983, "grad_norm": 0.6306595802307129, "learning_rate": 9.894915170342295e-05, "loss": 1.2802, "step": 57 }, { "epoch": 0.13099943534726144, "grad_norm": 0.8412261009216309, "learning_rate": 9.887016949089333e-05, "loss": 1.5162, "step": 58 }, { "epoch": 0.13325804630152457, "grad_norm": 1.7483497858047485, "learning_rate": 9.878835952587559e-05, "loss": 1.4394, "step": 59 }, { "epoch": 0.13551665725578768, "grad_norm": 0.6815210580825806, "learning_rate": 9.870372654210265e-05, "loss": 1.4947, "step": 60 }, { "epoch": 0.13777526821005082, "grad_norm": 0.819712221622467, "learning_rate": 9.861627543665456e-05, "loss": 1.8876, "step": 61 }, { "epoch": 0.14003387916431395, "grad_norm": 0.6188887357711792, "learning_rate": 9.852601126967502e-05, "loss": 1.4265, "step": 62 }, { "epoch": 0.1422924901185771, "grad_norm": 0.5934920310974121, "learning_rate": 9.843293926407866e-05, "loss": 1.4272, "step": 63 }, { "epoch": 0.1445511010728402, "grad_norm": 0.554636538028717, "learning_rate": 9.833706480524878e-05, "loss": 1.6095, "step": 64 }, { "epoch": 0.14680971202710333, "grad_norm": 0.545122504234314, "learning_rate": 9.82383934407258e-05, "loss": 1.2429, "step": 65 }, { "epoch": 0.14906832298136646, "grad_norm": 0.674778401851654, "learning_rate": 9.81369308798862e-05, "loss": 1.3112, "step": 66 }, { "epoch": 0.1513269339356296, "grad_norm": 0.4785314202308655, "learning_rate": 9.803268299361217e-05, "loss": 1.2792, "step": 67 }, { "epoch": 0.1535855448898927, "grad_norm": 0.7849199175834656, "learning_rate": 9.7925655813952e-05, "loss": 1.557, "step": 68 }, { "epoch": 0.15584415584415584, "grad_norm": 0.7094368934631348, "learning_rate": 9.781585553377085e-05, "loss": 1.3404, "step": 69 }, { "epoch": 0.15810276679841898, "grad_norm": 0.5156518220901489, "learning_rate": 9.770328850639268e-05, "loss": 1.2939, "step": 70 }, { "epoch": 0.1603613777526821, "grad_norm": 0.6321349143981934, "learning_rate": 9.758796124523239e-05, "loss": 1.684, "step": 71 }, { "epoch": 0.16261998870694522, "grad_norm": 0.8690780401229858, "learning_rate": 9.746988042341906e-05, "loss": 1.2346, "step": 72 }, { "epoch": 0.16487859966120835, "grad_norm": 0.5986551642417908, "learning_rate": 9.734905287340985e-05, "loss": 1.4559, "step": 73 }, { "epoch": 0.1671372106154715, "grad_norm": 0.5338417887687683, "learning_rate": 9.722548558659457e-05, "loss": 1.6537, "step": 74 }, { "epoch": 0.16939582156973462, "grad_norm": 0.7366315126419067, "learning_rate": 9.709918571289114e-05, "loss": 1.3941, "step": 75 }, { "epoch": 0.17165443252399773, "grad_norm": 0.7297102212905884, "learning_rate": 9.697016056033201e-05, "loss": 1.5824, "step": 76 }, { "epoch": 0.17391304347826086, "grad_norm": 0.5532839894294739, "learning_rate": 9.683841759464113e-05, "loss": 1.4119, "step": 77 }, { "epoch": 0.176171654432524, "grad_norm": 0.5008260011672974, "learning_rate": 9.670396443880208e-05, "loss": 1.5714, "step": 78 }, { "epoch": 0.17843026538678713, "grad_norm": 0.7788993716239929, "learning_rate": 9.656680887261693e-05, "loss": 1.4765, "step": 79 }, { "epoch": 0.18068887634105024, "grad_norm": 0.7315067648887634, "learning_rate": 9.64269588322561e-05, "loss": 1.6218, "step": 80 }, { "epoch": 0.18294748729531338, "grad_norm": 0.7663245797157288, "learning_rate": 9.628442240979916e-05, "loss": 1.381, "step": 81 }, { "epoch": 0.1852060982495765, "grad_norm": 0.5851924419403076, "learning_rate": 9.613920785276656e-05, "loss": 1.6193, "step": 82 }, { "epoch": 0.18746470920383965, "grad_norm": 0.8395928144454956, "learning_rate": 9.599132356364247e-05, "loss": 1.3371, "step": 83 }, { "epoch": 0.18972332015810275, "grad_norm": 0.9145395159721375, "learning_rate": 9.584077809938855e-05, "loss": 1.3984, "step": 84 }, { "epoch": 0.1919819311123659, "grad_norm": 0.6220190525054932, "learning_rate": 9.568758017094883e-05, "loss": 1.2248, "step": 85 }, { "epoch": 0.19424054206662902, "grad_norm": 0.5492339730262756, "learning_rate": 9.553173864274567e-05, "loss": 1.4326, "step": 86 }, { "epoch": 0.19649915302089216, "grad_norm": 0.47741296887397766, "learning_rate": 9.537326253216685e-05, "loss": 1.4092, "step": 87 }, { "epoch": 0.19875776397515527, "grad_norm": 0.5220446586608887, "learning_rate": 9.521216100904378e-05, "loss": 1.5837, "step": 88 }, { "epoch": 0.2010163749294184, "grad_norm": 0.5066771507263184, "learning_rate": 9.504844339512095e-05, "loss": 1.4179, "step": 89 }, { "epoch": 0.20327498588368154, "grad_norm": 0.6398409605026245, "learning_rate": 9.488211916351656e-05, "loss": 1.3896, "step": 90 }, { "epoch": 0.20553359683794467, "grad_norm": 0.7091066241264343, "learning_rate": 9.471319793817426e-05, "loss": 1.6861, "step": 91 }, { "epoch": 0.2077922077922078, "grad_norm": 1.067958116531372, "learning_rate": 9.454168949330645e-05, "loss": 1.417, "step": 92 }, { "epoch": 0.2100508187464709, "grad_norm": 0.5348410606384277, "learning_rate": 9.436760375282859e-05, "loss": 1.4303, "step": 93 }, { "epoch": 0.21230942970073405, "grad_norm": 0.5271281599998474, "learning_rate": 9.419095078978506e-05, "loss": 1.2863, "step": 94 }, { "epoch": 0.21456804065499718, "grad_norm": 0.6372822523117065, "learning_rate": 9.40117408257663e-05, "loss": 1.4036, "step": 95 }, { "epoch": 0.21682665160926032, "grad_norm": 0.5258705615997314, "learning_rate": 9.382998423031727e-05, "loss": 1.7694, "step": 96 }, { "epoch": 0.21908526256352343, "grad_norm": 0.5894024968147278, "learning_rate": 9.364569152033756e-05, "loss": 1.7329, "step": 97 }, { "epoch": 0.22134387351778656, "grad_norm": 0.7126629948616028, "learning_rate": 9.345887335947281e-05, "loss": 1.6303, "step": 98 }, { "epoch": 0.2236024844720497, "grad_norm": 0.5408257842063904, "learning_rate": 9.326954055749767e-05, "loss": 1.6237, "step": 99 }, { "epoch": 0.22586109542631283, "grad_norm": 0.673343300819397, "learning_rate": 9.30777040696903e-05, "loss": 1.1694, "step": 100 }, { "epoch": 0.22811970638057594, "grad_norm": 0.6288356781005859, "learning_rate": 9.288337499619857e-05, "loss": 1.3256, "step": 101 }, { "epoch": 0.23037831733483907, "grad_norm": 0.5580915212631226, "learning_rate": 9.268656458139762e-05, "loss": 1.4626, "step": 102 }, { "epoch": 0.2326369282891022, "grad_norm": 0.5487126708030701, "learning_rate": 9.248728421323941e-05, "loss": 1.5227, "step": 103 }, { "epoch": 0.23489553924336534, "grad_norm": 0.712181031703949, "learning_rate": 9.22855454225936e-05, "loss": 1.082, "step": 104 }, { "epoch": 0.23715415019762845, "grad_norm": 0.7502368092536926, "learning_rate": 9.208135988258051e-05, "loss": 1.4804, "step": 105 }, { "epoch": 0.23941276115189158, "grad_norm": 0.9380141496658325, "learning_rate": 9.187473940789557e-05, "loss": 1.4093, "step": 106 }, { "epoch": 0.24167137210615472, "grad_norm": 0.7033969759941101, "learning_rate": 9.166569595412575e-05, "loss": 1.3279, "step": 107 }, { "epoch": 0.24392998306041785, "grad_norm": 0.9007672667503357, "learning_rate": 9.145424161705776e-05, "loss": 1.3571, "step": 108 }, { "epoch": 0.24618859401468096, "grad_norm": 0.555966854095459, "learning_rate": 9.124038863197818e-05, "loss": 1.475, "step": 109 }, { "epoch": 0.2484472049689441, "grad_norm": 0.5234211683273315, "learning_rate": 9.10241493729654e-05, "loss": 1.5519, "step": 110 }, { "epoch": 0.25070581592320723, "grad_norm": 0.5980601906776428, "learning_rate": 9.08055363521738e-05, "loss": 1.5623, "step": 111 }, { "epoch": 0.25296442687747034, "grad_norm": 0.6727840900421143, "learning_rate": 9.058456221910956e-05, "loss": 1.4898, "step": 112 }, { "epoch": 0.2552230378317335, "grad_norm": 0.6955089569091797, "learning_rate": 9.036123975989892e-05, "loss": 1.4752, "step": 113 }, { "epoch": 0.2574816487859966, "grad_norm": 0.6179929971694946, "learning_rate": 9.013558189654819e-05, "loss": 1.4823, "step": 114 }, { "epoch": 0.2597402597402597, "grad_norm": 0.6038258671760559, "learning_rate": 8.990760168619615e-05, "loss": 1.2696, "step": 115 }, { "epoch": 0.2619988706945229, "grad_norm": 0.6415106058120728, "learning_rate": 8.967731232035847e-05, "loss": 1.8538, "step": 116 }, { "epoch": 0.264257481648786, "grad_norm": 0.4856817424297333, "learning_rate": 8.944472712416447e-05, "loss": 1.4395, "step": 117 }, { "epoch": 0.26651609260304915, "grad_norm": 0.5568715929985046, "learning_rate": 8.9209859555586e-05, "loss": 1.2157, "step": 118 }, { "epoch": 0.26877470355731226, "grad_norm": 0.5347578525543213, "learning_rate": 8.897272320465887e-05, "loss": 1.4817, "step": 119 }, { "epoch": 0.27103331451157536, "grad_norm": 0.5750213861465454, "learning_rate": 8.873333179269635e-05, "loss": 1.477, "step": 120 }, { "epoch": 0.2732919254658385, "grad_norm": 0.588683009147644, "learning_rate": 8.849169917149531e-05, "loss": 1.6019, "step": 121 }, { "epoch": 0.27555053642010163, "grad_norm": 0.511512279510498, "learning_rate": 8.82478393225347e-05, "loss": 1.6251, "step": 122 }, { "epoch": 0.27780914737436474, "grad_norm": 0.6532254815101624, "learning_rate": 8.800176635616657e-05, "loss": 1.7525, "step": 123 }, { "epoch": 0.2800677583286279, "grad_norm": 0.6906523108482361, "learning_rate": 8.775349451079948e-05, "loss": 1.3276, "step": 124 }, { "epoch": 0.282326369282891, "grad_norm": 0.7139832973480225, "learning_rate": 8.750303815207486e-05, "loss": 1.6493, "step": 125 }, { "epoch": 0.2845849802371542, "grad_norm": 0.6658667325973511, "learning_rate": 8.725041177203554e-05, "loss": 1.2206, "step": 126 }, { "epoch": 0.2868435911914173, "grad_norm": 0.6301794648170471, "learning_rate": 8.699562998828738e-05, "loss": 1.4198, "step": 127 }, { "epoch": 0.2891022021456804, "grad_norm": 0.5198056697845459, "learning_rate": 8.673870754315336e-05, "loss": 1.3772, "step": 128 }, { "epoch": 0.29136081309994355, "grad_norm": 0.6238622069358826, "learning_rate": 8.647965930282059e-05, "loss": 1.4059, "step": 129 }, { "epoch": 0.29361942405420666, "grad_norm": 0.7875000834465027, "learning_rate": 8.621850025648009e-05, "loss": 1.2813, "step": 130 }, { "epoch": 0.29587803500846976, "grad_norm": 0.8984452486038208, "learning_rate": 8.59552455154595e-05, "loss": 1.7444, "step": 131 }, { "epoch": 0.2981366459627329, "grad_norm": 0.520828902721405, "learning_rate": 8.56899103123487e-05, "loss": 1.3697, "step": 132 }, { "epoch": 0.30039525691699603, "grad_norm": 0.5849810242652893, "learning_rate": 8.54225100001184e-05, "loss": 1.2721, "step": 133 }, { "epoch": 0.3026538678712592, "grad_norm": 0.5514599680900574, "learning_rate": 8.51530600512318e-05, "loss": 1.4966, "step": 134 }, { "epoch": 0.3049124788255223, "grad_norm": 1.624732255935669, "learning_rate": 8.488157605674925e-05, "loss": 1.2903, "step": 135 }, { "epoch": 0.3071710897797854, "grad_norm": 0.5830861926078796, "learning_rate": 8.460807372542618e-05, "loss": 1.2969, "step": 136 }, { "epoch": 0.3094297007340486, "grad_norm": 0.8161193132400513, "learning_rate": 8.43325688828042e-05, "loss": 1.1874, "step": 137 }, { "epoch": 0.3116883116883117, "grad_norm": 0.47100022435188293, "learning_rate": 8.405507747029523e-05, "loss": 1.3229, "step": 138 }, { "epoch": 0.31394692264257484, "grad_norm": 0.4437452554702759, "learning_rate": 8.377561554425922e-05, "loss": 1.1945, "step": 139 }, { "epoch": 0.31620553359683795, "grad_norm": 0.566750705242157, "learning_rate": 8.349419927507505e-05, "loss": 1.4707, "step": 140 }, { "epoch": 0.31846414455110106, "grad_norm": 0.5701163411140442, "learning_rate": 8.321084494620488e-05, "loss": 1.5681, "step": 141 }, { "epoch": 0.3207227555053642, "grad_norm": 0.5442900657653809, "learning_rate": 8.292556895325194e-05, "loss": 1.2291, "step": 142 }, { "epoch": 0.32298136645962733, "grad_norm": 0.5684903860092163, "learning_rate": 8.263838780301182e-05, "loss": 1.3748, "step": 143 }, { "epoch": 0.32523997741389044, "grad_norm": 0.7141373157501221, "learning_rate": 8.234931811251739e-05, "loss": 1.1836, "step": 144 }, { "epoch": 0.3274985883681536, "grad_norm": 0.7630109190940857, "learning_rate": 8.205837660807725e-05, "loss": 1.602, "step": 145 }, { "epoch": 0.3297571993224167, "grad_norm": 0.6189699172973633, "learning_rate": 8.176558012430791e-05, "loss": 1.273, "step": 146 }, { "epoch": 0.33201581027667987, "grad_norm": 0.7228075861930847, "learning_rate": 8.147094560315977e-05, "loss": 1.1563, "step": 147 }, { "epoch": 0.334274421230943, "grad_norm": 0.5667940378189087, "learning_rate": 8.117449009293668e-05, "loss": 1.3525, "step": 148 }, { "epoch": 0.3365330321852061, "grad_norm": 0.4601518213748932, "learning_rate": 8.08762307473096e-05, "loss": 1.2889, "step": 149 }, { "epoch": 0.33879164313946925, "grad_norm": 0.5403670072555542, "learning_rate": 8.057618482432399e-05, "loss": 1.4927, "step": 150 }, { "epoch": 0.34105025409373235, "grad_norm": 0.5279664993286133, "learning_rate": 8.027436968540123e-05, "loss": 1.4293, "step": 151 }, { "epoch": 0.34330886504799546, "grad_norm": 0.517234206199646, "learning_rate": 7.997080279433402e-05, "loss": 1.2646, "step": 152 }, { "epoch": 0.3455674760022586, "grad_norm": 0.6740924119949341, "learning_rate": 7.966550171627592e-05, "loss": 1.3801, "step": 153 }, { "epoch": 0.34782608695652173, "grad_norm": 0.47068697214126587, "learning_rate": 7.9358484116725e-05, "loss": 1.3094, "step": 154 }, { "epoch": 0.3500846979107849, "grad_norm": 0.5425541996955872, "learning_rate": 7.904976776050156e-05, "loss": 1.5867, "step": 155 }, { "epoch": 0.352343308865048, "grad_norm": 0.41730475425720215, "learning_rate": 7.873937051072035e-05, "loss": 1.1469, "step": 156 }, { "epoch": 0.3546019198193111, "grad_norm": 0.632966160774231, "learning_rate": 7.842731032775687e-05, "loss": 1.4293, "step": 157 }, { "epoch": 0.35686053077357427, "grad_norm": 0.6313096880912781, "learning_rate": 7.81136052682082e-05, "loss": 1.2995, "step": 158 }, { "epoch": 0.3591191417278374, "grad_norm": 1.015628457069397, "learning_rate": 7.779827348384813e-05, "loss": 1.6397, "step": 159 }, { "epoch": 0.3613777526821005, "grad_norm": 0.6771583557128906, "learning_rate": 7.748133322057693e-05, "loss": 1.5359, "step": 160 }, { "epoch": 0.36363636363636365, "grad_norm": 0.6180778741836548, "learning_rate": 7.716280281736551e-05, "loss": 1.6158, "step": 161 }, { "epoch": 0.36589497459062675, "grad_norm": 0.46877238154411316, "learning_rate": 7.68427007051944e-05, "loss": 1.3506, "step": 162 }, { "epoch": 0.3681535855448899, "grad_norm": 0.5611956119537354, "learning_rate": 7.652104540598712e-05, "loss": 1.2563, "step": 163 }, { "epoch": 0.370412196499153, "grad_norm": 0.5148677229881287, "learning_rate": 7.619785553153864e-05, "loss": 1.4294, "step": 164 }, { "epoch": 0.37267080745341613, "grad_norm": 0.49939846992492676, "learning_rate": 7.58731497824383e-05, "loss": 1.2108, "step": 165 }, { "epoch": 0.3749294184076793, "grad_norm": 0.6522055864334106, "learning_rate": 7.554694694698784e-05, "loss": 1.3305, "step": 166 }, { "epoch": 0.3771880293619424, "grad_norm": 0.5979933738708496, "learning_rate": 7.521926590011418e-05, "loss": 1.4971, "step": 167 }, { "epoch": 0.3794466403162055, "grad_norm": 2.253812551498413, "learning_rate": 7.489012560227742e-05, "loss": 1.1929, "step": 168 }, { "epoch": 0.38170525127046867, "grad_norm": 0.5582684874534607, "learning_rate": 7.455954509837352e-05, "loss": 1.4669, "step": 169 }, { "epoch": 0.3839638622247318, "grad_norm": 0.5619791150093079, "learning_rate": 7.422754351663252e-05, "loss": 1.4651, "step": 170 }, { "epoch": 0.38622247317899494, "grad_norm": 0.4569830000400543, "learning_rate": 7.389414006751158e-05, "loss": 1.4677, "step": 171 }, { "epoch": 0.38848108413325805, "grad_norm": 1.0484251976013184, "learning_rate": 7.355935404258354e-05, "loss": 1.4736, "step": 172 }, { "epoch": 0.39073969508752115, "grad_norm": 0.5541074872016907, "learning_rate": 7.322320481342054e-05, "loss": 1.4604, "step": 173 }, { "epoch": 0.3929983060417843, "grad_norm": 0.6257642507553101, "learning_rate": 7.288571183047322e-05, "loss": 1.1331, "step": 174 }, { "epoch": 0.3952569169960474, "grad_norm": 0.5899243354797363, "learning_rate": 7.254689462194522e-05, "loss": 1.1387, "step": 175 }, { "epoch": 0.39751552795031053, "grad_norm": 0.5541165471076965, "learning_rate": 7.220677279266327e-05, "loss": 1.408, "step": 176 }, { "epoch": 0.3997741389045737, "grad_norm": 0.5860551595687866, "learning_rate": 7.186536602294278e-05, "loss": 1.512, "step": 177 }, { "epoch": 0.4020327498588368, "grad_norm": 0.5203275084495544, "learning_rate": 7.152269406744903e-05, "loss": 1.8094, "step": 178 }, { "epoch": 0.40429136081309996, "grad_norm": 0.6692151427268982, "learning_rate": 7.117877675405427e-05, "loss": 1.4363, "step": 179 }, { "epoch": 0.40654997176736307, "grad_norm": 0.624856173992157, "learning_rate": 7.083363398269022e-05, "loss": 1.4632, "step": 180 }, { "epoch": 0.4088085827216262, "grad_norm": 0.5100191235542297, "learning_rate": 7.04872857241968e-05, "loss": 1.3967, "step": 181 }, { "epoch": 0.41106719367588934, "grad_norm": 0.5183005332946777, "learning_rate": 7.013975201916648e-05, "loss": 1.7088, "step": 182 }, { "epoch": 0.41332580463015245, "grad_norm": 0.7876030802726746, "learning_rate": 6.979105297678462e-05, "loss": 1.4677, "step": 183 }, { "epoch": 0.4155844155844156, "grad_norm": 0.5176962614059448, "learning_rate": 6.944120877366604e-05, "loss": 1.5604, "step": 184 }, { "epoch": 0.4178430265386787, "grad_norm": 0.6162248849868774, "learning_rate": 6.909023965268746e-05, "loss": 1.307, "step": 185 }, { "epoch": 0.4201016374929418, "grad_norm": 0.8558477759361267, "learning_rate": 6.873816592181617e-05, "loss": 1.1848, "step": 186 }, { "epoch": 0.422360248447205, "grad_norm": 0.5814364552497864, "learning_rate": 6.838500795293505e-05, "loss": 1.3717, "step": 187 }, { "epoch": 0.4246188594014681, "grad_norm": 0.5346269011497498, "learning_rate": 6.803078618066378e-05, "loss": 1.444, "step": 188 }, { "epoch": 0.4268774703557312, "grad_norm": 0.5764583945274353, "learning_rate": 6.767552110117631e-05, "loss": 1.4341, "step": 189 }, { "epoch": 0.42913608130999437, "grad_norm": 0.4884260892868042, "learning_rate": 6.73192332710151e-05, "loss": 1.3498, "step": 190 }, { "epoch": 0.4313946922642575, "grad_norm": 0.6064755320549011, "learning_rate": 6.696194330590151e-05, "loss": 1.6995, "step": 191 }, { "epoch": 0.43365330321852064, "grad_norm": 0.5664028525352478, "learning_rate": 6.660367187954304e-05, "loss": 1.418, "step": 192 }, { "epoch": 0.43591191417278374, "grad_norm": 0.6558746695518494, "learning_rate": 6.624443972243698e-05, "loss": 1.2759, "step": 193 }, { "epoch": 0.43817052512704685, "grad_norm": 1.469589352607727, "learning_rate": 6.5884267620671e-05, "loss": 1.2259, "step": 194 }, { "epoch": 0.44042913608131, "grad_norm": 0.9882861375808716, "learning_rate": 6.552317641472026e-05, "loss": 1.4997, "step": 195 }, { "epoch": 0.4426877470355731, "grad_norm": 0.4861055910587311, "learning_rate": 6.516118699824178e-05, "loss": 1.2735, "step": 196 }, { "epoch": 0.4449463579898362, "grad_norm": 0.5669353008270264, "learning_rate": 6.479832031686521e-05, "loss": 1.4849, "step": 197 }, { "epoch": 0.4472049689440994, "grad_norm": 0.5639188885688782, "learning_rate": 6.443459736698105e-05, "loss": 1.6053, "step": 198 }, { "epoch": 0.4494635798983625, "grad_norm": 0.5520173907279968, "learning_rate": 6.407003919452564e-05, "loss": 1.2882, "step": 199 }, { "epoch": 0.45172219085262566, "grad_norm": 0.7025582790374756, "learning_rate": 6.370466689376342e-05, "loss": 1.4892, "step": 200 }, { "epoch": 0.45398080180688877, "grad_norm": 0.5788022875785828, "learning_rate": 6.33385016060664e-05, "loss": 1.7469, "step": 201 }, { "epoch": 0.4562394127611519, "grad_norm": 0.5949933528900146, "learning_rate": 6.297156451869082e-05, "loss": 1.405, "step": 202 }, { "epoch": 0.45849802371541504, "grad_norm": 0.5209388732910156, "learning_rate": 6.260387686355121e-05, "loss": 1.3265, "step": 203 }, { "epoch": 0.46075663466967814, "grad_norm": 0.4528113603591919, "learning_rate": 6.223545991599184e-05, "loss": 1.4738, "step": 204 }, { "epoch": 0.46301524562394125, "grad_norm": 0.5056377053260803, "learning_rate": 6.186633499355576e-05, "loss": 1.6497, "step": 205 }, { "epoch": 0.4652738565782044, "grad_norm": 0.5312528014183044, "learning_rate": 6.149652345475118e-05, "loss": 1.2971, "step": 206 }, { "epoch": 0.4675324675324675, "grad_norm": 0.8774858117103577, "learning_rate": 6.112604669781572e-05, "loss": 1.4465, "step": 207 }, { "epoch": 0.4697910784867307, "grad_norm": 0.6204832196235657, "learning_rate": 6.075492615947823e-05, "loss": 1.3416, "step": 208 }, { "epoch": 0.4720496894409938, "grad_norm": 0.4414108097553253, "learning_rate": 6.038318331371836e-05, "loss": 1.5334, "step": 209 }, { "epoch": 0.4743083003952569, "grad_norm": 0.6953609585762024, "learning_rate": 6.001083967052408e-05, "loss": 1.6932, "step": 210 }, { "epoch": 0.47656691134952006, "grad_norm": 0.47844645380973816, "learning_rate": 5.963791677464696e-05, "loss": 1.7326, "step": 211 }, { "epoch": 0.47882552230378317, "grad_norm": 0.5514799952507019, "learning_rate": 5.9264436204355724e-05, "loss": 1.2659, "step": 212 }, { "epoch": 0.4810841332580463, "grad_norm": 0.7047640681266785, "learning_rate": 5.889041957018745e-05, "loss": 1.2528, "step": 213 }, { "epoch": 0.48334274421230944, "grad_norm": 0.5648500919342041, "learning_rate": 5.85158885136973e-05, "loss": 1.1812, "step": 214 }, { "epoch": 0.48560135516657255, "grad_norm": 0.6360241174697876, "learning_rate": 5.81408647062062e-05, "loss": 1.464, "step": 215 }, { "epoch": 0.4878599661208357, "grad_norm": 0.693928599357605, "learning_rate": 5.7765369847546916e-05, "loss": 1.5254, "step": 216 }, { "epoch": 0.4901185770750988, "grad_norm": 0.8636585474014282, "learning_rate": 5.7389425664808396e-05, "loss": 1.7108, "step": 217 }, { "epoch": 0.4923771880293619, "grad_norm": 0.5646620392799377, "learning_rate": 5.7013053911078677e-05, "loss": 1.4662, "step": 218 }, { "epoch": 0.4946357989836251, "grad_norm": 0.48793742060661316, "learning_rate": 5.6636276364186105e-05, "loss": 1.475, "step": 219 }, { "epoch": 0.4968944099378882, "grad_norm": 0.4991232454776764, "learning_rate": 5.6259114825439275e-05, "loss": 1.5324, "step": 220 }, { "epoch": 0.4991530208921513, "grad_norm": 0.6191245317459106, "learning_rate": 5.588159111836553e-05, "loss": 1.3583, "step": 221 }, { "epoch": 0.5014116318464145, "grad_norm": 0.5580266714096069, "learning_rate": 5.550372708744815e-05, "loss": 1.4188, "step": 222 }, { "epoch": 0.5014116318464145, "eval_loss": 1.4097435474395752, "eval_runtime": 98.3925, "eval_samples_per_second": 7.582, "eval_steps_per_second": 0.955, "step": 222 }, { "epoch": 0.5036702428006776, "grad_norm": 0.4826272130012512, "learning_rate": 5.51255445968625e-05, "loss": 1.4089, "step": 223 }, { "epoch": 0.5059288537549407, "grad_norm": 0.7236170768737793, "learning_rate": 5.4747065529210736e-05, "loss": 1.3453, "step": 224 }, { "epoch": 0.5081874647092038, "grad_norm": 0.5844079256057739, "learning_rate": 5.436831178425582e-05, "loss": 1.3397, "step": 225 }, { "epoch": 0.510446075663467, "grad_norm": 0.8065000176429749, "learning_rate": 5.3989305277654156e-05, "loss": 1.4753, "step": 226 }, { "epoch": 0.51270468661773, "grad_norm": 0.48151111602783203, "learning_rate": 5.361006793968764e-05, "loss": 1.2529, "step": 227 }, { "epoch": 0.5149632975719932, "grad_norm": 0.6532959342002869, "learning_rate": 5.32306217139946e-05, "loss": 1.4809, "step": 228 }, { "epoch": 0.5172219085262564, "grad_norm": 0.6014647483825684, "learning_rate": 5.28509885563002e-05, "loss": 1.2814, "step": 229 }, { "epoch": 0.5194805194805194, "grad_norm": 0.6728442311286926, "learning_rate": 5.247119043314592e-05, "loss": 1.4694, "step": 230 }, { "epoch": 0.5217391304347826, "grad_norm": 0.5741045475006104, "learning_rate": 5.209124932061862e-05, "loss": 1.6012, "step": 231 }, { "epoch": 0.5239977413890458, "grad_norm": 0.49296098947525024, "learning_rate": 5.1711187203078824e-05, "loss": 1.4489, "step": 232 }, { "epoch": 0.5262563523433089, "grad_norm": 0.6149709820747375, "learning_rate": 5.133102607188874e-05, "loss": 1.4421, "step": 233 }, { "epoch": 0.528514963297572, "grad_norm": 0.6720355749130249, "learning_rate": 5.0950787924139764e-05, "loss": 1.2675, "step": 234 }, { "epoch": 0.5307735742518351, "grad_norm": 0.5325198769569397, "learning_rate": 5.057049476137967e-05, "loss": 1.1803, "step": 235 }, { "epoch": 0.5330321852060983, "grad_norm": 0.5668638944625854, "learning_rate": 5.0190168588339536e-05, "loss": 1.2712, "step": 236 }, { "epoch": 0.5352907961603613, "grad_norm": 0.4216601848602295, "learning_rate": 4.9809831411660476e-05, "loss": 1.322, "step": 237 }, { "epoch": 0.5375494071146245, "grad_norm": 0.608249306678772, "learning_rate": 4.942950523862033e-05, "loss": 1.3996, "step": 238 }, { "epoch": 0.5398080180688877, "grad_norm": 0.9624855518341064, "learning_rate": 4.904921207586024e-05, "loss": 1.3592, "step": 239 }, { "epoch": 0.5420666290231507, "grad_norm": 0.5141565203666687, "learning_rate": 4.866897392811126e-05, "loss": 1.1777, "step": 240 }, { "epoch": 0.5443252399774139, "grad_norm": 0.6783860325813293, "learning_rate": 4.828881279692119e-05, "loss": 1.6956, "step": 241 }, { "epoch": 0.546583850931677, "grad_norm": 0.48693007230758667, "learning_rate": 4.7908750679381384e-05, "loss": 1.6161, "step": 242 }, { "epoch": 0.5488424618859401, "grad_norm": 0.5655871033668518, "learning_rate": 4.752880956685407e-05, "loss": 1.0778, "step": 243 }, { "epoch": 0.5511010728402033, "grad_norm": 0.5297839045524597, "learning_rate": 4.7149011443699814e-05, "loss": 1.485, "step": 244 }, { "epoch": 0.5533596837944664, "grad_norm": 0.7380653023719788, "learning_rate": 4.676937828600542e-05, "loss": 1.4226, "step": 245 }, { "epoch": 0.5556182947487295, "grad_norm": 0.693130373954773, "learning_rate": 4.638993206031237e-05, "loss": 1.4093, "step": 246 }, { "epoch": 0.5578769057029926, "grad_norm": 0.6071158647537231, "learning_rate": 4.601069472234584e-05, "loss": 1.0761, "step": 247 }, { "epoch": 0.5601355166572558, "grad_norm": 0.9863161444664001, "learning_rate": 4.56316882157442e-05, "loss": 1.3143, "step": 248 }, { "epoch": 0.562394127611519, "grad_norm": 0.5352652668952942, "learning_rate": 4.525293447078927e-05, "loss": 1.7061, "step": 249 }, { "epoch": 0.564652738565782, "grad_norm": 0.558076798915863, "learning_rate": 4.4874455403137514e-05, "loss": 1.7395, "step": 250 }, { "epoch": 0.5669113495200452, "grad_norm": 1.1520057916641235, "learning_rate": 4.449627291255184e-05, "loss": 1.6384, "step": 251 }, { "epoch": 0.5691699604743083, "grad_norm": 0.531182587146759, "learning_rate": 4.411840888163449e-05, "loss": 1.3033, "step": 252 }, { "epoch": 0.5714285714285714, "grad_norm": 0.4696061313152313, "learning_rate": 4.3740885174560736e-05, "loss": 1.4949, "step": 253 }, { "epoch": 0.5736871823828346, "grad_norm": 0.6336054801940918, "learning_rate": 4.336372363581391e-05, "loss": 1.4374, "step": 254 }, { "epoch": 0.5759457933370977, "grad_norm": 0.6364861726760864, "learning_rate": 4.298694608892134e-05, "loss": 1.7703, "step": 255 }, { "epoch": 0.5782044042913608, "grad_norm": 1.0777122974395752, "learning_rate": 4.2610574335191615e-05, "loss": 1.4255, "step": 256 }, { "epoch": 0.5804630152456239, "grad_norm": 0.8023219704627991, "learning_rate": 4.2234630152453116e-05, "loss": 1.384, "step": 257 }, { "epoch": 0.5827216261998871, "grad_norm": 0.5222808718681335, "learning_rate": 4.185913529379381e-05, "loss": 1.1682, "step": 258 }, { "epoch": 0.5849802371541502, "grad_norm": 0.7348831295967102, "learning_rate": 4.1484111486302704e-05, "loss": 1.2724, "step": 259 }, { "epoch": 0.5872388481084133, "grad_norm": 0.5531060695648193, "learning_rate": 4.110958042981255e-05, "loss": 1.4645, "step": 260 }, { "epoch": 0.5894974590626765, "grad_norm": 0.6494237780570984, "learning_rate": 4.0735563795644294e-05, "loss": 1.5516, "step": 261 }, { "epoch": 0.5917560700169395, "grad_norm": 0.5829164981842041, "learning_rate": 4.0362083225353046e-05, "loss": 1.4303, "step": 262 }, { "epoch": 0.5940146809712027, "grad_norm": 0.4754260182380676, "learning_rate": 3.998916032947594e-05, "loss": 1.3245, "step": 263 }, { "epoch": 0.5962732919254659, "grad_norm": 0.7233774065971375, "learning_rate": 3.961681668628164e-05, "loss": 1.272, "step": 264 }, { "epoch": 0.598531902879729, "grad_norm": 0.55941241979599, "learning_rate": 3.9245073840521765e-05, "loss": 1.4211, "step": 265 }, { "epoch": 0.6007905138339921, "grad_norm": 0.5475345253944397, "learning_rate": 3.887395330218429e-05, "loss": 1.3842, "step": 266 }, { "epoch": 0.6030491247882552, "grad_norm": 0.7882475256919861, "learning_rate": 3.850347654524883e-05, "loss": 1.4087, "step": 267 }, { "epoch": 0.6053077357425184, "grad_norm": 0.6707928776741028, "learning_rate": 3.8133665006444255e-05, "loss": 1.2446, "step": 268 }, { "epoch": 0.6075663466967814, "grad_norm": 0.6627002358436584, "learning_rate": 3.776454008400816e-05, "loss": 1.7243, "step": 269 }, { "epoch": 0.6098249576510446, "grad_norm": 0.476368248462677, "learning_rate": 3.7396123136448824e-05, "loss": 1.5209, "step": 270 }, { "epoch": 0.6120835686053078, "grad_norm": 0.6583078503608704, "learning_rate": 3.70284354813092e-05, "loss": 1.3496, "step": 271 }, { "epoch": 0.6143421795595708, "grad_norm": 0.6484099626541138, "learning_rate": 3.666149839393361e-05, "loss": 1.6792, "step": 272 }, { "epoch": 0.616600790513834, "grad_norm": 0.42976608872413635, "learning_rate": 3.629533310623658e-05, "loss": 1.0712, "step": 273 }, { "epoch": 0.6188594014680971, "grad_norm": 0.7299443483352661, "learning_rate": 3.592996080547438e-05, "loss": 1.7172, "step": 274 }, { "epoch": 0.6211180124223602, "grad_norm": 0.6105283498764038, "learning_rate": 3.556540263301896e-05, "loss": 1.5218, "step": 275 }, { "epoch": 0.6233766233766234, "grad_norm": 0.6349045038223267, "learning_rate": 3.520167968313479e-05, "loss": 1.4029, "step": 276 }, { "epoch": 0.6256352343308865, "grad_norm": 0.441521555185318, "learning_rate": 3.483881300175823e-05, "loss": 1.5902, "step": 277 }, { "epoch": 0.6278938452851497, "grad_norm": 0.6537282466888428, "learning_rate": 3.447682358527974e-05, "loss": 1.2511, "step": 278 }, { "epoch": 0.6301524562394127, "grad_norm": 0.64874267578125, "learning_rate": 3.411573237932904e-05, "loss": 1.4145, "step": 279 }, { "epoch": 0.6324110671936759, "grad_norm": 0.6560806035995483, "learning_rate": 3.3755560277563023e-05, "loss": 1.3575, "step": 280 }, { "epoch": 0.6346696781479391, "grad_norm": 0.4486541152000427, "learning_rate": 3.339632812045696e-05, "loss": 1.4707, "step": 281 }, { "epoch": 0.6369282891022021, "grad_norm": 0.5472518801689148, "learning_rate": 3.303805669409848e-05, "loss": 1.2591, "step": 282 }, { "epoch": 0.6391869000564653, "grad_norm": 0.8374768495559692, "learning_rate": 3.268076672898492e-05, "loss": 1.2883, "step": 283 }, { "epoch": 0.6414455110107284, "grad_norm": 0.4761299788951874, "learning_rate": 3.2324478898823705e-05, "loss": 1.46, "step": 284 }, { "epoch": 0.6437041219649915, "grad_norm": 0.58656907081604, "learning_rate": 3.196921381933624e-05, "loss": 1.366, "step": 285 }, { "epoch": 0.6459627329192547, "grad_norm": 0.6311814785003662, "learning_rate": 3.1614992047064945e-05, "loss": 1.2755, "step": 286 }, { "epoch": 0.6482213438735178, "grad_norm": 0.5806014537811279, "learning_rate": 3.126183407818384e-05, "loss": 1.3936, "step": 287 }, { "epoch": 0.6504799548277809, "grad_norm": 0.7558007836341858, "learning_rate": 3.090976034731257e-05, "loss": 1.3133, "step": 288 }, { "epoch": 0.652738565782044, "grad_norm": 0.6248241662979126, "learning_rate": 3.055879122633397e-05, "loss": 1.3684, "step": 289 }, { "epoch": 0.6549971767363072, "grad_norm": 0.6883110404014587, "learning_rate": 3.020894702321539e-05, "loss": 1.5355, "step": 290 }, { "epoch": 0.6572557876905702, "grad_norm": 0.47955361008644104, "learning_rate": 2.9860247980833532e-05, "loss": 1.2752, "step": 291 }, { "epoch": 0.6595143986448334, "grad_norm": 0.6376725435256958, "learning_rate": 2.951271427580321e-05, "loss": 1.3576, "step": 292 }, { "epoch": 0.6617730095990966, "grad_norm": 0.6355816125869751, "learning_rate": 2.91663660173098e-05, "loss": 1.3411, "step": 293 }, { "epoch": 0.6640316205533597, "grad_norm": 0.6872756481170654, "learning_rate": 2.882122324594575e-05, "loss": 1.5319, "step": 294 }, { "epoch": 0.6662902315076228, "grad_norm": 1.0146925449371338, "learning_rate": 2.847730593255097e-05, "loss": 1.4574, "step": 295 }, { "epoch": 0.668548842461886, "grad_norm": 0.46255797147750854, "learning_rate": 2.8134633977057235e-05, "loss": 1.6074, "step": 296 }, { "epoch": 0.6708074534161491, "grad_norm": 0.5414189100265503, "learning_rate": 2.779322720733673e-05, "loss": 1.6655, "step": 297 }, { "epoch": 0.6730660643704122, "grad_norm": 0.4797675609588623, "learning_rate": 2.745310537805479e-05, "loss": 1.3457, "step": 298 }, { "epoch": 0.6753246753246753, "grad_norm": 0.5770863890647888, "learning_rate": 2.7114288169526793e-05, "loss": 1.5853, "step": 299 }, { "epoch": 0.6775832862789385, "grad_norm": 0.4539095163345337, "learning_rate": 2.6776795186579468e-05, "loss": 1.2473, "step": 300 }, { "epoch": 0.6798418972332015, "grad_norm": 0.5590886473655701, "learning_rate": 2.6440645957416484e-05, "loss": 1.7489, "step": 301 }, { "epoch": 0.6821005081874647, "grad_norm": 0.5184858441352844, "learning_rate": 2.610585993248843e-05, "loss": 1.4334, "step": 302 }, { "epoch": 0.6843591191417279, "grad_norm": 0.5701838135719299, "learning_rate": 2.5772456483367497e-05, "loss": 1.2721, "step": 303 }, { "epoch": 0.6866177300959909, "grad_norm": 0.5292041301727295, "learning_rate": 2.5440454901626486e-05, "loss": 1.4493, "step": 304 }, { "epoch": 0.6888763410502541, "grad_norm": 0.6524962782859802, "learning_rate": 2.510987439772261e-05, "loss": 1.1314, "step": 305 }, { "epoch": 0.6911349520045172, "grad_norm": 0.5966671109199524, "learning_rate": 2.4780734099885833e-05, "loss": 1.9383, "step": 306 }, { "epoch": 0.6933935629587803, "grad_norm": 0.506033718585968, "learning_rate": 2.4453053053012187e-05, "loss": 1.4983, "step": 307 }, { "epoch": 0.6956521739130435, "grad_norm": 0.687870979309082, "learning_rate": 2.4126850217561698e-05, "loss": 1.5229, "step": 308 }, { "epoch": 0.6979107848673066, "grad_norm": 0.4640161991119385, "learning_rate": 2.3802144468461367e-05, "loss": 1.5815, "step": 309 }, { "epoch": 0.7001693958215698, "grad_norm": 0.9109538793563843, "learning_rate": 2.347895459401288e-05, "loss": 1.419, "step": 310 }, { "epoch": 0.7024280067758328, "grad_norm": 0.5041788220405579, "learning_rate": 2.3157299294805613e-05, "loss": 1.3753, "step": 311 }, { "epoch": 0.704686617730096, "grad_norm": 0.6865159869194031, "learning_rate": 2.2837197182634483e-05, "loss": 1.6305, "step": 312 }, { "epoch": 0.7069452286843592, "grad_norm": 0.7781485319137573, "learning_rate": 2.2518666779423074e-05, "loss": 1.4607, "step": 313 }, { "epoch": 0.7092038396386222, "grad_norm": 0.5647329092025757, "learning_rate": 2.2201726516151882e-05, "loss": 1.4964, "step": 314 }, { "epoch": 0.7114624505928854, "grad_norm": 0.577785074710846, "learning_rate": 2.1886394731791816e-05, "loss": 1.5494, "step": 315 }, { "epoch": 0.7137210615471485, "grad_norm": 0.5110263228416443, "learning_rate": 2.157268967224314e-05, "loss": 1.4089, "step": 316 }, { "epoch": 0.7159796725014116, "grad_norm": 0.5485273003578186, "learning_rate": 2.126062948927966e-05, "loss": 1.5507, "step": 317 }, { "epoch": 0.7182382834556748, "grad_norm": 0.6336625814437866, "learning_rate": 2.0950232239498446e-05, "loss": 1.1833, "step": 318 }, { "epoch": 0.7204968944099379, "grad_norm": 0.5420766472816467, "learning_rate": 2.064151588327501e-05, "loss": 1.5497, "step": 319 }, { "epoch": 0.722755505364201, "grad_norm": 0.43920066952705383, "learning_rate": 2.0334498283724078e-05, "loss": 1.7342, "step": 320 }, { "epoch": 0.7250141163184641, "grad_norm": 0.4995698928833008, "learning_rate": 2.002919720566599e-05, "loss": 1.2779, "step": 321 }, { "epoch": 0.7272727272727273, "grad_norm": 1.1329647302627563, "learning_rate": 1.9725630314598782e-05, "loss": 1.3353, "step": 322 }, { "epoch": 0.7295313382269905, "grad_norm": 0.6960721015930176, "learning_rate": 1.9423815175676025e-05, "loss": 1.4193, "step": 323 }, { "epoch": 0.7317899491812535, "grad_norm": 0.5718068480491638, "learning_rate": 1.912376925269041e-05, "loss": 1.2262, "step": 324 }, { "epoch": 0.7340485601355167, "grad_norm": 0.633811891078949, "learning_rate": 1.8825509907063327e-05, "loss": 1.5999, "step": 325 }, { "epoch": 0.7363071710897798, "grad_norm": 0.5763436555862427, "learning_rate": 1.8529054396840234e-05, "loss": 1.5864, "step": 326 }, { "epoch": 0.7385657820440429, "grad_norm": 0.49580734968185425, "learning_rate": 1.8234419875692105e-05, "loss": 1.9146, "step": 327 }, { "epoch": 0.740824392998306, "grad_norm": 0.597285270690918, "learning_rate": 1.7941623391922772e-05, "loss": 1.2687, "step": 328 }, { "epoch": 0.7430830039525692, "grad_norm": 0.6545310020446777, "learning_rate": 1.7650681887482628e-05, "loss": 1.5924, "step": 329 }, { "epoch": 0.7453416149068323, "grad_norm": 0.760567307472229, "learning_rate": 1.7361612196988174e-05, "loss": 1.4892, "step": 330 }, { "epoch": 0.7476002258610954, "grad_norm": 0.5573106408119202, "learning_rate": 1.7074431046748075e-05, "loss": 1.3689, "step": 331 }, { "epoch": 0.7498588368153586, "grad_norm": 0.5589758157730103, "learning_rate": 1.678915505379513e-05, "loss": 1.2265, "step": 332 }, { "epoch": 0.7521174477696216, "grad_norm": 0.5898151397705078, "learning_rate": 1.650580072492496e-05, "loss": 1.3948, "step": 333 }, { "epoch": 0.7543760587238848, "grad_norm": 0.6013538241386414, "learning_rate": 1.6224384455740788e-05, "loss": 1.5207, "step": 334 }, { "epoch": 0.756634669678148, "grad_norm": 0.5616132020950317, "learning_rate": 1.5944922529704777e-05, "loss": 1.3745, "step": 335 }, { "epoch": 0.758893280632411, "grad_norm": 0.6084350943565369, "learning_rate": 1.5667431117195814e-05, "loss": 1.4857, "step": 336 }, { "epoch": 0.7611518915866742, "grad_norm": 0.6251053810119629, "learning_rate": 1.539192627457382e-05, "loss": 1.2732, "step": 337 }, { "epoch": 0.7634105025409373, "grad_norm": 0.8703694343566895, "learning_rate": 1.5118423943250771e-05, "loss": 1.5002, "step": 338 }, { "epoch": 0.7656691134952005, "grad_norm": 0.5518003702163696, "learning_rate": 1.4846939948768218e-05, "loss": 1.8132, "step": 339 }, { "epoch": 0.7679277244494636, "grad_norm": 0.5927785038948059, "learning_rate": 1.45774899998816e-05, "loss": 1.1446, "step": 340 }, { "epoch": 0.7701863354037267, "grad_norm": 0.5158970952033997, "learning_rate": 1.4310089687651301e-05, "loss": 1.2931, "step": 341 }, { "epoch": 0.7724449463579899, "grad_norm": 0.7069851160049438, "learning_rate": 1.40447544845405e-05, "loss": 1.4927, "step": 342 }, { "epoch": 0.7747035573122529, "grad_norm": 0.5784515142440796, "learning_rate": 1.378149974351991e-05, "loss": 1.3116, "step": 343 }, { "epoch": 0.7769621682665161, "grad_norm": 0.49697092175483704, "learning_rate": 1.3520340697179406e-05, "loss": 1.5359, "step": 344 }, { "epoch": 0.7792207792207793, "grad_norm": 0.6225888729095459, "learning_rate": 1.3261292456846647e-05, "loss": 1.4725, "step": 345 }, { "epoch": 0.7814793901750423, "grad_norm": 0.9478232860565186, "learning_rate": 1.3004370011712624e-05, "loss": 1.3835, "step": 346 }, { "epoch": 0.7837380011293055, "grad_norm": 0.5357046127319336, "learning_rate": 1.2749588227964465e-05, "loss": 1.2114, "step": 347 }, { "epoch": 0.7859966120835686, "grad_norm": 0.6515849828720093, "learning_rate": 1.2496961847925153e-05, "loss": 1.4449, "step": 348 }, { "epoch": 0.7882552230378317, "grad_norm": 0.7444521188735962, "learning_rate": 1.2246505489200532e-05, "loss": 1.2324, "step": 349 }, { "epoch": 0.7905138339920948, "grad_norm": 0.6521069407463074, "learning_rate": 1.1998233643833457e-05, "loss": 1.3745, "step": 350 }, { "epoch": 0.792772444946358, "grad_norm": 0.5900723934173584, "learning_rate": 1.1752160677465286e-05, "loss": 1.3835, "step": 351 }, { "epoch": 0.7950310559006211, "grad_norm": 0.5521007180213928, "learning_rate": 1.150830082850468e-05, "loss": 1.4689, "step": 352 }, { "epoch": 0.7972896668548842, "grad_norm": 0.43919599056243896, "learning_rate": 1.126666820730366e-05, "loss": 1.379, "step": 353 }, { "epoch": 0.7995482778091474, "grad_norm": 0.4976402223110199, "learning_rate": 1.1027276795341135e-05, "loss": 1.4056, "step": 354 }, { "epoch": 0.8018068887634106, "grad_norm": 0.7465052008628845, "learning_rate": 1.0790140444414e-05, "loss": 1.1132, "step": 355 }, { "epoch": 0.8040654997176736, "grad_norm": 1.412811279296875, "learning_rate": 1.0555272875835537e-05, "loss": 1.3229, "step": 356 }, { "epoch": 0.8063241106719368, "grad_norm": 0.38720905780792236, "learning_rate": 1.0322687679641523e-05, "loss": 1.3627, "step": 357 }, { "epoch": 0.8085827216261999, "grad_norm": 0.6548861265182495, "learning_rate": 1.0092398313803863e-05, "loss": 1.6172, "step": 358 }, { "epoch": 0.810841332580463, "grad_norm": 0.5794533491134644, "learning_rate": 9.864418103451828e-06, "loss": 1.5726, "step": 359 }, { "epoch": 0.8130999435347261, "grad_norm": 0.5520102977752686, "learning_rate": 9.638760240101102e-06, "loss": 1.294, "step": 360 }, { "epoch": 0.8153585544889893, "grad_norm": 0.5520592927932739, "learning_rate": 9.415437780890451e-06, "loss": 1.3229, "step": 361 }, { "epoch": 0.8176171654432524, "grad_norm": 0.644109845161438, "learning_rate": 9.194463647826223e-06, "loss": 1.6856, "step": 362 }, { "epoch": 0.8198757763975155, "grad_norm": 0.6648945808410645, "learning_rate": 8.975850627034604e-06, "loss": 1.4002, "step": 363 }, { "epoch": 0.8221343873517787, "grad_norm": 0.6234399676322937, "learning_rate": 8.759611368021831e-06, "loss": 1.33, "step": 364 }, { "epoch": 0.8243929983060417, "grad_norm": 0.6238119602203369, "learning_rate": 8.545758382942232e-06, "loss": 1.2347, "step": 365 }, { "epoch": 0.8266516092603049, "grad_norm": 0.5199403166770935, "learning_rate": 8.334304045874247e-06, "loss": 1.5012, "step": 366 }, { "epoch": 0.8289102202145681, "grad_norm": 0.43542924523353577, "learning_rate": 8.125260592104445e-06, "loss": 1.4619, "step": 367 }, { "epoch": 0.8311688311688312, "grad_norm": 0.6356791853904724, "learning_rate": 7.918640117419507e-06, "loss": 1.2857, "step": 368 }, { "epoch": 0.8334274421230943, "grad_norm": 0.8213852643966675, "learning_rate": 7.71445457740641e-06, "loss": 1.3405, "step": 369 }, { "epoch": 0.8356860530773574, "grad_norm": 0.6396834254264832, "learning_rate": 7.512715786760605e-06, "loss": 1.675, "step": 370 }, { "epoch": 0.8379446640316206, "grad_norm": 0.5670115947723389, "learning_rate": 7.313435418602388e-06, "loss": 1.4417, "step": 371 }, { "epoch": 0.8402032749858837, "grad_norm": 0.8237860798835754, "learning_rate": 7.116625003801436e-06, "loss": 1.3525, "step": 372 }, { "epoch": 0.8424618859401468, "grad_norm": 0.6882337927818298, "learning_rate": 6.922295930309691e-06, "loss": 1.5534, "step": 373 }, { "epoch": 0.84472049689441, "grad_norm": 0.750078022480011, "learning_rate": 6.730459442502329e-06, "loss": 1.5733, "step": 374 }, { "epoch": 0.846979107848673, "grad_norm": 0.5471389889717102, "learning_rate": 6.541126640527195e-06, "loss": 1.6169, "step": 375 }, { "epoch": 0.8492377188029362, "grad_norm": 0.4878935217857361, "learning_rate": 6.354308479662446e-06, "loss": 1.5414, "step": 376 }, { "epoch": 0.8514963297571994, "grad_norm": 0.5643681883811951, "learning_rate": 6.170015769682741e-06, "loss": 1.6472, "step": 377 }, { "epoch": 0.8537549407114624, "grad_norm": 0.5661868453025818, "learning_rate": 5.988259174233713e-06, "loss": 1.3295, "step": 378 }, { "epoch": 0.8560135516657256, "grad_norm": 0.8035285472869873, "learning_rate": 5.80904921021494e-06, "loss": 1.4848, "step": 379 }, { "epoch": 0.8582721626199887, "grad_norm": 0.48411625623703003, "learning_rate": 5.6323962471714286e-06, "loss": 1.3458, "step": 380 }, { "epoch": 0.8605307735742518, "grad_norm": 0.5900686979293823, "learning_rate": 5.458310506693571e-06, "loss": 1.5151, "step": 381 }, { "epoch": 0.862789384528515, "grad_norm": 0.5503493547439575, "learning_rate": 5.286802061825752e-06, "loss": 1.4399, "step": 382 }, { "epoch": 0.8650479954827781, "grad_norm": 0.4653548002243042, "learning_rate": 5.117880836483452e-06, "loss": 1.5298, "step": 383 }, { "epoch": 0.8673066064370413, "grad_norm": 0.5434414148330688, "learning_rate": 4.951556604879048e-06, "loss": 1.3049, "step": 384 }, { "epoch": 0.8695652173913043, "grad_norm": 0.8023334741592407, "learning_rate": 4.7878389909562285e-06, "loss": 1.4366, "step": 385 }, { "epoch": 0.8718238283455675, "grad_norm": 0.5557988286018372, "learning_rate": 4.62673746783317e-06, "loss": 1.2165, "step": 386 }, { "epoch": 0.8740824392998306, "grad_norm": 0.4488828778266907, "learning_rate": 4.468261357254339e-06, "loss": 1.1717, "step": 387 }, { "epoch": 0.8763410502540937, "grad_norm": 0.5039349794387817, "learning_rate": 4.312419829051173e-06, "loss": 1.5923, "step": 388 }, { "epoch": 0.8785996612083569, "grad_norm": 0.5717617273330688, "learning_rate": 4.15922190061146e-06, "loss": 1.3688, "step": 389 }, { "epoch": 0.88085827216262, "grad_norm": 0.6537705063819885, "learning_rate": 4.008676436357539e-06, "loss": 1.4746, "step": 390 }, { "epoch": 0.8831168831168831, "grad_norm": 0.7428813576698303, "learning_rate": 3.86079214723345e-06, "loss": 1.6453, "step": 391 }, { "epoch": 0.8853754940711462, "grad_norm": 0.6995297074317932, "learning_rate": 3.7155775902008526e-06, "loss": 1.2413, "step": 392 }, { "epoch": 0.8876341050254094, "grad_norm": 0.5746084451675415, "learning_rate": 3.5730411677439125e-06, "loss": 1.5308, "step": 393 }, { "epoch": 0.8898927159796725, "grad_norm": 0.7818319797515869, "learning_rate": 3.4331911273830784e-06, "loss": 1.2739, "step": 394 }, { "epoch": 0.8921513269339356, "grad_norm": 0.4729510545730591, "learning_rate": 3.2960355611979245e-06, "loss": 1.2642, "step": 395 }, { "epoch": 0.8944099378881988, "grad_norm": 0.5390079021453857, "learning_rate": 3.161582405358876e-06, "loss": 1.4629, "step": 396 }, { "epoch": 0.8966685488424618, "grad_norm": 0.5690498352050781, "learning_rate": 3.029839439668003e-06, "loss": 1.576, "step": 397 }, { "epoch": 0.898927159796725, "grad_norm": 0.6022957563400269, "learning_rate": 2.9008142871088663e-06, "loss": 1.4105, "step": 398 }, { "epoch": 0.9011857707509882, "grad_norm": 0.4936705231666565, "learning_rate": 2.7745144134054433e-06, "loss": 1.3807, "step": 399 }, { "epoch": 0.9034443817052513, "grad_norm": 0.5045138001441956, "learning_rate": 2.6509471265901477e-06, "loss": 1.427, "step": 400 }, { "epoch": 0.9057029926595144, "grad_norm": 0.5444932579994202, "learning_rate": 2.530119576580936e-06, "loss": 1.3433, "step": 401 }, { "epoch": 0.9079616036137775, "grad_norm": 0.48583537340164185, "learning_rate": 2.412038754767626e-06, "loss": 1.2593, "step": 402 }, { "epoch": 0.9102202145680407, "grad_norm": 0.7965908050537109, "learning_rate": 2.296711493607334e-06, "loss": 1.304, "step": 403 }, { "epoch": 0.9124788255223037, "grad_norm": 0.6211166381835938, "learning_rate": 2.1841444662291543e-06, "loss": 1.4832, "step": 404 }, { "epoch": 0.9147374364765669, "grad_norm": 0.5326806902885437, "learning_rate": 2.074344186048022e-06, "loss": 1.239, "step": 405 }, { "epoch": 0.9169960474308301, "grad_norm": 0.46467143297195435, "learning_rate": 1.967317006387831e-06, "loss": 1.5442, "step": 406 }, { "epoch": 0.9192546583850931, "grad_norm": 0.6954487562179565, "learning_rate": 1.863069120113814e-06, "loss": 1.0449, "step": 407 }, { "epoch": 0.9215132693393563, "grad_norm": 0.5407376885414124, "learning_rate": 1.7616065592742038e-06, "loss": 1.368, "step": 408 }, { "epoch": 0.9237718802936195, "grad_norm": 0.5914408564567566, "learning_rate": 1.6629351947512195e-06, "loss": 1.4579, "step": 409 }, { "epoch": 0.9260304912478825, "grad_norm": 0.5790956020355225, "learning_rate": 1.567060735921344e-06, "loss": 1.5103, "step": 410 }, { "epoch": 0.9282891022021457, "grad_norm": 0.7009652853012085, "learning_rate": 1.4739887303249877e-06, "loss": 1.4829, "step": 411 }, { "epoch": 0.9305477131564088, "grad_norm": 0.6565191149711609, "learning_rate": 1.383724563345451e-06, "loss": 1.6771, "step": 412 }, { "epoch": 0.932806324110672, "grad_norm": 0.6377475261688232, "learning_rate": 1.2962734578973568e-06, "loss": 1.4032, "step": 413 }, { "epoch": 0.935064935064935, "grad_norm": 0.6361109018325806, "learning_rate": 1.2116404741244203e-06, "loss": 1.4081, "step": 414 }, { "epoch": 0.9373235460191982, "grad_norm": 0.517658531665802, "learning_rate": 1.1298305091066664e-06, "loss": 1.3421, "step": 415 }, { "epoch": 0.9395821569734614, "grad_norm": 0.7061721086502075, "learning_rate": 1.0508482965770505e-06, "loss": 1.5959, "step": 416 }, { "epoch": 0.9418407679277244, "grad_norm": 0.5791245698928833, "learning_rate": 9.746984066475729e-07, "loss": 1.3128, "step": 417 }, { "epoch": 0.9440993788819876, "grad_norm": 0.7557624578475952, "learning_rate": 9.013852455448335e-07, "loss": 1.2763, "step": 418 }, { "epoch": 0.9463579898362507, "grad_norm": 0.9951817989349365, "learning_rate": 8.309130553550815e-07, "loss": 1.4941, "step": 419 }, { "epoch": 0.9486166007905138, "grad_norm": 0.3905503749847412, "learning_rate": 7.63285913778733e-07, "loss": 1.3464, "step": 420 }, { "epoch": 0.950875211744777, "grad_norm": 0.5072855949401855, "learning_rate": 6.985077338944657e-07, "loss": 1.0404, "step": 421 }, { "epoch": 0.9531338226990401, "grad_norm": 0.5355942845344543, "learning_rate": 6.365822639327723e-07, "loss": 1.2642, "step": 422 }, { "epoch": 0.9553924336533032, "grad_norm": 0.5786038041114807, "learning_rate": 5.775130870590783e-07, "loss": 1.3897, "step": 423 }, { "epoch": 0.9576510446075663, "grad_norm": 0.564463198184967, "learning_rate": 5.213036211664191e-07, "loss": 1.4783, "step": 424 }, { "epoch": 0.9599096555618295, "grad_norm": 0.5604745745658875, "learning_rate": 4.6795711867766436e-07, "loss": 1.6895, "step": 425 }, { "epoch": 0.9621682665160926, "grad_norm": 0.7266672849655151, "learning_rate": 4.1747666635733597e-07, "loss": 1.3143, "step": 426 }, { "epoch": 0.9644268774703557, "grad_norm": 0.7507563829421997, "learning_rate": 3.698651851329837e-07, "loss": 1.4809, "step": 427 }, { "epoch": 0.9666854884246189, "grad_norm": 0.5136380195617676, "learning_rate": 3.251254299261874e-07, "loss": 1.6922, "step": 428 }, { "epoch": 0.968944099378882, "grad_norm": 0.7942318916320801, "learning_rate": 2.8325998949314536e-07, "loss": 1.5874, "step": 429 }, { "epoch": 0.9712027103331451, "grad_norm": 0.46300673484802246, "learning_rate": 2.442712862748775e-07, "loss": 1.7898, "step": 430 }, { "epoch": 0.9734613212874083, "grad_norm": 0.6884695291519165, "learning_rate": 2.0816157625706545e-07, "loss": 1.33, "step": 431 }, { "epoch": 0.9757199322416714, "grad_norm": 0.5976503491401672, "learning_rate": 1.749329488395124e-07, "loss": 1.6437, "step": 432 }, { "epoch": 0.9779785431959345, "grad_norm": 0.42137616872787476, "learning_rate": 1.4458732671523977e-07, "loss": 1.2883, "step": 433 }, { "epoch": 0.9802371541501976, "grad_norm": 0.6945174932479858, "learning_rate": 1.1712646575922637e-07, "loss": 1.3545, "step": 434 }, { "epoch": 0.9824957651044608, "grad_norm": 0.5032137036323547, "learning_rate": 9.255195492685609e-08, "loss": 1.4777, "step": 435 }, { "epoch": 0.9847543760587238, "grad_norm": 0.6473844647407532, "learning_rate": 7.086521616190279e-08, "loss": 1.4482, "step": 436 }, { "epoch": 0.987012987012987, "grad_norm": 1.0584694147109985, "learning_rate": 5.2067504314323723e-08, "loss": 1.2144, "step": 437 }, { "epoch": 0.9892715979672502, "grad_norm": 0.4894103407859802, "learning_rate": 3.6159907067601085e-08, "loss": 1.3691, "step": 438 }, { "epoch": 0.9915302089215132, "grad_norm": 0.6497173309326172, "learning_rate": 2.3143344875831142e-08, "loss": 1.3208, "step": 439 }, { "epoch": 0.9937888198757764, "grad_norm": 0.5225093364715576, "learning_rate": 1.3018570910466877e-08, "loss": 1.1954, "step": 440 }, { "epoch": 0.9960474308300395, "grad_norm": 0.574373185634613, "learning_rate": 5.786171016708419e-09, "loss": 1.598, "step": 441 }, { "epoch": 0.9983060417843026, "grad_norm": 0.6945062875747681, "learning_rate": 1.446563679641244e-09, "loss": 1.4279, "step": 442 }, { "epoch": 1.0005646527385659, "grad_norm": 0.6715332865715027, "learning_rate": 0.0, "loss": 1.5682, "step": 443 } ], "logging_steps": 1, "max_steps": 443, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.630732306700042e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }