|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0005646527385659, |
|
"eval_steps": 222, |
|
"global_step": 443, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002258610954263128, |
|
"grad_norm": 0.734693706035614, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 2.1698, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002258610954263128, |
|
"eval_loss": 2.083223819732666, |
|
"eval_runtime": 98.3201, |
|
"eval_samples_per_second": 7.587, |
|
"eval_steps_per_second": 0.956, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004517221908526256, |
|
"grad_norm": 0.7944502830505371, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.2383, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.006775832862789385, |
|
"grad_norm": 0.9303544759750366, |
|
"learning_rate": 1e-05, |
|
"loss": 1.952, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009034443817052512, |
|
"grad_norm": 0.9056912660598755, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 2.306, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01129305477131564, |
|
"grad_norm": 1.3058993816375732, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 2.3384, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01355166572557877, |
|
"grad_norm": 1.2961996793746948, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3395, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.015810276679841896, |
|
"grad_norm": 0.8225818276405334, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 1.6852, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.018068887634105024, |
|
"grad_norm": 0.7992662787437439, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 1.7517, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.020327498588368152, |
|
"grad_norm": 1.9861479997634888, |
|
"learning_rate": 3e-05, |
|
"loss": 2.6289, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02258610954263128, |
|
"grad_norm": 0.720796525478363, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.3053, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.024844720496894408, |
|
"grad_norm": 0.8933535218238831, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 2.0577, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02710333145115754, |
|
"grad_norm": 1.1678780317306519, |
|
"learning_rate": 4e-05, |
|
"loss": 2.1599, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.029361942405420668, |
|
"grad_norm": 1.06686270236969, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 1.9538, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03162055335968379, |
|
"grad_norm": 1.0956751108169556, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 1.8666, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03387916431394692, |
|
"grad_norm": 1.3107072114944458, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0312, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03613777526821005, |
|
"grad_norm": 1.6173886060714722, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 1.9908, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.038396386222473176, |
|
"grad_norm": 1.1099895238876343, |
|
"learning_rate": 5.666666666666667e-05, |
|
"loss": 1.6063, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.040654997176736304, |
|
"grad_norm": 1.4625017642974854, |
|
"learning_rate": 6e-05, |
|
"loss": 1.6362, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04291360813099943, |
|
"grad_norm": 1.05028235912323, |
|
"learning_rate": 6.333333333333333e-05, |
|
"loss": 1.7419, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04517221908526256, |
|
"grad_norm": 0.9315014481544495, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.6991, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04743083003952569, |
|
"grad_norm": 1.2721033096313477, |
|
"learning_rate": 7e-05, |
|
"loss": 1.5631, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.049689440993788817, |
|
"grad_norm": 1.578466534614563, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 1.6748, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05194805194805195, |
|
"grad_norm": 1.692025065422058, |
|
"learning_rate": 7.666666666666667e-05, |
|
"loss": 1.607, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05420666290231508, |
|
"grad_norm": 1.251129388809204, |
|
"learning_rate": 8e-05, |
|
"loss": 1.8119, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05646527385657821, |
|
"grad_norm": 1.340306043624878, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.6167, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.058723884810841336, |
|
"grad_norm": 1.1570920944213867, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 1.7172, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.060982495765104464, |
|
"grad_norm": 1.0314528942108154, |
|
"learning_rate": 9e-05, |
|
"loss": 1.8385, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06324110671936758, |
|
"grad_norm": 1.032861351966858, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 1.3839, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06549971767363072, |
|
"grad_norm": 0.9600235819816589, |
|
"learning_rate": 9.666666666666667e-05, |
|
"loss": 1.4608, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06775832862789384, |
|
"grad_norm": 0.9947760105133057, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4875, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07001693958215698, |
|
"grad_norm": 1.2269097566604614, |
|
"learning_rate": 9.999855343632036e-05, |
|
"loss": 1.6437, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0722755505364201, |
|
"grad_norm": 0.865011990070343, |
|
"learning_rate": 9.999421382898329e-05, |
|
"loss": 1.6969, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07453416149068323, |
|
"grad_norm": 0.6729279160499573, |
|
"learning_rate": 9.998698142908953e-05, |
|
"loss": 1.2089, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07679277244494635, |
|
"grad_norm": 1.4308388233184814, |
|
"learning_rate": 9.997685665512418e-05, |
|
"loss": 1.3744, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.07905138339920949, |
|
"grad_norm": 0.582622230052948, |
|
"learning_rate": 9.99638400929324e-05, |
|
"loss": 1.2607, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08130999435347261, |
|
"grad_norm": 0.5864912867546082, |
|
"learning_rate": 9.994793249568569e-05, |
|
"loss": 1.3634, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08356860530773574, |
|
"grad_norm": 0.5478243827819824, |
|
"learning_rate": 9.99291347838381e-05, |
|
"loss": 1.5985, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.08582721626199886, |
|
"grad_norm": 0.7038874626159668, |
|
"learning_rate": 9.990744804507315e-05, |
|
"loss": 1.3606, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.088085827216262, |
|
"grad_norm": 0.5414575934410095, |
|
"learning_rate": 9.988287353424077e-05, |
|
"loss": 1.7192, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09034443817052512, |
|
"grad_norm": 0.5917218327522278, |
|
"learning_rate": 9.985541267328477e-05, |
|
"loss": 1.5298, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09260304912478826, |
|
"grad_norm": 0.5787907838821411, |
|
"learning_rate": 9.98250670511605e-05, |
|
"loss": 1.9064, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.09486166007905138, |
|
"grad_norm": 0.8571286201477051, |
|
"learning_rate": 9.979183842374293e-05, |
|
"loss": 1.3743, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09712027103331451, |
|
"grad_norm": 0.7187583446502686, |
|
"learning_rate": 9.975572871372513e-05, |
|
"loss": 1.3655, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.09937888198757763, |
|
"grad_norm": 1.4509934186935425, |
|
"learning_rate": 9.971674001050686e-05, |
|
"loss": 1.4908, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.10163749294184077, |
|
"grad_norm": 0.7309430837631226, |
|
"learning_rate": 9.967487457007381e-05, |
|
"loss": 1.0838, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1038961038961039, |
|
"grad_norm": 0.7827832102775574, |
|
"learning_rate": 9.963013481486703e-05, |
|
"loss": 1.5432, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.10615471485036702, |
|
"grad_norm": 0.6648978590965271, |
|
"learning_rate": 9.958252333364267e-05, |
|
"loss": 1.1893, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.10841332580463016, |
|
"grad_norm": 1.085577130317688, |
|
"learning_rate": 9.953204288132234e-05, |
|
"loss": 1.3416, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.11067193675889328, |
|
"grad_norm": 1.0411715507507324, |
|
"learning_rate": 9.947869637883358e-05, |
|
"loss": 1.4301, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.11293054771315642, |
|
"grad_norm": 0.6772856116294861, |
|
"learning_rate": 9.942248691294093e-05, |
|
"loss": 1.1494, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11518915866741954, |
|
"grad_norm": 1.099043846130371, |
|
"learning_rate": 9.936341773606723e-05, |
|
"loss": 1.4349, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.11744776962168267, |
|
"grad_norm": 0.7465451955795288, |
|
"learning_rate": 9.930149226610554e-05, |
|
"loss": 1.4149, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.11970638057594579, |
|
"grad_norm": 0.6757813096046448, |
|
"learning_rate": 9.923671408622129e-05, |
|
"loss": 1.3782, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.12196499153020893, |
|
"grad_norm": 0.6110934019088745, |
|
"learning_rate": 9.916908694464492e-05, |
|
"loss": 1.5505, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 0.8363707065582275, |
|
"learning_rate": 9.909861475445517e-05, |
|
"loss": 1.6348, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.12648221343873517, |
|
"grad_norm": 0.5607156753540039, |
|
"learning_rate": 9.902530159335243e-05, |
|
"loss": 1.6425, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1287408243929983, |
|
"grad_norm": 0.6306595802307129, |
|
"learning_rate": 9.894915170342295e-05, |
|
"loss": 1.2802, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.13099943534726144, |
|
"grad_norm": 0.8412261009216309, |
|
"learning_rate": 9.887016949089333e-05, |
|
"loss": 1.5162, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.13325804630152457, |
|
"grad_norm": 1.7483497858047485, |
|
"learning_rate": 9.878835952587559e-05, |
|
"loss": 1.4394, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.13551665725578768, |
|
"grad_norm": 0.6815210580825806, |
|
"learning_rate": 9.870372654210265e-05, |
|
"loss": 1.4947, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13777526821005082, |
|
"grad_norm": 0.819712221622467, |
|
"learning_rate": 9.861627543665456e-05, |
|
"loss": 1.8876, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.14003387916431395, |
|
"grad_norm": 0.6188887357711792, |
|
"learning_rate": 9.852601126967502e-05, |
|
"loss": 1.4265, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1422924901185771, |
|
"grad_norm": 0.5934920310974121, |
|
"learning_rate": 9.843293926407866e-05, |
|
"loss": 1.4272, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1445511010728402, |
|
"grad_norm": 0.554636538028717, |
|
"learning_rate": 9.833706480524878e-05, |
|
"loss": 1.6095, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.14680971202710333, |
|
"grad_norm": 0.545122504234314, |
|
"learning_rate": 9.82383934407258e-05, |
|
"loss": 1.2429, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14906832298136646, |
|
"grad_norm": 0.674778401851654, |
|
"learning_rate": 9.81369308798862e-05, |
|
"loss": 1.3112, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1513269339356296, |
|
"grad_norm": 0.4785314202308655, |
|
"learning_rate": 9.803268299361217e-05, |
|
"loss": 1.2792, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1535855448898927, |
|
"grad_norm": 0.7849199175834656, |
|
"learning_rate": 9.7925655813952e-05, |
|
"loss": 1.557, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.15584415584415584, |
|
"grad_norm": 0.7094368934631348, |
|
"learning_rate": 9.781585553377085e-05, |
|
"loss": 1.3404, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.15810276679841898, |
|
"grad_norm": 0.5156518220901489, |
|
"learning_rate": 9.770328850639268e-05, |
|
"loss": 1.2939, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1603613777526821, |
|
"grad_norm": 0.6321349143981934, |
|
"learning_rate": 9.758796124523239e-05, |
|
"loss": 1.684, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.16261998870694522, |
|
"grad_norm": 0.8690780401229858, |
|
"learning_rate": 9.746988042341906e-05, |
|
"loss": 1.2346, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.16487859966120835, |
|
"grad_norm": 0.5986551642417908, |
|
"learning_rate": 9.734905287340985e-05, |
|
"loss": 1.4559, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1671372106154715, |
|
"grad_norm": 0.5338417887687683, |
|
"learning_rate": 9.722548558659457e-05, |
|
"loss": 1.6537, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.16939582156973462, |
|
"grad_norm": 0.7366315126419067, |
|
"learning_rate": 9.709918571289114e-05, |
|
"loss": 1.3941, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.17165443252399773, |
|
"grad_norm": 0.7297102212905884, |
|
"learning_rate": 9.697016056033201e-05, |
|
"loss": 1.5824, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.5532839894294739, |
|
"learning_rate": 9.683841759464113e-05, |
|
"loss": 1.4119, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.176171654432524, |
|
"grad_norm": 0.5008260011672974, |
|
"learning_rate": 9.670396443880208e-05, |
|
"loss": 1.5714, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.17843026538678713, |
|
"grad_norm": 0.7788993716239929, |
|
"learning_rate": 9.656680887261693e-05, |
|
"loss": 1.4765, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.18068887634105024, |
|
"grad_norm": 0.7315067648887634, |
|
"learning_rate": 9.64269588322561e-05, |
|
"loss": 1.6218, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18294748729531338, |
|
"grad_norm": 0.7663245797157288, |
|
"learning_rate": 9.628442240979916e-05, |
|
"loss": 1.381, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1852060982495765, |
|
"grad_norm": 0.5851924419403076, |
|
"learning_rate": 9.613920785276656e-05, |
|
"loss": 1.6193, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.18746470920383965, |
|
"grad_norm": 0.8395928144454956, |
|
"learning_rate": 9.599132356364247e-05, |
|
"loss": 1.3371, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.18972332015810275, |
|
"grad_norm": 0.9145395159721375, |
|
"learning_rate": 9.584077809938855e-05, |
|
"loss": 1.3984, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1919819311123659, |
|
"grad_norm": 0.6220190525054932, |
|
"learning_rate": 9.568758017094883e-05, |
|
"loss": 1.2248, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.19424054206662902, |
|
"grad_norm": 0.5492339730262756, |
|
"learning_rate": 9.553173864274567e-05, |
|
"loss": 1.4326, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.19649915302089216, |
|
"grad_norm": 0.47741296887397766, |
|
"learning_rate": 9.537326253216685e-05, |
|
"loss": 1.4092, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.19875776397515527, |
|
"grad_norm": 0.5220446586608887, |
|
"learning_rate": 9.521216100904378e-05, |
|
"loss": 1.5837, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2010163749294184, |
|
"grad_norm": 0.5066771507263184, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 1.4179, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.20327498588368154, |
|
"grad_norm": 0.6398409605026245, |
|
"learning_rate": 9.488211916351656e-05, |
|
"loss": 1.3896, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.20553359683794467, |
|
"grad_norm": 0.7091066241264343, |
|
"learning_rate": 9.471319793817426e-05, |
|
"loss": 1.6861, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2077922077922078, |
|
"grad_norm": 1.067958116531372, |
|
"learning_rate": 9.454168949330645e-05, |
|
"loss": 1.417, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2100508187464709, |
|
"grad_norm": 0.5348410606384277, |
|
"learning_rate": 9.436760375282859e-05, |
|
"loss": 1.4303, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.21230942970073405, |
|
"grad_norm": 0.5271281599998474, |
|
"learning_rate": 9.419095078978506e-05, |
|
"loss": 1.2863, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.21456804065499718, |
|
"grad_norm": 0.6372822523117065, |
|
"learning_rate": 9.40117408257663e-05, |
|
"loss": 1.4036, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.21682665160926032, |
|
"grad_norm": 0.5258705615997314, |
|
"learning_rate": 9.382998423031727e-05, |
|
"loss": 1.7694, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.21908526256352343, |
|
"grad_norm": 0.5894024968147278, |
|
"learning_rate": 9.364569152033756e-05, |
|
"loss": 1.7329, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.22134387351778656, |
|
"grad_norm": 0.7126629948616028, |
|
"learning_rate": 9.345887335947281e-05, |
|
"loss": 1.6303, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2236024844720497, |
|
"grad_norm": 0.5408257842063904, |
|
"learning_rate": 9.326954055749767e-05, |
|
"loss": 1.6237, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.22586109542631283, |
|
"grad_norm": 0.673343300819397, |
|
"learning_rate": 9.30777040696903e-05, |
|
"loss": 1.1694, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22811970638057594, |
|
"grad_norm": 0.6288356781005859, |
|
"learning_rate": 9.288337499619857e-05, |
|
"loss": 1.3256, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.23037831733483907, |
|
"grad_norm": 0.5580915212631226, |
|
"learning_rate": 9.268656458139762e-05, |
|
"loss": 1.4626, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2326369282891022, |
|
"grad_norm": 0.5487126708030701, |
|
"learning_rate": 9.248728421323941e-05, |
|
"loss": 1.5227, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.23489553924336534, |
|
"grad_norm": 0.712181031703949, |
|
"learning_rate": 9.22855454225936e-05, |
|
"loss": 1.082, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.23715415019762845, |
|
"grad_norm": 0.7502368092536926, |
|
"learning_rate": 9.208135988258051e-05, |
|
"loss": 1.4804, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.23941276115189158, |
|
"grad_norm": 0.9380141496658325, |
|
"learning_rate": 9.187473940789557e-05, |
|
"loss": 1.4093, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.24167137210615472, |
|
"grad_norm": 0.7033969759941101, |
|
"learning_rate": 9.166569595412575e-05, |
|
"loss": 1.3279, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.24392998306041785, |
|
"grad_norm": 0.9007672667503357, |
|
"learning_rate": 9.145424161705776e-05, |
|
"loss": 1.3571, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.24618859401468096, |
|
"grad_norm": 0.555966854095459, |
|
"learning_rate": 9.124038863197818e-05, |
|
"loss": 1.475, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2484472049689441, |
|
"grad_norm": 0.5234211683273315, |
|
"learning_rate": 9.10241493729654e-05, |
|
"loss": 1.5519, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.25070581592320723, |
|
"grad_norm": 0.5980601906776428, |
|
"learning_rate": 9.08055363521738e-05, |
|
"loss": 1.5623, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.25296442687747034, |
|
"grad_norm": 0.6727840900421143, |
|
"learning_rate": 9.058456221910956e-05, |
|
"loss": 1.4898, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2552230378317335, |
|
"grad_norm": 0.6955089569091797, |
|
"learning_rate": 9.036123975989892e-05, |
|
"loss": 1.4752, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2574816487859966, |
|
"grad_norm": 0.6179929971694946, |
|
"learning_rate": 9.013558189654819e-05, |
|
"loss": 1.4823, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.6038258671760559, |
|
"learning_rate": 8.990760168619615e-05, |
|
"loss": 1.2696, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2619988706945229, |
|
"grad_norm": 0.6415106058120728, |
|
"learning_rate": 8.967731232035847e-05, |
|
"loss": 1.8538, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.264257481648786, |
|
"grad_norm": 0.4856817424297333, |
|
"learning_rate": 8.944472712416447e-05, |
|
"loss": 1.4395, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.26651609260304915, |
|
"grad_norm": 0.5568715929985046, |
|
"learning_rate": 8.9209859555586e-05, |
|
"loss": 1.2157, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.26877470355731226, |
|
"grad_norm": 0.5347578525543213, |
|
"learning_rate": 8.897272320465887e-05, |
|
"loss": 1.4817, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.27103331451157536, |
|
"grad_norm": 0.5750213861465454, |
|
"learning_rate": 8.873333179269635e-05, |
|
"loss": 1.477, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2732919254658385, |
|
"grad_norm": 0.588683009147644, |
|
"learning_rate": 8.849169917149531e-05, |
|
"loss": 1.6019, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.27555053642010163, |
|
"grad_norm": 0.511512279510498, |
|
"learning_rate": 8.82478393225347e-05, |
|
"loss": 1.6251, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.27780914737436474, |
|
"grad_norm": 0.6532254815101624, |
|
"learning_rate": 8.800176635616657e-05, |
|
"loss": 1.7525, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2800677583286279, |
|
"grad_norm": 0.6906523108482361, |
|
"learning_rate": 8.775349451079948e-05, |
|
"loss": 1.3276, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.282326369282891, |
|
"grad_norm": 0.7139832973480225, |
|
"learning_rate": 8.750303815207486e-05, |
|
"loss": 1.6493, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2845849802371542, |
|
"grad_norm": 0.6658667325973511, |
|
"learning_rate": 8.725041177203554e-05, |
|
"loss": 1.2206, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.2868435911914173, |
|
"grad_norm": 0.6301794648170471, |
|
"learning_rate": 8.699562998828738e-05, |
|
"loss": 1.4198, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2891022021456804, |
|
"grad_norm": 0.5198056697845459, |
|
"learning_rate": 8.673870754315336e-05, |
|
"loss": 1.3772, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.29136081309994355, |
|
"grad_norm": 0.6238622069358826, |
|
"learning_rate": 8.647965930282059e-05, |
|
"loss": 1.4059, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.29361942405420666, |
|
"grad_norm": 0.7875000834465027, |
|
"learning_rate": 8.621850025648009e-05, |
|
"loss": 1.2813, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.29587803500846976, |
|
"grad_norm": 0.8984452486038208, |
|
"learning_rate": 8.59552455154595e-05, |
|
"loss": 1.7444, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2981366459627329, |
|
"grad_norm": 0.520828902721405, |
|
"learning_rate": 8.56899103123487e-05, |
|
"loss": 1.3697, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.30039525691699603, |
|
"grad_norm": 0.5849810242652893, |
|
"learning_rate": 8.54225100001184e-05, |
|
"loss": 1.2721, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3026538678712592, |
|
"grad_norm": 0.5514599680900574, |
|
"learning_rate": 8.51530600512318e-05, |
|
"loss": 1.4966, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3049124788255223, |
|
"grad_norm": 1.624732255935669, |
|
"learning_rate": 8.488157605674925e-05, |
|
"loss": 1.2903, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3071710897797854, |
|
"grad_norm": 0.5830861926078796, |
|
"learning_rate": 8.460807372542618e-05, |
|
"loss": 1.2969, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3094297007340486, |
|
"grad_norm": 0.8161193132400513, |
|
"learning_rate": 8.43325688828042e-05, |
|
"loss": 1.1874, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3116883116883117, |
|
"grad_norm": 0.47100022435188293, |
|
"learning_rate": 8.405507747029523e-05, |
|
"loss": 1.3229, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.31394692264257484, |
|
"grad_norm": 0.4437452554702759, |
|
"learning_rate": 8.377561554425922e-05, |
|
"loss": 1.1945, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.31620553359683795, |
|
"grad_norm": 0.566750705242157, |
|
"learning_rate": 8.349419927507505e-05, |
|
"loss": 1.4707, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.31846414455110106, |
|
"grad_norm": 0.5701163411140442, |
|
"learning_rate": 8.321084494620488e-05, |
|
"loss": 1.5681, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3207227555053642, |
|
"grad_norm": 0.5442900657653809, |
|
"learning_rate": 8.292556895325194e-05, |
|
"loss": 1.2291, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.32298136645962733, |
|
"grad_norm": 0.5684903860092163, |
|
"learning_rate": 8.263838780301182e-05, |
|
"loss": 1.3748, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.32523997741389044, |
|
"grad_norm": 0.7141373157501221, |
|
"learning_rate": 8.234931811251739e-05, |
|
"loss": 1.1836, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3274985883681536, |
|
"grad_norm": 0.7630109190940857, |
|
"learning_rate": 8.205837660807725e-05, |
|
"loss": 1.602, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3297571993224167, |
|
"grad_norm": 0.6189699172973633, |
|
"learning_rate": 8.176558012430791e-05, |
|
"loss": 1.273, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.33201581027667987, |
|
"grad_norm": 0.7228075861930847, |
|
"learning_rate": 8.147094560315977e-05, |
|
"loss": 1.1563, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.334274421230943, |
|
"grad_norm": 0.5667940378189087, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 1.3525, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3365330321852061, |
|
"grad_norm": 0.4601518213748932, |
|
"learning_rate": 8.08762307473096e-05, |
|
"loss": 1.2889, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.33879164313946925, |
|
"grad_norm": 0.5403670072555542, |
|
"learning_rate": 8.057618482432399e-05, |
|
"loss": 1.4927, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.34105025409373235, |
|
"grad_norm": 0.5279664993286133, |
|
"learning_rate": 8.027436968540123e-05, |
|
"loss": 1.4293, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.34330886504799546, |
|
"grad_norm": 0.517234206199646, |
|
"learning_rate": 7.997080279433402e-05, |
|
"loss": 1.2646, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3455674760022586, |
|
"grad_norm": 0.6740924119949341, |
|
"learning_rate": 7.966550171627592e-05, |
|
"loss": 1.3801, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.47068697214126587, |
|
"learning_rate": 7.9358484116725e-05, |
|
"loss": 1.3094, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3500846979107849, |
|
"grad_norm": 0.5425541996955872, |
|
"learning_rate": 7.904976776050156e-05, |
|
"loss": 1.5867, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.352343308865048, |
|
"grad_norm": 0.41730475425720215, |
|
"learning_rate": 7.873937051072035e-05, |
|
"loss": 1.1469, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3546019198193111, |
|
"grad_norm": 0.632966160774231, |
|
"learning_rate": 7.842731032775687e-05, |
|
"loss": 1.4293, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.35686053077357427, |
|
"grad_norm": 0.6313096880912781, |
|
"learning_rate": 7.81136052682082e-05, |
|
"loss": 1.2995, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3591191417278374, |
|
"grad_norm": 1.015628457069397, |
|
"learning_rate": 7.779827348384813e-05, |
|
"loss": 1.6397, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3613777526821005, |
|
"grad_norm": 0.6771583557128906, |
|
"learning_rate": 7.748133322057693e-05, |
|
"loss": 1.5359, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.6180778741836548, |
|
"learning_rate": 7.716280281736551e-05, |
|
"loss": 1.6158, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.36589497459062675, |
|
"grad_norm": 0.46877238154411316, |
|
"learning_rate": 7.68427007051944e-05, |
|
"loss": 1.3506, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3681535855448899, |
|
"grad_norm": 0.5611956119537354, |
|
"learning_rate": 7.652104540598712e-05, |
|
"loss": 1.2563, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.370412196499153, |
|
"grad_norm": 0.5148677229881287, |
|
"learning_rate": 7.619785553153864e-05, |
|
"loss": 1.4294, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.37267080745341613, |
|
"grad_norm": 0.49939846992492676, |
|
"learning_rate": 7.58731497824383e-05, |
|
"loss": 1.2108, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3749294184076793, |
|
"grad_norm": 0.6522055864334106, |
|
"learning_rate": 7.554694694698784e-05, |
|
"loss": 1.3305, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3771880293619424, |
|
"grad_norm": 0.5979933738708496, |
|
"learning_rate": 7.521926590011418e-05, |
|
"loss": 1.4971, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3794466403162055, |
|
"grad_norm": 2.253812551498413, |
|
"learning_rate": 7.489012560227742e-05, |
|
"loss": 1.1929, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.38170525127046867, |
|
"grad_norm": 0.5582684874534607, |
|
"learning_rate": 7.455954509837352e-05, |
|
"loss": 1.4669, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3839638622247318, |
|
"grad_norm": 0.5619791150093079, |
|
"learning_rate": 7.422754351663252e-05, |
|
"loss": 1.4651, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.38622247317899494, |
|
"grad_norm": 0.4569830000400543, |
|
"learning_rate": 7.389414006751158e-05, |
|
"loss": 1.4677, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.38848108413325805, |
|
"grad_norm": 1.0484251976013184, |
|
"learning_rate": 7.355935404258354e-05, |
|
"loss": 1.4736, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.39073969508752115, |
|
"grad_norm": 0.5541074872016907, |
|
"learning_rate": 7.322320481342054e-05, |
|
"loss": 1.4604, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3929983060417843, |
|
"grad_norm": 0.6257642507553101, |
|
"learning_rate": 7.288571183047322e-05, |
|
"loss": 1.1331, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.3952569169960474, |
|
"grad_norm": 0.5899243354797363, |
|
"learning_rate": 7.254689462194522e-05, |
|
"loss": 1.1387, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.39751552795031053, |
|
"grad_norm": 0.5541165471076965, |
|
"learning_rate": 7.220677279266327e-05, |
|
"loss": 1.408, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3997741389045737, |
|
"grad_norm": 0.5860551595687866, |
|
"learning_rate": 7.186536602294278e-05, |
|
"loss": 1.512, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4020327498588368, |
|
"grad_norm": 0.5203275084495544, |
|
"learning_rate": 7.152269406744903e-05, |
|
"loss": 1.8094, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.40429136081309996, |
|
"grad_norm": 0.6692151427268982, |
|
"learning_rate": 7.117877675405427e-05, |
|
"loss": 1.4363, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.40654997176736307, |
|
"grad_norm": 0.624856173992157, |
|
"learning_rate": 7.083363398269022e-05, |
|
"loss": 1.4632, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4088085827216262, |
|
"grad_norm": 0.5100191235542297, |
|
"learning_rate": 7.04872857241968e-05, |
|
"loss": 1.3967, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.41106719367588934, |
|
"grad_norm": 0.5183005332946777, |
|
"learning_rate": 7.013975201916648e-05, |
|
"loss": 1.7088, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.41332580463015245, |
|
"grad_norm": 0.7876030802726746, |
|
"learning_rate": 6.979105297678462e-05, |
|
"loss": 1.4677, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4155844155844156, |
|
"grad_norm": 0.5176962614059448, |
|
"learning_rate": 6.944120877366604e-05, |
|
"loss": 1.5604, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4178430265386787, |
|
"grad_norm": 0.6162248849868774, |
|
"learning_rate": 6.909023965268746e-05, |
|
"loss": 1.307, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4201016374929418, |
|
"grad_norm": 0.8558477759361267, |
|
"learning_rate": 6.873816592181617e-05, |
|
"loss": 1.1848, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.422360248447205, |
|
"grad_norm": 0.5814364552497864, |
|
"learning_rate": 6.838500795293505e-05, |
|
"loss": 1.3717, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4246188594014681, |
|
"grad_norm": 0.5346269011497498, |
|
"learning_rate": 6.803078618066378e-05, |
|
"loss": 1.444, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4268774703557312, |
|
"grad_norm": 0.5764583945274353, |
|
"learning_rate": 6.767552110117631e-05, |
|
"loss": 1.4341, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.42913608130999437, |
|
"grad_norm": 0.4884260892868042, |
|
"learning_rate": 6.73192332710151e-05, |
|
"loss": 1.3498, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4313946922642575, |
|
"grad_norm": 0.6064755320549011, |
|
"learning_rate": 6.696194330590151e-05, |
|
"loss": 1.6995, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.43365330321852064, |
|
"grad_norm": 0.5664028525352478, |
|
"learning_rate": 6.660367187954304e-05, |
|
"loss": 1.418, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.43591191417278374, |
|
"grad_norm": 0.6558746695518494, |
|
"learning_rate": 6.624443972243698e-05, |
|
"loss": 1.2759, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.43817052512704685, |
|
"grad_norm": 1.469589352607727, |
|
"learning_rate": 6.5884267620671e-05, |
|
"loss": 1.2259, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.44042913608131, |
|
"grad_norm": 0.9882861375808716, |
|
"learning_rate": 6.552317641472026e-05, |
|
"loss": 1.4997, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4426877470355731, |
|
"grad_norm": 0.4861055910587311, |
|
"learning_rate": 6.516118699824178e-05, |
|
"loss": 1.2735, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4449463579898362, |
|
"grad_norm": 0.5669353008270264, |
|
"learning_rate": 6.479832031686521e-05, |
|
"loss": 1.4849, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4472049689440994, |
|
"grad_norm": 0.5639188885688782, |
|
"learning_rate": 6.443459736698105e-05, |
|
"loss": 1.6053, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4494635798983625, |
|
"grad_norm": 0.5520173907279968, |
|
"learning_rate": 6.407003919452564e-05, |
|
"loss": 1.2882, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.45172219085262566, |
|
"grad_norm": 0.7025582790374756, |
|
"learning_rate": 6.370466689376342e-05, |
|
"loss": 1.4892, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.45398080180688877, |
|
"grad_norm": 0.5788022875785828, |
|
"learning_rate": 6.33385016060664e-05, |
|
"loss": 1.7469, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.4562394127611519, |
|
"grad_norm": 0.5949933528900146, |
|
"learning_rate": 6.297156451869082e-05, |
|
"loss": 1.405, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.45849802371541504, |
|
"grad_norm": 0.5209388732910156, |
|
"learning_rate": 6.260387686355121e-05, |
|
"loss": 1.3265, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.46075663466967814, |
|
"grad_norm": 0.4528113603591919, |
|
"learning_rate": 6.223545991599184e-05, |
|
"loss": 1.4738, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.46301524562394125, |
|
"grad_norm": 0.5056377053260803, |
|
"learning_rate": 6.186633499355576e-05, |
|
"loss": 1.6497, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4652738565782044, |
|
"grad_norm": 0.5312528014183044, |
|
"learning_rate": 6.149652345475118e-05, |
|
"loss": 1.2971, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.4675324675324675, |
|
"grad_norm": 0.8774858117103577, |
|
"learning_rate": 6.112604669781572e-05, |
|
"loss": 1.4465, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.4697910784867307, |
|
"grad_norm": 0.6204832196235657, |
|
"learning_rate": 6.075492615947823e-05, |
|
"loss": 1.3416, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.4720496894409938, |
|
"grad_norm": 0.4414108097553253, |
|
"learning_rate": 6.038318331371836e-05, |
|
"loss": 1.5334, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.4743083003952569, |
|
"grad_norm": 0.6953609585762024, |
|
"learning_rate": 6.001083967052408e-05, |
|
"loss": 1.6932, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.47656691134952006, |
|
"grad_norm": 0.47844645380973816, |
|
"learning_rate": 5.963791677464696e-05, |
|
"loss": 1.7326, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.47882552230378317, |
|
"grad_norm": 0.5514799952507019, |
|
"learning_rate": 5.9264436204355724e-05, |
|
"loss": 1.2659, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.4810841332580463, |
|
"grad_norm": 0.7047640681266785, |
|
"learning_rate": 5.889041957018745e-05, |
|
"loss": 1.2528, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.48334274421230944, |
|
"grad_norm": 0.5648500919342041, |
|
"learning_rate": 5.85158885136973e-05, |
|
"loss": 1.1812, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.48560135516657255, |
|
"grad_norm": 0.6360241174697876, |
|
"learning_rate": 5.81408647062062e-05, |
|
"loss": 1.464, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4878599661208357, |
|
"grad_norm": 0.693928599357605, |
|
"learning_rate": 5.7765369847546916e-05, |
|
"loss": 1.5254, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4901185770750988, |
|
"grad_norm": 0.8636585474014282, |
|
"learning_rate": 5.7389425664808396e-05, |
|
"loss": 1.7108, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.4923771880293619, |
|
"grad_norm": 0.5646620392799377, |
|
"learning_rate": 5.7013053911078677e-05, |
|
"loss": 1.4662, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.4946357989836251, |
|
"grad_norm": 0.48793742060661316, |
|
"learning_rate": 5.6636276364186105e-05, |
|
"loss": 1.475, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.4968944099378882, |
|
"grad_norm": 0.4991232454776764, |
|
"learning_rate": 5.6259114825439275e-05, |
|
"loss": 1.5324, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4991530208921513, |
|
"grad_norm": 0.6191245317459106, |
|
"learning_rate": 5.588159111836553e-05, |
|
"loss": 1.3583, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5014116318464145, |
|
"grad_norm": 0.5580266714096069, |
|
"learning_rate": 5.550372708744815e-05, |
|
"loss": 1.4188, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5014116318464145, |
|
"eval_loss": 1.4097435474395752, |
|
"eval_runtime": 98.3925, |
|
"eval_samples_per_second": 7.582, |
|
"eval_steps_per_second": 0.955, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5036702428006776, |
|
"grad_norm": 0.4826272130012512, |
|
"learning_rate": 5.51255445968625e-05, |
|
"loss": 1.4089, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5059288537549407, |
|
"grad_norm": 0.7236170768737793, |
|
"learning_rate": 5.4747065529210736e-05, |
|
"loss": 1.3453, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5081874647092038, |
|
"grad_norm": 0.5844079256057739, |
|
"learning_rate": 5.436831178425582e-05, |
|
"loss": 1.3397, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.510446075663467, |
|
"grad_norm": 0.8065000176429749, |
|
"learning_rate": 5.3989305277654156e-05, |
|
"loss": 1.4753, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.51270468661773, |
|
"grad_norm": 0.48151111602783203, |
|
"learning_rate": 5.361006793968764e-05, |
|
"loss": 1.2529, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5149632975719932, |
|
"grad_norm": 0.6532959342002869, |
|
"learning_rate": 5.32306217139946e-05, |
|
"loss": 1.4809, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5172219085262564, |
|
"grad_norm": 0.6014647483825684, |
|
"learning_rate": 5.28509885563002e-05, |
|
"loss": 1.2814, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.6728442311286926, |
|
"learning_rate": 5.247119043314592e-05, |
|
"loss": 1.4694, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.5741045475006104, |
|
"learning_rate": 5.209124932061862e-05, |
|
"loss": 1.6012, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5239977413890458, |
|
"grad_norm": 0.49296098947525024, |
|
"learning_rate": 5.1711187203078824e-05, |
|
"loss": 1.4489, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5262563523433089, |
|
"grad_norm": 0.6149709820747375, |
|
"learning_rate": 5.133102607188874e-05, |
|
"loss": 1.4421, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.528514963297572, |
|
"grad_norm": 0.6720355749130249, |
|
"learning_rate": 5.0950787924139764e-05, |
|
"loss": 1.2675, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5307735742518351, |
|
"grad_norm": 0.5325198769569397, |
|
"learning_rate": 5.057049476137967e-05, |
|
"loss": 1.1803, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5330321852060983, |
|
"grad_norm": 0.5668638944625854, |
|
"learning_rate": 5.0190168588339536e-05, |
|
"loss": 1.2712, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5352907961603613, |
|
"grad_norm": 0.4216601848602295, |
|
"learning_rate": 4.9809831411660476e-05, |
|
"loss": 1.322, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5375494071146245, |
|
"grad_norm": 0.608249306678772, |
|
"learning_rate": 4.942950523862033e-05, |
|
"loss": 1.3996, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5398080180688877, |
|
"grad_norm": 0.9624855518341064, |
|
"learning_rate": 4.904921207586024e-05, |
|
"loss": 1.3592, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5420666290231507, |
|
"grad_norm": 0.5141565203666687, |
|
"learning_rate": 4.866897392811126e-05, |
|
"loss": 1.1777, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5443252399774139, |
|
"grad_norm": 0.6783860325813293, |
|
"learning_rate": 4.828881279692119e-05, |
|
"loss": 1.6956, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.546583850931677, |
|
"grad_norm": 0.48693007230758667, |
|
"learning_rate": 4.7908750679381384e-05, |
|
"loss": 1.6161, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5488424618859401, |
|
"grad_norm": 0.5655871033668518, |
|
"learning_rate": 4.752880956685407e-05, |
|
"loss": 1.0778, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.5511010728402033, |
|
"grad_norm": 0.5297839045524597, |
|
"learning_rate": 4.7149011443699814e-05, |
|
"loss": 1.485, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.5533596837944664, |
|
"grad_norm": 0.7380653023719788, |
|
"learning_rate": 4.676937828600542e-05, |
|
"loss": 1.4226, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5556182947487295, |
|
"grad_norm": 0.693130373954773, |
|
"learning_rate": 4.638993206031237e-05, |
|
"loss": 1.4093, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.5578769057029926, |
|
"grad_norm": 0.6071158647537231, |
|
"learning_rate": 4.601069472234584e-05, |
|
"loss": 1.0761, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.5601355166572558, |
|
"grad_norm": 0.9863161444664001, |
|
"learning_rate": 4.56316882157442e-05, |
|
"loss": 1.3143, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.562394127611519, |
|
"grad_norm": 0.5352652668952942, |
|
"learning_rate": 4.525293447078927e-05, |
|
"loss": 1.7061, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.564652738565782, |
|
"grad_norm": 0.558076798915863, |
|
"learning_rate": 4.4874455403137514e-05, |
|
"loss": 1.7395, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5669113495200452, |
|
"grad_norm": 1.1520057916641235, |
|
"learning_rate": 4.449627291255184e-05, |
|
"loss": 1.6384, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.5691699604743083, |
|
"grad_norm": 0.531182587146759, |
|
"learning_rate": 4.411840888163449e-05, |
|
"loss": 1.3033, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.4696061313152313, |
|
"learning_rate": 4.3740885174560736e-05, |
|
"loss": 1.4949, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.5736871823828346, |
|
"grad_norm": 0.6336054801940918, |
|
"learning_rate": 4.336372363581391e-05, |
|
"loss": 1.4374, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.5759457933370977, |
|
"grad_norm": 0.6364861726760864, |
|
"learning_rate": 4.298694608892134e-05, |
|
"loss": 1.7703, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5782044042913608, |
|
"grad_norm": 1.0777122974395752, |
|
"learning_rate": 4.2610574335191615e-05, |
|
"loss": 1.4255, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.5804630152456239, |
|
"grad_norm": 0.8023219704627991, |
|
"learning_rate": 4.2234630152453116e-05, |
|
"loss": 1.384, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.5827216261998871, |
|
"grad_norm": 0.5222808718681335, |
|
"learning_rate": 4.185913529379381e-05, |
|
"loss": 1.1682, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.5849802371541502, |
|
"grad_norm": 0.7348831295967102, |
|
"learning_rate": 4.1484111486302704e-05, |
|
"loss": 1.2724, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.5872388481084133, |
|
"grad_norm": 0.5531060695648193, |
|
"learning_rate": 4.110958042981255e-05, |
|
"loss": 1.4645, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5894974590626765, |
|
"grad_norm": 0.6494237780570984, |
|
"learning_rate": 4.0735563795644294e-05, |
|
"loss": 1.5516, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.5917560700169395, |
|
"grad_norm": 0.5829164981842041, |
|
"learning_rate": 4.0362083225353046e-05, |
|
"loss": 1.4303, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.5940146809712027, |
|
"grad_norm": 0.4754260182380676, |
|
"learning_rate": 3.998916032947594e-05, |
|
"loss": 1.3245, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.5962732919254659, |
|
"grad_norm": 0.7233774065971375, |
|
"learning_rate": 3.961681668628164e-05, |
|
"loss": 1.272, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.598531902879729, |
|
"grad_norm": 0.55941241979599, |
|
"learning_rate": 3.9245073840521765e-05, |
|
"loss": 1.4211, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6007905138339921, |
|
"grad_norm": 0.5475345253944397, |
|
"learning_rate": 3.887395330218429e-05, |
|
"loss": 1.3842, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6030491247882552, |
|
"grad_norm": 0.7882475256919861, |
|
"learning_rate": 3.850347654524883e-05, |
|
"loss": 1.4087, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6053077357425184, |
|
"grad_norm": 0.6707928776741028, |
|
"learning_rate": 3.8133665006444255e-05, |
|
"loss": 1.2446, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6075663466967814, |
|
"grad_norm": 0.6627002358436584, |
|
"learning_rate": 3.776454008400816e-05, |
|
"loss": 1.7243, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6098249576510446, |
|
"grad_norm": 0.476368248462677, |
|
"learning_rate": 3.7396123136448824e-05, |
|
"loss": 1.5209, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6120835686053078, |
|
"grad_norm": 0.6583078503608704, |
|
"learning_rate": 3.70284354813092e-05, |
|
"loss": 1.3496, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6143421795595708, |
|
"grad_norm": 0.6484099626541138, |
|
"learning_rate": 3.666149839393361e-05, |
|
"loss": 1.6792, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.616600790513834, |
|
"grad_norm": 0.42976608872413635, |
|
"learning_rate": 3.629533310623658e-05, |
|
"loss": 1.0712, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6188594014680971, |
|
"grad_norm": 0.7299443483352661, |
|
"learning_rate": 3.592996080547438e-05, |
|
"loss": 1.7172, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6211180124223602, |
|
"grad_norm": 0.6105283498764038, |
|
"learning_rate": 3.556540263301896e-05, |
|
"loss": 1.5218, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6233766233766234, |
|
"grad_norm": 0.6349045038223267, |
|
"learning_rate": 3.520167968313479e-05, |
|
"loss": 1.4029, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6256352343308865, |
|
"grad_norm": 0.441521555185318, |
|
"learning_rate": 3.483881300175823e-05, |
|
"loss": 1.5902, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.6278938452851497, |
|
"grad_norm": 0.6537282466888428, |
|
"learning_rate": 3.447682358527974e-05, |
|
"loss": 1.2511, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6301524562394127, |
|
"grad_norm": 0.64874267578125, |
|
"learning_rate": 3.411573237932904e-05, |
|
"loss": 1.4145, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6324110671936759, |
|
"grad_norm": 0.6560806035995483, |
|
"learning_rate": 3.3755560277563023e-05, |
|
"loss": 1.3575, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6346696781479391, |
|
"grad_norm": 0.4486541152000427, |
|
"learning_rate": 3.339632812045696e-05, |
|
"loss": 1.4707, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.6369282891022021, |
|
"grad_norm": 0.5472518801689148, |
|
"learning_rate": 3.303805669409848e-05, |
|
"loss": 1.2591, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6391869000564653, |
|
"grad_norm": 0.8374768495559692, |
|
"learning_rate": 3.268076672898492e-05, |
|
"loss": 1.2883, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6414455110107284, |
|
"grad_norm": 0.4761299788951874, |
|
"learning_rate": 3.2324478898823705e-05, |
|
"loss": 1.46, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.6437041219649915, |
|
"grad_norm": 0.58656907081604, |
|
"learning_rate": 3.196921381933624e-05, |
|
"loss": 1.366, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6459627329192547, |
|
"grad_norm": 0.6311814785003662, |
|
"learning_rate": 3.1614992047064945e-05, |
|
"loss": 1.2755, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.6482213438735178, |
|
"grad_norm": 0.5806014537811279, |
|
"learning_rate": 3.126183407818384e-05, |
|
"loss": 1.3936, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.6504799548277809, |
|
"grad_norm": 0.7558007836341858, |
|
"learning_rate": 3.090976034731257e-05, |
|
"loss": 1.3133, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.652738565782044, |
|
"grad_norm": 0.6248241662979126, |
|
"learning_rate": 3.055879122633397e-05, |
|
"loss": 1.3684, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.6549971767363072, |
|
"grad_norm": 0.6883110404014587, |
|
"learning_rate": 3.020894702321539e-05, |
|
"loss": 1.5355, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6572557876905702, |
|
"grad_norm": 0.47955361008644104, |
|
"learning_rate": 2.9860247980833532e-05, |
|
"loss": 1.2752, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.6595143986448334, |
|
"grad_norm": 0.6376725435256958, |
|
"learning_rate": 2.951271427580321e-05, |
|
"loss": 1.3576, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.6617730095990966, |
|
"grad_norm": 0.6355816125869751, |
|
"learning_rate": 2.91663660173098e-05, |
|
"loss": 1.3411, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.6640316205533597, |
|
"grad_norm": 0.6872756481170654, |
|
"learning_rate": 2.882122324594575e-05, |
|
"loss": 1.5319, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.6662902315076228, |
|
"grad_norm": 1.0146925449371338, |
|
"learning_rate": 2.847730593255097e-05, |
|
"loss": 1.4574, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.668548842461886, |
|
"grad_norm": 0.46255797147750854, |
|
"learning_rate": 2.8134633977057235e-05, |
|
"loss": 1.6074, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.6708074534161491, |
|
"grad_norm": 0.5414189100265503, |
|
"learning_rate": 2.779322720733673e-05, |
|
"loss": 1.6655, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.6730660643704122, |
|
"grad_norm": 0.4797675609588623, |
|
"learning_rate": 2.745310537805479e-05, |
|
"loss": 1.3457, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.6753246753246753, |
|
"grad_norm": 0.5770863890647888, |
|
"learning_rate": 2.7114288169526793e-05, |
|
"loss": 1.5853, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.6775832862789385, |
|
"grad_norm": 0.4539095163345337, |
|
"learning_rate": 2.6776795186579468e-05, |
|
"loss": 1.2473, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6798418972332015, |
|
"grad_norm": 0.5590886473655701, |
|
"learning_rate": 2.6440645957416484e-05, |
|
"loss": 1.7489, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.6821005081874647, |
|
"grad_norm": 0.5184858441352844, |
|
"learning_rate": 2.610585993248843e-05, |
|
"loss": 1.4334, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.6843591191417279, |
|
"grad_norm": 0.5701838135719299, |
|
"learning_rate": 2.5772456483367497e-05, |
|
"loss": 1.2721, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.6866177300959909, |
|
"grad_norm": 0.5292041301727295, |
|
"learning_rate": 2.5440454901626486e-05, |
|
"loss": 1.4493, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.6888763410502541, |
|
"grad_norm": 0.6524962782859802, |
|
"learning_rate": 2.510987439772261e-05, |
|
"loss": 1.1314, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6911349520045172, |
|
"grad_norm": 0.5966671109199524, |
|
"learning_rate": 2.4780734099885833e-05, |
|
"loss": 1.9383, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.6933935629587803, |
|
"grad_norm": 0.506033718585968, |
|
"learning_rate": 2.4453053053012187e-05, |
|
"loss": 1.4983, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.687870979309082, |
|
"learning_rate": 2.4126850217561698e-05, |
|
"loss": 1.5229, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.6979107848673066, |
|
"grad_norm": 0.4640161991119385, |
|
"learning_rate": 2.3802144468461367e-05, |
|
"loss": 1.5815, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7001693958215698, |
|
"grad_norm": 0.9109538793563843, |
|
"learning_rate": 2.347895459401288e-05, |
|
"loss": 1.419, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7024280067758328, |
|
"grad_norm": 0.5041788220405579, |
|
"learning_rate": 2.3157299294805613e-05, |
|
"loss": 1.3753, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.704686617730096, |
|
"grad_norm": 0.6865159869194031, |
|
"learning_rate": 2.2837197182634483e-05, |
|
"loss": 1.6305, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7069452286843592, |
|
"grad_norm": 0.7781485319137573, |
|
"learning_rate": 2.2518666779423074e-05, |
|
"loss": 1.4607, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7092038396386222, |
|
"grad_norm": 0.5647329092025757, |
|
"learning_rate": 2.2201726516151882e-05, |
|
"loss": 1.4964, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7114624505928854, |
|
"grad_norm": 0.577785074710846, |
|
"learning_rate": 2.1886394731791816e-05, |
|
"loss": 1.5494, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7137210615471485, |
|
"grad_norm": 0.5110263228416443, |
|
"learning_rate": 2.157268967224314e-05, |
|
"loss": 1.4089, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7159796725014116, |
|
"grad_norm": 0.5485273003578186, |
|
"learning_rate": 2.126062948927966e-05, |
|
"loss": 1.5507, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7182382834556748, |
|
"grad_norm": 0.6336625814437866, |
|
"learning_rate": 2.0950232239498446e-05, |
|
"loss": 1.1833, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7204968944099379, |
|
"grad_norm": 0.5420766472816467, |
|
"learning_rate": 2.064151588327501e-05, |
|
"loss": 1.5497, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.722755505364201, |
|
"grad_norm": 0.43920066952705383, |
|
"learning_rate": 2.0334498283724078e-05, |
|
"loss": 1.7342, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7250141163184641, |
|
"grad_norm": 0.4995698928833008, |
|
"learning_rate": 2.002919720566599e-05, |
|
"loss": 1.2779, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 1.1329647302627563, |
|
"learning_rate": 1.9725630314598782e-05, |
|
"loss": 1.3353, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7295313382269905, |
|
"grad_norm": 0.6960721015930176, |
|
"learning_rate": 1.9423815175676025e-05, |
|
"loss": 1.4193, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.7317899491812535, |
|
"grad_norm": 0.5718068480491638, |
|
"learning_rate": 1.912376925269041e-05, |
|
"loss": 1.2262, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.7340485601355167, |
|
"grad_norm": 0.633811891078949, |
|
"learning_rate": 1.8825509907063327e-05, |
|
"loss": 1.5999, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7363071710897798, |
|
"grad_norm": 0.5763436555862427, |
|
"learning_rate": 1.8529054396840234e-05, |
|
"loss": 1.5864, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.7385657820440429, |
|
"grad_norm": 0.49580734968185425, |
|
"learning_rate": 1.8234419875692105e-05, |
|
"loss": 1.9146, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.740824392998306, |
|
"grad_norm": 0.597285270690918, |
|
"learning_rate": 1.7941623391922772e-05, |
|
"loss": 1.2687, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.7430830039525692, |
|
"grad_norm": 0.6545310020446777, |
|
"learning_rate": 1.7650681887482628e-05, |
|
"loss": 1.5924, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.7453416149068323, |
|
"grad_norm": 0.760567307472229, |
|
"learning_rate": 1.7361612196988174e-05, |
|
"loss": 1.4892, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7476002258610954, |
|
"grad_norm": 0.5573106408119202, |
|
"learning_rate": 1.7074431046748075e-05, |
|
"loss": 1.3689, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.7498588368153586, |
|
"grad_norm": 0.5589758157730103, |
|
"learning_rate": 1.678915505379513e-05, |
|
"loss": 1.2265, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.7521174477696216, |
|
"grad_norm": 0.5898151397705078, |
|
"learning_rate": 1.650580072492496e-05, |
|
"loss": 1.3948, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.7543760587238848, |
|
"grad_norm": 0.6013538241386414, |
|
"learning_rate": 1.6224384455740788e-05, |
|
"loss": 1.5207, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.756634669678148, |
|
"grad_norm": 0.5616132020950317, |
|
"learning_rate": 1.5944922529704777e-05, |
|
"loss": 1.3745, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.758893280632411, |
|
"grad_norm": 0.6084350943565369, |
|
"learning_rate": 1.5667431117195814e-05, |
|
"loss": 1.4857, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.7611518915866742, |
|
"grad_norm": 0.6251053810119629, |
|
"learning_rate": 1.539192627457382e-05, |
|
"loss": 1.2732, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.7634105025409373, |
|
"grad_norm": 0.8703694343566895, |
|
"learning_rate": 1.5118423943250771e-05, |
|
"loss": 1.5002, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.7656691134952005, |
|
"grad_norm": 0.5518003702163696, |
|
"learning_rate": 1.4846939948768218e-05, |
|
"loss": 1.8132, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.7679277244494636, |
|
"grad_norm": 0.5927785038948059, |
|
"learning_rate": 1.45774899998816e-05, |
|
"loss": 1.1446, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7701863354037267, |
|
"grad_norm": 0.5158970952033997, |
|
"learning_rate": 1.4310089687651301e-05, |
|
"loss": 1.2931, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.7724449463579899, |
|
"grad_norm": 0.7069851160049438, |
|
"learning_rate": 1.40447544845405e-05, |
|
"loss": 1.4927, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.7747035573122529, |
|
"grad_norm": 0.5784515142440796, |
|
"learning_rate": 1.378149974351991e-05, |
|
"loss": 1.3116, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.7769621682665161, |
|
"grad_norm": 0.49697092175483704, |
|
"learning_rate": 1.3520340697179406e-05, |
|
"loss": 1.5359, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.6225888729095459, |
|
"learning_rate": 1.3261292456846647e-05, |
|
"loss": 1.4725, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7814793901750423, |
|
"grad_norm": 0.9478232860565186, |
|
"learning_rate": 1.3004370011712624e-05, |
|
"loss": 1.3835, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.7837380011293055, |
|
"grad_norm": 0.5357046127319336, |
|
"learning_rate": 1.2749588227964465e-05, |
|
"loss": 1.2114, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.7859966120835686, |
|
"grad_norm": 0.6515849828720093, |
|
"learning_rate": 1.2496961847925153e-05, |
|
"loss": 1.4449, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.7882552230378317, |
|
"grad_norm": 0.7444521188735962, |
|
"learning_rate": 1.2246505489200532e-05, |
|
"loss": 1.2324, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.7905138339920948, |
|
"grad_norm": 0.6521069407463074, |
|
"learning_rate": 1.1998233643833457e-05, |
|
"loss": 1.3745, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.792772444946358, |
|
"grad_norm": 0.5900723934173584, |
|
"learning_rate": 1.1752160677465286e-05, |
|
"loss": 1.3835, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.7950310559006211, |
|
"grad_norm": 0.5521007180213928, |
|
"learning_rate": 1.150830082850468e-05, |
|
"loss": 1.4689, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.7972896668548842, |
|
"grad_norm": 0.43919599056243896, |
|
"learning_rate": 1.126666820730366e-05, |
|
"loss": 1.379, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.7995482778091474, |
|
"grad_norm": 0.4976402223110199, |
|
"learning_rate": 1.1027276795341135e-05, |
|
"loss": 1.4056, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8018068887634106, |
|
"grad_norm": 0.7465052008628845, |
|
"learning_rate": 1.0790140444414e-05, |
|
"loss": 1.1132, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8040654997176736, |
|
"grad_norm": 1.412811279296875, |
|
"learning_rate": 1.0555272875835537e-05, |
|
"loss": 1.3229, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8063241106719368, |
|
"grad_norm": 0.38720905780792236, |
|
"learning_rate": 1.0322687679641523e-05, |
|
"loss": 1.3627, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8085827216261999, |
|
"grad_norm": 0.6548861265182495, |
|
"learning_rate": 1.0092398313803863e-05, |
|
"loss": 1.6172, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.810841332580463, |
|
"grad_norm": 0.5794533491134644, |
|
"learning_rate": 9.864418103451828e-06, |
|
"loss": 1.5726, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8130999435347261, |
|
"grad_norm": 0.5520102977752686, |
|
"learning_rate": 9.638760240101102e-06, |
|
"loss": 1.294, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8153585544889893, |
|
"grad_norm": 0.5520592927932739, |
|
"learning_rate": 9.415437780890451e-06, |
|
"loss": 1.3229, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8176171654432524, |
|
"grad_norm": 0.644109845161438, |
|
"learning_rate": 9.194463647826223e-06, |
|
"loss": 1.6856, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8198757763975155, |
|
"grad_norm": 0.6648945808410645, |
|
"learning_rate": 8.975850627034604e-06, |
|
"loss": 1.4002, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.8221343873517787, |
|
"grad_norm": 0.6234399676322937, |
|
"learning_rate": 8.759611368021831e-06, |
|
"loss": 1.33, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8243929983060417, |
|
"grad_norm": 0.6238119602203369, |
|
"learning_rate": 8.545758382942232e-06, |
|
"loss": 1.2347, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8266516092603049, |
|
"grad_norm": 0.5199403166770935, |
|
"learning_rate": 8.334304045874247e-06, |
|
"loss": 1.5012, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.8289102202145681, |
|
"grad_norm": 0.43542924523353577, |
|
"learning_rate": 8.125260592104445e-06, |
|
"loss": 1.4619, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.8311688311688312, |
|
"grad_norm": 0.6356791853904724, |
|
"learning_rate": 7.918640117419507e-06, |
|
"loss": 1.2857, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.8334274421230943, |
|
"grad_norm": 0.8213852643966675, |
|
"learning_rate": 7.71445457740641e-06, |
|
"loss": 1.3405, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.8356860530773574, |
|
"grad_norm": 0.6396834254264832, |
|
"learning_rate": 7.512715786760605e-06, |
|
"loss": 1.675, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8379446640316206, |
|
"grad_norm": 0.5670115947723389, |
|
"learning_rate": 7.313435418602388e-06, |
|
"loss": 1.4417, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.8402032749858837, |
|
"grad_norm": 0.8237860798835754, |
|
"learning_rate": 7.116625003801436e-06, |
|
"loss": 1.3525, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.8424618859401468, |
|
"grad_norm": 0.6882337927818298, |
|
"learning_rate": 6.922295930309691e-06, |
|
"loss": 1.5534, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.84472049689441, |
|
"grad_norm": 0.750078022480011, |
|
"learning_rate": 6.730459442502329e-06, |
|
"loss": 1.5733, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.846979107848673, |
|
"grad_norm": 0.5471389889717102, |
|
"learning_rate": 6.541126640527195e-06, |
|
"loss": 1.6169, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8492377188029362, |
|
"grad_norm": 0.4878935217857361, |
|
"learning_rate": 6.354308479662446e-06, |
|
"loss": 1.5414, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.8514963297571994, |
|
"grad_norm": 0.5643681883811951, |
|
"learning_rate": 6.170015769682741e-06, |
|
"loss": 1.6472, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.8537549407114624, |
|
"grad_norm": 0.5661868453025818, |
|
"learning_rate": 5.988259174233713e-06, |
|
"loss": 1.3295, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.8560135516657256, |
|
"grad_norm": 0.8035285472869873, |
|
"learning_rate": 5.80904921021494e-06, |
|
"loss": 1.4848, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.8582721626199887, |
|
"grad_norm": 0.48411625623703003, |
|
"learning_rate": 5.6323962471714286e-06, |
|
"loss": 1.3458, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8605307735742518, |
|
"grad_norm": 0.5900686979293823, |
|
"learning_rate": 5.458310506693571e-06, |
|
"loss": 1.5151, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.862789384528515, |
|
"grad_norm": 0.5503493547439575, |
|
"learning_rate": 5.286802061825752e-06, |
|
"loss": 1.4399, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.8650479954827781, |
|
"grad_norm": 0.4653548002243042, |
|
"learning_rate": 5.117880836483452e-06, |
|
"loss": 1.5298, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.8673066064370413, |
|
"grad_norm": 0.5434414148330688, |
|
"learning_rate": 4.951556604879048e-06, |
|
"loss": 1.3049, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.8023334741592407, |
|
"learning_rate": 4.7878389909562285e-06, |
|
"loss": 1.4366, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8718238283455675, |
|
"grad_norm": 0.5557988286018372, |
|
"learning_rate": 4.62673746783317e-06, |
|
"loss": 1.2165, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.8740824392998306, |
|
"grad_norm": 0.4488828778266907, |
|
"learning_rate": 4.468261357254339e-06, |
|
"loss": 1.1717, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.8763410502540937, |
|
"grad_norm": 0.5039349794387817, |
|
"learning_rate": 4.312419829051173e-06, |
|
"loss": 1.5923, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.8785996612083569, |
|
"grad_norm": 0.5717617273330688, |
|
"learning_rate": 4.15922190061146e-06, |
|
"loss": 1.3688, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.88085827216262, |
|
"grad_norm": 0.6537705063819885, |
|
"learning_rate": 4.008676436357539e-06, |
|
"loss": 1.4746, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8831168831168831, |
|
"grad_norm": 0.7428813576698303, |
|
"learning_rate": 3.86079214723345e-06, |
|
"loss": 1.6453, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.8853754940711462, |
|
"grad_norm": 0.6995297074317932, |
|
"learning_rate": 3.7155775902008526e-06, |
|
"loss": 1.2413, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.8876341050254094, |
|
"grad_norm": 0.5746084451675415, |
|
"learning_rate": 3.5730411677439125e-06, |
|
"loss": 1.5308, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.8898927159796725, |
|
"grad_norm": 0.7818319797515869, |
|
"learning_rate": 3.4331911273830784e-06, |
|
"loss": 1.2739, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.8921513269339356, |
|
"grad_norm": 0.4729510545730591, |
|
"learning_rate": 3.2960355611979245e-06, |
|
"loss": 1.2642, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8944099378881988, |
|
"grad_norm": 0.5390079021453857, |
|
"learning_rate": 3.161582405358876e-06, |
|
"loss": 1.4629, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.8966685488424618, |
|
"grad_norm": 0.5690498352050781, |
|
"learning_rate": 3.029839439668003e-06, |
|
"loss": 1.576, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.898927159796725, |
|
"grad_norm": 0.6022957563400269, |
|
"learning_rate": 2.9008142871088663e-06, |
|
"loss": 1.4105, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9011857707509882, |
|
"grad_norm": 0.4936705231666565, |
|
"learning_rate": 2.7745144134054433e-06, |
|
"loss": 1.3807, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9034443817052513, |
|
"grad_norm": 0.5045138001441956, |
|
"learning_rate": 2.6509471265901477e-06, |
|
"loss": 1.427, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9057029926595144, |
|
"grad_norm": 0.5444932579994202, |
|
"learning_rate": 2.530119576580936e-06, |
|
"loss": 1.3433, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.9079616036137775, |
|
"grad_norm": 0.48583537340164185, |
|
"learning_rate": 2.412038754767626e-06, |
|
"loss": 1.2593, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.9102202145680407, |
|
"grad_norm": 0.7965908050537109, |
|
"learning_rate": 2.296711493607334e-06, |
|
"loss": 1.304, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.9124788255223037, |
|
"grad_norm": 0.6211166381835938, |
|
"learning_rate": 2.1841444662291543e-06, |
|
"loss": 1.4832, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.9147374364765669, |
|
"grad_norm": 0.5326806902885437, |
|
"learning_rate": 2.074344186048022e-06, |
|
"loss": 1.239, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9169960474308301, |
|
"grad_norm": 0.46467143297195435, |
|
"learning_rate": 1.967317006387831e-06, |
|
"loss": 1.5442, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.9192546583850931, |
|
"grad_norm": 0.6954487562179565, |
|
"learning_rate": 1.863069120113814e-06, |
|
"loss": 1.0449, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.9215132693393563, |
|
"grad_norm": 0.5407376885414124, |
|
"learning_rate": 1.7616065592742038e-06, |
|
"loss": 1.368, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.9237718802936195, |
|
"grad_norm": 0.5914408564567566, |
|
"learning_rate": 1.6629351947512195e-06, |
|
"loss": 1.4579, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.9260304912478825, |
|
"grad_norm": 0.5790956020355225, |
|
"learning_rate": 1.567060735921344e-06, |
|
"loss": 1.5103, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9282891022021457, |
|
"grad_norm": 0.7009652853012085, |
|
"learning_rate": 1.4739887303249877e-06, |
|
"loss": 1.4829, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.9305477131564088, |
|
"grad_norm": 0.6565191149711609, |
|
"learning_rate": 1.383724563345451e-06, |
|
"loss": 1.6771, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.932806324110672, |
|
"grad_norm": 0.6377475261688232, |
|
"learning_rate": 1.2962734578973568e-06, |
|
"loss": 1.4032, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.935064935064935, |
|
"grad_norm": 0.6361109018325806, |
|
"learning_rate": 1.2116404741244203e-06, |
|
"loss": 1.4081, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.9373235460191982, |
|
"grad_norm": 0.517658531665802, |
|
"learning_rate": 1.1298305091066664e-06, |
|
"loss": 1.3421, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9395821569734614, |
|
"grad_norm": 0.7061721086502075, |
|
"learning_rate": 1.0508482965770505e-06, |
|
"loss": 1.5959, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.9418407679277244, |
|
"grad_norm": 0.5791245698928833, |
|
"learning_rate": 9.746984066475729e-07, |
|
"loss": 1.3128, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.9440993788819876, |
|
"grad_norm": 0.7557624578475952, |
|
"learning_rate": 9.013852455448335e-07, |
|
"loss": 1.2763, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.9463579898362507, |
|
"grad_norm": 0.9951817989349365, |
|
"learning_rate": 8.309130553550815e-07, |
|
"loss": 1.4941, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.9486166007905138, |
|
"grad_norm": 0.3905503749847412, |
|
"learning_rate": 7.63285913778733e-07, |
|
"loss": 1.3464, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.950875211744777, |
|
"grad_norm": 0.5072855949401855, |
|
"learning_rate": 6.985077338944657e-07, |
|
"loss": 1.0404, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.9531338226990401, |
|
"grad_norm": 0.5355942845344543, |
|
"learning_rate": 6.365822639327723e-07, |
|
"loss": 1.2642, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.9553924336533032, |
|
"grad_norm": 0.5786038041114807, |
|
"learning_rate": 5.775130870590783e-07, |
|
"loss": 1.3897, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.9576510446075663, |
|
"grad_norm": 0.564463198184967, |
|
"learning_rate": 5.213036211664191e-07, |
|
"loss": 1.4783, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.9599096555618295, |
|
"grad_norm": 0.5604745745658875, |
|
"learning_rate": 4.6795711867766436e-07, |
|
"loss": 1.6895, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9621682665160926, |
|
"grad_norm": 0.7266672849655151, |
|
"learning_rate": 4.1747666635733597e-07, |
|
"loss": 1.3143, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.9644268774703557, |
|
"grad_norm": 0.7507563829421997, |
|
"learning_rate": 3.698651851329837e-07, |
|
"loss": 1.4809, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.9666854884246189, |
|
"grad_norm": 0.5136380195617676, |
|
"learning_rate": 3.251254299261874e-07, |
|
"loss": 1.6922, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.968944099378882, |
|
"grad_norm": 0.7942318916320801, |
|
"learning_rate": 2.8325998949314536e-07, |
|
"loss": 1.5874, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.9712027103331451, |
|
"grad_norm": 0.46300673484802246, |
|
"learning_rate": 2.442712862748775e-07, |
|
"loss": 1.7898, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9734613212874083, |
|
"grad_norm": 0.6884695291519165, |
|
"learning_rate": 2.0816157625706545e-07, |
|
"loss": 1.33, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.9757199322416714, |
|
"grad_norm": 0.5976503491401672, |
|
"learning_rate": 1.749329488395124e-07, |
|
"loss": 1.6437, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.9779785431959345, |
|
"grad_norm": 0.42137616872787476, |
|
"learning_rate": 1.4458732671523977e-07, |
|
"loss": 1.2883, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.9802371541501976, |
|
"grad_norm": 0.6945174932479858, |
|
"learning_rate": 1.1712646575922637e-07, |
|
"loss": 1.3545, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.9824957651044608, |
|
"grad_norm": 0.5032137036323547, |
|
"learning_rate": 9.255195492685609e-08, |
|
"loss": 1.4777, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9847543760587238, |
|
"grad_norm": 0.6473844647407532, |
|
"learning_rate": 7.086521616190279e-08, |
|
"loss": 1.4482, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.987012987012987, |
|
"grad_norm": 1.0584694147109985, |
|
"learning_rate": 5.2067504314323723e-08, |
|
"loss": 1.2144, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.9892715979672502, |
|
"grad_norm": 0.4894103407859802, |
|
"learning_rate": 3.6159907067601085e-08, |
|
"loss": 1.3691, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.9915302089215132, |
|
"grad_norm": 0.6497173309326172, |
|
"learning_rate": 2.3143344875831142e-08, |
|
"loss": 1.3208, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.9937888198757764, |
|
"grad_norm": 0.5225093364715576, |
|
"learning_rate": 1.3018570910466877e-08, |
|
"loss": 1.1954, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9960474308300395, |
|
"grad_norm": 0.574373185634613, |
|
"learning_rate": 5.786171016708419e-09, |
|
"loss": 1.598, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.9983060417843026, |
|
"grad_norm": 0.6945062875747681, |
|
"learning_rate": 1.446563679641244e-09, |
|
"loss": 1.4279, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.0005646527385659, |
|
"grad_norm": 0.6715332865715027, |
|
"learning_rate": 0.0, |
|
"loss": 1.5682, |
|
"step": 443 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 443, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.630732306700042e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|