|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.09620780886715305, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003206926962238435, |
|
"grad_norm": 2.7483954429626465, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9483, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.000641385392447687, |
|
"grad_norm": 2.690746307373047, |
|
"learning_rate": 2e-05, |
|
"loss": 1.7416, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0009620780886715305, |
|
"grad_norm": 2.7132160663604736, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8029, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.001282770784895374, |
|
"grad_norm": 2.7139906883239746, |
|
"learning_rate": 4e-05, |
|
"loss": 1.9292, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0016034634811192174, |
|
"grad_norm": 2.8549463748931885, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8055, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.001924156177343061, |
|
"grad_norm": 2.5766773223876953, |
|
"learning_rate": 6e-05, |
|
"loss": 2.042, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0022448488735669044, |
|
"grad_norm": 2.1945223808288574, |
|
"learning_rate": 7e-05, |
|
"loss": 1.8585, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.002565541569790748, |
|
"grad_norm": 2.143160581588745, |
|
"learning_rate": 8e-05, |
|
"loss": 1.7026, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0028862342660145915, |
|
"grad_norm": 2.1353299617767334, |
|
"learning_rate": 9e-05, |
|
"loss": 1.7981, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.003206926962238435, |
|
"grad_norm": 2.080528497695923, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7712, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0035276196584622787, |
|
"grad_norm": 2.391787052154541, |
|
"learning_rate": 9.996782496782497e-05, |
|
"loss": 1.7666, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.003848312354686122, |
|
"grad_norm": 2.2097723484039307, |
|
"learning_rate": 9.993564993564995e-05, |
|
"loss": 1.8056, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.004169005050909965, |
|
"grad_norm": 1.883785367012024, |
|
"learning_rate": 9.99034749034749e-05, |
|
"loss": 1.5309, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.004489697747133809, |
|
"grad_norm": 1.9296449422836304, |
|
"learning_rate": 9.987129987129988e-05, |
|
"loss": 1.5142, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.004810390443357652, |
|
"grad_norm": 2.2633562088012695, |
|
"learning_rate": 9.983912483912484e-05, |
|
"loss": 1.6335, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005131083139581496, |
|
"grad_norm": 2.1033310890197754, |
|
"learning_rate": 9.98069498069498e-05, |
|
"loss": 1.7974, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00545177583580534, |
|
"grad_norm": 2.003077268600464, |
|
"learning_rate": 9.977477477477478e-05, |
|
"loss": 1.5864, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.005772468532029183, |
|
"grad_norm": 2.1461055278778076, |
|
"learning_rate": 9.974259974259975e-05, |
|
"loss": 1.5678, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0060931612282530264, |
|
"grad_norm": 1.9527181386947632, |
|
"learning_rate": 9.971042471042471e-05, |
|
"loss": 1.8429, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.00641385392447687, |
|
"grad_norm": 2.050208330154419, |
|
"learning_rate": 9.967824967824968e-05, |
|
"loss": 1.6551, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006734546620700713, |
|
"grad_norm": 1.841902256011963, |
|
"learning_rate": 9.964607464607466e-05, |
|
"loss": 1.8851, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.007055239316924557, |
|
"grad_norm": 1.8649909496307373, |
|
"learning_rate": 9.961389961389962e-05, |
|
"loss": 1.7153, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.007375932013148401, |
|
"grad_norm": 1.9829213619232178, |
|
"learning_rate": 9.958172458172458e-05, |
|
"loss": 1.7295, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.007696624709372244, |
|
"grad_norm": 1.945168137550354, |
|
"learning_rate": 9.954954954954956e-05, |
|
"loss": 1.6252, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.008017317405596087, |
|
"grad_norm": 1.7710901498794556, |
|
"learning_rate": 9.951737451737451e-05, |
|
"loss": 1.8558, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.00833801010181993, |
|
"grad_norm": 1.8060033321380615, |
|
"learning_rate": 9.948519948519949e-05, |
|
"loss": 1.6409, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.008658702798043774, |
|
"grad_norm": 1.737451195716858, |
|
"learning_rate": 9.945302445302446e-05, |
|
"loss": 1.6147, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.008979395494267618, |
|
"grad_norm": 1.7312341928482056, |
|
"learning_rate": 9.942084942084942e-05, |
|
"loss": 1.6207, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.009300088190491461, |
|
"grad_norm": 1.850573182106018, |
|
"learning_rate": 9.93886743886744e-05, |
|
"loss": 1.6429, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.009620780886715304, |
|
"grad_norm": 1.892348051071167, |
|
"learning_rate": 9.935649935649936e-05, |
|
"loss": 1.7195, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00994147358293915, |
|
"grad_norm": 1.9182771444320679, |
|
"learning_rate": 9.932432432432433e-05, |
|
"loss": 1.6534, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.010262166279162993, |
|
"grad_norm": 1.7795934677124023, |
|
"learning_rate": 9.929214929214929e-05, |
|
"loss": 1.5351, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.010582858975386836, |
|
"grad_norm": 1.794394850730896, |
|
"learning_rate": 9.925997425997427e-05, |
|
"loss": 1.6685, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.01090355167161068, |
|
"grad_norm": 1.784448504447937, |
|
"learning_rate": 9.922779922779923e-05, |
|
"loss": 1.6303, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.011224244367834523, |
|
"grad_norm": 1.6641632318496704, |
|
"learning_rate": 9.91956241956242e-05, |
|
"loss": 1.4606, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.011544937064058366, |
|
"grad_norm": 1.7565549612045288, |
|
"learning_rate": 9.916344916344918e-05, |
|
"loss": 1.7965, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.01186562976028221, |
|
"grad_norm": 1.863232135772705, |
|
"learning_rate": 9.913127413127413e-05, |
|
"loss": 1.6193, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.012186322456506053, |
|
"grad_norm": 1.9475373029708862, |
|
"learning_rate": 9.90990990990991e-05, |
|
"loss": 1.6691, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.012507015152729896, |
|
"grad_norm": 1.7293490171432495, |
|
"learning_rate": 9.906692406692407e-05, |
|
"loss": 1.6587, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.01282770784895374, |
|
"grad_norm": 1.7146735191345215, |
|
"learning_rate": 9.903474903474904e-05, |
|
"loss": 1.7512, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.013148400545177583, |
|
"grad_norm": 1.6908844709396362, |
|
"learning_rate": 9.900257400257401e-05, |
|
"loss": 1.5004, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.013469093241401426, |
|
"grad_norm": 2.0432796478271484, |
|
"learning_rate": 9.897039897039896e-05, |
|
"loss": 1.5675, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.013789785937625271, |
|
"grad_norm": 1.9518095254898071, |
|
"learning_rate": 9.893822393822394e-05, |
|
"loss": 1.5885, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.014110478633849115, |
|
"grad_norm": 2.044656753540039, |
|
"learning_rate": 9.89060489060489e-05, |
|
"loss": 1.914, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.014431171330072958, |
|
"grad_norm": 1.8500516414642334, |
|
"learning_rate": 9.887387387387388e-05, |
|
"loss": 1.6865, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.014751864026296801, |
|
"grad_norm": 1.7090437412261963, |
|
"learning_rate": 9.884169884169885e-05, |
|
"loss": 1.6582, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.015072556722520645, |
|
"grad_norm": 1.6555441617965698, |
|
"learning_rate": 9.880952380952381e-05, |
|
"loss": 1.4794, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.015393249418744488, |
|
"grad_norm": 1.6648783683776855, |
|
"learning_rate": 9.877734877734878e-05, |
|
"loss": 1.2951, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.01571394211496833, |
|
"grad_norm": 1.7010596990585327, |
|
"learning_rate": 9.874517374517374e-05, |
|
"loss": 1.5657, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.016034634811192175, |
|
"grad_norm": 1.7453527450561523, |
|
"learning_rate": 9.871299871299872e-05, |
|
"loss": 1.4699, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01635532750741602, |
|
"grad_norm": 1.7735224962234497, |
|
"learning_rate": 9.868082368082369e-05, |
|
"loss": 1.724, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.01667602020363986, |
|
"grad_norm": 1.5949498414993286, |
|
"learning_rate": 9.864864864864865e-05, |
|
"loss": 1.6525, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.016996712899863705, |
|
"grad_norm": 1.8592952489852905, |
|
"learning_rate": 9.861647361647363e-05, |
|
"loss": 1.6122, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.01731740559608755, |
|
"grad_norm": 1.7868975400924683, |
|
"learning_rate": 9.858429858429858e-05, |
|
"loss": 1.5783, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.01763809829231139, |
|
"grad_norm": 1.6725190877914429, |
|
"learning_rate": 9.855212355212356e-05, |
|
"loss": 1.6519, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.017958790988535235, |
|
"grad_norm": 2.030893564224243, |
|
"learning_rate": 9.851994851994852e-05, |
|
"loss": 1.881, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.01827948368475908, |
|
"grad_norm": 1.79916512966156, |
|
"learning_rate": 9.84877734877735e-05, |
|
"loss": 1.6116, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.018600176380982922, |
|
"grad_norm": 1.7612411975860596, |
|
"learning_rate": 9.845559845559846e-05, |
|
"loss": 1.5393, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.018920869077206765, |
|
"grad_norm": 1.8098998069763184, |
|
"learning_rate": 9.842342342342343e-05, |
|
"loss": 1.693, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.01924156177343061, |
|
"grad_norm": 1.8984328508377075, |
|
"learning_rate": 9.839124839124839e-05, |
|
"loss": 1.6939, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.019562254469654452, |
|
"grad_norm": 1.751279592514038, |
|
"learning_rate": 9.835907335907336e-05, |
|
"loss": 1.6109, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0198829471658783, |
|
"grad_norm": 1.8087176084518433, |
|
"learning_rate": 9.832689832689834e-05, |
|
"loss": 1.5514, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.020203639862102142, |
|
"grad_norm": 1.7087894678115845, |
|
"learning_rate": 9.82947232947233e-05, |
|
"loss": 1.6691, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.020524332558325985, |
|
"grad_norm": 1.963853120803833, |
|
"learning_rate": 9.826254826254826e-05, |
|
"loss": 1.7157, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.02084502525454983, |
|
"grad_norm": 1.9152112007141113, |
|
"learning_rate": 9.823037323037324e-05, |
|
"loss": 1.5779, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.021165717950773672, |
|
"grad_norm": 1.856195092201233, |
|
"learning_rate": 9.81981981981982e-05, |
|
"loss": 1.6279, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.021486410646997516, |
|
"grad_norm": 1.8876991271972656, |
|
"learning_rate": 9.816602316602317e-05, |
|
"loss": 1.7734, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.02180710334322136, |
|
"grad_norm": 1.8838751316070557, |
|
"learning_rate": 9.813384813384814e-05, |
|
"loss": 1.6114, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.022127796039445202, |
|
"grad_norm": 1.553562045097351, |
|
"learning_rate": 9.810167310167311e-05, |
|
"loss": 1.4989, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.022448488735669046, |
|
"grad_norm": 1.9303158521652222, |
|
"learning_rate": 9.806949806949808e-05, |
|
"loss": 1.7525, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02276918143189289, |
|
"grad_norm": 1.655938982963562, |
|
"learning_rate": 9.803732303732304e-05, |
|
"loss": 1.6089, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.023089874128116732, |
|
"grad_norm": 2.0096657276153564, |
|
"learning_rate": 9.800514800514801e-05, |
|
"loss": 1.6957, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.023410566824340576, |
|
"grad_norm": 1.8276057243347168, |
|
"learning_rate": 9.797297297297297e-05, |
|
"loss": 1.684, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.02373125952056442, |
|
"grad_norm": 1.6198551654815674, |
|
"learning_rate": 9.794079794079795e-05, |
|
"loss": 1.5419, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.024051952216788262, |
|
"grad_norm": 1.850451111793518, |
|
"learning_rate": 9.790862290862292e-05, |
|
"loss": 1.5769, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.024372644913012106, |
|
"grad_norm": 1.8147132396697998, |
|
"learning_rate": 9.787644787644788e-05, |
|
"loss": 1.4616, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.02469333760923595, |
|
"grad_norm": 1.6501047611236572, |
|
"learning_rate": 9.784427284427284e-05, |
|
"loss": 1.3938, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.025014030305459792, |
|
"grad_norm": 1.7198457717895508, |
|
"learning_rate": 9.781209781209781e-05, |
|
"loss": 1.4913, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.025334723001683636, |
|
"grad_norm": 1.8030346632003784, |
|
"learning_rate": 9.777992277992279e-05, |
|
"loss": 1.5601, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.02565541569790748, |
|
"grad_norm": 1.6368941068649292, |
|
"learning_rate": 9.774774774774775e-05, |
|
"loss": 1.5337, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.025976108394131323, |
|
"grad_norm": 1.6790492534637451, |
|
"learning_rate": 9.771557271557273e-05, |
|
"loss": 1.5556, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.026296801090355166, |
|
"grad_norm": 1.7556215524673462, |
|
"learning_rate": 9.76833976833977e-05, |
|
"loss": 1.5403, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.02661749378657901, |
|
"grad_norm": 1.6657801866531372, |
|
"learning_rate": 9.765122265122264e-05, |
|
"loss": 1.5023, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.026938186482802853, |
|
"grad_norm": 1.8705981969833374, |
|
"learning_rate": 9.761904761904762e-05, |
|
"loss": 1.5976, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.027258879179026696, |
|
"grad_norm": 1.8940606117248535, |
|
"learning_rate": 9.758687258687259e-05, |
|
"loss": 1.7891, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.027579571875250543, |
|
"grad_norm": 1.5879243612289429, |
|
"learning_rate": 9.755469755469757e-05, |
|
"loss": 1.4251, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.027900264571474386, |
|
"grad_norm": 1.5877267122268677, |
|
"learning_rate": 9.752252252252253e-05, |
|
"loss": 1.6625, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.02822095726769823, |
|
"grad_norm": 1.6735244989395142, |
|
"learning_rate": 9.74903474903475e-05, |
|
"loss": 1.5938, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.028541649963922073, |
|
"grad_norm": 1.6752715110778809, |
|
"learning_rate": 9.745817245817246e-05, |
|
"loss": 1.4759, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.028862342660145916, |
|
"grad_norm": 1.8093688488006592, |
|
"learning_rate": 9.742599742599742e-05, |
|
"loss": 1.8628, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02918303535636976, |
|
"grad_norm": 1.822588324546814, |
|
"learning_rate": 9.73938223938224e-05, |
|
"loss": 1.6622, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.029503728052593603, |
|
"grad_norm": 1.7710521221160889, |
|
"learning_rate": 9.736164736164737e-05, |
|
"loss": 1.4719, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.029824420748817446, |
|
"grad_norm": 1.4404919147491455, |
|
"learning_rate": 9.732947232947234e-05, |
|
"loss": 1.1332, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.03014511344504129, |
|
"grad_norm": 1.6102395057678223, |
|
"learning_rate": 9.729729729729731e-05, |
|
"loss": 1.4751, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.030465806141265133, |
|
"grad_norm": 1.706247091293335, |
|
"learning_rate": 9.726512226512226e-05, |
|
"loss": 1.5941, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.030786498837488976, |
|
"grad_norm": 1.6406103372573853, |
|
"learning_rate": 9.723294723294724e-05, |
|
"loss": 1.5951, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.03110719153371282, |
|
"grad_norm": 1.6317566633224487, |
|
"learning_rate": 9.72007722007722e-05, |
|
"loss": 1.4737, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.03142788422993666, |
|
"grad_norm": 1.8702404499053955, |
|
"learning_rate": 9.716859716859718e-05, |
|
"loss": 1.5181, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.03174857692616051, |
|
"grad_norm": 1.725326657295227, |
|
"learning_rate": 9.713642213642214e-05, |
|
"loss": 1.6582, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.03206926962238435, |
|
"grad_norm": 1.7484664916992188, |
|
"learning_rate": 9.710424710424711e-05, |
|
"loss": 1.6073, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0323899623186082, |
|
"grad_norm": 1.8844904899597168, |
|
"learning_rate": 9.707207207207207e-05, |
|
"loss": 1.6232, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.03271065501483204, |
|
"grad_norm": 1.8485490083694458, |
|
"learning_rate": 9.703989703989704e-05, |
|
"loss": 1.5857, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.03303134771105588, |
|
"grad_norm": 1.7108585834503174, |
|
"learning_rate": 9.700772200772202e-05, |
|
"loss": 1.7378, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.03335204040727972, |
|
"grad_norm": 1.869952917098999, |
|
"learning_rate": 9.697554697554698e-05, |
|
"loss": 1.7696, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.03367273310350357, |
|
"grad_norm": 1.6918983459472656, |
|
"learning_rate": 9.694337194337196e-05, |
|
"loss": 1.4724, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.03399342579972741, |
|
"grad_norm": 1.903573989868164, |
|
"learning_rate": 9.691119691119691e-05, |
|
"loss": 1.7818, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.03431411849595126, |
|
"grad_norm": 1.7081507444381714, |
|
"learning_rate": 9.687902187902187e-05, |
|
"loss": 1.5507, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.0346348111921751, |
|
"grad_norm": 1.8181805610656738, |
|
"learning_rate": 9.684684684684685e-05, |
|
"loss": 1.6716, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.034955503888398944, |
|
"grad_norm": 1.7593623399734497, |
|
"learning_rate": 9.681467181467182e-05, |
|
"loss": 1.6878, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.03527619658462278, |
|
"grad_norm": 1.6997793912887573, |
|
"learning_rate": 9.67824967824968e-05, |
|
"loss": 1.5886, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03559688928084663, |
|
"grad_norm": 1.5424351692199707, |
|
"learning_rate": 9.675032175032176e-05, |
|
"loss": 1.6448, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.03591758197707047, |
|
"grad_norm": 1.8513628244400024, |
|
"learning_rate": 9.671814671814672e-05, |
|
"loss": 1.4854, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.03623827467329432, |
|
"grad_norm": 1.7762032747268677, |
|
"learning_rate": 9.668597168597169e-05, |
|
"loss": 1.5484, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.03655896736951816, |
|
"grad_norm": 1.8025050163269043, |
|
"learning_rate": 9.665379665379665e-05, |
|
"loss": 1.7242, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.036879660065742004, |
|
"grad_norm": 1.6463018655776978, |
|
"learning_rate": 9.662162162162163e-05, |
|
"loss": 1.5908, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.037200352761965844, |
|
"grad_norm": 1.640679121017456, |
|
"learning_rate": 9.65894465894466e-05, |
|
"loss": 1.4735, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.03752104545818969, |
|
"grad_norm": 1.7171475887298584, |
|
"learning_rate": 9.655727155727157e-05, |
|
"loss": 1.5754, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.03784173815441353, |
|
"grad_norm": 1.6603971719741821, |
|
"learning_rate": 9.652509652509652e-05, |
|
"loss": 1.6748, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.03816243085063738, |
|
"grad_norm": 1.7294347286224365, |
|
"learning_rate": 9.649292149292149e-05, |
|
"loss": 1.6821, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.03848312354686122, |
|
"grad_norm": 1.8348171710968018, |
|
"learning_rate": 9.646074646074647e-05, |
|
"loss": 1.6028, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.038803816243085064, |
|
"grad_norm": 1.729529857635498, |
|
"learning_rate": 9.642857142857143e-05, |
|
"loss": 1.5516, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.039124508939308904, |
|
"grad_norm": 1.7815181016921997, |
|
"learning_rate": 9.639639639639641e-05, |
|
"loss": 1.6472, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.03944520163553275, |
|
"grad_norm": 1.8458462953567505, |
|
"learning_rate": 9.636422136422137e-05, |
|
"loss": 1.5905, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.0397658943317566, |
|
"grad_norm": 1.6666098833084106, |
|
"learning_rate": 9.633204633204634e-05, |
|
"loss": 1.4131, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.04008658702798044, |
|
"grad_norm": 1.9331727027893066, |
|
"learning_rate": 9.62998712998713e-05, |
|
"loss": 1.5866, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.040407279724204284, |
|
"grad_norm": 1.8131427764892578, |
|
"learning_rate": 9.626769626769627e-05, |
|
"loss": 1.6006, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.040727972420428124, |
|
"grad_norm": 1.6893302202224731, |
|
"learning_rate": 9.623552123552125e-05, |
|
"loss": 1.7013, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.04104866511665197, |
|
"grad_norm": 1.8650181293487549, |
|
"learning_rate": 9.620334620334621e-05, |
|
"loss": 1.7522, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.04136935781287581, |
|
"grad_norm": 1.8159857988357544, |
|
"learning_rate": 9.617117117117117e-05, |
|
"loss": 1.4935, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.04169005050909966, |
|
"grad_norm": 1.919482707977295, |
|
"learning_rate": 9.613899613899614e-05, |
|
"loss": 1.6084, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0420107432053235, |
|
"grad_norm": 1.721970558166504, |
|
"learning_rate": 9.61068211068211e-05, |
|
"loss": 1.5999, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.042331435901547344, |
|
"grad_norm": 1.6646255254745483, |
|
"learning_rate": 9.607464607464608e-05, |
|
"loss": 1.5181, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.042652128597771184, |
|
"grad_norm": 1.7984895706176758, |
|
"learning_rate": 9.604247104247105e-05, |
|
"loss": 1.6474, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.04297282129399503, |
|
"grad_norm": 1.83111572265625, |
|
"learning_rate": 9.601029601029602e-05, |
|
"loss": 1.671, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.04329351399021887, |
|
"grad_norm": 1.7685164213180542, |
|
"learning_rate": 9.597812097812098e-05, |
|
"loss": 1.6629, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.04361420668644272, |
|
"grad_norm": 1.7368009090423584, |
|
"learning_rate": 9.594594594594595e-05, |
|
"loss": 1.5108, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.04393489938266656, |
|
"grad_norm": 1.9609910249710083, |
|
"learning_rate": 9.591377091377092e-05, |
|
"loss": 1.6246, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.044255592078890404, |
|
"grad_norm": 2.0718841552734375, |
|
"learning_rate": 9.588159588159588e-05, |
|
"loss": 1.6929, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.044576284775114244, |
|
"grad_norm": 1.6211580038070679, |
|
"learning_rate": 9.584942084942086e-05, |
|
"loss": 1.4574, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.04489697747133809, |
|
"grad_norm": 1.6323758363723755, |
|
"learning_rate": 9.581724581724583e-05, |
|
"loss": 1.5247, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04521767016756193, |
|
"grad_norm": 1.8777216672897339, |
|
"learning_rate": 9.578507078507079e-05, |
|
"loss": 1.7277, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.04553836286378578, |
|
"grad_norm": 1.6735459566116333, |
|
"learning_rate": 9.575289575289575e-05, |
|
"loss": 1.5469, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.04585905556000962, |
|
"grad_norm": 1.5979048013687134, |
|
"learning_rate": 9.572072072072072e-05, |
|
"loss": 1.524, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.046179748256233465, |
|
"grad_norm": 1.7182780504226685, |
|
"learning_rate": 9.56885456885457e-05, |
|
"loss": 1.6218, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.046500440952457305, |
|
"grad_norm": 1.7319525480270386, |
|
"learning_rate": 9.565637065637066e-05, |
|
"loss": 1.684, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.04682113364868115, |
|
"grad_norm": 1.7818727493286133, |
|
"learning_rate": 9.562419562419564e-05, |
|
"loss": 1.5768, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.047141826344905, |
|
"grad_norm": 1.6169071197509766, |
|
"learning_rate": 9.559202059202059e-05, |
|
"loss": 1.5124, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.04746251904112884, |
|
"grad_norm": 1.8243296146392822, |
|
"learning_rate": 9.555984555984557e-05, |
|
"loss": 1.7473, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.047783211737352685, |
|
"grad_norm": 1.7316741943359375, |
|
"learning_rate": 9.552767052767053e-05, |
|
"loss": 1.6526, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.048103904433576525, |
|
"grad_norm": 1.712912678718567, |
|
"learning_rate": 9.54954954954955e-05, |
|
"loss": 1.7089, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04842459712980037, |
|
"grad_norm": 1.6389209032058716, |
|
"learning_rate": 9.546332046332048e-05, |
|
"loss": 1.4369, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.04874528982602421, |
|
"grad_norm": 1.9212989807128906, |
|
"learning_rate": 9.543114543114544e-05, |
|
"loss": 1.8143, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.04906598252224806, |
|
"grad_norm": 1.804646611213684, |
|
"learning_rate": 9.53989703989704e-05, |
|
"loss": 1.5463, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.0493866752184719, |
|
"grad_norm": 1.7606157064437866, |
|
"learning_rate": 9.536679536679537e-05, |
|
"loss": 1.6029, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.049707367914695745, |
|
"grad_norm": 1.673525094985962, |
|
"learning_rate": 9.533462033462033e-05, |
|
"loss": 1.4489, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.050028060610919585, |
|
"grad_norm": 1.677157998085022, |
|
"learning_rate": 9.530244530244531e-05, |
|
"loss": 1.5199, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.05034875330714343, |
|
"grad_norm": 1.804135799407959, |
|
"learning_rate": 9.527027027027028e-05, |
|
"loss": 1.5992, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.05066944600336727, |
|
"grad_norm": 1.651515007019043, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 1.5613, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.05099013869959112, |
|
"grad_norm": 1.7162939310073853, |
|
"learning_rate": 9.52059202059202e-05, |
|
"loss": 1.6594, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.05131083139581496, |
|
"grad_norm": 1.8460568189620972, |
|
"learning_rate": 9.517374517374518e-05, |
|
"loss": 1.7585, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.051631524092038805, |
|
"grad_norm": 1.9340801239013672, |
|
"learning_rate": 9.514157014157015e-05, |
|
"loss": 1.656, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.051952216788262645, |
|
"grad_norm": 1.7593783140182495, |
|
"learning_rate": 9.510939510939511e-05, |
|
"loss": 1.4838, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.05227290948448649, |
|
"grad_norm": 1.7070125341415405, |
|
"learning_rate": 9.507722007722009e-05, |
|
"loss": 1.4695, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.05259360218071033, |
|
"grad_norm": 1.6149088144302368, |
|
"learning_rate": 9.504504504504504e-05, |
|
"loss": 1.5294, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.05291429487693418, |
|
"grad_norm": 1.7407358884811401, |
|
"learning_rate": 9.501287001287002e-05, |
|
"loss": 1.5296, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.05323498757315802, |
|
"grad_norm": 1.8265562057495117, |
|
"learning_rate": 9.498069498069498e-05, |
|
"loss": 1.7319, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.053555680269381865, |
|
"grad_norm": 1.7947865724563599, |
|
"learning_rate": 9.494851994851995e-05, |
|
"loss": 1.5892, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.053876372965605705, |
|
"grad_norm": 1.9553256034851074, |
|
"learning_rate": 9.491634491634493e-05, |
|
"loss": 1.8113, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.05419706566182955, |
|
"grad_norm": 1.8548988103866577, |
|
"learning_rate": 9.488416988416989e-05, |
|
"loss": 1.6531, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.05451775835805339, |
|
"grad_norm": 1.9492192268371582, |
|
"learning_rate": 9.485199485199486e-05, |
|
"loss": 1.6863, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05483845105427724, |
|
"grad_norm": 1.9482530355453491, |
|
"learning_rate": 9.481981981981982e-05, |
|
"loss": 1.7185, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.055159143750501086, |
|
"grad_norm": 1.7255852222442627, |
|
"learning_rate": 9.47876447876448e-05, |
|
"loss": 1.5779, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.055479836446724926, |
|
"grad_norm": 1.706764578819275, |
|
"learning_rate": 9.475546975546976e-05, |
|
"loss": 1.664, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.05580052914294877, |
|
"grad_norm": 1.8826390504837036, |
|
"learning_rate": 9.472329472329473e-05, |
|
"loss": 1.7556, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.05612122183917261, |
|
"grad_norm": 1.7620404958724976, |
|
"learning_rate": 9.46911196911197e-05, |
|
"loss": 1.4872, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.05644191453539646, |
|
"grad_norm": 1.6721909046173096, |
|
"learning_rate": 9.465894465894466e-05, |
|
"loss": 1.4319, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.0567626072316203, |
|
"grad_norm": 1.5903970003128052, |
|
"learning_rate": 9.462676962676963e-05, |
|
"loss": 1.3132, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.057083299927844146, |
|
"grad_norm": 1.77998685836792, |
|
"learning_rate": 9.45945945945946e-05, |
|
"loss": 1.6568, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.057403992624067986, |
|
"grad_norm": 1.7730937004089355, |
|
"learning_rate": 9.456241956241956e-05, |
|
"loss": 1.7011, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.05772468532029183, |
|
"grad_norm": 1.7409788370132446, |
|
"learning_rate": 9.453024453024454e-05, |
|
"loss": 1.4079, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05804537801651567, |
|
"grad_norm": 1.6435108184814453, |
|
"learning_rate": 9.44980694980695e-05, |
|
"loss": 1.3067, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.05836607071273952, |
|
"grad_norm": 1.6248952150344849, |
|
"learning_rate": 9.446589446589447e-05, |
|
"loss": 1.4392, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.05868676340896336, |
|
"grad_norm": 1.7086244821548462, |
|
"learning_rate": 9.443371943371943e-05, |
|
"loss": 1.5775, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.059007456105187206, |
|
"grad_norm": 1.8702465295791626, |
|
"learning_rate": 9.44015444015444e-05, |
|
"loss": 1.5322, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.059328148801411046, |
|
"grad_norm": 1.7721097469329834, |
|
"learning_rate": 9.436936936936938e-05, |
|
"loss": 1.4723, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.05964884149763489, |
|
"grad_norm": 1.7834652662277222, |
|
"learning_rate": 9.433719433719434e-05, |
|
"loss": 1.5833, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.05996953419385873, |
|
"grad_norm": 1.840971827507019, |
|
"learning_rate": 9.43050193050193e-05, |
|
"loss": 1.6228, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.06029022689008258, |
|
"grad_norm": 1.7249364852905273, |
|
"learning_rate": 9.427284427284427e-05, |
|
"loss": 1.4276, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.06061091958630642, |
|
"grad_norm": 1.8125596046447754, |
|
"learning_rate": 9.424066924066925e-05, |
|
"loss": 1.4192, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.060931612282530266, |
|
"grad_norm": 1.6701613664627075, |
|
"learning_rate": 9.420849420849421e-05, |
|
"loss": 1.7908, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.061252304978754106, |
|
"grad_norm": 1.7857857942581177, |
|
"learning_rate": 9.417631917631918e-05, |
|
"loss": 1.7204, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.06157299767497795, |
|
"grad_norm": 1.917222499847412, |
|
"learning_rate": 9.414414414414416e-05, |
|
"loss": 1.7897, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.06189369037120179, |
|
"grad_norm": 1.814931035041809, |
|
"learning_rate": 9.411196911196911e-05, |
|
"loss": 1.6911, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.06221438306742564, |
|
"grad_norm": 1.600833535194397, |
|
"learning_rate": 9.407979407979409e-05, |
|
"loss": 1.4422, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.06253507576364949, |
|
"grad_norm": 1.750791072845459, |
|
"learning_rate": 9.404761904761905e-05, |
|
"loss": 1.602, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.06285576845987333, |
|
"grad_norm": 1.8130338191986084, |
|
"learning_rate": 9.401544401544401e-05, |
|
"loss": 1.7479, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.06317646115609717, |
|
"grad_norm": 1.8416740894317627, |
|
"learning_rate": 9.398326898326899e-05, |
|
"loss": 1.7607, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.06349715385232102, |
|
"grad_norm": 1.7460497617721558, |
|
"learning_rate": 9.395109395109396e-05, |
|
"loss": 1.796, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.06381784654854486, |
|
"grad_norm": 1.8751194477081299, |
|
"learning_rate": 9.391891891891892e-05, |
|
"loss": 1.7973, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.0641385392447687, |
|
"grad_norm": 1.6479636430740356, |
|
"learning_rate": 9.388674388674389e-05, |
|
"loss": 1.4562, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06445923194099254, |
|
"grad_norm": 1.813751220703125, |
|
"learning_rate": 9.385456885456886e-05, |
|
"loss": 1.4338, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.0647799246372164, |
|
"grad_norm": 1.991184115409851, |
|
"learning_rate": 9.382239382239383e-05, |
|
"loss": 1.6387, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.06510061733344023, |
|
"grad_norm": 1.622376561164856, |
|
"learning_rate": 9.379021879021879e-05, |
|
"loss": 1.5389, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.06542131002966407, |
|
"grad_norm": 1.4437798261642456, |
|
"learning_rate": 9.375804375804377e-05, |
|
"loss": 1.1904, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.06574200272588791, |
|
"grad_norm": 1.7311660051345825, |
|
"learning_rate": 9.372586872586872e-05, |
|
"loss": 1.692, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.06606269542211177, |
|
"grad_norm": 1.5912631750106812, |
|
"learning_rate": 9.36936936936937e-05, |
|
"loss": 1.3408, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.0663833881183356, |
|
"grad_norm": 1.7208282947540283, |
|
"learning_rate": 9.366151866151866e-05, |
|
"loss": 1.5267, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.06670408081455945, |
|
"grad_norm": 1.7791844606399536, |
|
"learning_rate": 9.362934362934363e-05, |
|
"loss": 1.6352, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.06702477351078329, |
|
"grad_norm": 1.7080252170562744, |
|
"learning_rate": 9.359716859716861e-05, |
|
"loss": 1.6333, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.06734546620700714, |
|
"grad_norm": 1.6710137128829956, |
|
"learning_rate": 9.356499356499357e-05, |
|
"loss": 1.6988, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06766615890323098, |
|
"grad_norm": 1.8755836486816406, |
|
"learning_rate": 9.353281853281854e-05, |
|
"loss": 1.7126, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.06798685159945482, |
|
"grad_norm": 1.7504806518554688, |
|
"learning_rate": 9.35006435006435e-05, |
|
"loss": 1.561, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.06830754429567866, |
|
"grad_norm": 1.6312227249145508, |
|
"learning_rate": 9.346846846846848e-05, |
|
"loss": 1.5506, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.06862823699190251, |
|
"grad_norm": 1.7328168153762817, |
|
"learning_rate": 9.343629343629344e-05, |
|
"loss": 1.4784, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.06894892968812635, |
|
"grad_norm": 1.6513800621032715, |
|
"learning_rate": 9.340411840411841e-05, |
|
"loss": 1.5849, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.0692696223843502, |
|
"grad_norm": 1.6930090188980103, |
|
"learning_rate": 9.337194337194337e-05, |
|
"loss": 1.6313, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.06959031508057403, |
|
"grad_norm": 1.7721333503723145, |
|
"learning_rate": 9.333976833976834e-05, |
|
"loss": 1.5716, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.06991100777679789, |
|
"grad_norm": 1.7768151760101318, |
|
"learning_rate": 9.330759330759331e-05, |
|
"loss": 1.5215, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.07023170047302173, |
|
"grad_norm": 1.6654014587402344, |
|
"learning_rate": 9.327541827541828e-05, |
|
"loss": 1.6394, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.07055239316924557, |
|
"grad_norm": 1.8215490579605103, |
|
"learning_rate": 9.324324324324324e-05, |
|
"loss": 1.552, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07087308586546942, |
|
"grad_norm": 1.7659235000610352, |
|
"learning_rate": 9.321106821106822e-05, |
|
"loss": 1.7689, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.07119377856169326, |
|
"grad_norm": 1.8315811157226562, |
|
"learning_rate": 9.317889317889317e-05, |
|
"loss": 1.518, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.0715144712579171, |
|
"grad_norm": 1.6116633415222168, |
|
"learning_rate": 9.314671814671815e-05, |
|
"loss": 1.5717, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.07183516395414094, |
|
"grad_norm": 1.6100307703018188, |
|
"learning_rate": 9.311454311454312e-05, |
|
"loss": 1.6901, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.0721558566503648, |
|
"grad_norm": 1.7291269302368164, |
|
"learning_rate": 9.30823680823681e-05, |
|
"loss": 1.6414, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.07247654934658863, |
|
"grad_norm": 1.8281351327896118, |
|
"learning_rate": 9.305019305019306e-05, |
|
"loss": 1.8009, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.07279724204281247, |
|
"grad_norm": 1.675513744354248, |
|
"learning_rate": 9.301801801801802e-05, |
|
"loss": 1.5188, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.07311793473903631, |
|
"grad_norm": 1.684611201286316, |
|
"learning_rate": 9.298584298584299e-05, |
|
"loss": 1.6918, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.07343862743526017, |
|
"grad_norm": 1.9136089086532593, |
|
"learning_rate": 9.295366795366795e-05, |
|
"loss": 1.6034, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.07375932013148401, |
|
"grad_norm": 1.8346682786941528, |
|
"learning_rate": 9.292149292149293e-05, |
|
"loss": 1.618, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.07408001282770785, |
|
"grad_norm": 1.842098355293274, |
|
"learning_rate": 9.28893178893179e-05, |
|
"loss": 1.4825, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.07440070552393169, |
|
"grad_norm": 1.723360538482666, |
|
"learning_rate": 9.285714285714286e-05, |
|
"loss": 1.3566, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.07472139822015554, |
|
"grad_norm": 1.6950525045394897, |
|
"learning_rate": 9.282496782496784e-05, |
|
"loss": 1.6979, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.07504209091637938, |
|
"grad_norm": 1.7014894485473633, |
|
"learning_rate": 9.279279279279279e-05, |
|
"loss": 1.5959, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.07536278361260322, |
|
"grad_norm": 1.7120134830474854, |
|
"learning_rate": 9.276061776061777e-05, |
|
"loss": 1.6729, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.07568347630882706, |
|
"grad_norm": 1.7131479978561401, |
|
"learning_rate": 9.272844272844273e-05, |
|
"loss": 1.6015, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.07600416900505091, |
|
"grad_norm": 1.7372465133666992, |
|
"learning_rate": 9.269626769626771e-05, |
|
"loss": 1.675, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.07632486170127475, |
|
"grad_norm": 1.7408344745635986, |
|
"learning_rate": 9.266409266409267e-05, |
|
"loss": 1.5983, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.0766455543974986, |
|
"grad_norm": 1.8079317808151245, |
|
"learning_rate": 9.263191763191764e-05, |
|
"loss": 1.5834, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.07696624709372243, |
|
"grad_norm": 1.7480193376541138, |
|
"learning_rate": 9.25997425997426e-05, |
|
"loss": 1.6799, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07728693978994629, |
|
"grad_norm": 1.6279219388961792, |
|
"learning_rate": 9.256756756756757e-05, |
|
"loss": 1.6193, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.07760763248617013, |
|
"grad_norm": 1.877221941947937, |
|
"learning_rate": 9.253539253539254e-05, |
|
"loss": 1.5518, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.07792832518239397, |
|
"grad_norm": 1.7862673997879028, |
|
"learning_rate": 9.250321750321751e-05, |
|
"loss": 1.6225, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.07824901787861781, |
|
"grad_norm": 1.7293739318847656, |
|
"learning_rate": 9.247104247104247e-05, |
|
"loss": 1.6664, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.07856971057484166, |
|
"grad_norm": 1.6670184135437012, |
|
"learning_rate": 9.243886743886744e-05, |
|
"loss": 1.4997, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.0788904032710655, |
|
"grad_norm": 1.6994450092315674, |
|
"learning_rate": 9.24066924066924e-05, |
|
"loss": 1.4323, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.07921109596728934, |
|
"grad_norm": 1.8251434564590454, |
|
"learning_rate": 9.237451737451738e-05, |
|
"loss": 1.7074, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.0795317886635132, |
|
"grad_norm": 1.6368730068206787, |
|
"learning_rate": 9.234234234234235e-05, |
|
"loss": 1.5412, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.07985248135973703, |
|
"grad_norm": 1.4857044219970703, |
|
"learning_rate": 9.231016731016732e-05, |
|
"loss": 1.3139, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.08017317405596087, |
|
"grad_norm": 1.7797576189041138, |
|
"learning_rate": 9.227799227799229e-05, |
|
"loss": 1.6985, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08049386675218471, |
|
"grad_norm": 1.8514691591262817, |
|
"learning_rate": 9.224581724581724e-05, |
|
"loss": 1.7173, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.08081455944840857, |
|
"grad_norm": 1.7061352729797363, |
|
"learning_rate": 9.221364221364222e-05, |
|
"loss": 1.4749, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.08113525214463241, |
|
"grad_norm": 1.8911951780319214, |
|
"learning_rate": 9.218146718146718e-05, |
|
"loss": 1.612, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.08145594484085625, |
|
"grad_norm": 1.8928183317184448, |
|
"learning_rate": 9.214929214929216e-05, |
|
"loss": 1.7248, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.08177663753708009, |
|
"grad_norm": 1.69159996509552, |
|
"learning_rate": 9.211711711711712e-05, |
|
"loss": 1.5939, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.08209733023330394, |
|
"grad_norm": 1.7285486459732056, |
|
"learning_rate": 9.208494208494209e-05, |
|
"loss": 1.4037, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.08241802292952778, |
|
"grad_norm": 1.5664756298065186, |
|
"learning_rate": 9.205276705276705e-05, |
|
"loss": 1.544, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.08273871562575162, |
|
"grad_norm": 1.8683611154556274, |
|
"learning_rate": 9.202059202059202e-05, |
|
"loss": 1.4889, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.08305940832197546, |
|
"grad_norm": 2.022183656692505, |
|
"learning_rate": 9.1988416988417e-05, |
|
"loss": 1.6101, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.08338010101819932, |
|
"grad_norm": 1.7333014011383057, |
|
"learning_rate": 9.195624195624196e-05, |
|
"loss": 1.4097, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.08370079371442316, |
|
"grad_norm": 1.693903923034668, |
|
"learning_rate": 9.192406692406694e-05, |
|
"loss": 1.6963, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.084021486410647, |
|
"grad_norm": 1.8906500339508057, |
|
"learning_rate": 9.18918918918919e-05, |
|
"loss": 1.7168, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.08434217910687083, |
|
"grad_norm": 1.8799378871917725, |
|
"learning_rate": 9.185971685971685e-05, |
|
"loss": 1.6934, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.08466287180309469, |
|
"grad_norm": 1.8366758823394775, |
|
"learning_rate": 9.182754182754183e-05, |
|
"loss": 1.7188, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.08498356449931853, |
|
"grad_norm": 1.7556151151657104, |
|
"learning_rate": 9.17953667953668e-05, |
|
"loss": 1.6177, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.08530425719554237, |
|
"grad_norm": 1.7707295417785645, |
|
"learning_rate": 9.176319176319177e-05, |
|
"loss": 1.5682, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.08562494989176621, |
|
"grad_norm": 1.636765480041504, |
|
"learning_rate": 9.173101673101674e-05, |
|
"loss": 1.4409, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.08594564258799006, |
|
"grad_norm": 1.7112761735916138, |
|
"learning_rate": 9.16988416988417e-05, |
|
"loss": 1.6157, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.0862663352842139, |
|
"grad_norm": 1.9961086511611938, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 1.5517, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.08658702798043774, |
|
"grad_norm": 1.5996118783950806, |
|
"learning_rate": 9.163449163449163e-05, |
|
"loss": 1.4812, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0869077206766616, |
|
"grad_norm": 1.658401608467102, |
|
"learning_rate": 9.160231660231661e-05, |
|
"loss": 1.4842, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.08722841337288544, |
|
"grad_norm": 1.7346595525741577, |
|
"learning_rate": 9.157014157014157e-05, |
|
"loss": 1.55, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.08754910606910928, |
|
"grad_norm": 1.7823814153671265, |
|
"learning_rate": 9.153796653796655e-05, |
|
"loss": 1.6065, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.08786979876533312, |
|
"grad_norm": 1.7370473146438599, |
|
"learning_rate": 9.15057915057915e-05, |
|
"loss": 1.5657, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.08819049146155697, |
|
"grad_norm": 1.5054192543029785, |
|
"learning_rate": 9.147361647361647e-05, |
|
"loss": 1.5523, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.08851118415778081, |
|
"grad_norm": 1.7042500972747803, |
|
"learning_rate": 9.144144144144145e-05, |
|
"loss": 1.6344, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.08883187685400465, |
|
"grad_norm": 1.6680580377578735, |
|
"learning_rate": 9.140926640926641e-05, |
|
"loss": 1.4047, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.08915256955022849, |
|
"grad_norm": 1.7721407413482666, |
|
"learning_rate": 9.137709137709139e-05, |
|
"loss": 1.5141, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.08947326224645234, |
|
"grad_norm": 1.6130516529083252, |
|
"learning_rate": 9.134491634491635e-05, |
|
"loss": 1.3017, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.08979395494267618, |
|
"grad_norm": 1.6346606016159058, |
|
"learning_rate": 9.131274131274132e-05, |
|
"loss": 1.3774, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.09011464763890002, |
|
"grad_norm": 1.720962643623352, |
|
"learning_rate": 9.128056628056628e-05, |
|
"loss": 1.4936, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.09043534033512386, |
|
"grad_norm": 1.7229537963867188, |
|
"learning_rate": 9.124839124839125e-05, |
|
"loss": 1.6381, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.09075603303134772, |
|
"grad_norm": 1.9374829530715942, |
|
"learning_rate": 9.121621621621623e-05, |
|
"loss": 1.5829, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.09107672572757156, |
|
"grad_norm": 1.6721988916397095, |
|
"learning_rate": 9.118404118404119e-05, |
|
"loss": 1.5449, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.0913974184237954, |
|
"grad_norm": 1.5761111974716187, |
|
"learning_rate": 9.115186615186617e-05, |
|
"loss": 1.388, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.09171811112001924, |
|
"grad_norm": 1.8343489170074463, |
|
"learning_rate": 9.111969111969112e-05, |
|
"loss": 1.5724, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.09203880381624309, |
|
"grad_norm": 1.564759373664856, |
|
"learning_rate": 9.108751608751608e-05, |
|
"loss": 1.4356, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.09235949651246693, |
|
"grad_norm": 1.6440503597259521, |
|
"learning_rate": 9.105534105534106e-05, |
|
"loss": 1.4281, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.09268018920869077, |
|
"grad_norm": 1.7821102142333984, |
|
"learning_rate": 9.102316602316603e-05, |
|
"loss": 1.5695, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.09300088190491461, |
|
"grad_norm": 1.7555899620056152, |
|
"learning_rate": 9.0990990990991e-05, |
|
"loss": 1.7338, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.09332157460113846, |
|
"grad_norm": 1.800379753112793, |
|
"learning_rate": 9.095881595881597e-05, |
|
"loss": 1.5684, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.0936422672973623, |
|
"grad_norm": 1.6488431692123413, |
|
"learning_rate": 9.092664092664093e-05, |
|
"loss": 1.4512, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.09396295999358614, |
|
"grad_norm": 1.72849440574646, |
|
"learning_rate": 9.08944658944659e-05, |
|
"loss": 1.6229, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.09428365268981, |
|
"grad_norm": 1.7736647129058838, |
|
"learning_rate": 9.086229086229086e-05, |
|
"loss": 1.4737, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.09460434538603384, |
|
"grad_norm": 1.7936447858810425, |
|
"learning_rate": 9.083011583011584e-05, |
|
"loss": 1.3228, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.09492503808225768, |
|
"grad_norm": 1.8301600217819214, |
|
"learning_rate": 9.07979407979408e-05, |
|
"loss": 1.6702, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.09524573077848152, |
|
"grad_norm": 1.5292707681655884, |
|
"learning_rate": 9.076576576576577e-05, |
|
"loss": 1.5324, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.09556642347470537, |
|
"grad_norm": 1.7899250984191895, |
|
"learning_rate": 9.073359073359073e-05, |
|
"loss": 1.6492, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.09588711617092921, |
|
"grad_norm": 1.6854242086410522, |
|
"learning_rate": 9.07014157014157e-05, |
|
"loss": 1.5048, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.09620780886715305, |
|
"grad_norm": 1.819797396659851, |
|
"learning_rate": 9.066924066924068e-05, |
|
"loss": 1.7865, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 3118, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.366563833469788e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|