Gwanwoo's picture
Upload folder using huggingface_hub
802ff53 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9741379310344827,
"eval_steps": 58,
"global_step": 464,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004310344827586207,
"grad_norm": 0.38478580117225647,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.6158,
"step": 1
},
{
"epoch": 0.004310344827586207,
"eval_loss": 1.6722962856292725,
"eval_runtime": 38.7564,
"eval_samples_per_second": 11.43,
"eval_steps_per_second": 1.445,
"step": 1
},
{
"epoch": 0.008620689655172414,
"grad_norm": 0.3902539312839508,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5696,
"step": 2
},
{
"epoch": 0.01293103448275862,
"grad_norm": 0.42075541615486145,
"learning_rate": 3e-06,
"loss": 1.6475,
"step": 3
},
{
"epoch": 0.017241379310344827,
"grad_norm": 0.4003991186618805,
"learning_rate": 4.000000000000001e-06,
"loss": 1.6201,
"step": 4
},
{
"epoch": 0.021551724137931036,
"grad_norm": 0.3858628571033478,
"learning_rate": 5e-06,
"loss": 1.5931,
"step": 5
},
{
"epoch": 0.02586206896551724,
"grad_norm": 0.4155072569847107,
"learning_rate": 6e-06,
"loss": 1.6432,
"step": 6
},
{
"epoch": 0.03017241379310345,
"grad_norm": 0.3816058039665222,
"learning_rate": 7e-06,
"loss": 1.6651,
"step": 7
},
{
"epoch": 0.034482758620689655,
"grad_norm": 0.38627564907073975,
"learning_rate": 8.000000000000001e-06,
"loss": 1.5938,
"step": 8
},
{
"epoch": 0.03879310344827586,
"grad_norm": 0.3964974284172058,
"learning_rate": 9e-06,
"loss": 1.6462,
"step": 9
},
{
"epoch": 0.04310344827586207,
"grad_norm": 0.374857634305954,
"learning_rate": 1e-05,
"loss": 1.6076,
"step": 10
},
{
"epoch": 0.04741379310344827,
"grad_norm": 0.41657668352127075,
"learning_rate": 9.999880291134381e-06,
"loss": 1.6585,
"step": 11
},
{
"epoch": 0.05172413793103448,
"grad_norm": 0.4166984260082245,
"learning_rate": 9.99952117026961e-06,
"loss": 1.6464,
"step": 12
},
{
"epoch": 0.05603448275862069,
"grad_norm": 0.41143307089805603,
"learning_rate": 9.998922654601666e-06,
"loss": 1.6041,
"step": 13
},
{
"epoch": 0.0603448275862069,
"grad_norm": 0.424064576625824,
"learning_rate": 9.998084772789603e-06,
"loss": 1.6668,
"step": 14
},
{
"epoch": 0.06465517241379311,
"grad_norm": 0.4521613121032715,
"learning_rate": 9.997007564954173e-06,
"loss": 1.6147,
"step": 15
},
{
"epoch": 0.06896551724137931,
"grad_norm": 0.4430346190929413,
"learning_rate": 9.995691082675908e-06,
"loss": 1.6345,
"step": 16
},
{
"epoch": 0.07327586206896551,
"grad_norm": 0.4326779544353485,
"learning_rate": 9.994135388992646e-06,
"loss": 1.6169,
"step": 17
},
{
"epoch": 0.07758620689655173,
"grad_norm": 0.4598339796066284,
"learning_rate": 9.99234055839652e-06,
"loss": 1.6345,
"step": 18
},
{
"epoch": 0.08189655172413793,
"grad_norm": 0.4327365756034851,
"learning_rate": 9.990306676830382e-06,
"loss": 1.563,
"step": 19
},
{
"epoch": 0.08620689655172414,
"grad_norm": 0.4325244128704071,
"learning_rate": 9.988033841683694e-06,
"loss": 1.632,
"step": 20
},
{
"epoch": 0.09051724137931035,
"grad_norm": 0.42453956604003906,
"learning_rate": 9.985522161787863e-06,
"loss": 1.5589,
"step": 21
},
{
"epoch": 0.09482758620689655,
"grad_norm": 0.44241735339164734,
"learning_rate": 9.982771757411032e-06,
"loss": 1.5412,
"step": 22
},
{
"epoch": 0.09913793103448276,
"grad_norm": 0.4151982367038727,
"learning_rate": 9.979782760252312e-06,
"loss": 1.6149,
"step": 23
},
{
"epoch": 0.10344827586206896,
"grad_norm": 0.43418580293655396,
"learning_rate": 9.97655531343549e-06,
"loss": 1.6115,
"step": 24
},
{
"epoch": 0.10775862068965517,
"grad_norm": 0.407058984041214,
"learning_rate": 9.973089571502163e-06,
"loss": 1.5269,
"step": 25
},
{
"epoch": 0.11206896551724138,
"grad_norm": 0.40243563055992126,
"learning_rate": 9.969385700404346e-06,
"loss": 1.6206,
"step": 26
},
{
"epoch": 0.11637931034482758,
"grad_norm": 0.40526947379112244,
"learning_rate": 9.965443877496522e-06,
"loss": 1.5815,
"step": 27
},
{
"epoch": 0.1206896551724138,
"grad_norm": 0.3972105085849762,
"learning_rate": 9.96126429152715e-06,
"loss": 1.5605,
"step": 28
},
{
"epoch": 0.125,
"grad_norm": 0.38185665011405945,
"learning_rate": 9.95684714262963e-06,
"loss": 1.5652,
"step": 29
},
{
"epoch": 0.12931034482758622,
"grad_norm": 0.4035002887248993,
"learning_rate": 9.952192642312713e-06,
"loss": 1.4807,
"step": 30
},
{
"epoch": 0.1336206896551724,
"grad_norm": 0.3860902488231659,
"learning_rate": 9.94730101345038e-06,
"loss": 1.4752,
"step": 31
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.40458953380584717,
"learning_rate": 9.942172490271169e-06,
"loss": 1.5132,
"step": 32
},
{
"epoch": 0.14224137931034483,
"grad_norm": 0.3780922293663025,
"learning_rate": 9.936807318346959e-06,
"loss": 1.4436,
"step": 33
},
{
"epoch": 0.14655172413793102,
"grad_norm": 0.3768951892852783,
"learning_rate": 9.931205754581203e-06,
"loss": 1.461,
"step": 34
},
{
"epoch": 0.15086206896551724,
"grad_norm": 0.40286195278167725,
"learning_rate": 9.925368067196644e-06,
"loss": 1.4718,
"step": 35
},
{
"epoch": 0.15517241379310345,
"grad_norm": 0.3668968081474304,
"learning_rate": 9.919294535722452e-06,
"loss": 1.4031,
"step": 36
},
{
"epoch": 0.15948275862068967,
"grad_norm": 0.3690381944179535,
"learning_rate": 9.912985450980853e-06,
"loss": 1.5063,
"step": 37
},
{
"epoch": 0.16379310344827586,
"grad_norm": 0.3745856285095215,
"learning_rate": 9.9064411150732e-06,
"loss": 1.4788,
"step": 38
},
{
"epoch": 0.16810344827586207,
"grad_norm": 0.3808038532733917,
"learning_rate": 9.899661841365502e-06,
"loss": 1.4621,
"step": 39
},
{
"epoch": 0.1724137931034483,
"grad_norm": 0.3296118378639221,
"learning_rate": 9.892647954473425e-06,
"loss": 1.3765,
"step": 40
},
{
"epoch": 0.17672413793103448,
"grad_norm": 0.3598046898841858,
"learning_rate": 9.885399790246746e-06,
"loss": 1.3972,
"step": 41
},
{
"epoch": 0.1810344827586207,
"grad_norm": 0.3617996871471405,
"learning_rate": 9.877917695753275e-06,
"loss": 1.3881,
"step": 42
},
{
"epoch": 0.1853448275862069,
"grad_norm": 0.35454022884368896,
"learning_rate": 9.870202029262228e-06,
"loss": 1.3877,
"step": 43
},
{
"epoch": 0.1896551724137931,
"grad_norm": 0.35556507110595703,
"learning_rate": 9.862253160227077e-06,
"loss": 1.3745,
"step": 44
},
{
"epoch": 0.1939655172413793,
"grad_norm": 0.33433303236961365,
"learning_rate": 9.85407146926786e-06,
"loss": 1.4469,
"step": 45
},
{
"epoch": 0.19827586206896552,
"grad_norm": 0.36582285165786743,
"learning_rate": 9.845657348152958e-06,
"loss": 1.4056,
"step": 46
},
{
"epoch": 0.2025862068965517,
"grad_norm": 0.3496723175048828,
"learning_rate": 9.837011199780325e-06,
"loss": 1.3943,
"step": 47
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.36036989092826843,
"learning_rate": 9.828133438158206e-06,
"loss": 1.4165,
"step": 48
},
{
"epoch": 0.21120689655172414,
"grad_norm": 0.34691792726516724,
"learning_rate": 9.819024488385314e-06,
"loss": 1.3675,
"step": 49
},
{
"epoch": 0.21551724137931033,
"grad_norm": 0.331584095954895,
"learning_rate": 9.809684786630462e-06,
"loss": 1.4028,
"step": 50
},
{
"epoch": 0.21982758620689655,
"grad_norm": 0.32271715998649597,
"learning_rate": 9.800114780111694e-06,
"loss": 1.3344,
"step": 51
},
{
"epoch": 0.22413793103448276,
"grad_norm": 0.3648192584514618,
"learning_rate": 9.79031492707486e-06,
"loss": 1.3383,
"step": 52
},
{
"epoch": 0.22844827586206898,
"grad_norm": 0.3334115743637085,
"learning_rate": 9.780285696771675e-06,
"loss": 1.3185,
"step": 53
},
{
"epoch": 0.23275862068965517,
"grad_norm": 0.3398106098175049,
"learning_rate": 9.770027569437252e-06,
"loss": 1.3564,
"step": 54
},
{
"epoch": 0.23706896551724138,
"grad_norm": 0.3277662694454193,
"learning_rate": 9.759541036267106e-06,
"loss": 1.4009,
"step": 55
},
{
"epoch": 0.2413793103448276,
"grad_norm": 0.30069175362586975,
"learning_rate": 9.748826599393632e-06,
"loss": 1.3393,
"step": 56
},
{
"epoch": 0.24568965517241378,
"grad_norm": 0.2870045006275177,
"learning_rate": 9.737884771862065e-06,
"loss": 1.3647,
"step": 57
},
{
"epoch": 0.25,
"grad_norm": 0.299041211605072,
"learning_rate": 9.72671607760591e-06,
"loss": 1.2942,
"step": 58
},
{
"epoch": 0.25,
"eval_loss": 1.3614733219146729,
"eval_runtime": 38.8235,
"eval_samples_per_second": 11.411,
"eval_steps_per_second": 1.442,
"step": 58
},
{
"epoch": 0.2543103448275862,
"grad_norm": 0.3013390302658081,
"learning_rate": 9.715321051421853e-06,
"loss": 1.3405,
"step": 59
},
{
"epoch": 0.25862068965517243,
"grad_norm": 0.28875967860221863,
"learning_rate": 9.703700238944157e-06,
"loss": 1.2537,
"step": 60
},
{
"epoch": 0.2629310344827586,
"grad_norm": 0.3158300817012787,
"learning_rate": 9.691854196618538e-06,
"loss": 1.3112,
"step": 61
},
{
"epoch": 0.2672413793103448,
"grad_norm": 0.284329891204834,
"learning_rate": 9.679783491675507e-06,
"loss": 1.2891,
"step": 62
},
{
"epoch": 0.27155172413793105,
"grad_norm": 0.2802349925041199,
"learning_rate": 9.667488702103222e-06,
"loss": 1.2899,
"step": 63
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.28650611639022827,
"learning_rate": 9.654970416619814e-06,
"loss": 1.307,
"step": 64
},
{
"epoch": 0.2801724137931034,
"grad_norm": 0.2946489751338959,
"learning_rate": 9.642229234645177e-06,
"loss": 1.313,
"step": 65
},
{
"epoch": 0.28448275862068967,
"grad_norm": 0.280954509973526,
"learning_rate": 9.629265766272293e-06,
"loss": 1.3074,
"step": 66
},
{
"epoch": 0.28879310344827586,
"grad_norm": 0.27372896671295166,
"learning_rate": 9.616080632237999e-06,
"loss": 1.2904,
"step": 67
},
{
"epoch": 0.29310344827586204,
"grad_norm": 0.2814270853996277,
"learning_rate": 9.602674463893266e-06,
"loss": 1.2482,
"step": 68
},
{
"epoch": 0.2974137931034483,
"grad_norm": 0.2719290256500244,
"learning_rate": 9.589047903172981e-06,
"loss": 1.2528,
"step": 69
},
{
"epoch": 0.3017241379310345,
"grad_norm": 0.2652990520000458,
"learning_rate": 9.575201602565192e-06,
"loss": 1.2582,
"step": 70
},
{
"epoch": 0.30603448275862066,
"grad_norm": 0.28665879368782043,
"learning_rate": 9.561136225079874e-06,
"loss": 1.2454,
"step": 71
},
{
"epoch": 0.3103448275862069,
"grad_norm": 0.2868441045284271,
"learning_rate": 9.54685244421718e-06,
"loss": 1.2629,
"step": 72
},
{
"epoch": 0.3146551724137931,
"grad_norm": 0.2746320366859436,
"learning_rate": 9.532350943935194e-06,
"loss": 1.2539,
"step": 73
},
{
"epoch": 0.31896551724137934,
"grad_norm": 0.2876338064670563,
"learning_rate": 9.517632418617173e-06,
"loss": 1.2566,
"step": 74
},
{
"epoch": 0.3232758620689655,
"grad_norm": 0.2622097134590149,
"learning_rate": 9.502697573038309e-06,
"loss": 1.2199,
"step": 75
},
{
"epoch": 0.3275862068965517,
"grad_norm": 0.2769118845462799,
"learning_rate": 9.487547122331965e-06,
"loss": 1.2564,
"step": 76
},
{
"epoch": 0.33189655172413796,
"grad_norm": 0.2650173008441925,
"learning_rate": 9.47218179195545e-06,
"loss": 1.2385,
"step": 77
},
{
"epoch": 0.33620689655172414,
"grad_norm": 0.2626594603061676,
"learning_rate": 9.456602317655274e-06,
"loss": 1.1074,
"step": 78
},
{
"epoch": 0.34051724137931033,
"grad_norm": 0.2777866721153259,
"learning_rate": 9.440809445431914e-06,
"loss": 1.2219,
"step": 79
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.27093663811683655,
"learning_rate": 9.424803931504095e-06,
"loss": 1.2315,
"step": 80
},
{
"epoch": 0.34913793103448276,
"grad_norm": 0.23614566028118134,
"learning_rate": 9.408586542272588e-06,
"loss": 1.1969,
"step": 81
},
{
"epoch": 0.35344827586206895,
"grad_norm": 0.25952041149139404,
"learning_rate": 9.392158054283497e-06,
"loss": 1.2439,
"step": 82
},
{
"epoch": 0.3577586206896552,
"grad_norm": 0.295060396194458,
"learning_rate": 9.375519254191088e-06,
"loss": 1.2369,
"step": 83
},
{
"epoch": 0.3620689655172414,
"grad_norm": 0.2887714207172394,
"learning_rate": 9.358670938720114e-06,
"loss": 1.2503,
"step": 84
},
{
"epoch": 0.36637931034482757,
"grad_norm": 0.26783010363578796,
"learning_rate": 9.341613914627667e-06,
"loss": 1.2057,
"step": 85
},
{
"epoch": 0.3706896551724138,
"grad_norm": 0.26204803586006165,
"learning_rate": 9.32434899866455e-06,
"loss": 1.1715,
"step": 86
},
{
"epoch": 0.375,
"grad_norm": 0.2795032262802124,
"learning_rate": 9.306877017536165e-06,
"loss": 1.241,
"step": 87
},
{
"epoch": 0.3793103448275862,
"grad_norm": 0.26389801502227783,
"learning_rate": 9.289198807862929e-06,
"loss": 1.1859,
"step": 88
},
{
"epoch": 0.38362068965517243,
"grad_norm": 0.25804105401039124,
"learning_rate": 9.27131521614021e-06,
"loss": 1.1952,
"step": 89
},
{
"epoch": 0.3879310344827586,
"grad_norm": 0.2644469439983368,
"learning_rate": 9.253227098697804e-06,
"loss": 1.1428,
"step": 90
},
{
"epoch": 0.3922413793103448,
"grad_norm": 0.2521233558654785,
"learning_rate": 9.234935321658916e-06,
"loss": 1.1685,
"step": 91
},
{
"epoch": 0.39655172413793105,
"grad_norm": 0.23881429433822632,
"learning_rate": 9.216440760898695e-06,
"loss": 1.1668,
"step": 92
},
{
"epoch": 0.40086206896551724,
"grad_norm": 0.2397918403148651,
"learning_rate": 9.197744302002301e-06,
"loss": 1.2153,
"step": 93
},
{
"epoch": 0.4051724137931034,
"grad_norm": 0.25894659757614136,
"learning_rate": 9.178846840222489e-06,
"loss": 1.196,
"step": 94
},
{
"epoch": 0.40948275862068967,
"grad_norm": 0.24914440512657166,
"learning_rate": 9.159749280436738e-06,
"loss": 1.2565,
"step": 95
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.25300922989845276,
"learning_rate": 9.140452537103943e-06,
"loss": 1.2124,
"step": 96
},
{
"epoch": 0.41810344827586204,
"grad_norm": 0.2863939106464386,
"learning_rate": 9.120957534220599e-06,
"loss": 1.1849,
"step": 97
},
{
"epoch": 0.4224137931034483,
"grad_norm": 0.26353657245635986,
"learning_rate": 9.101265205276581e-06,
"loss": 1.1749,
"step": 98
},
{
"epoch": 0.4267241379310345,
"grad_norm": 0.2726733684539795,
"learning_rate": 9.081376493210434e-06,
"loss": 1.1452,
"step": 99
},
{
"epoch": 0.43103448275862066,
"grad_norm": 0.2476412057876587,
"learning_rate": 9.061292350364222e-06,
"loss": 1.1881,
"step": 100
},
{
"epoch": 0.4353448275862069,
"grad_norm": 0.25747182965278625,
"learning_rate": 9.041013738437924e-06,
"loss": 1.2438,
"step": 101
},
{
"epoch": 0.4396551724137931,
"grad_norm": 0.2478281408548355,
"learning_rate": 9.020541628443395e-06,
"loss": 1.136,
"step": 102
},
{
"epoch": 0.44396551724137934,
"grad_norm": 0.28689947724342346,
"learning_rate": 8.99987700065786e-06,
"loss": 1.1827,
"step": 103
},
{
"epoch": 0.4482758620689655,
"grad_norm": 0.2534964084625244,
"learning_rate": 8.979020844576982e-06,
"loss": 1.2091,
"step": 104
},
{
"epoch": 0.4525862068965517,
"grad_norm": 0.23414653539657593,
"learning_rate": 8.95797415886747e-06,
"loss": 1.205,
"step": 105
},
{
"epoch": 0.45689655172413796,
"grad_norm": 0.26888808608055115,
"learning_rate": 8.936737951319276e-06,
"loss": 1.1838,
"step": 106
},
{
"epoch": 0.46120689655172414,
"grad_norm": 0.2613222897052765,
"learning_rate": 8.915313238797327e-06,
"loss": 1.2293,
"step": 107
},
{
"epoch": 0.46551724137931033,
"grad_norm": 0.2542130649089813,
"learning_rate": 8.893701047192832e-06,
"loss": 1.2118,
"step": 108
},
{
"epoch": 0.4698275862068966,
"grad_norm": 0.2766372561454773,
"learning_rate": 8.871902411374173e-06,
"loss": 1.1278,
"step": 109
},
{
"epoch": 0.47413793103448276,
"grad_norm": 0.2918456494808197,
"learning_rate": 8.84991837513733e-06,
"loss": 1.1893,
"step": 110
},
{
"epoch": 0.47844827586206895,
"grad_norm": 0.2817043364048004,
"learning_rate": 8.827749991155924e-06,
"loss": 1.199,
"step": 111
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.27356210350990295,
"learning_rate": 8.805398320930792e-06,
"loss": 1.146,
"step": 112
},
{
"epoch": 0.4870689655172414,
"grad_norm": 0.26965269446372986,
"learning_rate": 8.782864434739169e-06,
"loss": 1.1373,
"step": 113
},
{
"epoch": 0.49137931034482757,
"grad_norm": 0.2662865221500397,
"learning_rate": 8.760149411583436e-06,
"loss": 1.1677,
"step": 114
},
{
"epoch": 0.4956896551724138,
"grad_norm": 0.25307050347328186,
"learning_rate": 8.737254339139457e-06,
"loss": 1.1244,
"step": 115
},
{
"epoch": 0.5,
"grad_norm": 0.28164368867874146,
"learning_rate": 8.71418031370449e-06,
"loss": 1.1657,
"step": 116
},
{
"epoch": 0.5,
"eval_loss": 1.22263503074646,
"eval_runtime": 38.8381,
"eval_samples_per_second": 11.406,
"eval_steps_per_second": 1.442,
"step": 116
},
{
"epoch": 0.5043103448275862,
"grad_norm": 0.2570449709892273,
"learning_rate": 8.690928440144701e-06,
"loss": 1.1423,
"step": 117
},
{
"epoch": 0.5086206896551724,
"grad_norm": 0.27080610394477844,
"learning_rate": 8.667499831842252e-06,
"loss": 1.1379,
"step": 118
},
{
"epoch": 0.5129310344827587,
"grad_norm": 0.23152929544448853,
"learning_rate": 8.643895610641993e-06,
"loss": 1.1484,
"step": 119
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.25903040170669556,
"learning_rate": 8.62011690679774e-06,
"loss": 1.124,
"step": 120
},
{
"epoch": 0.521551724137931,
"grad_norm": 0.2430315613746643,
"learning_rate": 8.596164858918158e-06,
"loss": 1.1558,
"step": 121
},
{
"epoch": 0.5258620689655172,
"grad_norm": 0.2497812658548355,
"learning_rate": 8.572040613912241e-06,
"loss": 1.1659,
"step": 122
},
{
"epoch": 0.5301724137931034,
"grad_norm": 0.27267956733703613,
"learning_rate": 8.54774532693439e-06,
"loss": 1.1539,
"step": 123
},
{
"epoch": 0.5344827586206896,
"grad_norm": 0.2700651288032532,
"learning_rate": 8.5232801613291e-06,
"loss": 1.1786,
"step": 124
},
{
"epoch": 0.5387931034482759,
"grad_norm": 0.26759159564971924,
"learning_rate": 8.498646288575265e-06,
"loss": 1.133,
"step": 125
},
{
"epoch": 0.5431034482758621,
"grad_norm": 0.25706982612609863,
"learning_rate": 8.473844888230065e-06,
"loss": 1.1783,
"step": 126
},
{
"epoch": 0.5474137931034483,
"grad_norm": 0.2711213231086731,
"learning_rate": 8.448877147872505e-06,
"loss": 1.1887,
"step": 127
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.26545941829681396,
"learning_rate": 8.42374426304653e-06,
"loss": 1.1341,
"step": 128
},
{
"epoch": 0.5560344827586207,
"grad_norm": 0.30440106987953186,
"learning_rate": 8.398447437203799e-06,
"loss": 1.1609,
"step": 129
},
{
"epoch": 0.5603448275862069,
"grad_norm": 0.2716272473335266,
"learning_rate": 8.372987881646036e-06,
"loss": 1.1327,
"step": 130
},
{
"epoch": 0.5646551724137931,
"grad_norm": 0.27410513162612915,
"learning_rate": 8.347366815467051e-06,
"loss": 1.1144,
"step": 131
},
{
"epoch": 0.5689655172413793,
"grad_norm": 0.25738298892974854,
"learning_rate": 8.32158546549435e-06,
"loss": 1.1528,
"step": 132
},
{
"epoch": 0.5732758620689655,
"grad_norm": 0.28856974840164185,
"learning_rate": 8.295645066230396e-06,
"loss": 1.1565,
"step": 133
},
{
"epoch": 0.5775862068965517,
"grad_norm": 0.2661442160606384,
"learning_rate": 8.269546859793499e-06,
"loss": 1.162,
"step": 134
},
{
"epoch": 0.5818965517241379,
"grad_norm": 0.26686185598373413,
"learning_rate": 8.24329209585833e-06,
"loss": 1.1464,
"step": 135
},
{
"epoch": 0.5862068965517241,
"grad_norm": 0.2854245901107788,
"learning_rate": 8.216882031596098e-06,
"loss": 1.1143,
"step": 136
},
{
"epoch": 0.5905172413793104,
"grad_norm": 0.24192146956920624,
"learning_rate": 8.190317931614332e-06,
"loss": 1.1308,
"step": 137
},
{
"epoch": 0.5948275862068966,
"grad_norm": 0.23887085914611816,
"learning_rate": 8.163601067896344e-06,
"loss": 1.1545,
"step": 138
},
{
"epoch": 0.5991379310344828,
"grad_norm": 0.26864567399024963,
"learning_rate": 8.13673271974031e-06,
"loss": 1.1171,
"step": 139
},
{
"epoch": 0.603448275862069,
"grad_norm": 0.26497048139572144,
"learning_rate": 8.109714173698027e-06,
"loss": 1.1368,
"step": 140
},
{
"epoch": 0.6077586206896551,
"grad_norm": 0.24038733541965485,
"learning_rate": 8.082546723513289e-06,
"loss": 1.1512,
"step": 141
},
{
"epoch": 0.6120689655172413,
"grad_norm": 0.2721477150917053,
"learning_rate": 8.055231670059958e-06,
"loss": 1.0611,
"step": 142
},
{
"epoch": 0.6163793103448276,
"grad_norm": 0.2796621322631836,
"learning_rate": 8.027770321279654e-06,
"loss": 1.1109,
"step": 143
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.29094719886779785,
"learning_rate": 8.000163992119146e-06,
"loss": 1.1781,
"step": 144
},
{
"epoch": 0.625,
"grad_norm": 0.26753732562065125,
"learning_rate": 7.97241400446737e-06,
"loss": 1.1694,
"step": 145
},
{
"epoch": 0.6293103448275862,
"grad_norm": 0.2574915587902069,
"learning_rate": 7.944521687092143e-06,
"loss": 1.0821,
"step": 146
},
{
"epoch": 0.6336206896551724,
"grad_norm": 0.2696407735347748,
"learning_rate": 7.916488375576538e-06,
"loss": 1.1272,
"step": 147
},
{
"epoch": 0.6379310344827587,
"grad_norm": 0.29391637444496155,
"learning_rate": 7.888315412254921e-06,
"loss": 1.1787,
"step": 148
},
{
"epoch": 0.6422413793103449,
"grad_norm": 0.3649088740348816,
"learning_rate": 7.860004146148683e-06,
"loss": 1.1116,
"step": 149
},
{
"epoch": 0.646551724137931,
"grad_norm": 0.30176007747650146,
"learning_rate": 7.831555932901642e-06,
"loss": 1.1539,
"step": 150
},
{
"epoch": 0.6508620689655172,
"grad_norm": 0.25577977299690247,
"learning_rate": 7.802972134715138e-06,
"loss": 1.1014,
"step": 151
},
{
"epoch": 0.6551724137931034,
"grad_norm": 0.25991085171699524,
"learning_rate": 7.774254120282792e-06,
"loss": 1.0917,
"step": 152
},
{
"epoch": 0.6594827586206896,
"grad_norm": 0.2913319170475006,
"learning_rate": 7.745403264724973e-06,
"loss": 1.1445,
"step": 153
},
{
"epoch": 0.6637931034482759,
"grad_norm": 0.2821566164493561,
"learning_rate": 7.71642094952296e-06,
"loss": 1.1306,
"step": 154
},
{
"epoch": 0.6681034482758621,
"grad_norm": 0.28270649909973145,
"learning_rate": 7.687308562452783e-06,
"loss": 1.1326,
"step": 155
},
{
"epoch": 0.6724137931034483,
"grad_norm": 0.26017746329307556,
"learning_rate": 7.658067497518773e-06,
"loss": 1.1314,
"step": 156
},
{
"epoch": 0.6767241379310345,
"grad_norm": 0.333617240190506,
"learning_rate": 7.628699154886817e-06,
"loss": 1.1631,
"step": 157
},
{
"epoch": 0.6810344827586207,
"grad_norm": 0.26498985290527344,
"learning_rate": 7.599204940817309e-06,
"loss": 1.1347,
"step": 158
},
{
"epoch": 0.6853448275862069,
"grad_norm": 0.2794323265552521,
"learning_rate": 7.5695862675978085e-06,
"loss": 1.1213,
"step": 159
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.29901158809661865,
"learning_rate": 7.539844553475427e-06,
"loss": 1.1851,
"step": 160
},
{
"epoch": 0.6939655172413793,
"grad_norm": 0.2878389060497284,
"learning_rate": 7.509981222588909e-06,
"loss": 1.1201,
"step": 161
},
{
"epoch": 0.6982758620689655,
"grad_norm": 0.3090667426586151,
"learning_rate": 7.479997704900437e-06,
"loss": 1.1283,
"step": 162
},
{
"epoch": 0.7025862068965517,
"grad_norm": 0.3226082921028137,
"learning_rate": 7.449895436127169e-06,
"loss": 1.1655,
"step": 163
},
{
"epoch": 0.7068965517241379,
"grad_norm": 0.2934945225715637,
"learning_rate": 7.4196758576724835e-06,
"loss": 1.1251,
"step": 164
},
{
"epoch": 0.7112068965517241,
"grad_norm": 0.3038388788700104,
"learning_rate": 7.389340416556964e-06,
"loss": 1.1198,
"step": 165
},
{
"epoch": 0.7155172413793104,
"grad_norm": 0.2832164466381073,
"learning_rate": 7.358890565349106e-06,
"loss": 1.1653,
"step": 166
},
{
"epoch": 0.7198275862068966,
"grad_norm": 0.3079414367675781,
"learning_rate": 7.328327762095769e-06,
"loss": 1.0996,
"step": 167
},
{
"epoch": 0.7241379310344828,
"grad_norm": 0.29658767580986023,
"learning_rate": 7.297653470252359e-06,
"loss": 1.1167,
"step": 168
},
{
"epoch": 0.728448275862069,
"grad_norm": 0.29325637221336365,
"learning_rate": 7.266869158612743e-06,
"loss": 1.1,
"step": 169
},
{
"epoch": 0.7327586206896551,
"grad_norm": 0.2616899907588959,
"learning_rate": 7.235976301238933e-06,
"loss": 1.0893,
"step": 170
},
{
"epoch": 0.7370689655172413,
"grad_norm": 0.3306714594364166,
"learning_rate": 7.2049763773904955e-06,
"loss": 1.1118,
"step": 171
},
{
"epoch": 0.7413793103448276,
"grad_norm": 0.29951155185699463,
"learning_rate": 7.1738708714537165e-06,
"loss": 1.1483,
"step": 172
},
{
"epoch": 0.7456896551724138,
"grad_norm": 0.2952785789966583,
"learning_rate": 7.142661272870527e-06,
"loss": 1.1043,
"step": 173
},
{
"epoch": 0.75,
"grad_norm": 0.26570263504981995,
"learning_rate": 7.111349076067186e-06,
"loss": 1.1209,
"step": 174
},
{
"epoch": 0.75,
"eval_loss": 1.1805130243301392,
"eval_runtime": 38.8981,
"eval_samples_per_second": 11.389,
"eval_steps_per_second": 1.44,
"step": 174
},
{
"epoch": 0.7543103448275862,
"grad_norm": 0.32131871581077576,
"learning_rate": 7.079935780382716e-06,
"loss": 1.1453,
"step": 175
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.3396710455417633,
"learning_rate": 7.048422889997115e-06,
"loss": 1.1384,
"step": 176
},
{
"epoch": 0.7629310344827587,
"grad_norm": 0.3742856979370117,
"learning_rate": 7.016811913859325e-06,
"loss": 1.1325,
"step": 177
},
{
"epoch": 0.7672413793103449,
"grad_norm": 0.2917105555534363,
"learning_rate": 6.985104365614987e-06,
"loss": 1.0813,
"step": 178
},
{
"epoch": 0.771551724137931,
"grad_norm": 0.301831990480423,
"learning_rate": 6.953301763533951e-06,
"loss": 1.1406,
"step": 179
},
{
"epoch": 0.7758620689655172,
"grad_norm": 0.3077235817909241,
"learning_rate": 6.921405630437585e-06,
"loss": 1.0814,
"step": 180
},
{
"epoch": 0.7801724137931034,
"grad_norm": 0.271220862865448,
"learning_rate": 6.889417493625854e-06,
"loss": 1.0846,
"step": 181
},
{
"epoch": 0.7844827586206896,
"grad_norm": 0.3000980615615845,
"learning_rate": 6.857338884804185e-06,
"loss": 1.1146,
"step": 182
},
{
"epoch": 0.7887931034482759,
"grad_norm": 0.3144089877605438,
"learning_rate": 6.82517134001013e-06,
"loss": 1.097,
"step": 183
},
{
"epoch": 0.7931034482758621,
"grad_norm": 0.3334217071533203,
"learning_rate": 6.792916399539805e-06,
"loss": 1.087,
"step": 184
},
{
"epoch": 0.7974137931034483,
"grad_norm": 0.29255741834640503,
"learning_rate": 6.760575607874145e-06,
"loss": 1.093,
"step": 185
},
{
"epoch": 0.8017241379310345,
"grad_norm": 0.3156846761703491,
"learning_rate": 6.728150513604942e-06,
"loss": 1.1933,
"step": 186
},
{
"epoch": 0.8060344827586207,
"grad_norm": 0.2973058223724365,
"learning_rate": 6.6956426693607e-06,
"loss": 1.2163,
"step": 187
},
{
"epoch": 0.8103448275862069,
"grad_norm": 0.3086062967777252,
"learning_rate": 6.663053631732279e-06,
"loss": 1.0922,
"step": 188
},
{
"epoch": 0.8146551724137931,
"grad_norm": 0.3231525421142578,
"learning_rate": 6.630384961198371e-06,
"loss": 1.0915,
"step": 189
},
{
"epoch": 0.8189655172413793,
"grad_norm": 0.31745991110801697,
"learning_rate": 6.597638222050773e-06,
"loss": 1.153,
"step": 190
},
{
"epoch": 0.8232758620689655,
"grad_norm": 0.3454311788082123,
"learning_rate": 6.564814982319481e-06,
"loss": 1.1638,
"step": 191
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.3196294903755188,
"learning_rate": 6.5319168136976155e-06,
"loss": 1.0483,
"step": 192
},
{
"epoch": 0.8318965517241379,
"grad_norm": 0.30666542053222656,
"learning_rate": 6.4989452914661525e-06,
"loss": 1.0661,
"step": 193
},
{
"epoch": 0.8362068965517241,
"grad_norm": 0.31098195910453796,
"learning_rate": 6.465901994418505e-06,
"loss": 1.1715,
"step": 194
},
{
"epoch": 0.8405172413793104,
"grad_norm": 0.33826136589050293,
"learning_rate": 6.432788504784913e-06,
"loss": 1.0956,
"step": 195
},
{
"epoch": 0.8448275862068966,
"grad_norm": 0.32232627272605896,
"learning_rate": 6.399606408156688e-06,
"loss": 1.1055,
"step": 196
},
{
"epoch": 0.8491379310344828,
"grad_norm": 0.29653576016426086,
"learning_rate": 6.3663572934102915e-06,
"loss": 1.0766,
"step": 197
},
{
"epoch": 0.853448275862069,
"grad_norm": 0.2951408624649048,
"learning_rate": 6.333042752631243e-06,
"loss": 1.1221,
"step": 198
},
{
"epoch": 0.8577586206896551,
"grad_norm": 0.31561148166656494,
"learning_rate": 6.2996643810379e-06,
"loss": 1.0901,
"step": 199
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.28782910108566284,
"learning_rate": 6.266223776905062e-06,
"loss": 1.1135,
"step": 200
},
{
"epoch": 0.8663793103448276,
"grad_norm": 0.28432968258857727,
"learning_rate": 6.232722541487443e-06,
"loss": 1.1482,
"step": 201
},
{
"epoch": 0.8706896551724138,
"grad_norm": 0.30610454082489014,
"learning_rate": 6.199162278942997e-06,
"loss": 1.1433,
"step": 202
},
{
"epoch": 0.875,
"grad_norm": 0.31622040271759033,
"learning_rate": 6.165544596256109e-06,
"loss": 1.0992,
"step": 203
},
{
"epoch": 0.8793103448275862,
"grad_norm": 0.3249478042125702,
"learning_rate": 6.131871103160644e-06,
"loss": 1.0708,
"step": 204
},
{
"epoch": 0.8836206896551724,
"grad_norm": 0.28761979937553406,
"learning_rate": 6.098143412062864e-06,
"loss": 1.1509,
"step": 205
},
{
"epoch": 0.8879310344827587,
"grad_norm": 0.31169748306274414,
"learning_rate": 6.064363137964225e-06,
"loss": 1.1497,
"step": 206
},
{
"epoch": 0.8922413793103449,
"grad_norm": 0.2758077383041382,
"learning_rate": 6.030531898384045e-06,
"loss": 1.1262,
"step": 207
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.3364175856113434,
"learning_rate": 5.996651313282051e-06,
"loss": 1.1345,
"step": 208
},
{
"epoch": 0.9008620689655172,
"grad_norm": 0.287631630897522,
"learning_rate": 5.962723004980804e-06,
"loss": 1.133,
"step": 209
},
{
"epoch": 0.9051724137931034,
"grad_norm": 0.29732412099838257,
"learning_rate": 5.9287485980880245e-06,
"loss": 1.124,
"step": 210
},
{
"epoch": 0.9094827586206896,
"grad_norm": 0.2940175533294678,
"learning_rate": 5.894729719418795e-06,
"loss": 1.0874,
"step": 211
},
{
"epoch": 0.9137931034482759,
"grad_norm": 0.31645113229751587,
"learning_rate": 5.860667997917668e-06,
"loss": 1.0723,
"step": 212
},
{
"epoch": 0.9181034482758621,
"grad_norm": 0.2804202437400818,
"learning_rate": 5.826565064580659e-06,
"loss": 1.0918,
"step": 213
},
{
"epoch": 0.9224137931034483,
"grad_norm": 0.34620022773742676,
"learning_rate": 5.792422552377153e-06,
"loss": 1.0861,
"step": 214
},
{
"epoch": 0.9267241379310345,
"grad_norm": 0.2878914773464203,
"learning_rate": 5.758242096171713e-06,
"loss": 1.1481,
"step": 215
},
{
"epoch": 0.9310344827586207,
"grad_norm": 0.2986612915992737,
"learning_rate": 5.724025332645794e-06,
"loss": 1.1088,
"step": 216
},
{
"epoch": 0.9353448275862069,
"grad_norm": 0.3098682761192322,
"learning_rate": 5.689773900219374e-06,
"loss": 1.131,
"step": 217
},
{
"epoch": 0.9396551724137931,
"grad_norm": 0.29040470719337463,
"learning_rate": 5.655489438972503e-06,
"loss": 1.0993,
"step": 218
},
{
"epoch": 0.9439655172413793,
"grad_norm": 0.3034652769565582,
"learning_rate": 5.6211735905667665e-06,
"loss": 1.1125,
"step": 219
},
{
"epoch": 0.9482758620689655,
"grad_norm": 0.34945690631866455,
"learning_rate": 5.586827998166678e-06,
"loss": 1.0605,
"step": 220
},
{
"epoch": 0.9525862068965517,
"grad_norm": 0.3174704313278198,
"learning_rate": 5.552454306361e-06,
"loss": 1.1404,
"step": 221
},
{
"epoch": 0.9568965517241379,
"grad_norm": 0.3093855082988739,
"learning_rate": 5.518054161083994e-06,
"loss": 1.1072,
"step": 222
},
{
"epoch": 0.9612068965517241,
"grad_norm": 0.33938005566596985,
"learning_rate": 5.483629209536609e-06,
"loss": 1.0694,
"step": 223
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.32714080810546875,
"learning_rate": 5.449181100107599e-06,
"loss": 1.0651,
"step": 224
},
{
"epoch": 0.9698275862068966,
"grad_norm": 0.30905458331108093,
"learning_rate": 5.41471148229461e-06,
"loss": 1.0764,
"step": 225
},
{
"epoch": 0.9741379310344828,
"grad_norm": 0.2898302674293518,
"learning_rate": 5.38022200662518e-06,
"loss": 1.1079,
"step": 226
},
{
"epoch": 0.978448275862069,
"grad_norm": 0.2906856834888458,
"learning_rate": 5.34571432457771e-06,
"loss": 1.0981,
"step": 227
},
{
"epoch": 0.9827586206896551,
"grad_norm": 0.2875739336013794,
"learning_rate": 5.31119008850239e-06,
"loss": 1.109,
"step": 228
},
{
"epoch": 0.9870689655172413,
"grad_norm": 0.31278446316719055,
"learning_rate": 5.2766509515420785e-06,
"loss": 1.0993,
"step": 229
},
{
"epoch": 0.9913793103448276,
"grad_norm": 0.317538857460022,
"learning_rate": 5.242098567553133e-06,
"loss": 1.1088,
"step": 230
},
{
"epoch": 0.9956896551724138,
"grad_norm": 0.2955145239830017,
"learning_rate": 5.2075345910262296e-06,
"loss": 1.0803,
"step": 231
},
{
"epoch": 1.0,
"grad_norm": 0.33417847752571106,
"learning_rate": 5.1729606770071395e-06,
"loss": 1.0785,
"step": 232
},
{
"epoch": 1.0,
"eval_loss": 1.1590964794158936,
"eval_runtime": 38.8432,
"eval_samples_per_second": 11.405,
"eval_steps_per_second": 1.442,
"step": 232
},
{
"epoch": 1.0043103448275863,
"grad_norm": 0.29942837357521057,
"learning_rate": 5.138378481017475e-06,
"loss": 1.1076,
"step": 233
},
{
"epoch": 1.0086206896551724,
"grad_norm": 0.30415764451026917,
"learning_rate": 5.103789658975413e-06,
"loss": 1.0919,
"step": 234
},
{
"epoch": 1.0129310344827587,
"grad_norm": 0.32707205414772034,
"learning_rate": 5.069195867116416e-06,
"loss": 1.0154,
"step": 235
},
{
"epoch": 1.0172413793103448,
"grad_norm": 0.3695222735404968,
"learning_rate": 5.034598761913917e-06,
"loss": 1.0878,
"step": 236
},
{
"epoch": 1.021551724137931,
"grad_norm": 0.32564374804496765,
"learning_rate": 5e-06,
"loss": 1.1174,
"step": 237
},
{
"epoch": 1.0258620689655173,
"grad_norm": 0.3132264316082001,
"learning_rate": 4.965401238086084e-06,
"loss": 1.1475,
"step": 238
},
{
"epoch": 1.0043103448275863,
"grad_norm": 0.33977454900741577,
"learning_rate": 4.930804132883584e-06,
"loss": 1.0749,
"step": 239
},
{
"epoch": 1.0086206896551724,
"grad_norm": 0.34106722474098206,
"learning_rate": 4.896210341024587e-06,
"loss": 1.0821,
"step": 240
},
{
"epoch": 1.0129310344827587,
"grad_norm": 0.3282875716686249,
"learning_rate": 4.861621518982527e-06,
"loss": 1.0379,
"step": 241
},
{
"epoch": 1.0172413793103448,
"grad_norm": 0.38841262459754944,
"learning_rate": 4.827039322992861e-06,
"loss": 1.1243,
"step": 242
},
{
"epoch": 1.021551724137931,
"grad_norm": 0.28226733207702637,
"learning_rate": 4.792465408973772e-06,
"loss": 1.11,
"step": 243
},
{
"epoch": 1.0258620689655173,
"grad_norm": 0.32415810227394104,
"learning_rate": 4.75790143244687e-06,
"loss": 1.106,
"step": 244
},
{
"epoch": 1.0301724137931034,
"grad_norm": 0.30683305859565735,
"learning_rate": 4.723349048457924e-06,
"loss": 1.0732,
"step": 245
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.352728933095932,
"learning_rate": 4.68880991149761e-06,
"loss": 1.1466,
"step": 246
},
{
"epoch": 1.0387931034482758,
"grad_norm": 0.35975515842437744,
"learning_rate": 4.654285675422293e-06,
"loss": 1.0218,
"step": 247
},
{
"epoch": 1.043103448275862,
"grad_norm": 0.27443352341651917,
"learning_rate": 4.6197779933748226e-06,
"loss": 1.0574,
"step": 248
},
{
"epoch": 1.0474137931034482,
"grad_norm": 0.37412866950035095,
"learning_rate": 4.585288517705392e-06,
"loss": 1.0348,
"step": 249
},
{
"epoch": 1.0517241379310345,
"grad_norm": 0.3572046756744385,
"learning_rate": 4.550818899892402e-06,
"loss": 1.041,
"step": 250
},
{
"epoch": 1.0560344827586208,
"grad_norm": 0.3478338420391083,
"learning_rate": 4.516370790463394e-06,
"loss": 1.0869,
"step": 251
},
{
"epoch": 1.0603448275862069,
"grad_norm": 0.344951868057251,
"learning_rate": 4.481945838916006e-06,
"loss": 1.0014,
"step": 252
},
{
"epoch": 1.0646551724137931,
"grad_norm": 0.34624016284942627,
"learning_rate": 4.447545693639e-06,
"loss": 1.0854,
"step": 253
},
{
"epoch": 1.0689655172413792,
"grad_norm": 0.3237141966819763,
"learning_rate": 4.413172001833324e-06,
"loss": 1.0955,
"step": 254
},
{
"epoch": 1.0732758620689655,
"grad_norm": 0.3042526841163635,
"learning_rate": 4.378826409433235e-06,
"loss": 1.0767,
"step": 255
},
{
"epoch": 1.0775862068965518,
"grad_norm": 0.34745466709136963,
"learning_rate": 4.344510561027498e-06,
"loss": 1.0641,
"step": 256
},
{
"epoch": 1.081896551724138,
"grad_norm": 0.3786261975765228,
"learning_rate": 4.310226099780627e-06,
"loss": 1.0999,
"step": 257
},
{
"epoch": 1.0862068965517242,
"grad_norm": 0.3591586947441101,
"learning_rate": 4.275974667354208e-06,
"loss": 1.1285,
"step": 258
},
{
"epoch": 1.0905172413793103,
"grad_norm": 0.32906004786491394,
"learning_rate": 4.241757903828288e-06,
"loss": 1.0845,
"step": 259
},
{
"epoch": 1.0948275862068966,
"grad_norm": 0.31499743461608887,
"learning_rate": 4.207577447622849e-06,
"loss": 1.0354,
"step": 260
},
{
"epoch": 1.0991379310344827,
"grad_norm": 0.34880805015563965,
"learning_rate": 4.173434935419342e-06,
"loss": 1.0957,
"step": 261
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.3316938877105713,
"learning_rate": 4.139332002082333e-06,
"loss": 1.0089,
"step": 262
},
{
"epoch": 1.1077586206896552,
"grad_norm": 0.3652949631214142,
"learning_rate": 4.105270280581206e-06,
"loss": 1.1545,
"step": 263
},
{
"epoch": 1.1120689655172413,
"grad_norm": 0.33063212037086487,
"learning_rate": 4.071251401911977e-06,
"loss": 1.1157,
"step": 264
},
{
"epoch": 1.1163793103448276,
"grad_norm": 0.3272898197174072,
"learning_rate": 4.037276995019198e-06,
"loss": 1.1023,
"step": 265
},
{
"epoch": 1.1206896551724137,
"grad_norm": 0.34276431798934937,
"learning_rate": 4.00334868671795e-06,
"loss": 1.1385,
"step": 266
},
{
"epoch": 1.125,
"grad_norm": 0.33519965410232544,
"learning_rate": 3.969468101615956e-06,
"loss": 1.0816,
"step": 267
},
{
"epoch": 1.1293103448275863,
"grad_norm": 0.29600027203559875,
"learning_rate": 3.935636862035776e-06,
"loss": 1.0973,
"step": 268
},
{
"epoch": 1.1336206896551724,
"grad_norm": 0.3042546510696411,
"learning_rate": 3.901856587937138e-06,
"loss": 1.1236,
"step": 269
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.3240540027618408,
"learning_rate": 3.868128896839357e-06,
"loss": 1.047,
"step": 270
},
{
"epoch": 1.1422413793103448,
"grad_norm": 0.3279811143875122,
"learning_rate": 3.834455403743892e-06,
"loss": 1.0486,
"step": 271
},
{
"epoch": 1.146551724137931,
"grad_norm": 0.38778162002563477,
"learning_rate": 3.8008377210570045e-06,
"loss": 1.0099,
"step": 272
},
{
"epoch": 1.1508620689655173,
"grad_norm": 0.3080158233642578,
"learning_rate": 3.76727745851256e-06,
"loss": 1.0861,
"step": 273
},
{
"epoch": 1.1551724137931034,
"grad_norm": 0.3064788579940796,
"learning_rate": 3.7337762230949397e-06,
"loss": 1.0795,
"step": 274
},
{
"epoch": 1.1594827586206897,
"grad_norm": 0.3312165141105652,
"learning_rate": 3.700335618962101e-06,
"loss": 1.0781,
"step": 275
},
{
"epoch": 1.1637931034482758,
"grad_norm": 0.3970606327056885,
"learning_rate": 3.6669572473687577e-06,
"loss": 1.0798,
"step": 276
},
{
"epoch": 1.168103448275862,
"grad_norm": 0.35045385360717773,
"learning_rate": 3.6336427065897106e-06,
"loss": 1.1505,
"step": 277
},
{
"epoch": 1.1724137931034484,
"grad_norm": 0.3263322114944458,
"learning_rate": 3.6003935918433124e-06,
"loss": 1.0427,
"step": 278
},
{
"epoch": 1.1767241379310345,
"grad_norm": 0.3410392999649048,
"learning_rate": 3.567211495215088e-06,
"loss": 1.0789,
"step": 279
},
{
"epoch": 1.1810344827586208,
"grad_norm": 0.33315184712409973,
"learning_rate": 3.534098005581497e-06,
"loss": 1.0666,
"step": 280
},
{
"epoch": 1.1853448275862069,
"grad_norm": 0.34163913130760193,
"learning_rate": 3.5010547085338487e-06,
"loss": 1.0769,
"step": 281
},
{
"epoch": 1.1896551724137931,
"grad_norm": 0.3381548225879669,
"learning_rate": 3.4680831863023866e-06,
"loss": 1.074,
"step": 282
},
{
"epoch": 1.1939655172413792,
"grad_norm": 0.3141576647758484,
"learning_rate": 3.43518501768052e-06,
"loss": 1.0957,
"step": 283
},
{
"epoch": 1.1982758620689655,
"grad_norm": 0.2855520248413086,
"learning_rate": 3.402361777949229e-06,
"loss": 1.1114,
"step": 284
},
{
"epoch": 1.2025862068965516,
"grad_norm": 0.3210565745830536,
"learning_rate": 3.3696150388016295e-06,
"loss": 1.1209,
"step": 285
},
{
"epoch": 1.206896551724138,
"grad_norm": 0.3479798138141632,
"learning_rate": 3.336946368267724e-06,
"loss": 1.033,
"step": 286
},
{
"epoch": 1.2112068965517242,
"grad_norm": 0.3301966190338135,
"learning_rate": 3.304357330639303e-06,
"loss": 1.1279,
"step": 287
},
{
"epoch": 1.2155172413793103,
"grad_norm": 0.3064906895160675,
"learning_rate": 3.271849486395059e-06,
"loss": 1.1006,
"step": 288
},
{
"epoch": 1.2198275862068966,
"grad_norm": 0.30745407938957214,
"learning_rate": 3.2394243921258566e-06,
"loss": 1.0949,
"step": 289
},
{
"epoch": 1.2241379310344827,
"grad_norm": 0.31708669662475586,
"learning_rate": 3.207083600460196e-06,
"loss": 1.0999,
"step": 290
},
{
"epoch": 1.2241379310344827,
"eval_loss": 1.1474053859710693,
"eval_runtime": 38.8873,
"eval_samples_per_second": 11.392,
"eval_steps_per_second": 1.44,
"step": 290
},
{
"epoch": 1.228448275862069,
"grad_norm": 0.35755884647369385,
"learning_rate": 3.174828659989871e-06,
"loss": 1.0781,
"step": 291
},
{
"epoch": 1.2327586206896552,
"grad_norm": 0.3275066316127777,
"learning_rate": 3.1426611151958146e-06,
"loss": 1.1165,
"step": 292
},
{
"epoch": 1.2370689655172413,
"grad_norm": 0.3440989851951599,
"learning_rate": 3.1105825063741486e-06,
"loss": 1.1444,
"step": 293
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.3393403887748718,
"learning_rate": 3.078594369562417e-06,
"loss": 1.0563,
"step": 294
},
{
"epoch": 1.2456896551724137,
"grad_norm": 0.3146829307079315,
"learning_rate": 3.0466982364660514e-06,
"loss": 1.0335,
"step": 295
},
{
"epoch": 1.25,
"grad_norm": 0.30042600631713867,
"learning_rate": 3.0148956343850143e-06,
"loss": 1.0717,
"step": 296
},
{
"epoch": 1.2543103448275863,
"grad_norm": 0.3386390209197998,
"learning_rate": 2.9831880861406747e-06,
"loss": 1.1232,
"step": 297
},
{
"epoch": 1.2586206896551724,
"grad_norm": 0.3249225914478302,
"learning_rate": 2.9515771100028854e-06,
"loss": 1.1093,
"step": 298
},
{
"epoch": 1.2629310344827587,
"grad_norm": 0.3327488899230957,
"learning_rate": 2.9200642196172855e-06,
"loss": 1.0943,
"step": 299
},
{
"epoch": 1.2672413793103448,
"grad_norm": 0.32572102546691895,
"learning_rate": 2.888650923932815e-06,
"loss": 1.0842,
"step": 300
},
{
"epoch": 1.271551724137931,
"grad_norm": 0.3248356580734253,
"learning_rate": 2.8573387271294734e-06,
"loss": 1.1394,
"step": 301
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.33868080377578735,
"learning_rate": 2.8261291285462843e-06,
"loss": 1.059,
"step": 302
},
{
"epoch": 1.2801724137931034,
"grad_norm": 0.34686562418937683,
"learning_rate": 2.7950236226095044e-06,
"loss": 1.0545,
"step": 303
},
{
"epoch": 1.2844827586206897,
"grad_norm": 0.3430519700050354,
"learning_rate": 2.7640236987610662e-06,
"loss": 1.0411,
"step": 304
},
{
"epoch": 1.2887931034482758,
"grad_norm": 0.36565178632736206,
"learning_rate": 2.7331308413872593e-06,
"loss": 1.1237,
"step": 305
},
{
"epoch": 1.293103448275862,
"grad_norm": 0.31981322169303894,
"learning_rate": 2.7023465297476426e-06,
"loss": 1.0985,
"step": 306
},
{
"epoch": 1.2974137931034484,
"grad_norm": 0.33923840522766113,
"learning_rate": 2.6716722379042303e-06,
"loss": 1.0989,
"step": 307
},
{
"epoch": 1.3017241379310345,
"grad_norm": 0.311847984790802,
"learning_rate": 2.641109434650894e-06,
"loss": 1.0775,
"step": 308
},
{
"epoch": 1.3060344827586206,
"grad_norm": 0.34916892647743225,
"learning_rate": 2.6106595834430366e-06,
"loss": 1.0737,
"step": 309
},
{
"epoch": 1.3103448275862069,
"grad_norm": 0.35223567485809326,
"learning_rate": 2.580324142327516e-06,
"loss": 0.9688,
"step": 310
},
{
"epoch": 1.3146551724137931,
"grad_norm": 0.2887548804283142,
"learning_rate": 2.5501045638728307e-06,
"loss": 1.0723,
"step": 311
},
{
"epoch": 1.3189655172413794,
"grad_norm": 0.3438979685306549,
"learning_rate": 2.520002295099564e-06,
"loss": 1.0939,
"step": 312
},
{
"epoch": 1.3232758620689655,
"grad_norm": 0.3115052282810211,
"learning_rate": 2.4900187774110923e-06,
"loss": 1.0921,
"step": 313
},
{
"epoch": 1.3275862068965516,
"grad_norm": 0.3779308795928955,
"learning_rate": 2.460155446524573e-06,
"loss": 1.1631,
"step": 314
},
{
"epoch": 1.331896551724138,
"grad_norm": 0.305646151304245,
"learning_rate": 2.4304137324021915e-06,
"loss": 1.0457,
"step": 315
},
{
"epoch": 1.3362068965517242,
"grad_norm": 0.356973260641098,
"learning_rate": 2.400795059182692e-06,
"loss": 1.0929,
"step": 316
},
{
"epoch": 1.3405172413793103,
"grad_norm": 0.36015981435775757,
"learning_rate": 2.371300845113182e-06,
"loss": 1.0143,
"step": 317
},
{
"epoch": 1.3448275862068966,
"grad_norm": 0.38384366035461426,
"learning_rate": 2.341932502481226e-06,
"loss": 1.0644,
"step": 318
},
{
"epoch": 1.3491379310344827,
"grad_norm": 0.3105123043060303,
"learning_rate": 2.3126914375472185e-06,
"loss": 1.0738,
"step": 319
},
{
"epoch": 1.353448275862069,
"grad_norm": 0.32959380745887756,
"learning_rate": 2.283579050477042e-06,
"loss": 1.0785,
"step": 320
},
{
"epoch": 1.3577586206896552,
"grad_norm": 0.3154717683792114,
"learning_rate": 2.254596735275028e-06,
"loss": 1.0562,
"step": 321
},
{
"epoch": 1.3620689655172413,
"grad_norm": 0.32698148488998413,
"learning_rate": 2.2257458797172093e-06,
"loss": 1.0658,
"step": 322
},
{
"epoch": 1.3663793103448276,
"grad_norm": 0.3278767764568329,
"learning_rate": 2.1970278652848615e-06,
"loss": 1.0972,
"step": 323
},
{
"epoch": 1.3706896551724137,
"grad_norm": 0.3235674500465393,
"learning_rate": 2.1684440670983568e-06,
"loss": 1.0611,
"step": 324
},
{
"epoch": 1.375,
"grad_norm": 0.398926705121994,
"learning_rate": 2.1399958538513197e-06,
"loss": 1.0892,
"step": 325
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.35341110825538635,
"learning_rate": 2.111684587745081e-06,
"loss": 1.0991,
"step": 326
},
{
"epoch": 1.3836206896551724,
"grad_norm": 0.36299997568130493,
"learning_rate": 2.0835116244234625e-06,
"loss": 1.0954,
"step": 327
},
{
"epoch": 1.3879310344827587,
"grad_norm": 0.3728746175765991,
"learning_rate": 2.0554783129078564e-06,
"loss": 1.0784,
"step": 328
},
{
"epoch": 1.3922413793103448,
"grad_norm": 0.3647480905056,
"learning_rate": 2.027585995532631e-06,
"loss": 1.0874,
"step": 329
},
{
"epoch": 1.396551724137931,
"grad_norm": 0.3591470718383789,
"learning_rate": 1.9998360078808547e-06,
"loss": 1.0999,
"step": 330
},
{
"epoch": 1.4008620689655173,
"grad_norm": 0.3867989480495453,
"learning_rate": 1.972229678720346e-06,
"loss": 1.0829,
"step": 331
},
{
"epoch": 1.4051724137931034,
"grad_norm": 0.3600481450557709,
"learning_rate": 1.944768329940045e-06,
"loss": 1.0679,
"step": 332
},
{
"epoch": 1.4094827586206897,
"grad_norm": 0.35227489471435547,
"learning_rate": 1.917453276486712e-06,
"loss": 1.1076,
"step": 333
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.34222954511642456,
"learning_rate": 1.8902858263019746e-06,
"loss": 1.0673,
"step": 334
},
{
"epoch": 1.418103448275862,
"grad_norm": 0.3176562488079071,
"learning_rate": 1.8632672802596907e-06,
"loss": 1.0518,
"step": 335
},
{
"epoch": 1.4224137931034484,
"grad_norm": 0.3231894075870514,
"learning_rate": 1.836398932103658e-06,
"loss": 1.032,
"step": 336
},
{
"epoch": 1.4267241379310345,
"grad_norm": 0.3324163258075714,
"learning_rate": 1.8096820683856687e-06,
"loss": 1.0684,
"step": 337
},
{
"epoch": 1.4310344827586206,
"grad_norm": 0.32313287258148193,
"learning_rate": 1.7831179684039041e-06,
"loss": 1.0558,
"step": 338
},
{
"epoch": 1.4353448275862069,
"grad_norm": 0.2836117148399353,
"learning_rate": 1.7567079041416706e-06,
"loss": 1.097,
"step": 339
},
{
"epoch": 1.4396551724137931,
"grad_norm": 0.3591717779636383,
"learning_rate": 1.7304531402065033e-06,
"loss": 1.0722,
"step": 340
},
{
"epoch": 1.4439655172413794,
"grad_norm": 0.3194042146205902,
"learning_rate": 1.7043549337696053e-06,
"loss": 1.0532,
"step": 341
},
{
"epoch": 1.4482758620689655,
"grad_norm": 0.3645802438259125,
"learning_rate": 1.6784145345056519e-06,
"loss": 1.0791,
"step": 342
},
{
"epoch": 1.4525862068965516,
"grad_norm": 0.3621852397918701,
"learning_rate": 1.6526331845329508e-06,
"loss": 1.074,
"step": 343
},
{
"epoch": 1.456896551724138,
"grad_norm": 0.3067796528339386,
"learning_rate": 1.627012118353965e-06,
"loss": 1.0218,
"step": 344
},
{
"epoch": 1.4612068965517242,
"grad_norm": 0.3126223385334015,
"learning_rate": 1.6015525627962041e-06,
"loss": 1.0597,
"step": 345
},
{
"epoch": 1.4655172413793103,
"grad_norm": 0.34759384393692017,
"learning_rate": 1.5762557369534709e-06,
"loss": 1.0818,
"step": 346
},
{
"epoch": 1.4698275862068966,
"grad_norm": 0.31952786445617676,
"learning_rate": 1.5511228521274973e-06,
"loss": 1.065,
"step": 347
},
{
"epoch": 1.4741379310344827,
"grad_norm": 0.31419798731803894,
"learning_rate": 1.5261551117699358e-06,
"loss": 1.1012,
"step": 348
},
{
"epoch": 1.4741379310344827,
"eval_loss": 1.1407095193862915,
"eval_runtime": 38.8848,
"eval_samples_per_second": 11.393,
"eval_steps_per_second": 1.44,
"step": 348
},
{
"epoch": 1.478448275862069,
"grad_norm": 0.37249940633773804,
"learning_rate": 1.5013537114247362e-06,
"loss": 1.0744,
"step": 349
},
{
"epoch": 1.4827586206896552,
"grad_norm": 0.32196739315986633,
"learning_rate": 1.4767198386708998e-06,
"loss": 1.0961,
"step": 350
},
{
"epoch": 1.4870689655172413,
"grad_norm": 0.35418596863746643,
"learning_rate": 1.452254673065613e-06,
"loss": 1.1137,
"step": 351
},
{
"epoch": 1.4913793103448276,
"grad_norm": 0.3167632520198822,
"learning_rate": 1.427959386087761e-06,
"loss": 1.1096,
"step": 352
},
{
"epoch": 1.4956896551724137,
"grad_norm": 0.33386534452438354,
"learning_rate": 1.4038351410818434e-06,
"loss": 1.0794,
"step": 353
},
{
"epoch": 1.5,
"grad_norm": 0.3341921865940094,
"learning_rate": 1.3798830932022616e-06,
"loss": 1.1006,
"step": 354
},
{
"epoch": 1.5043103448275863,
"grad_norm": 0.3526374101638794,
"learning_rate": 1.3561043893580084e-06,
"loss": 1.0801,
"step": 355
},
{
"epoch": 1.5086206896551724,
"grad_norm": 0.31550395488739014,
"learning_rate": 1.3325001681577482e-06,
"loss": 1.0564,
"step": 356
},
{
"epoch": 1.5129310344827587,
"grad_norm": 0.35600364208221436,
"learning_rate": 1.3090715598553e-06,
"loss": 1.1088,
"step": 357
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.36724480986595154,
"learning_rate": 1.2858196862955108e-06,
"loss": 1.0891,
"step": 358
},
{
"epoch": 1.521551724137931,
"grad_norm": 0.35326990485191345,
"learning_rate": 1.2627456608605442e-06,
"loss": 1.0839,
"step": 359
},
{
"epoch": 1.5258620689655173,
"grad_norm": 0.3355065882205963,
"learning_rate": 1.2398505884165652e-06,
"loss": 1.1194,
"step": 360
},
{
"epoch": 1.5301724137931034,
"grad_norm": 0.3283027708530426,
"learning_rate": 1.217135565260833e-06,
"loss": 1.0854,
"step": 361
},
{
"epoch": 1.5344827586206895,
"grad_norm": 0.31447839736938477,
"learning_rate": 1.1946016790692094e-06,
"loss": 1.0984,
"step": 362
},
{
"epoch": 1.5387931034482758,
"grad_norm": 0.32135719060897827,
"learning_rate": 1.172250008844077e-06,
"loss": 1.1079,
"step": 363
},
{
"epoch": 1.543103448275862,
"grad_norm": 0.33316370844841003,
"learning_rate": 1.1500816248626711e-06,
"loss": 1.0484,
"step": 364
},
{
"epoch": 1.5474137931034484,
"grad_norm": 0.3206438720226288,
"learning_rate": 1.1280975886258294e-06,
"loss": 1.0532,
"step": 365
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.3373902440071106,
"learning_rate": 1.1062989528071683e-06,
"loss": 1.0786,
"step": 366
},
{
"epoch": 1.5560344827586206,
"grad_norm": 0.33597832918167114,
"learning_rate": 1.0846867612026746e-06,
"loss": 1.153,
"step": 367
},
{
"epoch": 1.5603448275862069,
"grad_norm": 0.34479406476020813,
"learning_rate": 1.0632620486807244e-06,
"loss": 1.0782,
"step": 368
},
{
"epoch": 1.5646551724137931,
"grad_norm": 0.37828493118286133,
"learning_rate": 1.0420258411325308e-06,
"loss": 1.0208,
"step": 369
},
{
"epoch": 1.5689655172413794,
"grad_norm": 0.32772085070610046,
"learning_rate": 1.0209791554230209e-06,
"loss": 1.1206,
"step": 370
},
{
"epoch": 1.5732758620689655,
"grad_norm": 0.3135930299758911,
"learning_rate": 1.0001229993421412e-06,
"loss": 1.0934,
"step": 371
},
{
"epoch": 1.5775862068965516,
"grad_norm": 0.29522621631622314,
"learning_rate": 9.79458371556607e-07,
"loss": 1.1179,
"step": 372
},
{
"epoch": 1.581896551724138,
"grad_norm": 0.3688337802886963,
"learning_rate": 9.589862615620782e-07,
"loss": 1.0647,
"step": 373
},
{
"epoch": 1.5862068965517242,
"grad_norm": 0.30966484546661377,
"learning_rate": 9.387076496357805e-07,
"loss": 1.0978,
"step": 374
},
{
"epoch": 1.5905172413793105,
"grad_norm": 0.30171751976013184,
"learning_rate": 9.186235067895672e-07,
"loss": 1.0483,
"step": 375
},
{
"epoch": 1.5948275862068966,
"grad_norm": 0.370378315448761,
"learning_rate": 8.987347947234193e-07,
"loss": 1.068,
"step": 376
},
{
"epoch": 1.5991379310344827,
"grad_norm": 0.3673414885997772,
"learning_rate": 8.790424657794034e-07,
"loss": 1.0745,
"step": 377
},
{
"epoch": 1.603448275862069,
"grad_norm": 0.3022395968437195,
"learning_rate": 8.595474628960598e-07,
"loss": 1.0693,
"step": 378
},
{
"epoch": 1.6077586206896552,
"grad_norm": 0.3871013820171356,
"learning_rate": 8.402507195632625e-07,
"loss": 1.0281,
"step": 379
},
{
"epoch": 1.6120689655172413,
"grad_norm": 0.3459872901439667,
"learning_rate": 8.211531597775136e-07,
"loss": 1.0435,
"step": 380
},
{
"epoch": 1.6163793103448276,
"grad_norm": 0.3100816309452057,
"learning_rate": 8.022556979976992e-07,
"loss": 1.1093,
"step": 381
},
{
"epoch": 1.6206896551724137,
"grad_norm": 0.32473233342170715,
"learning_rate": 7.835592391013053e-07,
"loss": 1.0207,
"step": 382
},
{
"epoch": 1.625,
"grad_norm": 0.3091509938240051,
"learning_rate": 7.650646783410875e-07,
"loss": 1.1253,
"step": 383
},
{
"epoch": 1.6293103448275863,
"grad_norm": 0.3101778030395508,
"learning_rate": 7.467729013021979e-07,
"loss": 1.0307,
"step": 384
},
{
"epoch": 1.6336206896551724,
"grad_norm": 0.31698641180992126,
"learning_rate": 7.286847838597905e-07,
"loss": 1.064,
"step": 385
},
{
"epoch": 1.6379310344827587,
"grad_norm": 0.34377050399780273,
"learning_rate": 7.108011921370728e-07,
"loss": 1.0792,
"step": 386
},
{
"epoch": 1.6422413793103448,
"grad_norm": 0.3562955856323242,
"learning_rate": 6.931229824638358e-07,
"loss": 1.1161,
"step": 387
},
{
"epoch": 1.646551724137931,
"grad_norm": 0.3532198965549469,
"learning_rate": 6.756510013354512e-07,
"loss": 1.0326,
"step": 388
},
{
"epoch": 1.6508620689655173,
"grad_norm": 0.3240654468536377,
"learning_rate": 6.583860853723339e-07,
"loss": 1.0902,
"step": 389
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.3335769474506378,
"learning_rate": 6.413290612798883e-07,
"loss": 1.0795,
"step": 390
},
{
"epoch": 1.6594827586206895,
"grad_norm": 0.3507966995239258,
"learning_rate": 6.24480745808913e-07,
"loss": 1.0312,
"step": 391
},
{
"epoch": 1.6637931034482758,
"grad_norm": 0.3086683750152588,
"learning_rate": 6.078419457165036e-07,
"loss": 1.046,
"step": 392
},
{
"epoch": 1.668103448275862,
"grad_norm": 0.33030492067337036,
"learning_rate": 5.914134577274122e-07,
"loss": 1.0605,
"step": 393
},
{
"epoch": 1.6724137931034484,
"grad_norm": 0.3030514717102051,
"learning_rate": 5.751960684959046e-07,
"loss": 1.0724,
"step": 394
},
{
"epoch": 1.6767241379310345,
"grad_norm": 0.33980438113212585,
"learning_rate": 5.59190554568087e-07,
"loss": 1.0998,
"step": 395
},
{
"epoch": 1.6810344827586206,
"grad_norm": 0.3229700028896332,
"learning_rate": 5.433976823447262e-07,
"loss": 1.0422,
"step": 396
},
{
"epoch": 1.6853448275862069,
"grad_norm": 0.3324540853500366,
"learning_rate": 5.27818208044551e-07,
"loss": 1.0356,
"step": 397
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.3236718773841858,
"learning_rate": 5.124528776680371e-07,
"loss": 1.05,
"step": 398
},
{
"epoch": 1.6939655172413794,
"grad_norm": 0.3516765832901001,
"learning_rate": 4.973024269616933e-07,
"loss": 1.0468,
"step": 399
},
{
"epoch": 1.6982758620689655,
"grad_norm": 0.3242420554161072,
"learning_rate": 4.823675813828271e-07,
"loss": 1.0682,
"step": 400
},
{
"epoch": 1.7025862068965516,
"grad_norm": 0.2817647159099579,
"learning_rate": 4.676490560648067e-07,
"loss": 1.0515,
"step": 401
},
{
"epoch": 1.706896551724138,
"grad_norm": 0.3226347267627716,
"learning_rate": 4.531475557828202e-07,
"loss": 1.0865,
"step": 402
},
{
"epoch": 1.7112068965517242,
"grad_norm": 0.3294438123703003,
"learning_rate": 4.388637749201274e-07,
"loss": 1.0515,
"step": 403
},
{
"epoch": 1.7155172413793105,
"grad_norm": 0.3420010209083557,
"learning_rate": 4.2479839743480965e-07,
"loss": 1.1431,
"step": 404
},
{
"epoch": 1.7198275862068966,
"grad_norm": 0.34435805678367615,
"learning_rate": 4.1095209682701977e-07,
"loss": 1.1311,
"step": 405
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.337528258562088,
"learning_rate": 3.9732553610673465e-07,
"loss": 1.0779,
"step": 406
},
{
"epoch": 1.7241379310344827,
"eval_loss": 1.1384811401367188,
"eval_runtime": 38.9249,
"eval_samples_per_second": 11.381,
"eval_steps_per_second": 1.439,
"step": 406
},
{
"epoch": 1.728448275862069,
"grad_norm": 0.3463694453239441,
"learning_rate": 3.839193677620029e-07,
"loss": 1.0895,
"step": 407
},
{
"epoch": 1.7327586206896552,
"grad_norm": 0.311787486076355,
"learning_rate": 3.7073423372770754e-07,
"loss": 1.0449,
"step": 408
},
{
"epoch": 1.7370689655172413,
"grad_norm": 0.3618296980857849,
"learning_rate": 3.577707653548229e-07,
"loss": 1.1474,
"step": 409
},
{
"epoch": 1.7413793103448276,
"grad_norm": 0.35903745889663696,
"learning_rate": 3.4502958338018754e-07,
"loss": 1.0572,
"step": 410
},
{
"epoch": 1.7456896551724137,
"grad_norm": 0.33760443329811096,
"learning_rate": 3.325112978967776e-07,
"loss": 1.0408,
"step": 411
},
{
"epoch": 1.75,
"grad_norm": 0.3630852997303009,
"learning_rate": 3.20216508324494e-07,
"loss": 1.0523,
"step": 412
},
{
"epoch": 1.7543103448275863,
"grad_norm": 0.3185005784034729,
"learning_rate": 3.081458033814627e-07,
"loss": 1.1005,
"step": 413
},
{
"epoch": 1.7586206896551724,
"grad_norm": 0.35955214500427246,
"learning_rate": 2.9629976105584266e-07,
"loss": 1.1021,
"step": 414
},
{
"epoch": 1.7629310344827587,
"grad_norm": 0.2948373854160309,
"learning_rate": 2.8467894857814814e-07,
"loss": 1.0647,
"step": 415
},
{
"epoch": 1.7672413793103448,
"grad_norm": 0.3243519961833954,
"learning_rate": 2.732839223940914e-07,
"loss": 1.0736,
"step": 416
},
{
"epoch": 1.771551724137931,
"grad_norm": 0.33327266573905945,
"learning_rate": 2.621152281379352e-07,
"loss": 1.1213,
"step": 417
},
{
"epoch": 1.7758620689655173,
"grad_norm": 0.32360267639160156,
"learning_rate": 2.5117340060636817e-07,
"loss": 1.045,
"step": 418
},
{
"epoch": 1.7801724137931034,
"grad_norm": 0.34389981627464294,
"learning_rate": 2.404589637328947e-07,
"loss": 1.1223,
"step": 419
},
{
"epoch": 1.7844827586206895,
"grad_norm": 0.372776061296463,
"learning_rate": 2.2997243056274822e-07,
"loss": 1.0633,
"step": 420
},
{
"epoch": 1.7887931034482758,
"grad_norm": 0.3406997621059418,
"learning_rate": 2.1971430322832553e-07,
"loss": 1.113,
"step": 421
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.3557218015193939,
"learning_rate": 2.096850729251404e-07,
"loss": 1.0903,
"step": 422
},
{
"epoch": 1.7974137931034484,
"grad_norm": 0.3280501067638397,
"learning_rate": 1.998852198883061e-07,
"loss": 1.086,
"step": 423
},
{
"epoch": 1.8017241379310345,
"grad_norm": 0.3395581543445587,
"learning_rate": 1.903152133695385e-07,
"loss": 1.1108,
"step": 424
},
{
"epoch": 1.8060344827586206,
"grad_norm": 0.33143341541290283,
"learning_rate": 1.8097551161468773e-07,
"loss": 1.0399,
"step": 425
},
{
"epoch": 1.8103448275862069,
"grad_norm": 0.30996373295783997,
"learning_rate": 1.7186656184179475e-07,
"loss": 1.0828,
"step": 426
},
{
"epoch": 1.8146551724137931,
"grad_norm": 0.3189036250114441,
"learning_rate": 1.6298880021967667e-07,
"loss": 1.0703,
"step": 427
},
{
"epoch": 1.8189655172413794,
"grad_norm": 0.3535940945148468,
"learning_rate": 1.543426518470431e-07,
"loss": 0.996,
"step": 428
},
{
"epoch": 1.8232758620689655,
"grad_norm": 0.3753534257411957,
"learning_rate": 1.4592853073214007e-07,
"loss": 1.0818,
"step": 429
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.3219975531101227,
"learning_rate": 1.3774683977292426e-07,
"loss": 1.1153,
"step": 430
},
{
"epoch": 1.831896551724138,
"grad_norm": 0.3368397355079651,
"learning_rate": 1.2979797073777333e-07,
"loss": 1.0769,
"step": 431
},
{
"epoch": 1.8362068965517242,
"grad_norm": 0.36413589119911194,
"learning_rate": 1.2208230424672562e-07,
"loss": 1.0734,
"step": 432
},
{
"epoch": 1.8405172413793105,
"grad_norm": 0.3497381806373596,
"learning_rate": 1.1460020975325392e-07,
"loss": 1.1006,
"step": 433
},
{
"epoch": 1.8448275862068966,
"grad_norm": 0.3364497125148773,
"learning_rate": 1.0735204552657641e-07,
"loss": 1.0738,
"step": 434
},
{
"epoch": 1.8491379310344827,
"grad_norm": 0.34419310092926025,
"learning_rate": 1.003381586344998e-07,
"loss": 1.0152,
"step": 435
},
{
"epoch": 1.853448275862069,
"grad_norm": 0.325958251953125,
"learning_rate": 9.355888492680155e-08,
"loss": 1.0799,
"step": 436
},
{
"epoch": 1.8577586206896552,
"grad_norm": 0.3611970543861389,
"learning_rate": 8.701454901914764e-08,
"loss": 1.1206,
"step": 437
},
{
"epoch": 1.8620689655172413,
"grad_norm": 0.3344848155975342,
"learning_rate": 8.070546427754899e-08,
"loss": 1.0546,
"step": 438
},
{
"epoch": 1.8663793103448276,
"grad_norm": 0.3656860589981079,
"learning_rate": 7.463193280335679e-08,
"loss": 1.0763,
"step": 439
},
{
"epoch": 1.8706896551724137,
"grad_norm": 0.3223496675491333,
"learning_rate": 6.879424541879676e-08,
"loss": 1.0854,
"step": 440
},
{
"epoch": 1.875,
"grad_norm": 0.3349456191062927,
"learning_rate": 6.319268165304204e-08,
"loss": 1.1127,
"step": 441
},
{
"epoch": 1.8793103448275863,
"grad_norm": 0.32300493121147156,
"learning_rate": 5.782750972883111e-08,
"loss": 1.0453,
"step": 442
},
{
"epoch": 1.8836206896551724,
"grad_norm": 0.34070324897766113,
"learning_rate": 5.26989865496208e-08,
"loss": 1.113,
"step": 443
},
{
"epoch": 1.8879310344827587,
"grad_norm": 0.3265235722064972,
"learning_rate": 4.780735768728895e-08,
"loss": 1.0802,
"step": 444
},
{
"epoch": 1.8922413793103448,
"grad_norm": 0.33940497040748596,
"learning_rate": 4.315285737037156e-08,
"loss": 1.0514,
"step": 445
},
{
"epoch": 1.896551724137931,
"grad_norm": 0.32045885920524597,
"learning_rate": 3.873570847285013e-08,
"loss": 1.0475,
"step": 446
},
{
"epoch": 1.9008620689655173,
"grad_norm": 0.3269757926464081,
"learning_rate": 3.455612250347851e-08,
"loss": 1.0889,
"step": 447
},
{
"epoch": 1.9051724137931034,
"grad_norm": 0.40040749311447144,
"learning_rate": 3.0614299595654875e-08,
"loss": 1.0693,
"step": 448
},
{
"epoch": 1.9094827586206895,
"grad_norm": 0.3067898750305176,
"learning_rate": 2.691042849783776e-08,
"loss": 1.0847,
"step": 449
},
{
"epoch": 1.9137931034482758,
"grad_norm": 0.3403901159763336,
"learning_rate": 2.3444686564511042e-08,
"loss": 1.0298,
"step": 450
},
{
"epoch": 1.918103448275862,
"grad_norm": 0.3210541009902954,
"learning_rate": 2.0217239747689077e-08,
"loss": 1.0751,
"step": 451
},
{
"epoch": 1.9224137931034484,
"grad_norm": 0.3297666907310486,
"learning_rate": 1.7228242588969714e-08,
"loss": 1.0646,
"step": 452
},
{
"epoch": 1.9267241379310345,
"grad_norm": 0.35611140727996826,
"learning_rate": 1.447783821213744e-08,
"loss": 1.0604,
"step": 453
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.3229893743991852,
"learning_rate": 1.1966158316307208e-08,
"loss": 1.1214,
"step": 454
},
{
"epoch": 1.9353448275862069,
"grad_norm": 0.3192938268184662,
"learning_rate": 9.693323169619463e-09,
"loss": 1.0673,
"step": 455
},
{
"epoch": 1.9396551724137931,
"grad_norm": 0.3141050338745117,
"learning_rate": 7.65944160348142e-09,
"loss": 1.0865,
"step": 456
},
{
"epoch": 1.9439655172413794,
"grad_norm": 0.3222709000110626,
"learning_rate": 5.864611007354581e-09,
"loss": 1.0105,
"step": 457
},
{
"epoch": 1.9482758620689655,
"grad_norm": 0.3701346516609192,
"learning_rate": 4.308917324092887e-09,
"loss": 1.0566,
"step": 458
},
{
"epoch": 1.9525862068965516,
"grad_norm": 0.34899988770484924,
"learning_rate": 2.9924350458271357e-09,
"loss": 1.0494,
"step": 459
},
{
"epoch": 1.956896551724138,
"grad_norm": 0.34436851739883423,
"learning_rate": 1.9152272103972746e-09,
"loss": 1.1125,
"step": 460
},
{
"epoch": 1.9612068965517242,
"grad_norm": 0.3521319329738617,
"learning_rate": 1.077345398334262e-09,
"loss": 1.0679,
"step": 461
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.3282637894153595,
"learning_rate": 4.788297303903732e-10,
"loss": 1.045,
"step": 462
},
{
"epoch": 1.9698275862068966,
"grad_norm": 0.31886130571365356,
"learning_rate": 1.1970886561907257e-10,
"loss": 1.0774,
"step": 463
},
{
"epoch": 1.9741379310344827,
"grad_norm": 0.32838234305381775,
"learning_rate": 0.0,
"loss": 1.0681,
"step": 464
},
{
"epoch": 1.9741379310344827,
"eval_loss": 1.1381434202194214,
"eval_runtime": 38.7863,
"eval_samples_per_second": 11.422,
"eval_steps_per_second": 1.444,
"step": 464
}
],
"logging_steps": 1,
"max_steps": 464,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 116,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.566697049072337e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}