|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9741379310344827, |
|
"eval_steps": 58, |
|
"global_step": 464, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004310344827586207, |
|
"grad_norm": 0.38478580117225647, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.6158, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004310344827586207, |
|
"eval_loss": 1.6722962856292725, |
|
"eval_runtime": 38.7564, |
|
"eval_samples_per_second": 11.43, |
|
"eval_steps_per_second": 1.445, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008620689655172414, |
|
"grad_norm": 0.3902539312839508, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.5696, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01293103448275862, |
|
"grad_norm": 0.42075541615486145, |
|
"learning_rate": 3e-06, |
|
"loss": 1.6475, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.017241379310344827, |
|
"grad_norm": 0.4003991186618805, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.6201, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.021551724137931036, |
|
"grad_norm": 0.3858628571033478, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5931, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02586206896551724, |
|
"grad_norm": 0.4155072569847107, |
|
"learning_rate": 6e-06, |
|
"loss": 1.6432, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03017241379310345, |
|
"grad_norm": 0.3816058039665222, |
|
"learning_rate": 7e-06, |
|
"loss": 1.6651, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.034482758620689655, |
|
"grad_norm": 0.38627564907073975, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.5938, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03879310344827586, |
|
"grad_norm": 0.3964974284172058, |
|
"learning_rate": 9e-06, |
|
"loss": 1.6462, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04310344827586207, |
|
"grad_norm": 0.374857634305954, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6076, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04741379310344827, |
|
"grad_norm": 0.41657668352127075, |
|
"learning_rate": 9.999880291134381e-06, |
|
"loss": 1.6585, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05172413793103448, |
|
"grad_norm": 0.4166984260082245, |
|
"learning_rate": 9.99952117026961e-06, |
|
"loss": 1.6464, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05603448275862069, |
|
"grad_norm": 0.41143307089805603, |
|
"learning_rate": 9.998922654601666e-06, |
|
"loss": 1.6041, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0603448275862069, |
|
"grad_norm": 0.424064576625824, |
|
"learning_rate": 9.998084772789603e-06, |
|
"loss": 1.6668, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06465517241379311, |
|
"grad_norm": 0.4521613121032715, |
|
"learning_rate": 9.997007564954173e-06, |
|
"loss": 1.6147, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 0.4430346190929413, |
|
"learning_rate": 9.995691082675908e-06, |
|
"loss": 1.6345, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07327586206896551, |
|
"grad_norm": 0.4326779544353485, |
|
"learning_rate": 9.994135388992646e-06, |
|
"loss": 1.6169, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07758620689655173, |
|
"grad_norm": 0.4598339796066284, |
|
"learning_rate": 9.99234055839652e-06, |
|
"loss": 1.6345, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08189655172413793, |
|
"grad_norm": 0.4327365756034851, |
|
"learning_rate": 9.990306676830382e-06, |
|
"loss": 1.563, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08620689655172414, |
|
"grad_norm": 0.4325244128704071, |
|
"learning_rate": 9.988033841683694e-06, |
|
"loss": 1.632, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09051724137931035, |
|
"grad_norm": 0.42453956604003906, |
|
"learning_rate": 9.985522161787863e-06, |
|
"loss": 1.5589, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09482758620689655, |
|
"grad_norm": 0.44241735339164734, |
|
"learning_rate": 9.982771757411032e-06, |
|
"loss": 1.5412, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09913793103448276, |
|
"grad_norm": 0.4151982367038727, |
|
"learning_rate": 9.979782760252312e-06, |
|
"loss": 1.6149, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10344827586206896, |
|
"grad_norm": 0.43418580293655396, |
|
"learning_rate": 9.97655531343549e-06, |
|
"loss": 1.6115, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10775862068965517, |
|
"grad_norm": 0.407058984041214, |
|
"learning_rate": 9.973089571502163e-06, |
|
"loss": 1.5269, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11206896551724138, |
|
"grad_norm": 0.40243563055992126, |
|
"learning_rate": 9.969385700404346e-06, |
|
"loss": 1.6206, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11637931034482758, |
|
"grad_norm": 0.40526947379112244, |
|
"learning_rate": 9.965443877496522e-06, |
|
"loss": 1.5815, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1206896551724138, |
|
"grad_norm": 0.3972105085849762, |
|
"learning_rate": 9.96126429152715e-06, |
|
"loss": 1.5605, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.38185665011405945, |
|
"learning_rate": 9.95684714262963e-06, |
|
"loss": 1.5652, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12931034482758622, |
|
"grad_norm": 0.4035002887248993, |
|
"learning_rate": 9.952192642312713e-06, |
|
"loss": 1.4807, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1336206896551724, |
|
"grad_norm": 0.3860902488231659, |
|
"learning_rate": 9.94730101345038e-06, |
|
"loss": 1.4752, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.40458953380584717, |
|
"learning_rate": 9.942172490271169e-06, |
|
"loss": 1.5132, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14224137931034483, |
|
"grad_norm": 0.3780922293663025, |
|
"learning_rate": 9.936807318346959e-06, |
|
"loss": 1.4436, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.14655172413793102, |
|
"grad_norm": 0.3768951892852783, |
|
"learning_rate": 9.931205754581203e-06, |
|
"loss": 1.461, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15086206896551724, |
|
"grad_norm": 0.40286195278167725, |
|
"learning_rate": 9.925368067196644e-06, |
|
"loss": 1.4718, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15517241379310345, |
|
"grad_norm": 0.3668968081474304, |
|
"learning_rate": 9.919294535722452e-06, |
|
"loss": 1.4031, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.15948275862068967, |
|
"grad_norm": 0.3690381944179535, |
|
"learning_rate": 9.912985450980853e-06, |
|
"loss": 1.5063, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.16379310344827586, |
|
"grad_norm": 0.3745856285095215, |
|
"learning_rate": 9.9064411150732e-06, |
|
"loss": 1.4788, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.16810344827586207, |
|
"grad_norm": 0.3808038532733917, |
|
"learning_rate": 9.899661841365502e-06, |
|
"loss": 1.4621, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 0.3296118378639221, |
|
"learning_rate": 9.892647954473425e-06, |
|
"loss": 1.3765, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17672413793103448, |
|
"grad_norm": 0.3598046898841858, |
|
"learning_rate": 9.885399790246746e-06, |
|
"loss": 1.3972, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1810344827586207, |
|
"grad_norm": 0.3617996871471405, |
|
"learning_rate": 9.877917695753275e-06, |
|
"loss": 1.3881, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1853448275862069, |
|
"grad_norm": 0.35454022884368896, |
|
"learning_rate": 9.870202029262228e-06, |
|
"loss": 1.3877, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1896551724137931, |
|
"grad_norm": 0.35556507110595703, |
|
"learning_rate": 9.862253160227077e-06, |
|
"loss": 1.3745, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1939655172413793, |
|
"grad_norm": 0.33433303236961365, |
|
"learning_rate": 9.85407146926786e-06, |
|
"loss": 1.4469, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19827586206896552, |
|
"grad_norm": 0.36582285165786743, |
|
"learning_rate": 9.845657348152958e-06, |
|
"loss": 1.4056, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2025862068965517, |
|
"grad_norm": 0.3496723175048828, |
|
"learning_rate": 9.837011199780325e-06, |
|
"loss": 1.3943, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 0.36036989092826843, |
|
"learning_rate": 9.828133438158206e-06, |
|
"loss": 1.4165, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21120689655172414, |
|
"grad_norm": 0.34691792726516724, |
|
"learning_rate": 9.819024488385314e-06, |
|
"loss": 1.3675, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.21551724137931033, |
|
"grad_norm": 0.331584095954895, |
|
"learning_rate": 9.809684786630462e-06, |
|
"loss": 1.4028, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21982758620689655, |
|
"grad_norm": 0.32271715998649597, |
|
"learning_rate": 9.800114780111694e-06, |
|
"loss": 1.3344, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.22413793103448276, |
|
"grad_norm": 0.3648192584514618, |
|
"learning_rate": 9.79031492707486e-06, |
|
"loss": 1.3383, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.22844827586206898, |
|
"grad_norm": 0.3334115743637085, |
|
"learning_rate": 9.780285696771675e-06, |
|
"loss": 1.3185, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.23275862068965517, |
|
"grad_norm": 0.3398106098175049, |
|
"learning_rate": 9.770027569437252e-06, |
|
"loss": 1.3564, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.23706896551724138, |
|
"grad_norm": 0.3277662694454193, |
|
"learning_rate": 9.759541036267106e-06, |
|
"loss": 1.4009, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2413793103448276, |
|
"grad_norm": 0.30069175362586975, |
|
"learning_rate": 9.748826599393632e-06, |
|
"loss": 1.3393, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.24568965517241378, |
|
"grad_norm": 0.2870045006275177, |
|
"learning_rate": 9.737884771862065e-06, |
|
"loss": 1.3647, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.299041211605072, |
|
"learning_rate": 9.72671607760591e-06, |
|
"loss": 1.2942, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.3614733219146729, |
|
"eval_runtime": 38.8235, |
|
"eval_samples_per_second": 11.411, |
|
"eval_steps_per_second": 1.442, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2543103448275862, |
|
"grad_norm": 0.3013390302658081, |
|
"learning_rate": 9.715321051421853e-06, |
|
"loss": 1.3405, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.25862068965517243, |
|
"grad_norm": 0.28875967860221863, |
|
"learning_rate": 9.703700238944157e-06, |
|
"loss": 1.2537, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2629310344827586, |
|
"grad_norm": 0.3158300817012787, |
|
"learning_rate": 9.691854196618538e-06, |
|
"loss": 1.3112, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2672413793103448, |
|
"grad_norm": 0.284329891204834, |
|
"learning_rate": 9.679783491675507e-06, |
|
"loss": 1.2891, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.27155172413793105, |
|
"grad_norm": 0.2802349925041199, |
|
"learning_rate": 9.667488702103222e-06, |
|
"loss": 1.2899, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.28650611639022827, |
|
"learning_rate": 9.654970416619814e-06, |
|
"loss": 1.307, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2801724137931034, |
|
"grad_norm": 0.2946489751338959, |
|
"learning_rate": 9.642229234645177e-06, |
|
"loss": 1.313, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.28448275862068967, |
|
"grad_norm": 0.280954509973526, |
|
"learning_rate": 9.629265766272293e-06, |
|
"loss": 1.3074, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.28879310344827586, |
|
"grad_norm": 0.27372896671295166, |
|
"learning_rate": 9.616080632237999e-06, |
|
"loss": 1.2904, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.29310344827586204, |
|
"grad_norm": 0.2814270853996277, |
|
"learning_rate": 9.602674463893266e-06, |
|
"loss": 1.2482, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2974137931034483, |
|
"grad_norm": 0.2719290256500244, |
|
"learning_rate": 9.589047903172981e-06, |
|
"loss": 1.2528, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3017241379310345, |
|
"grad_norm": 0.2652990520000458, |
|
"learning_rate": 9.575201602565192e-06, |
|
"loss": 1.2582, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.30603448275862066, |
|
"grad_norm": 0.28665879368782043, |
|
"learning_rate": 9.561136225079874e-06, |
|
"loss": 1.2454, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3103448275862069, |
|
"grad_norm": 0.2868441045284271, |
|
"learning_rate": 9.54685244421718e-06, |
|
"loss": 1.2629, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3146551724137931, |
|
"grad_norm": 0.2746320366859436, |
|
"learning_rate": 9.532350943935194e-06, |
|
"loss": 1.2539, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.31896551724137934, |
|
"grad_norm": 0.2876338064670563, |
|
"learning_rate": 9.517632418617173e-06, |
|
"loss": 1.2566, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3232758620689655, |
|
"grad_norm": 0.2622097134590149, |
|
"learning_rate": 9.502697573038309e-06, |
|
"loss": 1.2199, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3275862068965517, |
|
"grad_norm": 0.2769118845462799, |
|
"learning_rate": 9.487547122331965e-06, |
|
"loss": 1.2564, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.33189655172413796, |
|
"grad_norm": 0.2650173008441925, |
|
"learning_rate": 9.47218179195545e-06, |
|
"loss": 1.2385, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.33620689655172414, |
|
"grad_norm": 0.2626594603061676, |
|
"learning_rate": 9.456602317655274e-06, |
|
"loss": 1.1074, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.34051724137931033, |
|
"grad_norm": 0.2777866721153259, |
|
"learning_rate": 9.440809445431914e-06, |
|
"loss": 1.2219, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 0.27093663811683655, |
|
"learning_rate": 9.424803931504095e-06, |
|
"loss": 1.2315, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.34913793103448276, |
|
"grad_norm": 0.23614566028118134, |
|
"learning_rate": 9.408586542272588e-06, |
|
"loss": 1.1969, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.35344827586206895, |
|
"grad_norm": 0.25952041149139404, |
|
"learning_rate": 9.392158054283497e-06, |
|
"loss": 1.2439, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3577586206896552, |
|
"grad_norm": 0.295060396194458, |
|
"learning_rate": 9.375519254191088e-06, |
|
"loss": 1.2369, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3620689655172414, |
|
"grad_norm": 0.2887714207172394, |
|
"learning_rate": 9.358670938720114e-06, |
|
"loss": 1.2503, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.36637931034482757, |
|
"grad_norm": 0.26783010363578796, |
|
"learning_rate": 9.341613914627667e-06, |
|
"loss": 1.2057, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3706896551724138, |
|
"grad_norm": 0.26204803586006165, |
|
"learning_rate": 9.32434899866455e-06, |
|
"loss": 1.1715, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.2795032262802124, |
|
"learning_rate": 9.306877017536165e-06, |
|
"loss": 1.241, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.3793103448275862, |
|
"grad_norm": 0.26389801502227783, |
|
"learning_rate": 9.289198807862929e-06, |
|
"loss": 1.1859, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.38362068965517243, |
|
"grad_norm": 0.25804105401039124, |
|
"learning_rate": 9.27131521614021e-06, |
|
"loss": 1.1952, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3879310344827586, |
|
"grad_norm": 0.2644469439983368, |
|
"learning_rate": 9.253227098697804e-06, |
|
"loss": 1.1428, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3922413793103448, |
|
"grad_norm": 0.2521233558654785, |
|
"learning_rate": 9.234935321658916e-06, |
|
"loss": 1.1685, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.39655172413793105, |
|
"grad_norm": 0.23881429433822632, |
|
"learning_rate": 9.216440760898695e-06, |
|
"loss": 1.1668, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.40086206896551724, |
|
"grad_norm": 0.2397918403148651, |
|
"learning_rate": 9.197744302002301e-06, |
|
"loss": 1.2153, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.4051724137931034, |
|
"grad_norm": 0.25894659757614136, |
|
"learning_rate": 9.178846840222489e-06, |
|
"loss": 1.196, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.40948275862068967, |
|
"grad_norm": 0.24914440512657166, |
|
"learning_rate": 9.159749280436738e-06, |
|
"loss": 1.2565, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.25300922989845276, |
|
"learning_rate": 9.140452537103943e-06, |
|
"loss": 1.2124, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.41810344827586204, |
|
"grad_norm": 0.2863939106464386, |
|
"learning_rate": 9.120957534220599e-06, |
|
"loss": 1.1849, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4224137931034483, |
|
"grad_norm": 0.26353657245635986, |
|
"learning_rate": 9.101265205276581e-06, |
|
"loss": 1.1749, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4267241379310345, |
|
"grad_norm": 0.2726733684539795, |
|
"learning_rate": 9.081376493210434e-06, |
|
"loss": 1.1452, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 0.2476412057876587, |
|
"learning_rate": 9.061292350364222e-06, |
|
"loss": 1.1881, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4353448275862069, |
|
"grad_norm": 0.25747182965278625, |
|
"learning_rate": 9.041013738437924e-06, |
|
"loss": 1.2438, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.4396551724137931, |
|
"grad_norm": 0.2478281408548355, |
|
"learning_rate": 9.020541628443395e-06, |
|
"loss": 1.136, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.44396551724137934, |
|
"grad_norm": 0.28689947724342346, |
|
"learning_rate": 8.99987700065786e-06, |
|
"loss": 1.1827, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.4482758620689655, |
|
"grad_norm": 0.2534964084625244, |
|
"learning_rate": 8.979020844576982e-06, |
|
"loss": 1.2091, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4525862068965517, |
|
"grad_norm": 0.23414653539657593, |
|
"learning_rate": 8.95797415886747e-06, |
|
"loss": 1.205, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.45689655172413796, |
|
"grad_norm": 0.26888808608055115, |
|
"learning_rate": 8.936737951319276e-06, |
|
"loss": 1.1838, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.46120689655172414, |
|
"grad_norm": 0.2613222897052765, |
|
"learning_rate": 8.915313238797327e-06, |
|
"loss": 1.2293, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.46551724137931033, |
|
"grad_norm": 0.2542130649089813, |
|
"learning_rate": 8.893701047192832e-06, |
|
"loss": 1.2118, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4698275862068966, |
|
"grad_norm": 0.2766372561454773, |
|
"learning_rate": 8.871902411374173e-06, |
|
"loss": 1.1278, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.47413793103448276, |
|
"grad_norm": 0.2918456494808197, |
|
"learning_rate": 8.84991837513733e-06, |
|
"loss": 1.1893, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.47844827586206895, |
|
"grad_norm": 0.2817043364048004, |
|
"learning_rate": 8.827749991155924e-06, |
|
"loss": 1.199, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 0.27356210350990295, |
|
"learning_rate": 8.805398320930792e-06, |
|
"loss": 1.146, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4870689655172414, |
|
"grad_norm": 0.26965269446372986, |
|
"learning_rate": 8.782864434739169e-06, |
|
"loss": 1.1373, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.49137931034482757, |
|
"grad_norm": 0.2662865221500397, |
|
"learning_rate": 8.760149411583436e-06, |
|
"loss": 1.1677, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.4956896551724138, |
|
"grad_norm": 0.25307050347328186, |
|
"learning_rate": 8.737254339139457e-06, |
|
"loss": 1.1244, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.28164368867874146, |
|
"learning_rate": 8.71418031370449e-06, |
|
"loss": 1.1657, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.22263503074646, |
|
"eval_runtime": 38.8381, |
|
"eval_samples_per_second": 11.406, |
|
"eval_steps_per_second": 1.442, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5043103448275862, |
|
"grad_norm": 0.2570449709892273, |
|
"learning_rate": 8.690928440144701e-06, |
|
"loss": 1.1423, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5086206896551724, |
|
"grad_norm": 0.27080610394477844, |
|
"learning_rate": 8.667499831842252e-06, |
|
"loss": 1.1379, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5129310344827587, |
|
"grad_norm": 0.23152929544448853, |
|
"learning_rate": 8.643895610641993e-06, |
|
"loss": 1.1484, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 0.25903040170669556, |
|
"learning_rate": 8.62011690679774e-06, |
|
"loss": 1.124, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.521551724137931, |
|
"grad_norm": 0.2430315613746643, |
|
"learning_rate": 8.596164858918158e-06, |
|
"loss": 1.1558, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5258620689655172, |
|
"grad_norm": 0.2497812658548355, |
|
"learning_rate": 8.572040613912241e-06, |
|
"loss": 1.1659, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5301724137931034, |
|
"grad_norm": 0.27267956733703613, |
|
"learning_rate": 8.54774532693439e-06, |
|
"loss": 1.1539, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5344827586206896, |
|
"grad_norm": 0.2700651288032532, |
|
"learning_rate": 8.5232801613291e-06, |
|
"loss": 1.1786, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5387931034482759, |
|
"grad_norm": 0.26759159564971924, |
|
"learning_rate": 8.498646288575265e-06, |
|
"loss": 1.133, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5431034482758621, |
|
"grad_norm": 0.25706982612609863, |
|
"learning_rate": 8.473844888230065e-06, |
|
"loss": 1.1783, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5474137931034483, |
|
"grad_norm": 0.2711213231086731, |
|
"learning_rate": 8.448877147872505e-06, |
|
"loss": 1.1887, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.26545941829681396, |
|
"learning_rate": 8.42374426304653e-06, |
|
"loss": 1.1341, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5560344827586207, |
|
"grad_norm": 0.30440106987953186, |
|
"learning_rate": 8.398447437203799e-06, |
|
"loss": 1.1609, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5603448275862069, |
|
"grad_norm": 0.2716272473335266, |
|
"learning_rate": 8.372987881646036e-06, |
|
"loss": 1.1327, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5646551724137931, |
|
"grad_norm": 0.27410513162612915, |
|
"learning_rate": 8.347366815467051e-06, |
|
"loss": 1.1144, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5689655172413793, |
|
"grad_norm": 0.25738298892974854, |
|
"learning_rate": 8.32158546549435e-06, |
|
"loss": 1.1528, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5732758620689655, |
|
"grad_norm": 0.28856974840164185, |
|
"learning_rate": 8.295645066230396e-06, |
|
"loss": 1.1565, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5775862068965517, |
|
"grad_norm": 0.2661442160606384, |
|
"learning_rate": 8.269546859793499e-06, |
|
"loss": 1.162, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5818965517241379, |
|
"grad_norm": 0.26686185598373413, |
|
"learning_rate": 8.24329209585833e-06, |
|
"loss": 1.1464, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5862068965517241, |
|
"grad_norm": 0.2854245901107788, |
|
"learning_rate": 8.216882031596098e-06, |
|
"loss": 1.1143, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5905172413793104, |
|
"grad_norm": 0.24192146956920624, |
|
"learning_rate": 8.190317931614332e-06, |
|
"loss": 1.1308, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5948275862068966, |
|
"grad_norm": 0.23887085914611816, |
|
"learning_rate": 8.163601067896344e-06, |
|
"loss": 1.1545, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5991379310344828, |
|
"grad_norm": 0.26864567399024963, |
|
"learning_rate": 8.13673271974031e-06, |
|
"loss": 1.1171, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.603448275862069, |
|
"grad_norm": 0.26497048139572144, |
|
"learning_rate": 8.109714173698027e-06, |
|
"loss": 1.1368, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6077586206896551, |
|
"grad_norm": 0.24038733541965485, |
|
"learning_rate": 8.082546723513289e-06, |
|
"loss": 1.1512, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6120689655172413, |
|
"grad_norm": 0.2721477150917053, |
|
"learning_rate": 8.055231670059958e-06, |
|
"loss": 1.0611, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6163793103448276, |
|
"grad_norm": 0.2796621322631836, |
|
"learning_rate": 8.027770321279654e-06, |
|
"loss": 1.1109, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 0.29094719886779785, |
|
"learning_rate": 8.000163992119146e-06, |
|
"loss": 1.1781, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.26753732562065125, |
|
"learning_rate": 7.97241400446737e-06, |
|
"loss": 1.1694, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6293103448275862, |
|
"grad_norm": 0.2574915587902069, |
|
"learning_rate": 7.944521687092143e-06, |
|
"loss": 1.0821, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6336206896551724, |
|
"grad_norm": 0.2696407735347748, |
|
"learning_rate": 7.916488375576538e-06, |
|
"loss": 1.1272, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6379310344827587, |
|
"grad_norm": 0.29391637444496155, |
|
"learning_rate": 7.888315412254921e-06, |
|
"loss": 1.1787, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6422413793103449, |
|
"grad_norm": 0.3649088740348816, |
|
"learning_rate": 7.860004146148683e-06, |
|
"loss": 1.1116, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.646551724137931, |
|
"grad_norm": 0.30176007747650146, |
|
"learning_rate": 7.831555932901642e-06, |
|
"loss": 1.1539, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6508620689655172, |
|
"grad_norm": 0.25577977299690247, |
|
"learning_rate": 7.802972134715138e-06, |
|
"loss": 1.1014, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6551724137931034, |
|
"grad_norm": 0.25991085171699524, |
|
"learning_rate": 7.774254120282792e-06, |
|
"loss": 1.0917, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6594827586206896, |
|
"grad_norm": 0.2913319170475006, |
|
"learning_rate": 7.745403264724973e-06, |
|
"loss": 1.1445, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6637931034482759, |
|
"grad_norm": 0.2821566164493561, |
|
"learning_rate": 7.71642094952296e-06, |
|
"loss": 1.1306, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6681034482758621, |
|
"grad_norm": 0.28270649909973145, |
|
"learning_rate": 7.687308562452783e-06, |
|
"loss": 1.1326, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6724137931034483, |
|
"grad_norm": 0.26017746329307556, |
|
"learning_rate": 7.658067497518773e-06, |
|
"loss": 1.1314, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6767241379310345, |
|
"grad_norm": 0.333617240190506, |
|
"learning_rate": 7.628699154886817e-06, |
|
"loss": 1.1631, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6810344827586207, |
|
"grad_norm": 0.26498985290527344, |
|
"learning_rate": 7.599204940817309e-06, |
|
"loss": 1.1347, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6853448275862069, |
|
"grad_norm": 0.2794323265552521, |
|
"learning_rate": 7.5695862675978085e-06, |
|
"loss": 1.1213, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.29901158809661865, |
|
"learning_rate": 7.539844553475427e-06, |
|
"loss": 1.1851, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6939655172413793, |
|
"grad_norm": 0.2878389060497284, |
|
"learning_rate": 7.509981222588909e-06, |
|
"loss": 1.1201, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6982758620689655, |
|
"grad_norm": 0.3090667426586151, |
|
"learning_rate": 7.479997704900437e-06, |
|
"loss": 1.1283, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7025862068965517, |
|
"grad_norm": 0.3226082921028137, |
|
"learning_rate": 7.449895436127169e-06, |
|
"loss": 1.1655, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7068965517241379, |
|
"grad_norm": 0.2934945225715637, |
|
"learning_rate": 7.4196758576724835e-06, |
|
"loss": 1.1251, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7112068965517241, |
|
"grad_norm": 0.3038388788700104, |
|
"learning_rate": 7.389340416556964e-06, |
|
"loss": 1.1198, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7155172413793104, |
|
"grad_norm": 0.2832164466381073, |
|
"learning_rate": 7.358890565349106e-06, |
|
"loss": 1.1653, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7198275862068966, |
|
"grad_norm": 0.3079414367675781, |
|
"learning_rate": 7.328327762095769e-06, |
|
"loss": 1.0996, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7241379310344828, |
|
"grad_norm": 0.29658767580986023, |
|
"learning_rate": 7.297653470252359e-06, |
|
"loss": 1.1167, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.728448275862069, |
|
"grad_norm": 0.29325637221336365, |
|
"learning_rate": 7.266869158612743e-06, |
|
"loss": 1.1, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7327586206896551, |
|
"grad_norm": 0.2616899907588959, |
|
"learning_rate": 7.235976301238933e-06, |
|
"loss": 1.0893, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7370689655172413, |
|
"grad_norm": 0.3306714594364166, |
|
"learning_rate": 7.2049763773904955e-06, |
|
"loss": 1.1118, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7413793103448276, |
|
"grad_norm": 0.29951155185699463, |
|
"learning_rate": 7.1738708714537165e-06, |
|
"loss": 1.1483, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7456896551724138, |
|
"grad_norm": 0.2952785789966583, |
|
"learning_rate": 7.142661272870527e-06, |
|
"loss": 1.1043, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.26570263504981995, |
|
"learning_rate": 7.111349076067186e-06, |
|
"loss": 1.1209, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.1805130243301392, |
|
"eval_runtime": 38.8981, |
|
"eval_samples_per_second": 11.389, |
|
"eval_steps_per_second": 1.44, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7543103448275862, |
|
"grad_norm": 0.32131871581077576, |
|
"learning_rate": 7.079935780382716e-06, |
|
"loss": 1.1453, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7586206896551724, |
|
"grad_norm": 0.3396710455417633, |
|
"learning_rate": 7.048422889997115e-06, |
|
"loss": 1.1384, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7629310344827587, |
|
"grad_norm": 0.3742856979370117, |
|
"learning_rate": 7.016811913859325e-06, |
|
"loss": 1.1325, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7672413793103449, |
|
"grad_norm": 0.2917105555534363, |
|
"learning_rate": 6.985104365614987e-06, |
|
"loss": 1.0813, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.771551724137931, |
|
"grad_norm": 0.301831990480423, |
|
"learning_rate": 6.953301763533951e-06, |
|
"loss": 1.1406, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7758620689655172, |
|
"grad_norm": 0.3077235817909241, |
|
"learning_rate": 6.921405630437585e-06, |
|
"loss": 1.0814, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7801724137931034, |
|
"grad_norm": 0.271220862865448, |
|
"learning_rate": 6.889417493625854e-06, |
|
"loss": 1.0846, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7844827586206896, |
|
"grad_norm": 0.3000980615615845, |
|
"learning_rate": 6.857338884804185e-06, |
|
"loss": 1.1146, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7887931034482759, |
|
"grad_norm": 0.3144089877605438, |
|
"learning_rate": 6.82517134001013e-06, |
|
"loss": 1.097, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7931034482758621, |
|
"grad_norm": 0.3334217071533203, |
|
"learning_rate": 6.792916399539805e-06, |
|
"loss": 1.087, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7974137931034483, |
|
"grad_norm": 0.29255741834640503, |
|
"learning_rate": 6.760575607874145e-06, |
|
"loss": 1.093, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8017241379310345, |
|
"grad_norm": 0.3156846761703491, |
|
"learning_rate": 6.728150513604942e-06, |
|
"loss": 1.1933, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8060344827586207, |
|
"grad_norm": 0.2973058223724365, |
|
"learning_rate": 6.6956426693607e-06, |
|
"loss": 1.2163, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8103448275862069, |
|
"grad_norm": 0.3086062967777252, |
|
"learning_rate": 6.663053631732279e-06, |
|
"loss": 1.0922, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.8146551724137931, |
|
"grad_norm": 0.3231525421142578, |
|
"learning_rate": 6.630384961198371e-06, |
|
"loss": 1.0915, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8189655172413793, |
|
"grad_norm": 0.31745991110801697, |
|
"learning_rate": 6.597638222050773e-06, |
|
"loss": 1.153, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8232758620689655, |
|
"grad_norm": 0.3454311788082123, |
|
"learning_rate": 6.564814982319481e-06, |
|
"loss": 1.1638, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.3196294903755188, |
|
"learning_rate": 6.5319168136976155e-06, |
|
"loss": 1.0483, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8318965517241379, |
|
"grad_norm": 0.30666542053222656, |
|
"learning_rate": 6.4989452914661525e-06, |
|
"loss": 1.0661, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.8362068965517241, |
|
"grad_norm": 0.31098195910453796, |
|
"learning_rate": 6.465901994418505e-06, |
|
"loss": 1.1715, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8405172413793104, |
|
"grad_norm": 0.33826136589050293, |
|
"learning_rate": 6.432788504784913e-06, |
|
"loss": 1.0956, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8448275862068966, |
|
"grad_norm": 0.32232627272605896, |
|
"learning_rate": 6.399606408156688e-06, |
|
"loss": 1.1055, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8491379310344828, |
|
"grad_norm": 0.29653576016426086, |
|
"learning_rate": 6.3663572934102915e-06, |
|
"loss": 1.0766, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.853448275862069, |
|
"grad_norm": 0.2951408624649048, |
|
"learning_rate": 6.333042752631243e-06, |
|
"loss": 1.1221, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8577586206896551, |
|
"grad_norm": 0.31561148166656494, |
|
"learning_rate": 6.2996643810379e-06, |
|
"loss": 1.0901, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 0.28782910108566284, |
|
"learning_rate": 6.266223776905062e-06, |
|
"loss": 1.1135, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8663793103448276, |
|
"grad_norm": 0.28432968258857727, |
|
"learning_rate": 6.232722541487443e-06, |
|
"loss": 1.1482, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8706896551724138, |
|
"grad_norm": 0.30610454082489014, |
|
"learning_rate": 6.199162278942997e-06, |
|
"loss": 1.1433, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.31622040271759033, |
|
"learning_rate": 6.165544596256109e-06, |
|
"loss": 1.0992, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8793103448275862, |
|
"grad_norm": 0.3249478042125702, |
|
"learning_rate": 6.131871103160644e-06, |
|
"loss": 1.0708, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8836206896551724, |
|
"grad_norm": 0.28761979937553406, |
|
"learning_rate": 6.098143412062864e-06, |
|
"loss": 1.1509, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8879310344827587, |
|
"grad_norm": 0.31169748306274414, |
|
"learning_rate": 6.064363137964225e-06, |
|
"loss": 1.1497, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8922413793103449, |
|
"grad_norm": 0.2758077383041382, |
|
"learning_rate": 6.030531898384045e-06, |
|
"loss": 1.1262, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 0.3364175856113434, |
|
"learning_rate": 5.996651313282051e-06, |
|
"loss": 1.1345, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.9008620689655172, |
|
"grad_norm": 0.287631630897522, |
|
"learning_rate": 5.962723004980804e-06, |
|
"loss": 1.133, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.9051724137931034, |
|
"grad_norm": 0.29732412099838257, |
|
"learning_rate": 5.9287485980880245e-06, |
|
"loss": 1.124, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9094827586206896, |
|
"grad_norm": 0.2940175533294678, |
|
"learning_rate": 5.894729719418795e-06, |
|
"loss": 1.0874, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9137931034482759, |
|
"grad_norm": 0.31645113229751587, |
|
"learning_rate": 5.860667997917668e-06, |
|
"loss": 1.0723, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9181034482758621, |
|
"grad_norm": 0.2804202437400818, |
|
"learning_rate": 5.826565064580659e-06, |
|
"loss": 1.0918, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.9224137931034483, |
|
"grad_norm": 0.34620022773742676, |
|
"learning_rate": 5.792422552377153e-06, |
|
"loss": 1.0861, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9267241379310345, |
|
"grad_norm": 0.2878914773464203, |
|
"learning_rate": 5.758242096171713e-06, |
|
"loss": 1.1481, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9310344827586207, |
|
"grad_norm": 0.2986612915992737, |
|
"learning_rate": 5.724025332645794e-06, |
|
"loss": 1.1088, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.9353448275862069, |
|
"grad_norm": 0.3098682761192322, |
|
"learning_rate": 5.689773900219374e-06, |
|
"loss": 1.131, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.9396551724137931, |
|
"grad_norm": 0.29040470719337463, |
|
"learning_rate": 5.655489438972503e-06, |
|
"loss": 1.0993, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9439655172413793, |
|
"grad_norm": 0.3034652769565582, |
|
"learning_rate": 5.6211735905667665e-06, |
|
"loss": 1.1125, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.9482758620689655, |
|
"grad_norm": 0.34945690631866455, |
|
"learning_rate": 5.586827998166678e-06, |
|
"loss": 1.0605, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9525862068965517, |
|
"grad_norm": 0.3174704313278198, |
|
"learning_rate": 5.552454306361e-06, |
|
"loss": 1.1404, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9568965517241379, |
|
"grad_norm": 0.3093855082988739, |
|
"learning_rate": 5.518054161083994e-06, |
|
"loss": 1.1072, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9612068965517241, |
|
"grad_norm": 0.33938005566596985, |
|
"learning_rate": 5.483629209536609e-06, |
|
"loss": 1.0694, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.32714080810546875, |
|
"learning_rate": 5.449181100107599e-06, |
|
"loss": 1.0651, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9698275862068966, |
|
"grad_norm": 0.30905458331108093, |
|
"learning_rate": 5.41471148229461e-06, |
|
"loss": 1.0764, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9741379310344828, |
|
"grad_norm": 0.2898302674293518, |
|
"learning_rate": 5.38022200662518e-06, |
|
"loss": 1.1079, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.978448275862069, |
|
"grad_norm": 0.2906856834888458, |
|
"learning_rate": 5.34571432457771e-06, |
|
"loss": 1.0981, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9827586206896551, |
|
"grad_norm": 0.2875739336013794, |
|
"learning_rate": 5.31119008850239e-06, |
|
"loss": 1.109, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9870689655172413, |
|
"grad_norm": 0.31278446316719055, |
|
"learning_rate": 5.2766509515420785e-06, |
|
"loss": 1.0993, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9913793103448276, |
|
"grad_norm": 0.317538857460022, |
|
"learning_rate": 5.242098567553133e-06, |
|
"loss": 1.1088, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9956896551724138, |
|
"grad_norm": 0.2955145239830017, |
|
"learning_rate": 5.2075345910262296e-06, |
|
"loss": 1.0803, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.33417847752571106, |
|
"learning_rate": 5.1729606770071395e-06, |
|
"loss": 1.0785, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.1590964794158936, |
|
"eval_runtime": 38.8432, |
|
"eval_samples_per_second": 11.405, |
|
"eval_steps_per_second": 1.442, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0043103448275863, |
|
"grad_norm": 0.29942837357521057, |
|
"learning_rate": 5.138378481017475e-06, |
|
"loss": 1.1076, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.0086206896551724, |
|
"grad_norm": 0.30415764451026917, |
|
"learning_rate": 5.103789658975413e-06, |
|
"loss": 1.0919, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.0129310344827587, |
|
"grad_norm": 0.32707205414772034, |
|
"learning_rate": 5.069195867116416e-06, |
|
"loss": 1.0154, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0172413793103448, |
|
"grad_norm": 0.3695222735404968, |
|
"learning_rate": 5.034598761913917e-06, |
|
"loss": 1.0878, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.021551724137931, |
|
"grad_norm": 0.32564374804496765, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1174, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.0258620689655173, |
|
"grad_norm": 0.3132264316082001, |
|
"learning_rate": 4.965401238086084e-06, |
|
"loss": 1.1475, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.0043103448275863, |
|
"grad_norm": 0.33977454900741577, |
|
"learning_rate": 4.930804132883584e-06, |
|
"loss": 1.0749, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.0086206896551724, |
|
"grad_norm": 0.34106722474098206, |
|
"learning_rate": 4.896210341024587e-06, |
|
"loss": 1.0821, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0129310344827587, |
|
"grad_norm": 0.3282875716686249, |
|
"learning_rate": 4.861621518982527e-06, |
|
"loss": 1.0379, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.0172413793103448, |
|
"grad_norm": 0.38841262459754944, |
|
"learning_rate": 4.827039322992861e-06, |
|
"loss": 1.1243, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.021551724137931, |
|
"grad_norm": 0.28226733207702637, |
|
"learning_rate": 4.792465408973772e-06, |
|
"loss": 1.11, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.0258620689655173, |
|
"grad_norm": 0.32415810227394104, |
|
"learning_rate": 4.75790143244687e-06, |
|
"loss": 1.106, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.0301724137931034, |
|
"grad_norm": 0.30683305859565735, |
|
"learning_rate": 4.723349048457924e-06, |
|
"loss": 1.0732, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 0.352728933095932, |
|
"learning_rate": 4.68880991149761e-06, |
|
"loss": 1.1466, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.0387931034482758, |
|
"grad_norm": 0.35975515842437744, |
|
"learning_rate": 4.654285675422293e-06, |
|
"loss": 1.0218, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.043103448275862, |
|
"grad_norm": 0.27443352341651917, |
|
"learning_rate": 4.6197779933748226e-06, |
|
"loss": 1.0574, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.0474137931034482, |
|
"grad_norm": 0.37412866950035095, |
|
"learning_rate": 4.585288517705392e-06, |
|
"loss": 1.0348, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0517241379310345, |
|
"grad_norm": 0.3572046756744385, |
|
"learning_rate": 4.550818899892402e-06, |
|
"loss": 1.041, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0560344827586208, |
|
"grad_norm": 0.3478338420391083, |
|
"learning_rate": 4.516370790463394e-06, |
|
"loss": 1.0869, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.0603448275862069, |
|
"grad_norm": 0.344951868057251, |
|
"learning_rate": 4.481945838916006e-06, |
|
"loss": 1.0014, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.0646551724137931, |
|
"grad_norm": 0.34624016284942627, |
|
"learning_rate": 4.447545693639e-06, |
|
"loss": 1.0854, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.0689655172413792, |
|
"grad_norm": 0.3237141966819763, |
|
"learning_rate": 4.413172001833324e-06, |
|
"loss": 1.0955, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.0732758620689655, |
|
"grad_norm": 0.3042526841163635, |
|
"learning_rate": 4.378826409433235e-06, |
|
"loss": 1.0767, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0775862068965518, |
|
"grad_norm": 0.34745466709136963, |
|
"learning_rate": 4.344510561027498e-06, |
|
"loss": 1.0641, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.081896551724138, |
|
"grad_norm": 0.3786261975765228, |
|
"learning_rate": 4.310226099780627e-06, |
|
"loss": 1.0999, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.0862068965517242, |
|
"grad_norm": 0.3591586947441101, |
|
"learning_rate": 4.275974667354208e-06, |
|
"loss": 1.1285, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.0905172413793103, |
|
"grad_norm": 0.32906004786491394, |
|
"learning_rate": 4.241757903828288e-06, |
|
"loss": 1.0845, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.0948275862068966, |
|
"grad_norm": 0.31499743461608887, |
|
"learning_rate": 4.207577447622849e-06, |
|
"loss": 1.0354, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0991379310344827, |
|
"grad_norm": 0.34880805015563965, |
|
"learning_rate": 4.173434935419342e-06, |
|
"loss": 1.0957, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 0.3316938877105713, |
|
"learning_rate": 4.139332002082333e-06, |
|
"loss": 1.0089, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.1077586206896552, |
|
"grad_norm": 0.3652949631214142, |
|
"learning_rate": 4.105270280581206e-06, |
|
"loss": 1.1545, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.1120689655172413, |
|
"grad_norm": 0.33063212037086487, |
|
"learning_rate": 4.071251401911977e-06, |
|
"loss": 1.1157, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.1163793103448276, |
|
"grad_norm": 0.3272898197174072, |
|
"learning_rate": 4.037276995019198e-06, |
|
"loss": 1.1023, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.1206896551724137, |
|
"grad_norm": 0.34276431798934937, |
|
"learning_rate": 4.00334868671795e-06, |
|
"loss": 1.1385, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.33519965410232544, |
|
"learning_rate": 3.969468101615956e-06, |
|
"loss": 1.0816, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.1293103448275863, |
|
"grad_norm": 0.29600027203559875, |
|
"learning_rate": 3.935636862035776e-06, |
|
"loss": 1.0973, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.1336206896551724, |
|
"grad_norm": 0.3042546510696411, |
|
"learning_rate": 3.901856587937138e-06, |
|
"loss": 1.1236, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 0.3240540027618408, |
|
"learning_rate": 3.868128896839357e-06, |
|
"loss": 1.047, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1422413793103448, |
|
"grad_norm": 0.3279811143875122, |
|
"learning_rate": 3.834455403743892e-06, |
|
"loss": 1.0486, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.146551724137931, |
|
"grad_norm": 0.38778162002563477, |
|
"learning_rate": 3.8008377210570045e-06, |
|
"loss": 1.0099, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.1508620689655173, |
|
"grad_norm": 0.3080158233642578, |
|
"learning_rate": 3.76727745851256e-06, |
|
"loss": 1.0861, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.1551724137931034, |
|
"grad_norm": 0.3064788579940796, |
|
"learning_rate": 3.7337762230949397e-06, |
|
"loss": 1.0795, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.1594827586206897, |
|
"grad_norm": 0.3312165141105652, |
|
"learning_rate": 3.700335618962101e-06, |
|
"loss": 1.0781, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.1637931034482758, |
|
"grad_norm": 0.3970606327056885, |
|
"learning_rate": 3.6669572473687577e-06, |
|
"loss": 1.0798, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.168103448275862, |
|
"grad_norm": 0.35045385360717773, |
|
"learning_rate": 3.6336427065897106e-06, |
|
"loss": 1.1505, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.1724137931034484, |
|
"grad_norm": 0.3263322114944458, |
|
"learning_rate": 3.6003935918433124e-06, |
|
"loss": 1.0427, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.1767241379310345, |
|
"grad_norm": 0.3410392999649048, |
|
"learning_rate": 3.567211495215088e-06, |
|
"loss": 1.0789, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.1810344827586208, |
|
"grad_norm": 0.33315184712409973, |
|
"learning_rate": 3.534098005581497e-06, |
|
"loss": 1.0666, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1853448275862069, |
|
"grad_norm": 0.34163913130760193, |
|
"learning_rate": 3.5010547085338487e-06, |
|
"loss": 1.0769, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.1896551724137931, |
|
"grad_norm": 0.3381548225879669, |
|
"learning_rate": 3.4680831863023866e-06, |
|
"loss": 1.074, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.1939655172413792, |
|
"grad_norm": 0.3141576647758484, |
|
"learning_rate": 3.43518501768052e-06, |
|
"loss": 1.0957, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.1982758620689655, |
|
"grad_norm": 0.2855520248413086, |
|
"learning_rate": 3.402361777949229e-06, |
|
"loss": 1.1114, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.2025862068965516, |
|
"grad_norm": 0.3210565745830536, |
|
"learning_rate": 3.3696150388016295e-06, |
|
"loss": 1.1209, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 0.3479798138141632, |
|
"learning_rate": 3.336946368267724e-06, |
|
"loss": 1.033, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.2112068965517242, |
|
"grad_norm": 0.3301966190338135, |
|
"learning_rate": 3.304357330639303e-06, |
|
"loss": 1.1279, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.2155172413793103, |
|
"grad_norm": 0.3064906895160675, |
|
"learning_rate": 3.271849486395059e-06, |
|
"loss": 1.1006, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.2198275862068966, |
|
"grad_norm": 0.30745407938957214, |
|
"learning_rate": 3.2394243921258566e-06, |
|
"loss": 1.0949, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.2241379310344827, |
|
"grad_norm": 0.31708669662475586, |
|
"learning_rate": 3.207083600460196e-06, |
|
"loss": 1.0999, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2241379310344827, |
|
"eval_loss": 1.1474053859710693, |
|
"eval_runtime": 38.8873, |
|
"eval_samples_per_second": 11.392, |
|
"eval_steps_per_second": 1.44, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.228448275862069, |
|
"grad_norm": 0.35755884647369385, |
|
"learning_rate": 3.174828659989871e-06, |
|
"loss": 1.0781, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.2327586206896552, |
|
"grad_norm": 0.3275066316127777, |
|
"learning_rate": 3.1426611151958146e-06, |
|
"loss": 1.1165, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.2370689655172413, |
|
"grad_norm": 0.3440989851951599, |
|
"learning_rate": 3.1105825063741486e-06, |
|
"loss": 1.1444, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"grad_norm": 0.3393403887748718, |
|
"learning_rate": 3.078594369562417e-06, |
|
"loss": 1.0563, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.2456896551724137, |
|
"grad_norm": 0.3146829307079315, |
|
"learning_rate": 3.0466982364660514e-06, |
|
"loss": 1.0335, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.30042600631713867, |
|
"learning_rate": 3.0148956343850143e-06, |
|
"loss": 1.0717, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.2543103448275863, |
|
"grad_norm": 0.3386390209197998, |
|
"learning_rate": 2.9831880861406747e-06, |
|
"loss": 1.1232, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.2586206896551724, |
|
"grad_norm": 0.3249225914478302, |
|
"learning_rate": 2.9515771100028854e-06, |
|
"loss": 1.1093, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.2629310344827587, |
|
"grad_norm": 0.3327488899230957, |
|
"learning_rate": 2.9200642196172855e-06, |
|
"loss": 1.0943, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.2672413793103448, |
|
"grad_norm": 0.32572102546691895, |
|
"learning_rate": 2.888650923932815e-06, |
|
"loss": 1.0842, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.271551724137931, |
|
"grad_norm": 0.3248356580734253, |
|
"learning_rate": 2.8573387271294734e-06, |
|
"loss": 1.1394, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 0.33868080377578735, |
|
"learning_rate": 2.8261291285462843e-06, |
|
"loss": 1.059, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.2801724137931034, |
|
"grad_norm": 0.34686562418937683, |
|
"learning_rate": 2.7950236226095044e-06, |
|
"loss": 1.0545, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.2844827586206897, |
|
"grad_norm": 0.3430519700050354, |
|
"learning_rate": 2.7640236987610662e-06, |
|
"loss": 1.0411, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.2887931034482758, |
|
"grad_norm": 0.36565178632736206, |
|
"learning_rate": 2.7331308413872593e-06, |
|
"loss": 1.1237, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.293103448275862, |
|
"grad_norm": 0.31981322169303894, |
|
"learning_rate": 2.7023465297476426e-06, |
|
"loss": 1.0985, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.2974137931034484, |
|
"grad_norm": 0.33923840522766113, |
|
"learning_rate": 2.6716722379042303e-06, |
|
"loss": 1.0989, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.3017241379310345, |
|
"grad_norm": 0.311847984790802, |
|
"learning_rate": 2.641109434650894e-06, |
|
"loss": 1.0775, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.3060344827586206, |
|
"grad_norm": 0.34916892647743225, |
|
"learning_rate": 2.6106595834430366e-06, |
|
"loss": 1.0737, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.3103448275862069, |
|
"grad_norm": 0.35223567485809326, |
|
"learning_rate": 2.580324142327516e-06, |
|
"loss": 0.9688, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3146551724137931, |
|
"grad_norm": 0.2887548804283142, |
|
"learning_rate": 2.5501045638728307e-06, |
|
"loss": 1.0723, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.3189655172413794, |
|
"grad_norm": 0.3438979685306549, |
|
"learning_rate": 2.520002295099564e-06, |
|
"loss": 1.0939, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.3232758620689655, |
|
"grad_norm": 0.3115052282810211, |
|
"learning_rate": 2.4900187774110923e-06, |
|
"loss": 1.0921, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.3275862068965516, |
|
"grad_norm": 0.3779308795928955, |
|
"learning_rate": 2.460155446524573e-06, |
|
"loss": 1.1631, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.331896551724138, |
|
"grad_norm": 0.305646151304245, |
|
"learning_rate": 2.4304137324021915e-06, |
|
"loss": 1.0457, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.3362068965517242, |
|
"grad_norm": 0.356973260641098, |
|
"learning_rate": 2.400795059182692e-06, |
|
"loss": 1.0929, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.3405172413793103, |
|
"grad_norm": 0.36015981435775757, |
|
"learning_rate": 2.371300845113182e-06, |
|
"loss": 1.0143, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.3448275862068966, |
|
"grad_norm": 0.38384366035461426, |
|
"learning_rate": 2.341932502481226e-06, |
|
"loss": 1.0644, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.3491379310344827, |
|
"grad_norm": 0.3105123043060303, |
|
"learning_rate": 2.3126914375472185e-06, |
|
"loss": 1.0738, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.353448275862069, |
|
"grad_norm": 0.32959380745887756, |
|
"learning_rate": 2.283579050477042e-06, |
|
"loss": 1.0785, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.3577586206896552, |
|
"grad_norm": 0.3154717683792114, |
|
"learning_rate": 2.254596735275028e-06, |
|
"loss": 1.0562, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.3620689655172413, |
|
"grad_norm": 0.32698148488998413, |
|
"learning_rate": 2.2257458797172093e-06, |
|
"loss": 1.0658, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.3663793103448276, |
|
"grad_norm": 0.3278767764568329, |
|
"learning_rate": 2.1970278652848615e-06, |
|
"loss": 1.0972, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.3706896551724137, |
|
"grad_norm": 0.3235674500465393, |
|
"learning_rate": 2.1684440670983568e-06, |
|
"loss": 1.0611, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.398926705121994, |
|
"learning_rate": 2.1399958538513197e-06, |
|
"loss": 1.0892, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.35341110825538635, |
|
"learning_rate": 2.111684587745081e-06, |
|
"loss": 1.0991, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.3836206896551724, |
|
"grad_norm": 0.36299997568130493, |
|
"learning_rate": 2.0835116244234625e-06, |
|
"loss": 1.0954, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.3879310344827587, |
|
"grad_norm": 0.3728746175765991, |
|
"learning_rate": 2.0554783129078564e-06, |
|
"loss": 1.0784, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.3922413793103448, |
|
"grad_norm": 0.3647480905056, |
|
"learning_rate": 2.027585995532631e-06, |
|
"loss": 1.0874, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.396551724137931, |
|
"grad_norm": 0.3591470718383789, |
|
"learning_rate": 1.9998360078808547e-06, |
|
"loss": 1.0999, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4008620689655173, |
|
"grad_norm": 0.3867989480495453, |
|
"learning_rate": 1.972229678720346e-06, |
|
"loss": 1.0829, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.4051724137931034, |
|
"grad_norm": 0.3600481450557709, |
|
"learning_rate": 1.944768329940045e-06, |
|
"loss": 1.0679, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.4094827586206897, |
|
"grad_norm": 0.35227489471435547, |
|
"learning_rate": 1.917453276486712e-06, |
|
"loss": 1.1076, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 0.34222954511642456, |
|
"learning_rate": 1.8902858263019746e-06, |
|
"loss": 1.0673, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.418103448275862, |
|
"grad_norm": 0.3176562488079071, |
|
"learning_rate": 1.8632672802596907e-06, |
|
"loss": 1.0518, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.4224137931034484, |
|
"grad_norm": 0.3231894075870514, |
|
"learning_rate": 1.836398932103658e-06, |
|
"loss": 1.032, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.4267241379310345, |
|
"grad_norm": 0.3324163258075714, |
|
"learning_rate": 1.8096820683856687e-06, |
|
"loss": 1.0684, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.4310344827586206, |
|
"grad_norm": 0.32313287258148193, |
|
"learning_rate": 1.7831179684039041e-06, |
|
"loss": 1.0558, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.4353448275862069, |
|
"grad_norm": 0.2836117148399353, |
|
"learning_rate": 1.7567079041416706e-06, |
|
"loss": 1.097, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.4396551724137931, |
|
"grad_norm": 0.3591717779636383, |
|
"learning_rate": 1.7304531402065033e-06, |
|
"loss": 1.0722, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4439655172413794, |
|
"grad_norm": 0.3194042146205902, |
|
"learning_rate": 1.7043549337696053e-06, |
|
"loss": 1.0532, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.4482758620689655, |
|
"grad_norm": 0.3645802438259125, |
|
"learning_rate": 1.6784145345056519e-06, |
|
"loss": 1.0791, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.4525862068965516, |
|
"grad_norm": 0.3621852397918701, |
|
"learning_rate": 1.6526331845329508e-06, |
|
"loss": 1.074, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.456896551724138, |
|
"grad_norm": 0.3067796528339386, |
|
"learning_rate": 1.627012118353965e-06, |
|
"loss": 1.0218, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.4612068965517242, |
|
"grad_norm": 0.3126223385334015, |
|
"learning_rate": 1.6015525627962041e-06, |
|
"loss": 1.0597, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.4655172413793103, |
|
"grad_norm": 0.34759384393692017, |
|
"learning_rate": 1.5762557369534709e-06, |
|
"loss": 1.0818, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.4698275862068966, |
|
"grad_norm": 0.31952786445617676, |
|
"learning_rate": 1.5511228521274973e-06, |
|
"loss": 1.065, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.4741379310344827, |
|
"grad_norm": 0.31419798731803894, |
|
"learning_rate": 1.5261551117699358e-06, |
|
"loss": 1.1012, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.4741379310344827, |
|
"eval_loss": 1.1407095193862915, |
|
"eval_runtime": 38.8848, |
|
"eval_samples_per_second": 11.393, |
|
"eval_steps_per_second": 1.44, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.478448275862069, |
|
"grad_norm": 0.37249940633773804, |
|
"learning_rate": 1.5013537114247362e-06, |
|
"loss": 1.0744, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.4827586206896552, |
|
"grad_norm": 0.32196739315986633, |
|
"learning_rate": 1.4767198386708998e-06, |
|
"loss": 1.0961, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.4870689655172413, |
|
"grad_norm": 0.35418596863746643, |
|
"learning_rate": 1.452254673065613e-06, |
|
"loss": 1.1137, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.4913793103448276, |
|
"grad_norm": 0.3167632520198822, |
|
"learning_rate": 1.427959386087761e-06, |
|
"loss": 1.1096, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.4956896551724137, |
|
"grad_norm": 0.33386534452438354, |
|
"learning_rate": 1.4038351410818434e-06, |
|
"loss": 1.0794, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.3341921865940094, |
|
"learning_rate": 1.3798830932022616e-06, |
|
"loss": 1.1006, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.5043103448275863, |
|
"grad_norm": 0.3526374101638794, |
|
"learning_rate": 1.3561043893580084e-06, |
|
"loss": 1.0801, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.5086206896551724, |
|
"grad_norm": 0.31550395488739014, |
|
"learning_rate": 1.3325001681577482e-06, |
|
"loss": 1.0564, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.5129310344827587, |
|
"grad_norm": 0.35600364208221436, |
|
"learning_rate": 1.3090715598553e-06, |
|
"loss": 1.1088, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.5172413793103448, |
|
"grad_norm": 0.36724480986595154, |
|
"learning_rate": 1.2858196862955108e-06, |
|
"loss": 1.0891, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.521551724137931, |
|
"grad_norm": 0.35326990485191345, |
|
"learning_rate": 1.2627456608605442e-06, |
|
"loss": 1.0839, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.5258620689655173, |
|
"grad_norm": 0.3355065882205963, |
|
"learning_rate": 1.2398505884165652e-06, |
|
"loss": 1.1194, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.5301724137931034, |
|
"grad_norm": 0.3283027708530426, |
|
"learning_rate": 1.217135565260833e-06, |
|
"loss": 1.0854, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.5344827586206895, |
|
"grad_norm": 0.31447839736938477, |
|
"learning_rate": 1.1946016790692094e-06, |
|
"loss": 1.0984, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.5387931034482758, |
|
"grad_norm": 0.32135719060897827, |
|
"learning_rate": 1.172250008844077e-06, |
|
"loss": 1.1079, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.543103448275862, |
|
"grad_norm": 0.33316370844841003, |
|
"learning_rate": 1.1500816248626711e-06, |
|
"loss": 1.0484, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.5474137931034484, |
|
"grad_norm": 0.3206438720226288, |
|
"learning_rate": 1.1280975886258294e-06, |
|
"loss": 1.0532, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 0.3373902440071106, |
|
"learning_rate": 1.1062989528071683e-06, |
|
"loss": 1.0786, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.5560344827586206, |
|
"grad_norm": 0.33597832918167114, |
|
"learning_rate": 1.0846867612026746e-06, |
|
"loss": 1.153, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.5603448275862069, |
|
"grad_norm": 0.34479406476020813, |
|
"learning_rate": 1.0632620486807244e-06, |
|
"loss": 1.0782, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.5646551724137931, |
|
"grad_norm": 0.37828493118286133, |
|
"learning_rate": 1.0420258411325308e-06, |
|
"loss": 1.0208, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.5689655172413794, |
|
"grad_norm": 0.32772085070610046, |
|
"learning_rate": 1.0209791554230209e-06, |
|
"loss": 1.1206, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.5732758620689655, |
|
"grad_norm": 0.3135930299758911, |
|
"learning_rate": 1.0001229993421412e-06, |
|
"loss": 1.0934, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.5775862068965516, |
|
"grad_norm": 0.29522621631622314, |
|
"learning_rate": 9.79458371556607e-07, |
|
"loss": 1.1179, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.581896551724138, |
|
"grad_norm": 0.3688337802886963, |
|
"learning_rate": 9.589862615620782e-07, |
|
"loss": 1.0647, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.5862068965517242, |
|
"grad_norm": 0.30966484546661377, |
|
"learning_rate": 9.387076496357805e-07, |
|
"loss": 1.0978, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.5905172413793105, |
|
"grad_norm": 0.30171751976013184, |
|
"learning_rate": 9.186235067895672e-07, |
|
"loss": 1.0483, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.5948275862068966, |
|
"grad_norm": 0.370378315448761, |
|
"learning_rate": 8.987347947234193e-07, |
|
"loss": 1.068, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.5991379310344827, |
|
"grad_norm": 0.3673414885997772, |
|
"learning_rate": 8.790424657794034e-07, |
|
"loss": 1.0745, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.603448275862069, |
|
"grad_norm": 0.3022395968437195, |
|
"learning_rate": 8.595474628960598e-07, |
|
"loss": 1.0693, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.6077586206896552, |
|
"grad_norm": 0.3871013820171356, |
|
"learning_rate": 8.402507195632625e-07, |
|
"loss": 1.0281, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.6120689655172413, |
|
"grad_norm": 0.3459872901439667, |
|
"learning_rate": 8.211531597775136e-07, |
|
"loss": 1.0435, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6163793103448276, |
|
"grad_norm": 0.3100816309452057, |
|
"learning_rate": 8.022556979976992e-07, |
|
"loss": 1.1093, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.6206896551724137, |
|
"grad_norm": 0.32473233342170715, |
|
"learning_rate": 7.835592391013053e-07, |
|
"loss": 1.0207, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.3091509938240051, |
|
"learning_rate": 7.650646783410875e-07, |
|
"loss": 1.1253, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.6293103448275863, |
|
"grad_norm": 0.3101778030395508, |
|
"learning_rate": 7.467729013021979e-07, |
|
"loss": 1.0307, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.6336206896551724, |
|
"grad_norm": 0.31698641180992126, |
|
"learning_rate": 7.286847838597905e-07, |
|
"loss": 1.064, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.6379310344827587, |
|
"grad_norm": 0.34377050399780273, |
|
"learning_rate": 7.108011921370728e-07, |
|
"loss": 1.0792, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.6422413793103448, |
|
"grad_norm": 0.3562955856323242, |
|
"learning_rate": 6.931229824638358e-07, |
|
"loss": 1.1161, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.646551724137931, |
|
"grad_norm": 0.3532198965549469, |
|
"learning_rate": 6.756510013354512e-07, |
|
"loss": 1.0326, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.6508620689655173, |
|
"grad_norm": 0.3240654468536377, |
|
"learning_rate": 6.583860853723339e-07, |
|
"loss": 1.0902, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 0.3335769474506378, |
|
"learning_rate": 6.413290612798883e-07, |
|
"loss": 1.0795, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6594827586206895, |
|
"grad_norm": 0.3507966995239258, |
|
"learning_rate": 6.24480745808913e-07, |
|
"loss": 1.0312, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.6637931034482758, |
|
"grad_norm": 0.3086683750152588, |
|
"learning_rate": 6.078419457165036e-07, |
|
"loss": 1.046, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.668103448275862, |
|
"grad_norm": 0.33030492067337036, |
|
"learning_rate": 5.914134577274122e-07, |
|
"loss": 1.0605, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.6724137931034484, |
|
"grad_norm": 0.3030514717102051, |
|
"learning_rate": 5.751960684959046e-07, |
|
"loss": 1.0724, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.6767241379310345, |
|
"grad_norm": 0.33980438113212585, |
|
"learning_rate": 5.59190554568087e-07, |
|
"loss": 1.0998, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.6810344827586206, |
|
"grad_norm": 0.3229700028896332, |
|
"learning_rate": 5.433976823447262e-07, |
|
"loss": 1.0422, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.6853448275862069, |
|
"grad_norm": 0.3324540853500366, |
|
"learning_rate": 5.27818208044551e-07, |
|
"loss": 1.0356, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 0.3236718773841858, |
|
"learning_rate": 5.124528776680371e-07, |
|
"loss": 1.05, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.6939655172413794, |
|
"grad_norm": 0.3516765832901001, |
|
"learning_rate": 4.973024269616933e-07, |
|
"loss": 1.0468, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.6982758620689655, |
|
"grad_norm": 0.3242420554161072, |
|
"learning_rate": 4.823675813828271e-07, |
|
"loss": 1.0682, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7025862068965516, |
|
"grad_norm": 0.2817647159099579, |
|
"learning_rate": 4.676490560648067e-07, |
|
"loss": 1.0515, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.706896551724138, |
|
"grad_norm": 0.3226347267627716, |
|
"learning_rate": 4.531475557828202e-07, |
|
"loss": 1.0865, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.7112068965517242, |
|
"grad_norm": 0.3294438123703003, |
|
"learning_rate": 4.388637749201274e-07, |
|
"loss": 1.0515, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.7155172413793105, |
|
"grad_norm": 0.3420010209083557, |
|
"learning_rate": 4.2479839743480965e-07, |
|
"loss": 1.1431, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.7198275862068966, |
|
"grad_norm": 0.34435805678367615, |
|
"learning_rate": 4.1095209682701977e-07, |
|
"loss": 1.1311, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.337528258562088, |
|
"learning_rate": 3.9732553610673465e-07, |
|
"loss": 1.0779, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"eval_loss": 1.1384811401367188, |
|
"eval_runtime": 38.9249, |
|
"eval_samples_per_second": 11.381, |
|
"eval_steps_per_second": 1.439, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.728448275862069, |
|
"grad_norm": 0.3463694453239441, |
|
"learning_rate": 3.839193677620029e-07, |
|
"loss": 1.0895, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.7327586206896552, |
|
"grad_norm": 0.311787486076355, |
|
"learning_rate": 3.7073423372770754e-07, |
|
"loss": 1.0449, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.7370689655172413, |
|
"grad_norm": 0.3618296980857849, |
|
"learning_rate": 3.577707653548229e-07, |
|
"loss": 1.1474, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.7413793103448276, |
|
"grad_norm": 0.35903745889663696, |
|
"learning_rate": 3.4502958338018754e-07, |
|
"loss": 1.0572, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.7456896551724137, |
|
"grad_norm": 0.33760443329811096, |
|
"learning_rate": 3.325112978967776e-07, |
|
"loss": 1.0408, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.3630852997303009, |
|
"learning_rate": 3.20216508324494e-07, |
|
"loss": 1.0523, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.7543103448275863, |
|
"grad_norm": 0.3185005784034729, |
|
"learning_rate": 3.081458033814627e-07, |
|
"loss": 1.1005, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.7586206896551724, |
|
"grad_norm": 0.35955214500427246, |
|
"learning_rate": 2.9629976105584266e-07, |
|
"loss": 1.1021, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.7629310344827587, |
|
"grad_norm": 0.2948373854160309, |
|
"learning_rate": 2.8467894857814814e-07, |
|
"loss": 1.0647, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.7672413793103448, |
|
"grad_norm": 0.3243519961833954, |
|
"learning_rate": 2.732839223940914e-07, |
|
"loss": 1.0736, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.771551724137931, |
|
"grad_norm": 0.33327266573905945, |
|
"learning_rate": 2.621152281379352e-07, |
|
"loss": 1.1213, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.7758620689655173, |
|
"grad_norm": 0.32360267639160156, |
|
"learning_rate": 2.5117340060636817e-07, |
|
"loss": 1.045, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.7801724137931034, |
|
"grad_norm": 0.34389981627464294, |
|
"learning_rate": 2.404589637328947e-07, |
|
"loss": 1.1223, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.7844827586206895, |
|
"grad_norm": 0.372776061296463, |
|
"learning_rate": 2.2997243056274822e-07, |
|
"loss": 1.0633, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.7887931034482758, |
|
"grad_norm": 0.3406997621059418, |
|
"learning_rate": 2.1971430322832553e-07, |
|
"loss": 1.113, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.793103448275862, |
|
"grad_norm": 0.3557218015193939, |
|
"learning_rate": 2.096850729251404e-07, |
|
"loss": 1.0903, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.7974137931034484, |
|
"grad_norm": 0.3280501067638397, |
|
"learning_rate": 1.998852198883061e-07, |
|
"loss": 1.086, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.8017241379310345, |
|
"grad_norm": 0.3395581543445587, |
|
"learning_rate": 1.903152133695385e-07, |
|
"loss": 1.1108, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.8060344827586206, |
|
"grad_norm": 0.33143341541290283, |
|
"learning_rate": 1.8097551161468773e-07, |
|
"loss": 1.0399, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.8103448275862069, |
|
"grad_norm": 0.30996373295783997, |
|
"learning_rate": 1.7186656184179475e-07, |
|
"loss": 1.0828, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.8146551724137931, |
|
"grad_norm": 0.3189036250114441, |
|
"learning_rate": 1.6298880021967667e-07, |
|
"loss": 1.0703, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.8189655172413794, |
|
"grad_norm": 0.3535940945148468, |
|
"learning_rate": 1.543426518470431e-07, |
|
"loss": 0.996, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.8232758620689655, |
|
"grad_norm": 0.3753534257411957, |
|
"learning_rate": 1.4592853073214007e-07, |
|
"loss": 1.0818, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 0.3219975531101227, |
|
"learning_rate": 1.3774683977292426e-07, |
|
"loss": 1.1153, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.831896551724138, |
|
"grad_norm": 0.3368397355079651, |
|
"learning_rate": 1.2979797073777333e-07, |
|
"loss": 1.0769, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.8362068965517242, |
|
"grad_norm": 0.36413589119911194, |
|
"learning_rate": 1.2208230424672562e-07, |
|
"loss": 1.0734, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.8405172413793105, |
|
"grad_norm": 0.3497381806373596, |
|
"learning_rate": 1.1460020975325392e-07, |
|
"loss": 1.1006, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.8448275862068966, |
|
"grad_norm": 0.3364497125148773, |
|
"learning_rate": 1.0735204552657641e-07, |
|
"loss": 1.0738, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.8491379310344827, |
|
"grad_norm": 0.34419310092926025, |
|
"learning_rate": 1.003381586344998e-07, |
|
"loss": 1.0152, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.853448275862069, |
|
"grad_norm": 0.325958251953125, |
|
"learning_rate": 9.355888492680155e-08, |
|
"loss": 1.0799, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.8577586206896552, |
|
"grad_norm": 0.3611970543861389, |
|
"learning_rate": 8.701454901914764e-08, |
|
"loss": 1.1206, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.8620689655172413, |
|
"grad_norm": 0.3344848155975342, |
|
"learning_rate": 8.070546427754899e-08, |
|
"loss": 1.0546, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.8663793103448276, |
|
"grad_norm": 0.3656860589981079, |
|
"learning_rate": 7.463193280335679e-08, |
|
"loss": 1.0763, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.8706896551724137, |
|
"grad_norm": 0.3223496675491333, |
|
"learning_rate": 6.879424541879676e-08, |
|
"loss": 1.0854, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.3349456191062927, |
|
"learning_rate": 6.319268165304204e-08, |
|
"loss": 1.1127, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.8793103448275863, |
|
"grad_norm": 0.32300493121147156, |
|
"learning_rate": 5.782750972883111e-08, |
|
"loss": 1.0453, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.8836206896551724, |
|
"grad_norm": 0.34070324897766113, |
|
"learning_rate": 5.26989865496208e-08, |
|
"loss": 1.113, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.8879310344827587, |
|
"grad_norm": 0.3265235722064972, |
|
"learning_rate": 4.780735768728895e-08, |
|
"loss": 1.0802, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.8922413793103448, |
|
"grad_norm": 0.33940497040748596, |
|
"learning_rate": 4.315285737037156e-08, |
|
"loss": 1.0514, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 0.32045885920524597, |
|
"learning_rate": 3.873570847285013e-08, |
|
"loss": 1.0475, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.9008620689655173, |
|
"grad_norm": 0.3269757926464081, |
|
"learning_rate": 3.455612250347851e-08, |
|
"loss": 1.0889, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.9051724137931034, |
|
"grad_norm": 0.40040749311447144, |
|
"learning_rate": 3.0614299595654875e-08, |
|
"loss": 1.0693, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.9094827586206895, |
|
"grad_norm": 0.3067898750305176, |
|
"learning_rate": 2.691042849783776e-08, |
|
"loss": 1.0847, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.9137931034482758, |
|
"grad_norm": 0.3403901159763336, |
|
"learning_rate": 2.3444686564511042e-08, |
|
"loss": 1.0298, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.918103448275862, |
|
"grad_norm": 0.3210541009902954, |
|
"learning_rate": 2.0217239747689077e-08, |
|
"loss": 1.0751, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.9224137931034484, |
|
"grad_norm": 0.3297666907310486, |
|
"learning_rate": 1.7228242588969714e-08, |
|
"loss": 1.0646, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.9267241379310345, |
|
"grad_norm": 0.35611140727996826, |
|
"learning_rate": 1.447783821213744e-08, |
|
"loss": 1.0604, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 0.3229893743991852, |
|
"learning_rate": 1.1966158316307208e-08, |
|
"loss": 1.1214, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.9353448275862069, |
|
"grad_norm": 0.3192938268184662, |
|
"learning_rate": 9.693323169619463e-09, |
|
"loss": 1.0673, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.9396551724137931, |
|
"grad_norm": 0.3141050338745117, |
|
"learning_rate": 7.65944160348142e-09, |
|
"loss": 1.0865, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.9439655172413794, |
|
"grad_norm": 0.3222709000110626, |
|
"learning_rate": 5.864611007354581e-09, |
|
"loss": 1.0105, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.9482758620689655, |
|
"grad_norm": 0.3701346516609192, |
|
"learning_rate": 4.308917324092887e-09, |
|
"loss": 1.0566, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.9525862068965516, |
|
"grad_norm": 0.34899988770484924, |
|
"learning_rate": 2.9924350458271357e-09, |
|
"loss": 1.0494, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.956896551724138, |
|
"grad_norm": 0.34436851739883423, |
|
"learning_rate": 1.9152272103972746e-09, |
|
"loss": 1.1125, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.9612068965517242, |
|
"grad_norm": 0.3521319329738617, |
|
"learning_rate": 1.077345398334262e-09, |
|
"loss": 1.0679, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 0.3282637894153595, |
|
"learning_rate": 4.788297303903732e-10, |
|
"loss": 1.045, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.9698275862068966, |
|
"grad_norm": 0.31886130571365356, |
|
"learning_rate": 1.1970886561907257e-10, |
|
"loss": 1.0774, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.9741379310344827, |
|
"grad_norm": 0.32838234305381775, |
|
"learning_rate": 0.0, |
|
"loss": 1.0681, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.9741379310344827, |
|
"eval_loss": 1.1381434202194214, |
|
"eval_runtime": 38.7863, |
|
"eval_samples_per_second": 11.422, |
|
"eval_steps_per_second": 1.444, |
|
"step": 464 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 464, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 116, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.566697049072337e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|