diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5804 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.990689013035381, + "eval_steps": 5, + "global_step": 670, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0074487895716946, + "grad_norm": 0.09585436433553696, + "learning_rate": 1.4925373134328358e-06, + "loss": 0.057, + "step": 1 + }, + { + "epoch": 0.0148975791433892, + "grad_norm": 0.0947771668434143, + "learning_rate": 2.9850746268656716e-06, + "loss": 0.0606, + "step": 2 + }, + { + "epoch": 0.0223463687150838, + "grad_norm": 0.09924148768186569, + "learning_rate": 4.477611940298508e-06, + "loss": 0.0588, + "step": 3 + }, + { + "epoch": 0.0297951582867784, + "grad_norm": 0.09715887159109116, + "learning_rate": 5.970149253731343e-06, + "loss": 0.058, + "step": 4 + }, + { + "epoch": 0.037243947858473, + "grad_norm": 0.08674579113721848, + "learning_rate": 7.4626865671641785e-06, + "loss": 0.0567, + "step": 5 + }, + { + "epoch": 0.037243947858473, + "eval_loss": 0.05475037544965744, + "eval_runtime": 1.4827, + "eval_samples_per_second": 5.396, + "eval_steps_per_second": 1.349, + "step": 5 + }, + { + "epoch": 0.0446927374301676, + "grad_norm": 0.08492287993431091, + "learning_rate": 8.955223880597016e-06, + "loss": 0.0544, + "step": 6 + }, + { + "epoch": 0.0521415270018622, + "grad_norm": 0.07947102189064026, + "learning_rate": 1.0447761194029851e-05, + "loss": 0.0533, + "step": 7 + }, + { + "epoch": 0.0595903165735568, + "grad_norm": 0.06775230914354324, + "learning_rate": 1.1940298507462686e-05, + "loss": 0.0481, + "step": 8 + }, + { + "epoch": 0.0670391061452514, + "grad_norm": 0.05250588804483414, + "learning_rate": 1.3432835820895523e-05, + "loss": 0.044, + "step": 9 + }, + { + "epoch": 0.074487895716946, + "grad_norm": 0.04540105536580086, + "learning_rate": 1.4925373134328357e-05, + "loss": 0.0421, + "step": 10 + }, + { + "epoch": 0.074487895716946, + "eval_loss": 0.043365783989429474, + "eval_runtime": 1.4321, + "eval_samples_per_second": 5.586, + "eval_steps_per_second": 1.397, + "step": 10 + }, + { + "epoch": 0.08193668528864059, + "grad_norm": 0.04332689195871353, + "learning_rate": 1.6417910447761194e-05, + "loss": 0.0405, + "step": 11 + }, + { + "epoch": 0.0893854748603352, + "grad_norm": 0.04583241418004036, + "learning_rate": 1.791044776119403e-05, + "loss": 0.0395, + "step": 12 + }, + { + "epoch": 0.09683426443202979, + "grad_norm": 0.043215200304985046, + "learning_rate": 1.9402985074626868e-05, + "loss": 0.0397, + "step": 13 + }, + { + "epoch": 0.1042830540037244, + "grad_norm": 0.03956913948059082, + "learning_rate": 2.0895522388059702e-05, + "loss": 0.0357, + "step": 14 + }, + { + "epoch": 0.11173184357541899, + "grad_norm": 0.04003346711397171, + "learning_rate": 2.238805970149254e-05, + "loss": 0.0347, + "step": 15 + }, + { + "epoch": 0.11173184357541899, + "eval_loss": 0.038371481001377106, + "eval_runtime": 1.4391, + "eval_samples_per_second": 5.559, + "eval_steps_per_second": 1.39, + "step": 15 + }, + { + "epoch": 0.1191806331471136, + "grad_norm": 0.03616767376661301, + "learning_rate": 2.3880597014925373e-05, + "loss": 0.037, + "step": 16 + }, + { + "epoch": 0.1266294227188082, + "grad_norm": 0.03483457863330841, + "learning_rate": 2.537313432835821e-05, + "loss": 0.0341, + "step": 17 + }, + { + "epoch": 0.1340782122905028, + "grad_norm": 0.030281590297818184, + "learning_rate": 2.6865671641791047e-05, + "loss": 0.0325, + "step": 18 + }, + { + "epoch": 0.14152700186219738, + "grad_norm": 0.031204624101519585, + "learning_rate": 2.835820895522388e-05, + "loss": 0.0322, + "step": 19 + }, + { + "epoch": 0.148975791433892, + "grad_norm": 0.02826506830751896, + "learning_rate": 2.9850746268656714e-05, + "loss": 0.0306, + "step": 20 + }, + { + "epoch": 0.148975791433892, + "eval_loss": 0.034415338188409805, + "eval_runtime": 1.4411, + "eval_samples_per_second": 5.551, + "eval_steps_per_second": 1.388, + "step": 20 + }, + { + "epoch": 0.1564245810055866, + "grad_norm": 0.026216600090265274, + "learning_rate": 3.1343283582089554e-05, + "loss": 0.0285, + "step": 21 + }, + { + "epoch": 0.16387337057728119, + "grad_norm": 0.02381393499672413, + "learning_rate": 3.283582089552239e-05, + "loss": 0.0287, + "step": 22 + }, + { + "epoch": 0.1713221601489758, + "grad_norm": 0.022704176604747772, + "learning_rate": 3.432835820895522e-05, + "loss": 0.0301, + "step": 23 + }, + { + "epoch": 0.1787709497206704, + "grad_norm": 0.021720608696341515, + "learning_rate": 3.582089552238806e-05, + "loss": 0.0253, + "step": 24 + }, + { + "epoch": 0.186219739292365, + "grad_norm": 0.024361221119761467, + "learning_rate": 3.73134328358209e-05, + "loss": 0.0325, + "step": 25 + }, + { + "epoch": 0.186219739292365, + "eval_loss": 0.030194800347089767, + "eval_runtime": 1.4369, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 25 + }, + { + "epoch": 0.19366852886405958, + "grad_norm": 0.022885512560606003, + "learning_rate": 3.8805970149253736e-05, + "loss": 0.0249, + "step": 26 + }, + { + "epoch": 0.2011173184357542, + "grad_norm": 0.020435787737369537, + "learning_rate": 4.029850746268657e-05, + "loss": 0.0261, + "step": 27 + }, + { + "epoch": 0.2085661080074488, + "grad_norm": 0.022251691669225693, + "learning_rate": 4.1791044776119404e-05, + "loss": 0.0248, + "step": 28 + }, + { + "epoch": 0.21601489757914338, + "grad_norm": 0.021197110414505005, + "learning_rate": 4.328358208955224e-05, + "loss": 0.0276, + "step": 29 + }, + { + "epoch": 0.22346368715083798, + "grad_norm": 0.018538013100624084, + "learning_rate": 4.477611940298508e-05, + "loss": 0.022, + "step": 30 + }, + { + "epoch": 0.22346368715083798, + "eval_loss": 0.026628218591213226, + "eval_runtime": 1.4417, + "eval_samples_per_second": 5.549, + "eval_steps_per_second": 1.387, + "step": 30 + }, + { + "epoch": 0.2309124767225326, + "grad_norm": 0.020026978105306625, + "learning_rate": 4.626865671641791e-05, + "loss": 0.0272, + "step": 31 + }, + { + "epoch": 0.2383612662942272, + "grad_norm": 0.01873094029724598, + "learning_rate": 4.7761194029850745e-05, + "loss": 0.021, + "step": 32 + }, + { + "epoch": 0.24581005586592178, + "grad_norm": 0.017488041892647743, + "learning_rate": 4.9253731343283586e-05, + "loss": 0.0244, + "step": 33 + }, + { + "epoch": 0.2532588454376164, + "grad_norm": 0.017643896862864494, + "learning_rate": 5.074626865671642e-05, + "loss": 0.0223, + "step": 34 + }, + { + "epoch": 0.260707635009311, + "grad_norm": 0.018672725185751915, + "learning_rate": 5.223880597014925e-05, + "loss": 0.0251, + "step": 35 + }, + { + "epoch": 0.260707635009311, + "eval_loss": 0.024099677801132202, + "eval_runtime": 1.4381, + "eval_samples_per_second": 5.563, + "eval_steps_per_second": 1.391, + "step": 35 + }, + { + "epoch": 0.2681564245810056, + "grad_norm": 0.018858693540096283, + "learning_rate": 5.373134328358209e-05, + "loss": 0.0227, + "step": 36 + }, + { + "epoch": 0.2756052141527002, + "grad_norm": 0.019595002755522728, + "learning_rate": 5.5223880597014934e-05, + "loss": 0.0243, + "step": 37 + }, + { + "epoch": 0.28305400372439476, + "grad_norm": 0.01909262128174305, + "learning_rate": 5.671641791044776e-05, + "loss": 0.0237, + "step": 38 + }, + { + "epoch": 0.2905027932960894, + "grad_norm": 0.017081189900636673, + "learning_rate": 5.82089552238806e-05, + "loss": 0.0203, + "step": 39 + }, + { + "epoch": 0.297951582867784, + "grad_norm": 0.0175361055880785, + "learning_rate": 5.970149253731343e-05, + "loss": 0.0223, + "step": 40 + }, + { + "epoch": 0.297951582867784, + "eval_loss": 0.02209121733903885, + "eval_runtime": 1.4363, + "eval_samples_per_second": 5.57, + "eval_steps_per_second": 1.392, + "step": 40 + }, + { + "epoch": 0.3054003724394786, + "grad_norm": 0.016245298087596893, + "learning_rate": 6.119402985074628e-05, + "loss": 0.0203, + "step": 41 + }, + { + "epoch": 0.3128491620111732, + "grad_norm": 0.01772252470254898, + "learning_rate": 6.268656716417911e-05, + "loss": 0.0202, + "step": 42 + }, + { + "epoch": 0.3202979515828678, + "grad_norm": 0.02075192704796791, + "learning_rate": 6.417910447761194e-05, + "loss": 0.0251, + "step": 43 + }, + { + "epoch": 0.32774674115456237, + "grad_norm": 0.018841199576854706, + "learning_rate": 6.567164179104478e-05, + "loss": 0.0247, + "step": 44 + }, + { + "epoch": 0.33519553072625696, + "grad_norm": 0.01515793427824974, + "learning_rate": 6.716417910447762e-05, + "loss": 0.0174, + "step": 45 + }, + { + "epoch": 0.33519553072625696, + "eval_loss": 0.020781710743904114, + "eval_runtime": 1.4378, + "eval_samples_per_second": 5.564, + "eval_steps_per_second": 1.391, + "step": 45 + }, + { + "epoch": 0.3426443202979516, + "grad_norm": 0.018361978232860565, + "learning_rate": 6.865671641791044e-05, + "loss": 0.023, + "step": 46 + }, + { + "epoch": 0.3500931098696462, + "grad_norm": 0.016451209783554077, + "learning_rate": 7.014925373134329e-05, + "loss": 0.0204, + "step": 47 + }, + { + "epoch": 0.3575418994413408, + "grad_norm": 0.01562649942934513, + "learning_rate": 7.164179104477612e-05, + "loss": 0.0226, + "step": 48 + }, + { + "epoch": 0.3649906890130354, + "grad_norm": 0.015697643160820007, + "learning_rate": 7.313432835820896e-05, + "loss": 0.0187, + "step": 49 + }, + { + "epoch": 0.37243947858473, + "grad_norm": 0.017037643119692802, + "learning_rate": 7.46268656716418e-05, + "loss": 0.0218, + "step": 50 + }, + { + "epoch": 0.37243947858473, + "eval_loss": 0.019308224320411682, + "eval_runtime": 1.4373, + "eval_samples_per_second": 5.566, + "eval_steps_per_second": 1.391, + "step": 50 + }, + { + "epoch": 0.37988826815642457, + "grad_norm": 0.014953936450183392, + "learning_rate": 7.611940298507463e-05, + "loss": 0.0201, + "step": 51 + }, + { + "epoch": 0.38733705772811916, + "grad_norm": 0.01794457994401455, + "learning_rate": 7.761194029850747e-05, + "loss": 0.0199, + "step": 52 + }, + { + "epoch": 0.3947858472998138, + "grad_norm": 0.01501704752445221, + "learning_rate": 7.910447761194029e-05, + "loss": 0.0194, + "step": 53 + }, + { + "epoch": 0.4022346368715084, + "grad_norm": 0.013597317971289158, + "learning_rate": 8.059701492537314e-05, + "loss": 0.0183, + "step": 54 + }, + { + "epoch": 0.409683426443203, + "grad_norm": 0.01587977632880211, + "learning_rate": 8.208955223880597e-05, + "loss": 0.0208, + "step": 55 + }, + { + "epoch": 0.409683426443203, + "eval_loss": 0.018861299380660057, + "eval_runtime": 1.4302, + "eval_samples_per_second": 5.593, + "eval_steps_per_second": 1.398, + "step": 55 + }, + { + "epoch": 0.4171322160148976, + "grad_norm": 0.015953266993165016, + "learning_rate": 8.358208955223881e-05, + "loss": 0.0207, + "step": 56 + }, + { + "epoch": 0.4245810055865922, + "grad_norm": 0.015235635451972485, + "learning_rate": 8.507462686567164e-05, + "loss": 0.0183, + "step": 57 + }, + { + "epoch": 0.43202979515828677, + "grad_norm": 0.017862174659967422, + "learning_rate": 8.656716417910447e-05, + "loss": 0.0197, + "step": 58 + }, + { + "epoch": 0.43947858472998136, + "grad_norm": 0.014502939768135548, + "learning_rate": 8.805970149253732e-05, + "loss": 0.0176, + "step": 59 + }, + { + "epoch": 0.44692737430167595, + "grad_norm": 0.015043440274894238, + "learning_rate": 8.955223880597016e-05, + "loss": 0.0193, + "step": 60 + }, + { + "epoch": 0.44692737430167595, + "eval_loss": 0.01754169538617134, + "eval_runtime": 1.4329, + "eval_samples_per_second": 5.583, + "eval_steps_per_second": 1.396, + "step": 60 + }, + { + "epoch": 0.4543761638733706, + "grad_norm": 0.018948890268802643, + "learning_rate": 9.104477611940299e-05, + "loss": 0.0195, + "step": 61 + }, + { + "epoch": 0.4618249534450652, + "grad_norm": 0.016225658357143402, + "learning_rate": 9.253731343283582e-05, + "loss": 0.0175, + "step": 62 + }, + { + "epoch": 0.4692737430167598, + "grad_norm": 0.015083406120538712, + "learning_rate": 9.402985074626867e-05, + "loss": 0.0184, + "step": 63 + }, + { + "epoch": 0.4767225325884544, + "grad_norm": 0.018532050773501396, + "learning_rate": 9.552238805970149e-05, + "loss": 0.0176, + "step": 64 + }, + { + "epoch": 0.48417132216014896, + "grad_norm": 0.01437497977167368, + "learning_rate": 9.701492537313434e-05, + "loss": 0.0178, + "step": 65 + }, + { + "epoch": 0.48417132216014896, + "eval_loss": 0.016698401421308517, + "eval_runtime": 1.4373, + "eval_samples_per_second": 5.566, + "eval_steps_per_second": 1.392, + "step": 65 + }, + { + "epoch": 0.49162011173184356, + "grad_norm": 0.014234500005841255, + "learning_rate": 9.850746268656717e-05, + "loss": 0.0152, + "step": 66 + }, + { + "epoch": 0.49906890130353815, + "grad_norm": 0.017935309559106827, + "learning_rate": 0.0001, + "loss": 0.0206, + "step": 67 + }, + { + "epoch": 0.5065176908752328, + "grad_norm": 0.018849356099963188, + "learning_rate": 9.999932141516873e-05, + "loss": 0.0233, + "step": 68 + }, + { + "epoch": 0.5139664804469274, + "grad_norm": 0.01653886027634144, + "learning_rate": 9.999728567909403e-05, + "loss": 0.019, + "step": 69 + }, + { + "epoch": 0.521415270018622, + "grad_norm": 0.01674569770693779, + "learning_rate": 9.999389284703265e-05, + "loss": 0.017, + "step": 70 + }, + { + "epoch": 0.521415270018622, + "eval_loss": 0.01588435284793377, + "eval_runtime": 1.4379, + "eval_samples_per_second": 5.564, + "eval_steps_per_second": 1.391, + "step": 70 + }, + { + "epoch": 0.5288640595903166, + "grad_norm": 0.015146244317293167, + "learning_rate": 9.99891430110776e-05, + "loss": 0.0195, + "step": 71 + }, + { + "epoch": 0.5363128491620112, + "grad_norm": 0.015625057741999626, + "learning_rate": 9.998303630015553e-05, + "loss": 0.0156, + "step": 72 + }, + { + "epoch": 0.5437616387337058, + "grad_norm": 0.016596440225839615, + "learning_rate": 9.99755728800233e-05, + "loss": 0.0173, + "step": 73 + }, + { + "epoch": 0.5512104283054003, + "grad_norm": 0.015392429195344448, + "learning_rate": 9.996675295326346e-05, + "loss": 0.0167, + "step": 74 + }, + { + "epoch": 0.5586592178770949, + "grad_norm": 0.018201902508735657, + "learning_rate": 9.995657675927874e-05, + "loss": 0.0199, + "step": 75 + }, + { + "epoch": 0.5586592178770949, + "eval_loss": 0.015027389861643314, + "eval_runtime": 1.4361, + "eval_samples_per_second": 5.571, + "eval_steps_per_second": 1.393, + "step": 75 + }, + { + "epoch": 0.5661080074487895, + "grad_norm": 0.014863165095448494, + "learning_rate": 9.994504457428558e-05, + "loss": 0.0152, + "step": 76 + }, + { + "epoch": 0.5735567970204841, + "grad_norm": 0.016625454649329185, + "learning_rate": 9.993215671130662e-05, + "loss": 0.018, + "step": 77 + }, + { + "epoch": 0.5810055865921788, + "grad_norm": 0.018686765804886818, + "learning_rate": 9.991791352016217e-05, + "loss": 0.0154, + "step": 78 + }, + { + "epoch": 0.5884543761638734, + "grad_norm": 0.014789015986025333, + "learning_rate": 9.99023153874608e-05, + "loss": 0.0166, + "step": 79 + }, + { + "epoch": 0.595903165735568, + "grad_norm": 0.0178856011480093, + "learning_rate": 9.988536273658876e-05, + "loss": 0.0185, + "step": 80 + }, + { + "epoch": 0.595903165735568, + "eval_loss": 0.014966826885938644, + "eval_runtime": 1.4396, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 80 + }, + { + "epoch": 0.6033519553072626, + "grad_norm": 0.015204566530883312, + "learning_rate": 9.986705602769847e-05, + "loss": 0.0168, + "step": 81 + }, + { + "epoch": 0.6108007448789572, + "grad_norm": 0.014793118461966515, + "learning_rate": 9.984739575769618e-05, + "loss": 0.0148, + "step": 82 + }, + { + "epoch": 0.6182495344506518, + "grad_norm": 0.015269034542143345, + "learning_rate": 9.982638246022831e-05, + "loss": 0.0159, + "step": 83 + }, + { + "epoch": 0.6256983240223464, + "grad_norm": 0.014847309328615665, + "learning_rate": 9.980401670566706e-05, + "loss": 0.0141, + "step": 84 + }, + { + "epoch": 0.633147113594041, + "grad_norm": 0.016128500923514366, + "learning_rate": 9.978029910109491e-05, + "loss": 0.0167, + "step": 85 + }, + { + "epoch": 0.633147113594041, + "eval_loss": 0.014796840026974678, + "eval_runtime": 1.4367, + "eval_samples_per_second": 5.568, + "eval_steps_per_second": 1.392, + "step": 85 + }, + { + "epoch": 0.6405959031657356, + "grad_norm": 0.01574038155376911, + "learning_rate": 9.975523029028811e-05, + "loss": 0.0181, + "step": 86 + }, + { + "epoch": 0.6480446927374302, + "grad_norm": 0.01712299883365631, + "learning_rate": 9.972881095369926e-05, + "loss": 0.0175, + "step": 87 + }, + { + "epoch": 0.6554934823091247, + "grad_norm": 0.015495178289711475, + "learning_rate": 9.97010418084388e-05, + "loss": 0.0173, + "step": 88 + }, + { + "epoch": 0.6629422718808193, + "grad_norm": 0.014881464652717113, + "learning_rate": 9.967192360825557e-05, + "loss": 0.0138, + "step": 89 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 0.017920853570103645, + "learning_rate": 9.964145714351631e-05, + "loss": 0.0159, + "step": 90 + }, + { + "epoch": 0.6703910614525139, + "eval_loss": 0.014338547363877296, + "eval_runtime": 1.4382, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.391, + "step": 90 + }, + { + "epoch": 0.6778398510242085, + "grad_norm": 0.015727294608950615, + "learning_rate": 9.960964324118426e-05, + "loss": 0.0169, + "step": 91 + }, + { + "epoch": 0.6852886405959032, + "grad_norm": 0.014029323123395443, + "learning_rate": 9.95764827647967e-05, + "loss": 0.0152, + "step": 92 + }, + { + "epoch": 0.6927374301675978, + "grad_norm": 0.014262300916016102, + "learning_rate": 9.954197661444147e-05, + "loss": 0.0135, + "step": 93 + }, + { + "epoch": 0.7001862197392924, + "grad_norm": 0.014806578867137432, + "learning_rate": 9.950612572673255e-05, + "loss": 0.0177, + "step": 94 + }, + { + "epoch": 0.707635009310987, + "grad_norm": 0.015095021575689316, + "learning_rate": 9.946893107478473e-05, + "loss": 0.0153, + "step": 95 + }, + { + "epoch": 0.707635009310987, + "eval_loss": 0.013785483315587044, + "eval_runtime": 1.4349, + "eval_samples_per_second": 5.575, + "eval_steps_per_second": 1.394, + "step": 95 + }, + { + "epoch": 0.7150837988826816, + "grad_norm": 0.015151964500546455, + "learning_rate": 9.943039366818704e-05, + "loss": 0.0154, + "step": 96 + }, + { + "epoch": 0.7225325884543762, + "grad_norm": 0.0159757137298584, + "learning_rate": 9.939051455297547e-05, + "loss": 0.0163, + "step": 97 + }, + { + "epoch": 0.7299813780260708, + "grad_norm": 0.016340378671884537, + "learning_rate": 9.934929481160455e-05, + "loss": 0.0146, + "step": 98 + }, + { + "epoch": 0.7374301675977654, + "grad_norm": 0.01402219571173191, + "learning_rate": 9.93067355629179e-05, + "loss": 0.0139, + "step": 99 + }, + { + "epoch": 0.74487895716946, + "grad_norm": 0.014446019195020199, + "learning_rate": 9.926283796211795e-05, + "loss": 0.0144, + "step": 100 + }, + { + "epoch": 0.74487895716946, + "eval_loss": 0.013594349846243858, + "eval_runtime": 1.4357, + "eval_samples_per_second": 5.572, + "eval_steps_per_second": 1.393, + "step": 100 + }, + { + "epoch": 0.7523277467411545, + "grad_norm": 0.015092196874320507, + "learning_rate": 9.921760320073456e-05, + "loss": 0.0147, + "step": 101 + }, + { + "epoch": 0.7597765363128491, + "grad_norm": 0.01714695431292057, + "learning_rate": 9.917103250659262e-05, + "loss": 0.0157, + "step": 102 + }, + { + "epoch": 0.7672253258845437, + "grad_norm": 0.0149180693551898, + "learning_rate": 9.91231271437788e-05, + "loss": 0.0152, + "step": 103 + }, + { + "epoch": 0.7746741154562383, + "grad_norm": 0.014800818637013435, + "learning_rate": 9.907388841260723e-05, + "loss": 0.0121, + "step": 104 + }, + { + "epoch": 0.7821229050279329, + "grad_norm": 0.015027027577161789, + "learning_rate": 9.902331764958413e-05, + "loss": 0.0141, + "step": 105 + }, + { + "epoch": 0.7821229050279329, + "eval_loss": 0.013118552044034004, + "eval_runtime": 1.4365, + "eval_samples_per_second": 5.569, + "eval_steps_per_second": 1.392, + "step": 105 + }, + { + "epoch": 0.7895716945996276, + "grad_norm": 0.013868089765310287, + "learning_rate": 9.89714162273716e-05, + "loss": 0.0141, + "step": 106 + }, + { + "epoch": 0.7970204841713222, + "grad_norm": 0.012074757367372513, + "learning_rate": 9.891818555475037e-05, + "loss": 0.0127, + "step": 107 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 0.013333679176867008, + "learning_rate": 9.886362707658152e-05, + "loss": 0.0146, + "step": 108 + }, + { + "epoch": 0.8119180633147114, + "grad_norm": 0.013119596056640148, + "learning_rate": 9.880774227376726e-05, + "loss": 0.0136, + "step": 109 + }, + { + "epoch": 0.819366852886406, + "grad_norm": 0.01487047877162695, + "learning_rate": 9.87505326632108e-05, + "loss": 0.0156, + "step": 110 + }, + { + "epoch": 0.819366852886406, + "eval_loss": 0.01288369670510292, + "eval_runtime": 1.4366, + "eval_samples_per_second": 5.569, + "eval_steps_per_second": 1.392, + "step": 110 + }, + { + "epoch": 0.8268156424581006, + "grad_norm": 0.013432069681584835, + "learning_rate": 9.869199979777505e-05, + "loss": 0.0136, + "step": 111 + }, + { + "epoch": 0.8342644320297952, + "grad_norm": 0.014393101446330547, + "learning_rate": 9.863214526624065e-05, + "loss": 0.013, + "step": 112 + }, + { + "epoch": 0.8417132216014898, + "grad_norm": 0.01358636375516653, + "learning_rate": 9.857097069326267e-05, + "loss": 0.0138, + "step": 113 + }, + { + "epoch": 0.8491620111731844, + "grad_norm": 0.015925254672765732, + "learning_rate": 9.850847773932656e-05, + "loss": 0.0162, + "step": 114 + }, + { + "epoch": 0.8566108007448789, + "grad_norm": 0.014131754636764526, + "learning_rate": 9.844466810070319e-05, + "loss": 0.0116, + "step": 115 + }, + { + "epoch": 0.8566108007448789, + "eval_loss": 0.012635525315999985, + "eval_runtime": 1.4358, + "eval_samples_per_second": 5.572, + "eval_steps_per_second": 1.393, + "step": 115 + }, + { + "epoch": 0.8640595903165735, + "grad_norm": 0.01492136251181364, + "learning_rate": 9.837954350940266e-05, + "loss": 0.0137, + "step": 116 + }, + { + "epoch": 0.8715083798882681, + "grad_norm": 0.014648137614130974, + "learning_rate": 9.831310573312736e-05, + "loss": 0.0143, + "step": 117 + }, + { + "epoch": 0.8789571694599627, + "grad_norm": 0.013955476693809032, + "learning_rate": 9.824535657522398e-05, + "loss": 0.0143, + "step": 118 + }, + { + "epoch": 0.8864059590316573, + "grad_norm": 0.012400391511619091, + "learning_rate": 9.817629787463456e-05, + "loss": 0.0126, + "step": 119 + }, + { + "epoch": 0.8938547486033519, + "grad_norm": 0.01355024054646492, + "learning_rate": 9.810593150584658e-05, + "loss": 0.0154, + "step": 120 + }, + { + "epoch": 0.8938547486033519, + "eval_loss": 0.012325276620686054, + "eval_runtime": 1.4403, + "eval_samples_per_second": 5.554, + "eval_steps_per_second": 1.389, + "step": 120 + }, + { + "epoch": 0.9013035381750466, + "grad_norm": 0.01315679494291544, + "learning_rate": 9.8034259378842e-05, + "loss": 0.0145, + "step": 121 + }, + { + "epoch": 0.9087523277467412, + "grad_norm": 0.012878386303782463, + "learning_rate": 9.796128343904562e-05, + "loss": 0.0158, + "step": 122 + }, + { + "epoch": 0.9162011173184358, + "grad_norm": 0.014513295143842697, + "learning_rate": 9.788700566727205e-05, + "loss": 0.0175, + "step": 123 + }, + { + "epoch": 0.9236499068901304, + "grad_norm": 0.013689885847270489, + "learning_rate": 9.781142807967205e-05, + "loss": 0.0134, + "step": 124 + }, + { + "epoch": 0.931098696461825, + "grad_norm": 0.01449587196111679, + "learning_rate": 9.773455272767779e-05, + "loss": 0.0116, + "step": 125 + }, + { + "epoch": 0.931098696461825, + "eval_loss": 0.012087415903806686, + "eval_runtime": 1.4375, + "eval_samples_per_second": 5.565, + "eval_steps_per_second": 1.391, + "step": 125 + }, + { + "epoch": 0.9385474860335196, + "grad_norm": 0.01300882175564766, + "learning_rate": 9.765638169794719e-05, + "loss": 0.0133, + "step": 126 + }, + { + "epoch": 0.9459962756052142, + "grad_norm": 0.015073812566697598, + "learning_rate": 9.757691711230727e-05, + "loss": 0.0162, + "step": 127 + }, + { + "epoch": 0.9534450651769087, + "grad_norm": 0.01347210630774498, + "learning_rate": 9.74961611276965e-05, + "loss": 0.0136, + "step": 128 + }, + { + "epoch": 0.9608938547486033, + "grad_norm": 0.015565160661935806, + "learning_rate": 9.741411593610635e-05, + "loss": 0.0169, + "step": 129 + }, + { + "epoch": 0.9683426443202979, + "grad_norm": 0.015679839998483658, + "learning_rate": 9.733078376452171e-05, + "loss": 0.0167, + "step": 130 + }, + { + "epoch": 0.9683426443202979, + "eval_loss": 0.011777696199715137, + "eval_runtime": 1.4383, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.39, + "step": 130 + }, + { + "epoch": 0.9757914338919925, + "grad_norm": 0.014082420617341995, + "learning_rate": 9.724616687486048e-05, + "loss": 0.0148, + "step": 131 + }, + { + "epoch": 0.9832402234636871, + "grad_norm": 0.01425888855010271, + "learning_rate": 9.716026756391217e-05, + "loss": 0.0121, + "step": 132 + }, + { + "epoch": 0.9906890130353817, + "grad_norm": 0.011250928975641727, + "learning_rate": 9.707308816327557e-05, + "loss": 0.0112, + "step": 133 + }, + { + "epoch": 0.9981378026070763, + "grad_norm": 0.01310779619961977, + "learning_rate": 9.698463103929542e-05, + "loss": 0.0131, + "step": 134 + }, + { + "epoch": 1.005586592178771, + "grad_norm": 0.02536887675523758, + "learning_rate": 9.689489859299823e-05, + "loss": 0.0202, + "step": 135 + }, + { + "epoch": 1.005586592178771, + "eval_loss": 0.011471766978502274, + "eval_runtime": 1.4396, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 135 + }, + { + "epoch": 1.0130353817504656, + "grad_norm": 0.012562491931021214, + "learning_rate": 9.680389326002708e-05, + "loss": 0.0118, + "step": 136 + }, + { + "epoch": 1.0204841713221602, + "grad_norm": 0.014537862502038479, + "learning_rate": 9.671161751057551e-05, + "loss": 0.013, + "step": 137 + }, + { + "epoch": 1.0279329608938548, + "grad_norm": 0.015786582604050636, + "learning_rate": 9.661807384932047e-05, + "loss": 0.013, + "step": 138 + }, + { + "epoch": 1.0353817504655494, + "grad_norm": 0.014025253243744373, + "learning_rate": 9.652326481535435e-05, + "loss": 0.0122, + "step": 139 + }, + { + "epoch": 1.042830540037244, + "grad_norm": 0.013662833720445633, + "learning_rate": 9.642719298211602e-05, + "loss": 0.0126, + "step": 140 + }, + { + "epoch": 1.042830540037244, + "eval_loss": 0.01144600659608841, + "eval_runtime": 1.4367, + "eval_samples_per_second": 5.568, + "eval_steps_per_second": 1.392, + "step": 140 + }, + { + "epoch": 1.0502793296089385, + "grad_norm": 0.013206909410655499, + "learning_rate": 9.632986095732107e-05, + "loss": 0.0105, + "step": 141 + }, + { + "epoch": 1.0577281191806331, + "grad_norm": 0.015509011223912239, + "learning_rate": 9.623127138289087e-05, + "loss": 0.0132, + "step": 142 + }, + { + "epoch": 1.0651769087523277, + "grad_norm": 0.013073816895484924, + "learning_rate": 9.613142693488106e-05, + "loss": 0.0109, + "step": 143 + }, + { + "epoch": 1.0726256983240223, + "grad_norm": 0.012292170897126198, + "learning_rate": 9.603033032340875e-05, + "loss": 0.01, + "step": 144 + }, + { + "epoch": 1.080074487895717, + "grad_norm": 0.013037611730396748, + "learning_rate": 9.5927984292579e-05, + "loss": 0.0122, + "step": 145 + }, + { + "epoch": 1.080074487895717, + "eval_loss": 0.011396262794733047, + "eval_runtime": 1.4375, + "eval_samples_per_second": 5.565, + "eval_steps_per_second": 1.391, + "step": 145 + }, + { + "epoch": 1.0875232774674115, + "grad_norm": 0.012869670055806637, + "learning_rate": 9.582439162041037e-05, + "loss": 0.0113, + "step": 146 + }, + { + "epoch": 1.094972067039106, + "grad_norm": 0.013465145602822304, + "learning_rate": 9.571955511875954e-05, + "loss": 0.0104, + "step": 147 + }, + { + "epoch": 1.1024208566108007, + "grad_norm": 0.013149636797606945, + "learning_rate": 9.561347763324484e-05, + "loss": 0.0115, + "step": 148 + }, + { + "epoch": 1.1098696461824953, + "grad_norm": 0.014216437004506588, + "learning_rate": 9.550616204316922e-05, + "loss": 0.0125, + "step": 149 + }, + { + "epoch": 1.1173184357541899, + "grad_norm": 0.014135920442640781, + "learning_rate": 9.539761126144193e-05, + "loss": 0.0126, + "step": 150 + }, + { + "epoch": 1.1173184357541899, + "eval_loss": 0.011353620328009129, + "eval_runtime": 1.4376, + "eval_samples_per_second": 5.565, + "eval_steps_per_second": 1.391, + "step": 150 + }, + { + "epoch": 1.1247672253258845, + "grad_norm": 0.012843444012105465, + "learning_rate": 9.528782823449954e-05, + "loss": 0.0119, + "step": 151 + }, + { + "epoch": 1.132216014897579, + "grad_norm": 0.01245823036879301, + "learning_rate": 9.517681594222589e-05, + "loss": 0.0111, + "step": 152 + }, + { + "epoch": 1.1396648044692737, + "grad_norm": 0.014410407282412052, + "learning_rate": 9.506457739787132e-05, + "loss": 0.014, + "step": 153 + }, + { + "epoch": 1.1471135940409685, + "grad_norm": 0.012295592576265335, + "learning_rate": 9.495111564797074e-05, + "loss": 0.0127, + "step": 154 + }, + { + "epoch": 1.1545623836126628, + "grad_norm": 0.011407362297177315, + "learning_rate": 9.483643377226107e-05, + "loss": 0.0097, + "step": 155 + }, + { + "epoch": 1.1545623836126628, + "eval_loss": 0.011653873138129711, + "eval_runtime": 1.4458, + "eval_samples_per_second": 5.533, + "eval_steps_per_second": 1.383, + "step": 155 + }, + { + "epoch": 1.1620111731843576, + "grad_norm": 0.011912810616195202, + "learning_rate": 9.472053488359757e-05, + "loss": 0.0104, + "step": 156 + }, + { + "epoch": 1.169459962756052, + "grad_norm": 0.01219869963824749, + "learning_rate": 9.460342212786932e-05, + "loss": 0.0109, + "step": 157 + }, + { + "epoch": 1.1769087523277468, + "grad_norm": 0.013597341254353523, + "learning_rate": 9.448509868391395e-05, + "loss": 0.0115, + "step": 158 + }, + { + "epoch": 1.1843575418994414, + "grad_norm": 0.012947805225849152, + "learning_rate": 9.43655677634312e-05, + "loss": 0.0117, + "step": 159 + }, + { + "epoch": 1.191806331471136, + "grad_norm": 0.013273722492158413, + "learning_rate": 9.424483261089584e-05, + "loss": 0.01, + "step": 160 + }, + { + "epoch": 1.191806331471136, + "eval_loss": 0.011705568060278893, + "eval_runtime": 1.4386, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 160 + }, + { + "epoch": 1.1992551210428306, + "grad_norm": 0.012726851738989353, + "learning_rate": 9.412289650346961e-05, + "loss": 0.0123, + "step": 161 + }, + { + "epoch": 1.2067039106145252, + "grad_norm": 0.011661297641694546, + "learning_rate": 9.399976275091223e-05, + "loss": 0.0102, + "step": 162 + }, + { + "epoch": 1.2141527001862198, + "grad_norm": 0.013693406246602535, + "learning_rate": 9.387543469549156e-05, + "loss": 0.0122, + "step": 163 + }, + { + "epoch": 1.2216014897579144, + "grad_norm": 0.013621781021356583, + "learning_rate": 9.374991571189291e-05, + "loss": 0.0139, + "step": 164 + }, + { + "epoch": 1.229050279329609, + "grad_norm": 0.012945972383022308, + "learning_rate": 9.362320920712739e-05, + "loss": 0.0112, + "step": 165 + }, + { + "epoch": 1.229050279329609, + "eval_loss": 0.011136394925415516, + "eval_runtime": 1.4375, + "eval_samples_per_second": 5.565, + "eval_steps_per_second": 1.391, + "step": 165 + }, + { + "epoch": 1.2364990689013036, + "grad_norm": 0.011862865649163723, + "learning_rate": 9.349531862043952e-05, + "loss": 0.011, + "step": 166 + }, + { + "epoch": 1.2439478584729982, + "grad_norm": 0.011740331538021564, + "learning_rate": 9.336624742321375e-05, + "loss": 0.0106, + "step": 167 + }, + { + "epoch": 1.2513966480446927, + "grad_norm": 0.012267290614545345, + "learning_rate": 9.323599911888037e-05, + "loss": 0.0107, + "step": 168 + }, + { + "epoch": 1.2588454376163873, + "grad_norm": 0.014287952333688736, + "learning_rate": 9.310457724282034e-05, + "loss": 0.012, + "step": 169 + }, + { + "epoch": 1.266294227188082, + "grad_norm": 0.011434430256485939, + "learning_rate": 9.297198536226928e-05, + "loss": 0.0102, + "step": 170 + }, + { + "epoch": 1.266294227188082, + "eval_loss": 0.010175629518926144, + "eval_runtime": 1.4373, + "eval_samples_per_second": 5.566, + "eval_steps_per_second": 1.391, + "step": 170 + }, + { + "epoch": 1.2737430167597765, + "grad_norm": 0.012829802930355072, + "learning_rate": 9.283822707622075e-05, + "loss": 0.0109, + "step": 171 + }, + { + "epoch": 1.2811918063314711, + "grad_norm": 0.013556810095906258, + "learning_rate": 9.270330601532855e-05, + "loss": 0.0104, + "step": 172 + }, + { + "epoch": 1.2886405959031657, + "grad_norm": 0.013354483060538769, + "learning_rate": 9.256722584180806e-05, + "loss": 0.0123, + "step": 173 + }, + { + "epoch": 1.2960893854748603, + "grad_norm": 0.014078876003623009, + "learning_rate": 9.242999024933694e-05, + "loss": 0.0119, + "step": 174 + }, + { + "epoch": 1.303538175046555, + "grad_norm": 0.013081400655210018, + "learning_rate": 9.229160296295488e-05, + "loss": 0.0114, + "step": 175 + }, + { + "epoch": 1.303538175046555, + "eval_loss": 0.009631536900997162, + "eval_runtime": 1.437, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 175 + }, + { + "epoch": 1.3109869646182495, + "grad_norm": 0.012357114814221859, + "learning_rate": 9.215206773896237e-05, + "loss": 0.0109, + "step": 176 + }, + { + "epoch": 1.318435754189944, + "grad_norm": 0.014305895194411278, + "learning_rate": 9.201138836481891e-05, + "loss": 0.0121, + "step": 177 + }, + { + "epoch": 1.3258845437616387, + "grad_norm": 0.012040902860462666, + "learning_rate": 9.186956865904003e-05, + "loss": 0.0083, + "step": 178 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.015873685479164124, + "learning_rate": 9.172661247109382e-05, + "loss": 0.014, + "step": 179 + }, + { + "epoch": 1.3407821229050279, + "grad_norm": 0.013044324703514576, + "learning_rate": 9.158252368129628e-05, + "loss": 0.0109, + "step": 180 + }, + { + "epoch": 1.3407821229050279, + "eval_loss": 0.009440160356462002, + "eval_runtime": 1.442, + "eval_samples_per_second": 5.548, + "eval_steps_per_second": 1.387, + "step": 180 + }, + { + "epoch": 1.3482309124767227, + "grad_norm": 0.013596507720649242, + "learning_rate": 9.143730620070608e-05, + "loss": 0.0143, + "step": 181 + }, + { + "epoch": 1.355679702048417, + "grad_norm": 0.011726610362529755, + "learning_rate": 9.129096397101843e-05, + "loss": 0.0087, + "step": 182 + }, + { + "epoch": 1.3631284916201118, + "grad_norm": 0.014253761619329453, + "learning_rate": 9.114350096445803e-05, + "loss": 0.0111, + "step": 183 + }, + { + "epoch": 1.3705772811918062, + "grad_norm": 0.01339323353022337, + "learning_rate": 9.099492118367123e-05, + "loss": 0.0093, + "step": 184 + }, + { + "epoch": 1.378026070763501, + "grad_norm": 0.015108811669051647, + "learning_rate": 9.084522866161746e-05, + "loss": 0.0119, + "step": 185 + }, + { + "epoch": 1.378026070763501, + "eval_loss": 0.009605261497199535, + "eval_runtime": 1.438, + "eval_samples_per_second": 5.563, + "eval_steps_per_second": 1.391, + "step": 185 + }, + { + "epoch": 1.3854748603351954, + "grad_norm": 0.013058802112936974, + "learning_rate": 9.069442746145971e-05, + "loss": 0.0115, + "step": 186 + }, + { + "epoch": 1.3929236499068902, + "grad_norm": 0.01210273988544941, + "learning_rate": 9.054252167645425e-05, + "loss": 0.0097, + "step": 187 + }, + { + "epoch": 1.4003724394785848, + "grad_norm": 0.012483786791563034, + "learning_rate": 9.038951542983956e-05, + "loss": 0.0117, + "step": 188 + }, + { + "epoch": 1.4078212290502794, + "grad_norm": 0.01438519824296236, + "learning_rate": 9.023541287472435e-05, + "loss": 0.012, + "step": 189 + }, + { + "epoch": 1.415270018621974, + "grad_norm": 0.012437263503670692, + "learning_rate": 9.008021819397487e-05, + "loss": 0.0099, + "step": 190 + }, + { + "epoch": 1.415270018621974, + "eval_loss": 0.0095137320458889, + "eval_runtime": 1.4411, + "eval_samples_per_second": 5.551, + "eval_steps_per_second": 1.388, + "step": 190 + }, + { + "epoch": 1.4227188081936686, + "grad_norm": 0.01274897251278162, + "learning_rate": 8.992393560010137e-05, + "loss": 0.0096, + "step": 191 + }, + { + "epoch": 1.4301675977653632, + "grad_norm": 0.013629659079015255, + "learning_rate": 8.976656933514378e-05, + "loss": 0.012, + "step": 192 + }, + { + "epoch": 1.4376163873370578, + "grad_norm": 0.014524271711707115, + "learning_rate": 8.960812367055646e-05, + "loss": 0.0126, + "step": 193 + }, + { + "epoch": 1.4450651769087524, + "grad_norm": 0.01249483972787857, + "learning_rate": 8.944860290709244e-05, + "loss": 0.0118, + "step": 194 + }, + { + "epoch": 1.452513966480447, + "grad_norm": 0.012390038929879665, + "learning_rate": 8.928801137468654e-05, + "loss": 0.01, + "step": 195 + }, + { + "epoch": 1.452513966480447, + "eval_loss": 0.009372740983963013, + "eval_runtime": 1.437, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 195 + }, + { + "epoch": 1.4599627560521415, + "grad_norm": 0.013015707023441792, + "learning_rate": 8.912635343233784e-05, + "loss": 0.0101, + "step": 196 + }, + { + "epoch": 1.4674115456238361, + "grad_norm": 0.013453744351863861, + "learning_rate": 8.896363346799146e-05, + "loss": 0.0111, + "step": 197 + }, + { + "epoch": 1.4748603351955307, + "grad_norm": 0.012541916221380234, + "learning_rate": 8.879985589841937e-05, + "loss": 0.0109, + "step": 198 + }, + { + "epoch": 1.4823091247672253, + "grad_norm": 0.014430958777666092, + "learning_rate": 8.863502516910058e-05, + "loss": 0.0126, + "step": 199 + }, + { + "epoch": 1.48975791433892, + "grad_norm": 0.013778624124825, + "learning_rate": 8.846914575410034e-05, + "loss": 0.0117, + "step": 200 + }, + { + "epoch": 1.48975791433892, + "eval_loss": 0.009262695908546448, + "eval_runtime": 1.4417, + "eval_samples_per_second": 5.549, + "eval_steps_per_second": 1.387, + "step": 200 + }, + { + "epoch": 1.4972067039106145, + "grad_norm": 0.01135862898081541, + "learning_rate": 8.83022221559489e-05, + "loss": 0.0094, + "step": 201 + }, + { + "epoch": 1.504655493482309, + "grad_norm": 0.012235710397362709, + "learning_rate": 8.81342589055191e-05, + "loss": 0.0109, + "step": 202 + }, + { + "epoch": 1.5121042830540037, + "grad_norm": 0.012683708220720291, + "learning_rate": 8.79652605619035e-05, + "loss": 0.0118, + "step": 203 + }, + { + "epoch": 1.5195530726256983, + "grad_norm": 0.01422063261270523, + "learning_rate": 8.77952317122906e-05, + "loss": 0.0105, + "step": 204 + }, + { + "epoch": 1.5270018621973929, + "grad_norm": 0.013863672502338886, + "learning_rate": 8.762417697184033e-05, + "loss": 0.0121, + "step": 205 + }, + { + "epoch": 1.5270018621973929, + "eval_loss": 0.009015379473567009, + "eval_runtime": 1.4354, + "eval_samples_per_second": 5.574, + "eval_steps_per_second": 1.393, + "step": 205 + }, + { + "epoch": 1.5344506517690877, + "grad_norm": 0.013202273286879063, + "learning_rate": 8.745210098355878e-05, + "loss": 0.0102, + "step": 206 + }, + { + "epoch": 1.541899441340782, + "grad_norm": 0.014716023579239845, + "learning_rate": 8.727900841817215e-05, + "loss": 0.0121, + "step": 207 + }, + { + "epoch": 1.5493482309124769, + "grad_norm": 0.013146266341209412, + "learning_rate": 8.710490397400006e-05, + "loss": 0.0118, + "step": 208 + }, + { + "epoch": 1.5567970204841712, + "grad_norm": 0.012007774785161018, + "learning_rate": 8.692979237682786e-05, + "loss": 0.0096, + "step": 209 + }, + { + "epoch": 1.564245810055866, + "grad_norm": 0.012724281288683414, + "learning_rate": 8.675367837977849e-05, + "loss": 0.0104, + "step": 210 + }, + { + "epoch": 1.564245810055866, + "eval_loss": 0.008786162361502647, + "eval_runtime": 1.4355, + "eval_samples_per_second": 5.573, + "eval_steps_per_second": 1.393, + "step": 210 + }, + { + "epoch": 1.5716945996275604, + "grad_norm": 0.01193510927259922, + "learning_rate": 8.657656676318346e-05, + "loss": 0.0102, + "step": 211 + }, + { + "epoch": 1.5791433891992552, + "grad_norm": 0.012191027402877808, + "learning_rate": 8.639846233445301e-05, + "loss": 0.0103, + "step": 212 + }, + { + "epoch": 1.5865921787709496, + "grad_norm": 0.013887192122638226, + "learning_rate": 8.621936992794568e-05, + "loss": 0.0115, + "step": 213 + }, + { + "epoch": 1.5940409683426444, + "grad_norm": 0.013555426150560379, + "learning_rate": 8.603929440483713e-05, + "loss": 0.0119, + "step": 214 + }, + { + "epoch": 1.6014897579143388, + "grad_norm": 0.014149404130876064, + "learning_rate": 8.585824065298806e-05, + "loss": 0.0123, + "step": 215 + }, + { + "epoch": 1.6014897579143388, + "eval_loss": 0.008579553104937077, + "eval_runtime": 1.4367, + "eval_samples_per_second": 5.568, + "eval_steps_per_second": 1.392, + "step": 215 + }, + { + "epoch": 1.6089385474860336, + "grad_norm": 0.013324511237442493, + "learning_rate": 8.567621358681165e-05, + "loss": 0.0089, + "step": 216 + }, + { + "epoch": 1.616387337057728, + "grad_norm": 0.015030642040073872, + "learning_rate": 8.549321814714018e-05, + "loss": 0.012, + "step": 217 + }, + { + "epoch": 1.6238361266294228, + "grad_norm": 0.012063059955835342, + "learning_rate": 8.530925930109078e-05, + "loss": 0.0095, + "step": 218 + }, + { + "epoch": 1.6312849162011172, + "grad_norm": 0.013545147143304348, + "learning_rate": 8.51243420419308e-05, + "loss": 0.0104, + "step": 219 + }, + { + "epoch": 1.638733705772812, + "grad_norm": 0.01237440388649702, + "learning_rate": 8.493847138894209e-05, + "loss": 0.0092, + "step": 220 + }, + { + "epoch": 1.638733705772812, + "eval_loss": 0.008445178158581257, + "eval_runtime": 1.4363, + "eval_samples_per_second": 5.57, + "eval_steps_per_second": 1.392, + "step": 220 + }, + { + "epoch": 1.6461824953445066, + "grad_norm": 0.011756625957787037, + "learning_rate": 8.475165238728489e-05, + "loss": 0.0095, + "step": 221 + }, + { + "epoch": 1.6536312849162011, + "grad_norm": 0.013697388581931591, + "learning_rate": 8.456389010786083e-05, + "loss": 0.0113, + "step": 222 + }, + { + "epoch": 1.6610800744878957, + "grad_norm": 0.013174861669540405, + "learning_rate": 8.43751896471753e-05, + "loss": 0.0112, + "step": 223 + }, + { + "epoch": 1.6685288640595903, + "grad_norm": 0.01325511746108532, + "learning_rate": 8.418555612719911e-05, + "loss": 0.0109, + "step": 224 + }, + { + "epoch": 1.675977653631285, + "grad_norm": 0.014326260425150394, + "learning_rate": 8.399499469522947e-05, + "loss": 0.012, + "step": 225 + }, + { + "epoch": 1.675977653631285, + "eval_loss": 0.008550510741770267, + "eval_runtime": 1.4378, + "eval_samples_per_second": 5.564, + "eval_steps_per_second": 1.391, + "step": 225 + }, + { + "epoch": 1.6834264432029795, + "grad_norm": 0.013872465118765831, + "learning_rate": 8.380351052375022e-05, + "loss": 0.0125, + "step": 226 + }, + { + "epoch": 1.690875232774674, + "grad_norm": 0.014252915978431702, + "learning_rate": 8.361110881029161e-05, + "loss": 0.0102, + "step": 227 + }, + { + "epoch": 1.6983240223463687, + "grad_norm": 0.012954461388289928, + "learning_rate": 8.341779477728896e-05, + "loss": 0.0099, + "step": 228 + }, + { + "epoch": 1.7057728119180633, + "grad_norm": 0.013365969061851501, + "learning_rate": 8.322357367194109e-05, + "loss": 0.0106, + "step": 229 + }, + { + "epoch": 1.7132216014897579, + "grad_norm": 0.012612243182957172, + "learning_rate": 8.302845076606786e-05, + "loss": 0.0088, + "step": 230 + }, + { + "epoch": 1.7132216014897579, + "eval_loss": 0.008591416291892529, + "eval_runtime": 1.4362, + "eval_samples_per_second": 5.57, + "eval_steps_per_second": 1.393, + "step": 230 + }, + { + "epoch": 1.7206703910614525, + "grad_norm": 0.015234209597110748, + "learning_rate": 8.283243135596701e-05, + "loss": 0.0122, + "step": 231 + }, + { + "epoch": 1.728119180633147, + "grad_norm": 0.014029356651008129, + "learning_rate": 8.263552076227048e-05, + "loss": 0.0103, + "step": 232 + }, + { + "epoch": 1.7355679702048417, + "grad_norm": 0.012033418752253056, + "learning_rate": 8.243772432979997e-05, + "loss": 0.0096, + "step": 233 + }, + { + "epoch": 1.7430167597765363, + "grad_norm": 0.013875091448426247, + "learning_rate": 8.223904742742181e-05, + "loss": 0.0115, + "step": 234 + }, + { + "epoch": 1.750465549348231, + "grad_norm": 0.01745481789112091, + "learning_rate": 8.203949544790131e-05, + "loss": 0.0098, + "step": 235 + }, + { + "epoch": 1.750465549348231, + "eval_loss": 0.008027992211282253, + "eval_runtime": 1.4388, + "eval_samples_per_second": 5.56, + "eval_steps_per_second": 1.39, + "step": 235 + }, + { + "epoch": 1.7579143389199254, + "grad_norm": 0.01303934957832098, + "learning_rate": 8.183907380775631e-05, + "loss": 0.0105, + "step": 236 + }, + { + "epoch": 1.7653631284916202, + "grad_norm": 0.013825961388647556, + "learning_rate": 8.163778794711019e-05, + "loss": 0.0115, + "step": 237 + }, + { + "epoch": 1.7728119180633146, + "grad_norm": 0.014336783438920975, + "learning_rate": 8.143564332954425e-05, + "loss": 0.0091, + "step": 238 + }, + { + "epoch": 1.7802607076350094, + "grad_norm": 0.013591837137937546, + "learning_rate": 8.123264544194933e-05, + "loss": 0.0097, + "step": 239 + }, + { + "epoch": 1.7877094972067038, + "grad_norm": 0.013703561387956142, + "learning_rate": 8.10287997943769e-05, + "loss": 0.01, + "step": 240 + }, + { + "epoch": 1.7877094972067038, + "eval_loss": 0.008301131427288055, + "eval_runtime": 1.438, + "eval_samples_per_second": 5.563, + "eval_steps_per_second": 1.391, + "step": 240 + }, + { + "epoch": 1.7951582867783986, + "grad_norm": 0.01494829636067152, + "learning_rate": 8.082411191988957e-05, + "loss": 0.0102, + "step": 241 + }, + { + "epoch": 1.802607076350093, + "grad_norm": 0.014247076585888863, + "learning_rate": 8.061858737441078e-05, + "loss": 0.0091, + "step": 242 + }, + { + "epoch": 1.8100558659217878, + "grad_norm": 0.012268093414604664, + "learning_rate": 8.04122317365741e-05, + "loss": 0.0088, + "step": 243 + }, + { + "epoch": 1.8175046554934822, + "grad_norm": 0.015215467661619186, + "learning_rate": 8.020505060757179e-05, + "loss": 0.01, + "step": 244 + }, + { + "epoch": 1.824953445065177, + "grad_norm": 0.012957314029335976, + "learning_rate": 7.999704961100266e-05, + "loss": 0.0089, + "step": 245 + }, + { + "epoch": 1.824953445065177, + "eval_loss": 0.00799483247101307, + "eval_runtime": 1.4389, + "eval_samples_per_second": 5.56, + "eval_steps_per_second": 1.39, + "step": 245 + }, + { + "epoch": 1.8324022346368714, + "grad_norm": 0.012614929117262363, + "learning_rate": 7.978823439271958e-05, + "loss": 0.0086, + "step": 246 + }, + { + "epoch": 1.8398510242085662, + "grad_norm": 0.013070178218185902, + "learning_rate": 7.957861062067614e-05, + "loss": 0.0093, + "step": 247 + }, + { + "epoch": 1.8472998137802608, + "grad_norm": 0.012843037955462933, + "learning_rate": 7.936818398477279e-05, + "loss": 0.009, + "step": 248 + }, + { + "epoch": 1.8547486033519553, + "grad_norm": 0.015133730135858059, + "learning_rate": 7.915696019670249e-05, + "loss": 0.0121, + "step": 249 + }, + { + "epoch": 1.86219739292365, + "grad_norm": 0.012439730577170849, + "learning_rate": 7.894494498979557e-05, + "loss": 0.0094, + "step": 250 + }, + { + "epoch": 1.86219739292365, + "eval_loss": 0.008173057809472084, + "eval_runtime": 1.4372, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 250 + }, + { + "epoch": 1.8696461824953445, + "grad_norm": 0.012256564572453499, + "learning_rate": 7.873214411886419e-05, + "loss": 0.0092, + "step": 251 + }, + { + "epoch": 1.8770949720670391, + "grad_norm": 0.012881455942988396, + "learning_rate": 7.851856336004604e-05, + "loss": 0.0105, + "step": 252 + }, + { + "epoch": 1.8845437616387337, + "grad_norm": 0.013479229062795639, + "learning_rate": 7.830420851064766e-05, + "loss": 0.0106, + "step": 253 + }, + { + "epoch": 1.8919925512104283, + "grad_norm": 0.01145131979137659, + "learning_rate": 7.808908538898702e-05, + "loss": 0.0088, + "step": 254 + }, + { + "epoch": 1.899441340782123, + "grad_norm": 0.013347066007554531, + "learning_rate": 7.787319983423563e-05, + "loss": 0.0086, + "step": 255 + }, + { + "epoch": 1.899441340782123, + "eval_loss": 0.008111419156193733, + "eval_runtime": 1.4358, + "eval_samples_per_second": 5.572, + "eval_steps_per_second": 1.393, + "step": 255 + }, + { + "epoch": 1.9068901303538175, + "grad_norm": 0.01261739619076252, + "learning_rate": 7.765655770625997e-05, + "loss": 0.0097, + "step": 256 + }, + { + "epoch": 1.914338919925512, + "grad_norm": 0.012597484514117241, + "learning_rate": 7.743916488546254e-05, + "loss": 0.009, + "step": 257 + }, + { + "epoch": 1.9217877094972067, + "grad_norm": 0.013339761644601822, + "learning_rate": 7.722102727262215e-05, + "loss": 0.009, + "step": 258 + }, + { + "epoch": 1.9292364990689013, + "grad_norm": 0.015243194065988064, + "learning_rate": 7.700215078873379e-05, + "loss": 0.0084, + "step": 259 + }, + { + "epoch": 1.9366852886405959, + "grad_norm": 0.013798700645565987, + "learning_rate": 7.678254137484797e-05, + "loss": 0.0092, + "step": 260 + }, + { + "epoch": 1.9366852886405959, + "eval_loss": 0.008017845451831818, + "eval_runtime": 1.4396, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 260 + }, + { + "epoch": 1.9441340782122905, + "grad_norm": 0.013677353039383888, + "learning_rate": 7.656220499190936e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 1.9515828677839853, + "grad_norm": 0.014048004522919655, + "learning_rate": 7.634114762059504e-05, + "loss": 0.0118, + "step": 262 + }, + { + "epoch": 1.9590316573556796, + "grad_norm": 0.012328066863119602, + "learning_rate": 7.611937526115218e-05, + "loss": 0.0085, + "step": 263 + }, + { + "epoch": 1.9664804469273744, + "grad_norm": 0.013239427469670773, + "learning_rate": 7.589689393323514e-05, + "loss": 0.0108, + "step": 264 + }, + { + "epoch": 1.9739292364990688, + "grad_norm": 0.011700286529958248, + "learning_rate": 7.56737096757421e-05, + "loss": 0.0097, + "step": 265 + }, + { + "epoch": 1.9739292364990688, + "eval_loss": 0.00809735618531704, + "eval_runtime": 1.436, + "eval_samples_per_second": 5.571, + "eval_steps_per_second": 1.393, + "step": 265 + }, + { + "epoch": 1.9813780260707636, + "grad_norm": 0.013501322828233242, + "learning_rate": 7.544982854665113e-05, + "loss": 0.0102, + "step": 266 + }, + { + "epoch": 1.988826815642458, + "grad_norm": 0.012474549934267998, + "learning_rate": 7.522525662285575e-05, + "loss": 0.01, + "step": 267 + }, + { + "epoch": 1.9962756052141528, + "grad_norm": 0.01261069905012846, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0097, + "step": 268 + }, + { + "epoch": 2.003724394785847, + "grad_norm": 0.025947801768779755, + "learning_rate": 7.4774064792313e-05, + "loss": 0.0152, + "step": 269 + }, + { + "epoch": 2.011173184357542, + "grad_norm": 0.01134838629513979, + "learning_rate": 7.454745713244289e-05, + "loss": 0.0074, + "step": 270 + }, + { + "epoch": 2.011173184357542, + "eval_loss": 0.00787295214831829, + "eval_runtime": 1.4411, + "eval_samples_per_second": 5.551, + "eval_steps_per_second": 1.388, + "step": 270 + }, + { + "epoch": 2.0186219739292364, + "grad_norm": 0.011715228669345379, + "learning_rate": 7.432018317129056e-05, + "loss": 0.0075, + "step": 271 + }, + { + "epoch": 2.026070763500931, + "grad_norm": 0.01580004207789898, + "learning_rate": 7.409224907784247e-05, + "loss": 0.0086, + "step": 272 + }, + { + "epoch": 2.0335195530726256, + "grad_norm": 0.01578812301158905, + "learning_rate": 7.386366103900337e-05, + "loss": 0.0089, + "step": 273 + }, + { + "epoch": 2.0409683426443204, + "grad_norm": 0.015953512862324715, + "learning_rate": 7.363442525942826e-05, + "loss": 0.0082, + "step": 274 + }, + { + "epoch": 2.0484171322160147, + "grad_norm": 0.014860575087368488, + "learning_rate": 7.34045479613541e-05, + "loss": 0.0071, + "step": 275 + }, + { + "epoch": 2.0484171322160147, + "eval_loss": 0.008008327335119247, + "eval_runtime": 1.4371, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 275 + }, + { + "epoch": 2.0558659217877095, + "grad_norm": 0.0140910055488348, + "learning_rate": 7.317403538443078e-05, + "loss": 0.0069, + "step": 276 + }, + { + "epoch": 2.063314711359404, + "grad_norm": 0.015318790450692177, + "learning_rate": 7.294289378555179e-05, + "loss": 0.0074, + "step": 277 + }, + { + "epoch": 2.0707635009310987, + "grad_norm": 0.014569014310836792, + "learning_rate": 7.271112943868448e-05, + "loss": 0.0076, + "step": 278 + }, + { + "epoch": 2.078212290502793, + "grad_norm": 0.012695319019258022, + "learning_rate": 7.247874863469964e-05, + "loss": 0.0075, + "step": 279 + }, + { + "epoch": 2.085661080074488, + "grad_norm": 0.013956272974610329, + "learning_rate": 7.224575768120083e-05, + "loss": 0.0087, + "step": 280 + }, + { + "epoch": 2.085661080074488, + "eval_loss": 0.007895184680819511, + "eval_runtime": 1.438, + "eval_samples_per_second": 5.563, + "eval_steps_per_second": 1.391, + "step": 280 + }, + { + "epoch": 2.0931098696461823, + "grad_norm": 0.011924992315471172, + "learning_rate": 7.201216290235312e-05, + "loss": 0.0068, + "step": 281 + }, + { + "epoch": 2.100558659217877, + "grad_norm": 0.013794896192848682, + "learning_rate": 7.177797063871146e-05, + "loss": 0.0081, + "step": 282 + }, + { + "epoch": 2.1080074487895715, + "grad_norm": 0.012423375621438026, + "learning_rate": 7.154318724704853e-05, + "loss": 0.0067, + "step": 283 + }, + { + "epoch": 2.1154562383612663, + "grad_norm": 0.014208446256816387, + "learning_rate": 7.130781910018227e-05, + "loss": 0.0085, + "step": 284 + }, + { + "epoch": 2.122905027932961, + "grad_norm": 0.01337914913892746, + "learning_rate": 7.107187258680287e-05, + "loss": 0.0078, + "step": 285 + }, + { + "epoch": 2.122905027932961, + "eval_loss": 0.007793488912284374, + "eval_runtime": 1.4398, + "eval_samples_per_second": 5.556, + "eval_steps_per_second": 1.389, + "step": 285 + }, + { + "epoch": 2.1303538175046555, + "grad_norm": 0.012681729160249233, + "learning_rate": 7.083535411129933e-05, + "loss": 0.0065, + "step": 286 + }, + { + "epoch": 2.1378026070763503, + "grad_norm": 0.01357929315418005, + "learning_rate": 7.059827009358563e-05, + "loss": 0.0078, + "step": 287 + }, + { + "epoch": 2.1452513966480447, + "grad_norm": 0.011913263238966465, + "learning_rate": 7.036062696892648e-05, + "loss": 0.0068, + "step": 288 + }, + { + "epoch": 2.1527001862197395, + "grad_norm": 0.011856846511363983, + "learning_rate": 7.012243118776269e-05, + "loss": 0.0064, + "step": 289 + }, + { + "epoch": 2.160148975791434, + "grad_norm": 0.013001404702663422, + "learning_rate": 6.988368921553601e-05, + "loss": 0.0071, + "step": 290 + }, + { + "epoch": 2.160148975791434, + "eval_loss": 0.007797461934387684, + "eval_runtime": 1.4396, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 290 + }, + { + "epoch": 2.1675977653631286, + "grad_norm": 0.014474976807832718, + "learning_rate": 6.964440753251366e-05, + "loss": 0.0072, + "step": 291 + }, + { + "epoch": 2.175046554934823, + "grad_norm": 0.012461818754673004, + "learning_rate": 6.940459263361249e-05, + "loss": 0.007, + "step": 292 + }, + { + "epoch": 2.182495344506518, + "grad_norm": 0.012863445095717907, + "learning_rate": 6.91642510282226e-05, + "loss": 0.0077, + "step": 293 + }, + { + "epoch": 2.189944134078212, + "grad_norm": 0.013767079450190067, + "learning_rate": 6.892338924003067e-05, + "loss": 0.009, + "step": 294 + }, + { + "epoch": 2.197392923649907, + "grad_norm": 0.012428815476596355, + "learning_rate": 6.868201380684299e-05, + "loss": 0.0062, + "step": 295 + }, + { + "epoch": 2.197392923649907, + "eval_loss": 0.0076547181233763695, + "eval_runtime": 1.4504, + "eval_samples_per_second": 5.516, + "eval_steps_per_second": 1.379, + "step": 295 + }, + { + "epoch": 2.2048417132216014, + "grad_norm": 0.012883047573268414, + "learning_rate": 6.844013128040782e-05, + "loss": 0.0067, + "step": 296 + }, + { + "epoch": 2.212290502793296, + "grad_norm": 0.01364380493760109, + "learning_rate": 6.819774822623772e-05, + "loss": 0.007, + "step": 297 + }, + { + "epoch": 2.2197392923649906, + "grad_norm": 0.014654329977929592, + "learning_rate": 6.795487122343124e-05, + "loss": 0.0091, + "step": 298 + }, + { + "epoch": 2.2271880819366854, + "grad_norm": 0.013505183160305023, + "learning_rate": 6.771150686449436e-05, + "loss": 0.0079, + "step": 299 + }, + { + "epoch": 2.2346368715083798, + "grad_norm": 0.013163126073777676, + "learning_rate": 6.74676617551616e-05, + "loss": 0.0072, + "step": 300 + }, + { + "epoch": 2.2346368715083798, + "eval_loss": 0.007786765694618225, + "eval_runtime": 1.4387, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 300 + }, + { + "epoch": 2.2420856610800746, + "grad_norm": 0.012463901191949844, + "learning_rate": 6.722334251421665e-05, + "loss": 0.0068, + "step": 301 + }, + { + "epoch": 2.249534450651769, + "grad_norm": 0.014410431496798992, + "learning_rate": 6.697855577331274e-05, + "loss": 0.0076, + "step": 302 + }, + { + "epoch": 2.2569832402234637, + "grad_norm": 0.013395678251981735, + "learning_rate": 6.673330817679265e-05, + "loss": 0.0079, + "step": 303 + }, + { + "epoch": 2.264432029795158, + "grad_norm": 0.01278934720903635, + "learning_rate": 6.648760638150832e-05, + "loss": 0.0065, + "step": 304 + }, + { + "epoch": 2.271880819366853, + "grad_norm": 0.01424480602145195, + "learning_rate": 6.624145705664023e-05, + "loss": 0.0078, + "step": 305 + }, + { + "epoch": 2.271880819366853, + "eval_loss": 0.007918687537312508, + "eval_runtime": 1.4308, + "eval_samples_per_second": 5.591, + "eval_steps_per_second": 1.398, + "step": 305 + }, + { + "epoch": 2.2793296089385473, + "grad_norm": 0.013303694315254688, + "learning_rate": 6.599486688351628e-05, + "loss": 0.0064, + "step": 306 + }, + { + "epoch": 2.286778398510242, + "grad_norm": 0.011955632828176022, + "learning_rate": 6.574784255543051e-05, + "loss": 0.007, + "step": 307 + }, + { + "epoch": 2.294227188081937, + "grad_norm": 0.01262418832629919, + "learning_rate": 6.550039077746142e-05, + "loss": 0.0069, + "step": 308 + }, + { + "epoch": 2.3016759776536313, + "grad_norm": 0.012786969542503357, + "learning_rate": 6.525251826628991e-05, + "loss": 0.0068, + "step": 309 + }, + { + "epoch": 2.3091247672253257, + "grad_norm": 0.013325291685760021, + "learning_rate": 6.500423175001705e-05, + "loss": 0.0071, + "step": 310 + }, + { + "epoch": 2.3091247672253257, + "eval_loss": 0.007905280217528343, + "eval_runtime": 1.4495, + "eval_samples_per_second": 5.519, + "eval_steps_per_second": 1.38, + "step": 310 + }, + { + "epoch": 2.3165735567970205, + "grad_norm": 0.011358117684721947, + "learning_rate": 6.475553796798135e-05, + "loss": 0.0068, + "step": 311 + }, + { + "epoch": 2.3240223463687153, + "grad_norm": 0.012326772324740887, + "learning_rate": 6.450644367057597e-05, + "loss": 0.0064, + "step": 312 + }, + { + "epoch": 2.3314711359404097, + "grad_norm": 0.012478802353143692, + "learning_rate": 6.425695561906537e-05, + "loss": 0.0073, + "step": 313 + }, + { + "epoch": 2.338919925512104, + "grad_norm": 0.012162077240645885, + "learning_rate": 6.400708058540182e-05, + "loss": 0.0071, + "step": 314 + }, + { + "epoch": 2.346368715083799, + "grad_norm": 0.011681130155920982, + "learning_rate": 6.375682535204167e-05, + "loss": 0.0064, + "step": 315 + }, + { + "epoch": 2.346368715083799, + "eval_loss": 0.00784903485327959, + "eval_runtime": 1.4437, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 1.385, + "step": 315 + }, + { + "epoch": 2.3538175046554937, + "grad_norm": 0.012769734486937523, + "learning_rate": 6.350619671176111e-05, + "loss": 0.0065, + "step": 316 + }, + { + "epoch": 2.361266294227188, + "grad_norm": 0.014505505561828613, + "learning_rate": 6.325520146747189e-05, + "loss": 0.0078, + "step": 317 + }, + { + "epoch": 2.368715083798883, + "grad_norm": 0.012786807492375374, + "learning_rate": 6.30038464320366e-05, + "loss": 0.0076, + "step": 318 + }, + { + "epoch": 2.376163873370577, + "grad_norm": 0.011977126821875572, + "learning_rate": 6.275213842808383e-05, + "loss": 0.006, + "step": 319 + }, + { + "epoch": 2.383612662942272, + "grad_norm": 0.013352525420486927, + "learning_rate": 6.250008428782292e-05, + "loss": 0.0075, + "step": 320 + }, + { + "epoch": 2.383612662942272, + "eval_loss": 0.007701109163463116, + "eval_runtime": 1.4383, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.391, + "step": 320 + }, + { + "epoch": 2.3910614525139664, + "grad_norm": 0.013219831511378288, + "learning_rate": 6.224769085285854e-05, + "loss": 0.0069, + "step": 321 + }, + { + "epoch": 2.398510242085661, + "grad_norm": 0.01240642461925745, + "learning_rate": 6.19949649740049e-05, + "loss": 0.0066, + "step": 322 + }, + { + "epoch": 2.4059590316573556, + "grad_norm": 0.012310575693845749, + "learning_rate": 6.174191351109995e-05, + "loss": 0.0068, + "step": 323 + }, + { + "epoch": 2.4134078212290504, + "grad_norm": 0.013844011351466179, + "learning_rate": 6.148854333281905e-05, + "loss": 0.0067, + "step": 324 + }, + { + "epoch": 2.4208566108007448, + "grad_norm": 0.013616068288683891, + "learning_rate": 6.12348613164886e-05, + "loss": 0.0075, + "step": 325 + }, + { + "epoch": 2.4208566108007448, + "eval_loss": 0.007413792423903942, + "eval_runtime": 1.4424, + "eval_samples_per_second": 5.546, + "eval_steps_per_second": 1.387, + "step": 325 + }, + { + "epoch": 2.4283054003724396, + "grad_norm": 0.013439241796731949, + "learning_rate": 6.098087434789931e-05, + "loss": 0.0069, + "step": 326 + }, + { + "epoch": 2.435754189944134, + "grad_norm": 0.01345871016383171, + "learning_rate": 6.0726589321119364e-05, + "loss": 0.0071, + "step": 327 + }, + { + "epoch": 2.4432029795158288, + "grad_norm": 0.01571730151772499, + "learning_rate": 6.0472013138307235e-05, + "loss": 0.0075, + "step": 328 + }, + { + "epoch": 2.450651769087523, + "grad_norm": 0.012418650090694427, + "learning_rate": 6.021715270952435e-05, + "loss": 0.0075, + "step": 329 + }, + { + "epoch": 2.458100558659218, + "grad_norm": 0.011913064867258072, + "learning_rate": 5.9962014952547575e-05, + "loss": 0.007, + "step": 330 + }, + { + "epoch": 2.458100558659218, + "eval_loss": 0.007491931319236755, + "eval_runtime": 1.4428, + "eval_samples_per_second": 5.545, + "eval_steps_per_second": 1.386, + "step": 330 + }, + { + "epoch": 2.4655493482309123, + "grad_norm": 0.011813613586127758, + "learning_rate": 5.970660679268138e-05, + "loss": 0.0065, + "step": 331 + }, + { + "epoch": 2.472998137802607, + "grad_norm": 0.012969112023711205, + "learning_rate": 5.945093516256989e-05, + "loss": 0.0074, + "step": 332 + }, + { + "epoch": 2.4804469273743015, + "grad_norm": 0.014279666356742382, + "learning_rate": 5.9195007002008685e-05, + "loss": 0.0073, + "step": 333 + }, + { + "epoch": 2.4878957169459963, + "grad_norm": 0.014202989637851715, + "learning_rate": 5.893882925775648e-05, + "loss": 0.0078, + "step": 334 + }, + { + "epoch": 2.4953445065176907, + "grad_norm": 0.013433150015771389, + "learning_rate": 5.868240888334653e-05, + "loss": 0.0067, + "step": 335 + }, + { + "epoch": 2.4953445065176907, + "eval_loss": 0.007440233137458563, + "eval_runtime": 1.4382, + "eval_samples_per_second": 5.563, + "eval_steps_per_second": 1.391, + "step": 335 + }, + { + "epoch": 2.5027932960893855, + "grad_norm": 0.012475401163101196, + "learning_rate": 5.842575283889789e-05, + "loss": 0.0061, + "step": 336 + }, + { + "epoch": 2.51024208566108, + "grad_norm": 0.015293211676180363, + "learning_rate": 5.816886809092651e-05, + "loss": 0.0084, + "step": 337 + }, + { + "epoch": 2.5176908752327747, + "grad_norm": 0.0140406908467412, + "learning_rate": 5.7911761612156135e-05, + "loss": 0.0061, + "step": 338 + }, + { + "epoch": 2.5251396648044695, + "grad_norm": 0.013416999019682407, + "learning_rate": 5.765444038132901e-05, + "loss": 0.0073, + "step": 339 + }, + { + "epoch": 2.532588454376164, + "grad_norm": 0.011545198038220406, + "learning_rate": 5.73969113830165e-05, + "loss": 0.0054, + "step": 340 + }, + { + "epoch": 2.532588454376164, + "eval_loss": 0.007562604267150164, + "eval_runtime": 1.4388, + "eval_samples_per_second": 5.56, + "eval_steps_per_second": 1.39, + "step": 340 + }, + { + "epoch": 2.5400372439478582, + "grad_norm": 0.01364484615623951, + "learning_rate": 5.713918160742948e-05, + "loss": 0.0076, + "step": 341 + }, + { + "epoch": 2.547486033519553, + "grad_norm": 0.012668787501752377, + "learning_rate": 5.688125805022861e-05, + "loss": 0.0063, + "step": 342 + }, + { + "epoch": 2.554934823091248, + "grad_norm": 0.013231166638433933, + "learning_rate": 5.66231477123344e-05, + "loss": 0.0068, + "step": 343 + }, + { + "epoch": 2.5623836126629422, + "grad_norm": 0.01256707962602377, + "learning_rate": 5.636485759973729e-05, + "loss": 0.0062, + "step": 344 + }, + { + "epoch": 2.5698324022346366, + "grad_norm": 0.013357976451516151, + "learning_rate": 5.6106394723307365e-05, + "loss": 0.006, + "step": 345 + }, + { + "epoch": 2.5698324022346366, + "eval_loss": 0.006900668144226074, + "eval_runtime": 1.4392, + "eval_samples_per_second": 5.559, + "eval_steps_per_second": 1.39, + "step": 345 + }, + { + "epoch": 2.5772811918063314, + "grad_norm": 0.0140624875202775, + "learning_rate": 5.584776609860414e-05, + "loss": 0.0076, + "step": 346 + }, + { + "epoch": 2.5847299813780262, + "grad_norm": 0.011903064325451851, + "learning_rate": 5.558897874568604e-05, + "loss": 0.0059, + "step": 347 + }, + { + "epoch": 2.5921787709497206, + "grad_norm": 0.014448638074100018, + "learning_rate": 5.533003968891998e-05, + "loss": 0.0068, + "step": 348 + }, + { + "epoch": 2.5996275605214154, + "grad_norm": 0.012435466051101685, + "learning_rate": 5.5070955956790594e-05, + "loss": 0.0058, + "step": 349 + }, + { + "epoch": 2.60707635009311, + "grad_norm": 0.01535428874194622, + "learning_rate": 5.4811734581709514e-05, + "loss": 0.007, + "step": 350 + }, + { + "epoch": 2.60707635009311, + "eval_loss": 0.0068687270395457745, + "eval_runtime": 1.4392, + "eval_samples_per_second": 5.559, + "eval_steps_per_second": 1.39, + "step": 350 + }, + { + "epoch": 2.6145251396648046, + "grad_norm": 0.014365176670253277, + "learning_rate": 5.455238259982448e-05, + "loss": 0.0076, + "step": 351 + }, + { + "epoch": 2.621973929236499, + "grad_norm": 0.012560203671455383, + "learning_rate": 5.42929070508283e-05, + "loss": 0.0068, + "step": 352 + }, + { + "epoch": 2.629422718808194, + "grad_norm": 0.011918047443032265, + "learning_rate": 5.4033314977767856e-05, + "loss": 0.0061, + "step": 353 + }, + { + "epoch": 2.636871508379888, + "grad_norm": 0.011303206905722618, + "learning_rate": 5.377361342685287e-05, + "loss": 0.0054, + "step": 354 + }, + { + "epoch": 2.644320297951583, + "grad_norm": 0.014169976115226746, + "learning_rate": 5.351380944726465e-05, + "loss": 0.0058, + "step": 355 + }, + { + "epoch": 2.644320297951583, + "eval_loss": 0.006901645101606846, + "eval_runtime": 1.4386, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 355 + }, + { + "epoch": 2.6517690875232773, + "grad_norm": 0.014690485782921314, + "learning_rate": 5.325391009096481e-05, + "loss": 0.0075, + "step": 356 + }, + { + "epoch": 2.659217877094972, + "grad_norm": 0.012094467878341675, + "learning_rate": 5.299392241250376e-05, + "loss": 0.0059, + "step": 357 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.013897833414375782, + "learning_rate": 5.27338534688293e-05, + "loss": 0.0068, + "step": 358 + }, + { + "epoch": 2.6741154562383613, + "grad_norm": 0.013070980086922646, + "learning_rate": 5.247371031909505e-05, + "loss": 0.0066, + "step": 359 + }, + { + "epoch": 2.6815642458100557, + "grad_norm": 0.012155570089817047, + "learning_rate": 5.221350002446882e-05, + "loss": 0.0062, + "step": 360 + }, + { + "epoch": 2.6815642458100557, + "eval_loss": 0.006993839051574469, + "eval_runtime": 1.439, + "eval_samples_per_second": 5.559, + "eval_steps_per_second": 1.39, + "step": 360 + }, + { + "epoch": 2.6890130353817505, + "grad_norm": 0.0141450809314847, + "learning_rate": 5.195322964794098e-05, + "loss": 0.0064, + "step": 361 + }, + { + "epoch": 2.6964618249534453, + "grad_norm": 0.013800080865621567, + "learning_rate": 5.169290625413268e-05, + "loss": 0.0077, + "step": 362 + }, + { + "epoch": 2.7039106145251397, + "grad_norm": 0.014076444320380688, + "learning_rate": 5.143253690910419e-05, + "loss": 0.0085, + "step": 363 + }, + { + "epoch": 2.711359404096834, + "grad_norm": 0.01326959952712059, + "learning_rate": 5.117212868016303e-05, + "loss": 0.0062, + "step": 364 + }, + { + "epoch": 2.718808193668529, + "grad_norm": 0.012991178780794144, + "learning_rate": 5.091168863567215e-05, + "loss": 0.0075, + "step": 365 + }, + { + "epoch": 2.718808193668529, + "eval_loss": 0.006982033606618643, + "eval_runtime": 1.4397, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 365 + }, + { + "epoch": 2.7262569832402237, + "grad_norm": 0.01416806224733591, + "learning_rate": 5.065122384485814e-05, + "loss": 0.0081, + "step": 366 + }, + { + "epoch": 2.733705772811918, + "grad_norm": 0.012682228349149227, + "learning_rate": 5.03907413776192e-05, + "loss": 0.0074, + "step": 367 + }, + { + "epoch": 2.7411545623836124, + "grad_norm": 0.013895339332520962, + "learning_rate": 5.013024830433338e-05, + "loss": 0.0071, + "step": 368 + }, + { + "epoch": 2.7486033519553073, + "grad_norm": 0.014080525375902653, + "learning_rate": 4.9869751695666615e-05, + "loss": 0.007, + "step": 369 + }, + { + "epoch": 2.756052141527002, + "grad_norm": 0.012255143374204636, + "learning_rate": 4.96092586223808e-05, + "loss": 0.0062, + "step": 370 + }, + { + "epoch": 2.756052141527002, + "eval_loss": 0.006682597566395998, + "eval_runtime": 1.4391, + "eval_samples_per_second": 5.559, + "eval_steps_per_second": 1.39, + "step": 370 + }, + { + "epoch": 2.7635009310986964, + "grad_norm": 0.012831464409828186, + "learning_rate": 4.9348776155141876e-05, + "loss": 0.0059, + "step": 371 + }, + { + "epoch": 2.770949720670391, + "grad_norm": 0.0137475011870265, + "learning_rate": 4.908831136432784e-05, + "loss": 0.007, + "step": 372 + }, + { + "epoch": 2.7783985102420856, + "grad_norm": 0.014217370189726353, + "learning_rate": 4.882787131983698e-05, + "loss": 0.007, + "step": 373 + }, + { + "epoch": 2.7858472998137804, + "grad_norm": 0.013658503070473671, + "learning_rate": 4.856746309089582e-05, + "loss": 0.0065, + "step": 374 + }, + { + "epoch": 2.793296089385475, + "grad_norm": 0.013131371699273586, + "learning_rate": 4.8307093745867335e-05, + "loss": 0.0064, + "step": 375 + }, + { + "epoch": 2.793296089385475, + "eval_loss": 0.00666953856125474, + "eval_runtime": 1.4378, + "eval_samples_per_second": 5.564, + "eval_steps_per_second": 1.391, + "step": 375 + }, + { + "epoch": 2.8007448789571696, + "grad_norm": 0.010836012661457062, + "learning_rate": 4.804677035205903e-05, + "loss": 0.006, + "step": 376 + }, + { + "epoch": 2.808193668528864, + "grad_norm": 0.014964509755373001, + "learning_rate": 4.778649997553119e-05, + "loss": 0.0071, + "step": 377 + }, + { + "epoch": 2.815642458100559, + "grad_norm": 0.011445709504187107, + "learning_rate": 4.752628968090496e-05, + "loss": 0.005, + "step": 378 + }, + { + "epoch": 2.823091247672253, + "grad_norm": 0.011256701312959194, + "learning_rate": 4.726614653117071e-05, + "loss": 0.0052, + "step": 379 + }, + { + "epoch": 2.830540037243948, + "grad_norm": 0.01327276136726141, + "learning_rate": 4.700607758749625e-05, + "loss": 0.0076, + "step": 380 + }, + { + "epoch": 2.830540037243948, + "eval_loss": 0.006651410833001137, + "eval_runtime": 1.4359, + "eval_samples_per_second": 5.572, + "eval_steps_per_second": 1.393, + "step": 380 + }, + { + "epoch": 2.8379888268156424, + "grad_norm": 0.011966750957071781, + "learning_rate": 4.674608990903521e-05, + "loss": 0.0065, + "step": 381 + }, + { + "epoch": 2.845437616387337, + "grad_norm": 0.013033094815909863, + "learning_rate": 4.648619055273537e-05, + "loss": 0.0061, + "step": 382 + }, + { + "epoch": 2.8528864059590315, + "grad_norm": 0.0130838043987751, + "learning_rate": 4.622638657314716e-05, + "loss": 0.0066, + "step": 383 + }, + { + "epoch": 2.8603351955307263, + "grad_norm": 0.014701674692332745, + "learning_rate": 4.596668502223214e-05, + "loss": 0.0085, + "step": 384 + }, + { + "epoch": 2.8677839851024207, + "grad_norm": 0.012755104340612888, + "learning_rate": 4.5707092949171697e-05, + "loss": 0.0062, + "step": 385 + }, + { + "epoch": 2.8677839851024207, + "eval_loss": 0.006667222827672958, + "eval_runtime": 1.4375, + "eval_samples_per_second": 5.565, + "eval_steps_per_second": 1.391, + "step": 385 + }, + { + "epoch": 2.8752327746741155, + "grad_norm": 0.013360305689275265, + "learning_rate": 4.544761740017553e-05, + "loss": 0.0075, + "step": 386 + }, + { + "epoch": 2.88268156424581, + "grad_norm": 0.01437696535140276, + "learning_rate": 4.518826541829049e-05, + "loss": 0.0083, + "step": 387 + }, + { + "epoch": 2.8901303538175047, + "grad_norm": 0.014352986589074135, + "learning_rate": 4.492904404320942e-05, + "loss": 0.0077, + "step": 388 + }, + { + "epoch": 2.8975791433891995, + "grad_norm": 0.012372603639960289, + "learning_rate": 4.466996031108004e-05, + "loss": 0.0052, + "step": 389 + }, + { + "epoch": 2.905027932960894, + "grad_norm": 0.014959496445953846, + "learning_rate": 4.441102125431398e-05, + "loss": 0.0076, + "step": 390 + }, + { + "epoch": 2.905027932960894, + "eval_loss": 0.006536586210131645, + "eval_runtime": 1.4373, + "eval_samples_per_second": 5.566, + "eval_steps_per_second": 1.392, + "step": 390 + }, + { + "epoch": 2.9124767225325883, + "grad_norm": 0.013327567838132381, + "learning_rate": 4.415223390139588e-05, + "loss": 0.0063, + "step": 391 + }, + { + "epoch": 2.919925512104283, + "grad_norm": 0.01268142368644476, + "learning_rate": 4.3893605276692646e-05, + "loss": 0.0061, + "step": 392 + }, + { + "epoch": 2.927374301675978, + "grad_norm": 0.014645962044596672, + "learning_rate": 4.3635142400262715e-05, + "loss": 0.0066, + "step": 393 + }, + { + "epoch": 2.9348230912476723, + "grad_norm": 0.013157113455235958, + "learning_rate": 4.3376852287665606e-05, + "loss": 0.0068, + "step": 394 + }, + { + "epoch": 2.9422718808193666, + "grad_norm": 0.013522828929126263, + "learning_rate": 4.311874194977141e-05, + "loss": 0.0064, + "step": 395 + }, + { + "epoch": 2.9422718808193666, + "eval_loss": 0.006440032739192247, + "eval_runtime": 1.4353, + "eval_samples_per_second": 5.574, + "eval_steps_per_second": 1.393, + "step": 395 + }, + { + "epoch": 2.9497206703910615, + "grad_norm": 0.01399063877761364, + "learning_rate": 4.2860818392570535e-05, + "loss": 0.0067, + "step": 396 + }, + { + "epoch": 2.9571694599627563, + "grad_norm": 0.0129147469997406, + "learning_rate": 4.260308861698351e-05, + "loss": 0.0063, + "step": 397 + }, + { + "epoch": 2.9646182495344506, + "grad_norm": 0.014884996227920055, + "learning_rate": 4.234555961867099e-05, + "loss": 0.0072, + "step": 398 + }, + { + "epoch": 2.972067039106145, + "grad_norm": 0.012328519485890865, + "learning_rate": 4.208823838784386e-05, + "loss": 0.006, + "step": 399 + }, + { + "epoch": 2.97951582867784, + "grad_norm": 0.012001237832009792, + "learning_rate": 4.183113190907349e-05, + "loss": 0.006, + "step": 400 + }, + { + "epoch": 2.97951582867784, + "eval_loss": 0.006503336131572723, + "eval_runtime": 1.4423, + "eval_samples_per_second": 5.547, + "eval_steps_per_second": 1.387, + "step": 400 + }, + { + "epoch": 2.9869646182495346, + "grad_norm": 0.01248820498585701, + "learning_rate": 4.157424716110212e-05, + "loss": 0.0062, + "step": 401 + }, + { + "epoch": 2.994413407821229, + "grad_norm": 0.012790112756192684, + "learning_rate": 4.131759111665349e-05, + "loss": 0.006, + "step": 402 + }, + { + "epoch": 3.001862197392924, + "grad_norm": 0.029748283326625824, + "learning_rate": 4.106117074224354e-05, + "loss": 0.0109, + "step": 403 + }, + { + "epoch": 3.009310986964618, + "grad_norm": 0.011120783165097237, + "learning_rate": 4.080499299799133e-05, + "loss": 0.0046, + "step": 404 + }, + { + "epoch": 3.016759776536313, + "grad_norm": 0.010844327509403229, + "learning_rate": 4.0549064837430124e-05, + "loss": 0.0045, + "step": 405 + }, + { + "epoch": 3.016759776536313, + "eval_loss": 0.006626332178711891, + "eval_runtime": 1.4413, + "eval_samples_per_second": 5.55, + "eval_steps_per_second": 1.388, + "step": 405 + }, + { + "epoch": 3.0242085661080074, + "grad_norm": 0.012453730218112469, + "learning_rate": 4.029339320731862e-05, + "loss": 0.0047, + "step": 406 + }, + { + "epoch": 3.031657355679702, + "grad_norm": 0.012017901986837387, + "learning_rate": 4.003798504745243e-05, + "loss": 0.0046, + "step": 407 + }, + { + "epoch": 3.0391061452513966, + "grad_norm": 0.01223169919103384, + "learning_rate": 3.978284729047567e-05, + "loss": 0.0048, + "step": 408 + }, + { + "epoch": 3.0465549348230914, + "grad_norm": 0.01258036494255066, + "learning_rate": 3.952798686169279e-05, + "loss": 0.0046, + "step": 409 + }, + { + "epoch": 3.0540037243947857, + "grad_norm": 0.01578667014837265, + "learning_rate": 3.9273410678880654e-05, + "loss": 0.0043, + "step": 410 + }, + { + "epoch": 3.0540037243947857, + "eval_loss": 0.006709449924528599, + "eval_runtime": 1.4385, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 410 + }, + { + "epoch": 3.0614525139664805, + "grad_norm": 0.013783026486635208, + "learning_rate": 3.901912565210071e-05, + "loss": 0.0047, + "step": 411 + }, + { + "epoch": 3.068901303538175, + "grad_norm": 0.01629786752164364, + "learning_rate": 3.876513868351142e-05, + "loss": 0.005, + "step": 412 + }, + { + "epoch": 3.0763500931098697, + "grad_norm": 0.013088533654808998, + "learning_rate": 3.851145666718095e-05, + "loss": 0.0046, + "step": 413 + }, + { + "epoch": 3.083798882681564, + "grad_norm": 0.013530509546399117, + "learning_rate": 3.825808648890005e-05, + "loss": 0.0055, + "step": 414 + }, + { + "epoch": 3.091247672253259, + "grad_norm": 0.01278090663254261, + "learning_rate": 3.8005035025995104e-05, + "loss": 0.0045, + "step": 415 + }, + { + "epoch": 3.091247672253259, + "eval_loss": 0.006619194056838751, + "eval_runtime": 1.4393, + "eval_samples_per_second": 5.558, + "eval_steps_per_second": 1.39, + "step": 415 + }, + { + "epoch": 3.0986964618249533, + "grad_norm": 0.013418780639767647, + "learning_rate": 3.775230914714149e-05, + "loss": 0.0052, + "step": 416 + }, + { + "epoch": 3.106145251396648, + "grad_norm": 0.012641118839383125, + "learning_rate": 3.7499915712177094e-05, + "loss": 0.0046, + "step": 417 + }, + { + "epoch": 3.1135940409683425, + "grad_norm": 0.013218709267675877, + "learning_rate": 3.7247861571916185e-05, + "loss": 0.0048, + "step": 418 + }, + { + "epoch": 3.1210428305400373, + "grad_norm": 0.011562489904463291, + "learning_rate": 3.699615356796342e-05, + "loss": 0.0042, + "step": 419 + }, + { + "epoch": 3.1284916201117317, + "grad_norm": 0.011314046569168568, + "learning_rate": 3.674479853252813e-05, + "loss": 0.0038, + "step": 420 + }, + { + "epoch": 3.1284916201117317, + "eval_loss": 0.006657813210040331, + "eval_runtime": 1.4455, + "eval_samples_per_second": 5.534, + "eval_steps_per_second": 1.384, + "step": 420 + }, + { + "epoch": 3.1359404096834265, + "grad_norm": 0.01162840984761715, + "learning_rate": 3.6493803288238896e-05, + "loss": 0.0041, + "step": 421 + }, + { + "epoch": 3.143389199255121, + "grad_norm": 0.014425015076994896, + "learning_rate": 3.624317464795834e-05, + "loss": 0.004, + "step": 422 + }, + { + "epoch": 3.1508379888268156, + "grad_norm": 0.0132564976811409, + "learning_rate": 3.599291941459818e-05, + "loss": 0.0049, + "step": 423 + }, + { + "epoch": 3.1582867783985105, + "grad_norm": 0.01422976702451706, + "learning_rate": 3.574304438093466e-05, + "loss": 0.0048, + "step": 424 + }, + { + "epoch": 3.165735567970205, + "grad_norm": 0.013297994621098042, + "learning_rate": 3.549355632942405e-05, + "loss": 0.0041, + "step": 425 + }, + { + "epoch": 3.165735567970205, + "eval_loss": 0.006783606484532356, + "eval_runtime": 1.4402, + "eval_samples_per_second": 5.555, + "eval_steps_per_second": 1.389, + "step": 425 + }, + { + "epoch": 3.1731843575418996, + "grad_norm": 0.015820320695638657, + "learning_rate": 3.5244462032018666e-05, + "loss": 0.0052, + "step": 426 + }, + { + "epoch": 3.180633147113594, + "grad_norm": 0.016552533954381943, + "learning_rate": 3.499576824998298e-05, + "loss": 0.005, + "step": 427 + }, + { + "epoch": 3.188081936685289, + "grad_norm": 0.013990816660225391, + "learning_rate": 3.474748173371008e-05, + "loss": 0.0045, + "step": 428 + }, + { + "epoch": 3.195530726256983, + "grad_norm": 0.012228915467858315, + "learning_rate": 3.4499609222538576e-05, + "loss": 0.0041, + "step": 429 + }, + { + "epoch": 3.202979515828678, + "grad_norm": 0.013655752874910831, + "learning_rate": 3.425215744456948e-05, + "loss": 0.0042, + "step": 430 + }, + { + "epoch": 3.202979515828678, + "eval_loss": 0.0066948262974619865, + "eval_runtime": 1.4397, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 430 + }, + { + "epoch": 3.2104283054003724, + "grad_norm": 0.013641907833516598, + "learning_rate": 3.400513311648372e-05, + "loss": 0.0045, + "step": 431 + }, + { + "epoch": 3.217877094972067, + "grad_norm": 0.012657403945922852, + "learning_rate": 3.375854294335977e-05, + "loss": 0.004, + "step": 432 + }, + { + "epoch": 3.2253258845437616, + "grad_norm": 0.014040276408195496, + "learning_rate": 3.3512393618491676e-05, + "loss": 0.0049, + "step": 433 + }, + { + "epoch": 3.2327746741154564, + "grad_norm": 0.015839368104934692, + "learning_rate": 3.326669182320736e-05, + "loss": 0.0052, + "step": 434 + }, + { + "epoch": 3.2402234636871508, + "grad_norm": 0.013753394596278667, + "learning_rate": 3.302144422668726e-05, + "loss": 0.0046, + "step": 435 + }, + { + "epoch": 3.2402234636871508, + "eval_loss": 0.006585339084267616, + "eval_runtime": 1.4383, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.39, + "step": 435 + }, + { + "epoch": 3.2476722532588456, + "grad_norm": 0.012829342857003212, + "learning_rate": 3.277665748578336e-05, + "loss": 0.0046, + "step": 436 + }, + { + "epoch": 3.25512104283054, + "grad_norm": 0.01261841505765915, + "learning_rate": 3.2532338244838415e-05, + "loss": 0.0044, + "step": 437 + }, + { + "epoch": 3.2625698324022347, + "grad_norm": 0.014598112553358078, + "learning_rate": 3.228849313550566e-05, + "loss": 0.0049, + "step": 438 + }, + { + "epoch": 3.270018621973929, + "grad_norm": 0.01227685995399952, + "learning_rate": 3.2045128776568784e-05, + "loss": 0.0048, + "step": 439 + }, + { + "epoch": 3.277467411545624, + "grad_norm": 0.012074559926986694, + "learning_rate": 3.180225177376229e-05, + "loss": 0.0047, + "step": 440 + }, + { + "epoch": 3.277467411545624, + "eval_loss": 0.006581705529242754, + "eval_runtime": 1.4369, + "eval_samples_per_second": 5.568, + "eval_steps_per_second": 1.392, + "step": 440 + }, + { + "epoch": 3.2849162011173183, + "grad_norm": 0.014227618463337421, + "learning_rate": 3.155986871959219e-05, + "loss": 0.0051, + "step": 441 + }, + { + "epoch": 3.292364990689013, + "grad_norm": 0.012186282314360142, + "learning_rate": 3.131798619315702e-05, + "loss": 0.0041, + "step": 442 + }, + { + "epoch": 3.2998137802607075, + "grad_norm": 0.012706179171800613, + "learning_rate": 3.107661075996932e-05, + "loss": 0.004, + "step": 443 + }, + { + "epoch": 3.3072625698324023, + "grad_norm": 0.016201509162783623, + "learning_rate": 3.083574897177741e-05, + "loss": 0.0055, + "step": 444 + }, + { + "epoch": 3.3147113594040967, + "grad_norm": 0.013101043179631233, + "learning_rate": 3.0595407366387504e-05, + "loss": 0.0045, + "step": 445 + }, + { + "epoch": 3.3147113594040967, + "eval_loss": 0.006452564150094986, + "eval_runtime": 1.4384, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.39, + "step": 445 + }, + { + "epoch": 3.3221601489757915, + "grad_norm": 0.014754964038729668, + "learning_rate": 3.035559246748635e-05, + "loss": 0.0055, + "step": 446 + }, + { + "epoch": 3.329608938547486, + "grad_norm": 0.014629971235990524, + "learning_rate": 3.0116310784464008e-05, + "loss": 0.0053, + "step": 447 + }, + { + "epoch": 3.3370577281191807, + "grad_norm": 0.013362464495003223, + "learning_rate": 2.987756881223732e-05, + "loss": 0.0048, + "step": 448 + }, + { + "epoch": 3.344506517690875, + "grad_norm": 0.013100212439894676, + "learning_rate": 2.9639373031073525e-05, + "loss": 0.0048, + "step": 449 + }, + { + "epoch": 3.35195530726257, + "grad_norm": 0.013729127123951912, + "learning_rate": 2.940172990641438e-05, + "loss": 0.005, + "step": 450 + }, + { + "epoch": 3.35195530726257, + "eval_loss": 0.006514144595712423, + "eval_runtime": 1.4367, + "eval_samples_per_second": 5.569, + "eval_steps_per_second": 1.392, + "step": 450 + }, + { + "epoch": 3.3594040968342647, + "grad_norm": 0.014590530656278133, + "learning_rate": 2.916464588870067e-05, + "loss": 0.0044, + "step": 451 + }, + { + "epoch": 3.366852886405959, + "grad_norm": 0.012314187362790108, + "learning_rate": 2.8928127413197124e-05, + "loss": 0.0048, + "step": 452 + }, + { + "epoch": 3.3743016759776534, + "grad_norm": 0.01219462975859642, + "learning_rate": 2.869218089981772e-05, + "loss": 0.0039, + "step": 453 + }, + { + "epoch": 3.381750465549348, + "grad_norm": 0.015480121597647667, + "learning_rate": 2.8456812752951485e-05, + "loss": 0.0043, + "step": 454 + }, + { + "epoch": 3.389199255121043, + "grad_norm": 0.014719086699187756, + "learning_rate": 2.8222029361288583e-05, + "loss": 0.0049, + "step": 455 + }, + { + "epoch": 3.389199255121043, + "eval_loss": 0.00670338049530983, + "eval_runtime": 1.4344, + "eval_samples_per_second": 5.577, + "eval_steps_per_second": 1.394, + "step": 455 + }, + { + "epoch": 3.3966480446927374, + "grad_norm": 0.013700945302844048, + "learning_rate": 2.7987837097646908e-05, + "loss": 0.0047, + "step": 456 + }, + { + "epoch": 3.404096834264432, + "grad_norm": 0.013997942209243774, + "learning_rate": 2.7754242318799174e-05, + "loss": 0.0049, + "step": 457 + }, + { + "epoch": 3.4115456238361266, + "grad_norm": 0.011925108730793, + "learning_rate": 2.752125136530036e-05, + "loss": 0.0039, + "step": 458 + }, + { + "epoch": 3.4189944134078214, + "grad_norm": 0.012956595979630947, + "learning_rate": 2.7288870561315527e-05, + "loss": 0.0045, + "step": 459 + }, + { + "epoch": 3.4264432029795158, + "grad_norm": 0.01354091614484787, + "learning_rate": 2.7057106214448212e-05, + "loss": 0.0044, + "step": 460 + }, + { + "epoch": 3.4264432029795158, + "eval_loss": 0.006489352323114872, + "eval_runtime": 1.4364, + "eval_samples_per_second": 5.569, + "eval_steps_per_second": 1.392, + "step": 460 + }, + { + "epoch": 3.4338919925512106, + "grad_norm": 0.012749075889587402, + "learning_rate": 2.6825964615569233e-05, + "loss": 0.0036, + "step": 461 + }, + { + "epoch": 3.441340782122905, + "grad_norm": 0.013043111190199852, + "learning_rate": 2.6595452038645897e-05, + "loss": 0.0041, + "step": 462 + }, + { + "epoch": 3.4487895716945998, + "grad_norm": 0.013550251722335815, + "learning_rate": 2.636557474057173e-05, + "loss": 0.0048, + "step": 463 + }, + { + "epoch": 3.456238361266294, + "grad_norm": 0.012522494420409203, + "learning_rate": 2.6136338960996666e-05, + "loss": 0.0042, + "step": 464 + }, + { + "epoch": 3.463687150837989, + "grad_norm": 0.014970463700592518, + "learning_rate": 2.5907750922157552e-05, + "loss": 0.0054, + "step": 465 + }, + { + "epoch": 3.463687150837989, + "eval_loss": 0.006429262459278107, + "eval_runtime": 1.4383, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.391, + "step": 465 + }, + { + "epoch": 3.4711359404096833, + "grad_norm": 0.0113364327698946, + "learning_rate": 2.5679816828709458e-05, + "loss": 0.0043, + "step": 466 + }, + { + "epoch": 3.478584729981378, + "grad_norm": 0.012430958449840546, + "learning_rate": 2.5452542867557117e-05, + "loss": 0.0042, + "step": 467 + }, + { + "epoch": 3.4860335195530725, + "grad_norm": 0.012101479806005955, + "learning_rate": 2.5225935207687025e-05, + "loss": 0.0043, + "step": 468 + }, + { + "epoch": 3.4934823091247673, + "grad_norm": 0.01122524868696928, + "learning_rate": 2.500000000000001e-05, + "loss": 0.0043, + "step": 469 + }, + { + "epoch": 3.5009310986964617, + "grad_norm": 0.012876059859991074, + "learning_rate": 2.4774743377144265e-05, + "loss": 0.0045, + "step": 470 + }, + { + "epoch": 3.5009310986964617, + "eval_loss": 0.006419518496841192, + "eval_runtime": 1.444, + "eval_samples_per_second": 5.54, + "eval_steps_per_second": 1.385, + "step": 470 + }, + { + "epoch": 3.5083798882681565, + "grad_norm": 0.01279713399708271, + "learning_rate": 2.4550171453348887e-05, + "loss": 0.004, + "step": 471 + }, + { + "epoch": 3.515828677839851, + "grad_norm": 0.0135357566177845, + "learning_rate": 2.4326290324257894e-05, + "loss": 0.0045, + "step": 472 + }, + { + "epoch": 3.5232774674115457, + "grad_norm": 0.013147437013685703, + "learning_rate": 2.410310606676485e-05, + "loss": 0.0045, + "step": 473 + }, + { + "epoch": 3.5307262569832405, + "grad_norm": 0.012606249190866947, + "learning_rate": 2.3880624738847835e-05, + "loss": 0.0043, + "step": 474 + }, + { + "epoch": 3.538175046554935, + "grad_norm": 0.014670198783278465, + "learning_rate": 2.3658852379404973e-05, + "loss": 0.0037, + "step": 475 + }, + { + "epoch": 3.538175046554935, + "eval_loss": 0.0063990759663283825, + "eval_runtime": 1.4485, + "eval_samples_per_second": 5.523, + "eval_steps_per_second": 1.381, + "step": 475 + }, + { + "epoch": 3.5456238361266292, + "grad_norm": 0.01415963377803564, + "learning_rate": 2.3437795008090656e-05, + "loss": 0.0033, + "step": 476 + }, + { + "epoch": 3.553072625698324, + "grad_norm": 0.01465427502989769, + "learning_rate": 2.3217458625152038e-05, + "loss": 0.0047, + "step": 477 + }, + { + "epoch": 3.560521415270019, + "grad_norm": 0.012740039266645908, + "learning_rate": 2.2997849211266222e-05, + "loss": 0.0039, + "step": 478 + }, + { + "epoch": 3.5679702048417132, + "grad_norm": 0.012664888985455036, + "learning_rate": 2.2778972727377868e-05, + "loss": 0.0044, + "step": 479 + }, + { + "epoch": 3.5754189944134076, + "grad_norm": 0.012201756238937378, + "learning_rate": 2.256083511453747e-05, + "loss": 0.0039, + "step": 480 + }, + { + "epoch": 3.5754189944134076, + "eval_loss": 0.006340866908431053, + "eval_runtime": 1.4396, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 480 + }, + { + "epoch": 3.5828677839851024, + "grad_norm": 0.011587726883590221, + "learning_rate": 2.234344229374003e-05, + "loss": 0.0036, + "step": 481 + }, + { + "epoch": 3.5903165735567972, + "grad_norm": 0.013236557133495808, + "learning_rate": 2.2126800165764378e-05, + "loss": 0.0042, + "step": 482 + }, + { + "epoch": 3.5977653631284916, + "grad_norm": 0.012902887538075447, + "learning_rate": 2.191091461101298e-05, + "loss": 0.0038, + "step": 483 + }, + { + "epoch": 3.605214152700186, + "grad_norm": 0.014430266804993153, + "learning_rate": 2.1695791489352345e-05, + "loss": 0.0051, + "step": 484 + }, + { + "epoch": 3.612662942271881, + "grad_norm": 0.01229032315313816, + "learning_rate": 2.1481436639953984e-05, + "loss": 0.0044, + "step": 485 + }, + { + "epoch": 3.612662942271881, + "eval_loss": 0.006252099294215441, + "eval_runtime": 1.4435, + "eval_samples_per_second": 5.542, + "eval_steps_per_second": 1.385, + "step": 485 + }, + { + "epoch": 3.6201117318435756, + "grad_norm": 0.013799392618238926, + "learning_rate": 2.126785588113584e-05, + "loss": 0.0041, + "step": 486 + }, + { + "epoch": 3.62756052141527, + "grad_norm": 0.012603668496012688, + "learning_rate": 2.1055055010204427e-05, + "loss": 0.0037, + "step": 487 + }, + { + "epoch": 3.635009310986965, + "grad_norm": 0.012317496351897717, + "learning_rate": 2.0843039803297516e-05, + "loss": 0.0037, + "step": 488 + }, + { + "epoch": 3.642458100558659, + "grad_norm": 0.011733555234968662, + "learning_rate": 2.0631816015227218e-05, + "loss": 0.0033, + "step": 489 + }, + { + "epoch": 3.649906890130354, + "grad_norm": 0.013048346154391766, + "learning_rate": 2.042138937932388e-05, + "loss": 0.0039, + "step": 490 + }, + { + "epoch": 3.649906890130354, + "eval_loss": 0.0063432203605771065, + "eval_runtime": 1.4423, + "eval_samples_per_second": 5.547, + "eval_steps_per_second": 1.387, + "step": 490 + }, + { + "epoch": 3.6573556797020483, + "grad_norm": 0.013012276031076908, + "learning_rate": 2.021176560728043e-05, + "loss": 0.0045, + "step": 491 + }, + { + "epoch": 3.664804469273743, + "grad_norm": 0.013149777427315712, + "learning_rate": 2.0002950388997345e-05, + "loss": 0.004, + "step": 492 + }, + { + "epoch": 3.6722532588454375, + "grad_norm": 0.01237794291228056, + "learning_rate": 1.979494939242822e-05, + "loss": 0.0037, + "step": 493 + }, + { + "epoch": 3.6797020484171323, + "grad_norm": 0.013679184019565582, + "learning_rate": 1.9587768263425886e-05, + "loss": 0.0037, + "step": 494 + }, + { + "epoch": 3.6871508379888267, + "grad_norm": 0.014831073582172394, + "learning_rate": 1.9381412625589234e-05, + "loss": 0.0045, + "step": 495 + }, + { + "epoch": 3.6871508379888267, + "eval_loss": 0.00640025082975626, + "eval_runtime": 1.4396, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 495 + }, + { + "epoch": 3.6945996275605215, + "grad_norm": 0.013730873353779316, + "learning_rate": 1.917588808011045e-05, + "loss": 0.0043, + "step": 496 + }, + { + "epoch": 3.702048417132216, + "grad_norm": 0.014336715452373028, + "learning_rate": 1.897120020562311e-05, + "loss": 0.0049, + "step": 497 + }, + { + "epoch": 3.7094972067039107, + "grad_norm": 0.01304208766669035, + "learning_rate": 1.8767354558050693e-05, + "loss": 0.004, + "step": 498 + }, + { + "epoch": 3.716945996275605, + "grad_norm": 0.013440731912851334, + "learning_rate": 1.8564356670455767e-05, + "loss": 0.0042, + "step": 499 + }, + { + "epoch": 3.7243947858473, + "grad_norm": 0.012896439991891384, + "learning_rate": 1.8362212052889826e-05, + "loss": 0.0042, + "step": 500 + }, + { + "epoch": 3.7243947858473, + "eval_loss": 0.006389484740793705, + "eval_runtime": 1.4401, + "eval_samples_per_second": 5.555, + "eval_steps_per_second": 1.389, + "step": 500 + }, + { + "epoch": 3.7318435754189943, + "grad_norm": 0.012527666985988617, + "learning_rate": 1.8160926192243698e-05, + "loss": 0.0041, + "step": 501 + }, + { + "epoch": 3.739292364990689, + "grad_norm": 0.01260663103312254, + "learning_rate": 1.796050455209869e-05, + "loss": 0.0044, + "step": 502 + }, + { + "epoch": 3.7467411545623834, + "grad_norm": 0.013085776939988136, + "learning_rate": 1.7760952572578182e-05, + "loss": 0.0041, + "step": 503 + }, + { + "epoch": 3.7541899441340782, + "grad_norm": 0.013910362496972084, + "learning_rate": 1.756227567020004e-05, + "loss": 0.0047, + "step": 504 + }, + { + "epoch": 3.761638733705773, + "grad_norm": 0.01386139914393425, + "learning_rate": 1.7364479237729526e-05, + "loss": 0.0044, + "step": 505 + }, + { + "epoch": 3.761638733705773, + "eval_loss": 0.006338322069495916, + "eval_runtime": 1.4377, + "eval_samples_per_second": 5.564, + "eval_steps_per_second": 1.391, + "step": 505 + }, + { + "epoch": 3.7690875232774674, + "grad_norm": 0.012698731385171413, + "learning_rate": 1.7167568644033005e-05, + "loss": 0.0035, + "step": 506 + }, + { + "epoch": 3.776536312849162, + "grad_norm": 0.012162290513515472, + "learning_rate": 1.697154923393216e-05, + "loss": 0.0038, + "step": 507 + }, + { + "epoch": 3.7839851024208566, + "grad_norm": 0.01324171107262373, + "learning_rate": 1.677642632805892e-05, + "loss": 0.0044, + "step": 508 + }, + { + "epoch": 3.7914338919925514, + "grad_norm": 0.011998174712061882, + "learning_rate": 1.658220522271105e-05, + "loss": 0.004, + "step": 509 + }, + { + "epoch": 3.798882681564246, + "grad_norm": 0.013846911489963531, + "learning_rate": 1.63888911897084e-05, + "loss": 0.0045, + "step": 510 + }, + { + "epoch": 3.798882681564246, + "eval_loss": 0.006283560302108526, + "eval_runtime": 1.4387, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 510 + }, + { + "epoch": 3.80633147113594, + "grad_norm": 0.015946250408887863, + "learning_rate": 1.6196489476249777e-05, + "loss": 0.0044, + "step": 511 + }, + { + "epoch": 3.813780260707635, + "grad_norm": 0.01330247800797224, + "learning_rate": 1.6005005304770552e-05, + "loss": 0.0041, + "step": 512 + }, + { + "epoch": 3.82122905027933, + "grad_norm": 0.013271212577819824, + "learning_rate": 1.5814443872800906e-05, + "loss": 0.0041, + "step": 513 + }, + { + "epoch": 3.828677839851024, + "grad_norm": 0.012470326386392117, + "learning_rate": 1.562481035282471e-05, + "loss": 0.0035, + "step": 514 + }, + { + "epoch": 3.8361266294227185, + "grad_norm": 0.013661223463714123, + "learning_rate": 1.5436109892139177e-05, + "loss": 0.0041, + "step": 515 + }, + { + "epoch": 3.8361266294227185, + "eval_loss": 0.006311333738267422, + "eval_runtime": 1.4372, + "eval_samples_per_second": 5.566, + "eval_steps_per_second": 1.392, + "step": 515 + }, + { + "epoch": 3.8435754189944134, + "grad_norm": 0.013829373754560947, + "learning_rate": 1.5248347612715119e-05, + "loss": 0.0047, + "step": 516 + }, + { + "epoch": 3.851024208566108, + "grad_norm": 0.012702974490821362, + "learning_rate": 1.5061528611057918e-05, + "loss": 0.0042, + "step": 517 + }, + { + "epoch": 3.8584729981378025, + "grad_norm": 0.014550411142408848, + "learning_rate": 1.4875657958069212e-05, + "loss": 0.0042, + "step": 518 + }, + { + "epoch": 3.8659217877094973, + "grad_norm": 0.012538805603981018, + "learning_rate": 1.4690740698909222e-05, + "loss": 0.0038, + "step": 519 + }, + { + "epoch": 3.8733705772811917, + "grad_norm": 0.013346507214009762, + "learning_rate": 1.4506781852859835e-05, + "loss": 0.0042, + "step": 520 + }, + { + "epoch": 3.8733705772811917, + "eval_loss": 0.006348676513880491, + "eval_runtime": 1.4369, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 520 + }, + { + "epoch": 3.8808193668528865, + "grad_norm": 0.012295892462134361, + "learning_rate": 1.432378641318835e-05, + "loss": 0.0038, + "step": 521 + }, + { + "epoch": 3.888268156424581, + "grad_norm": 0.01254743617027998, + "learning_rate": 1.4141759347011952e-05, + "loss": 0.0042, + "step": 522 + }, + { + "epoch": 3.8957169459962757, + "grad_norm": 0.011672910302877426, + "learning_rate": 1.3960705595162876e-05, + "loss": 0.0036, + "step": 523 + }, + { + "epoch": 3.90316573556797, + "grad_norm": 0.011820181272923946, + "learning_rate": 1.3780630072054313e-05, + "loss": 0.0042, + "step": 524 + }, + { + "epoch": 3.910614525139665, + "grad_norm": 0.011941757053136826, + "learning_rate": 1.3601537665547009e-05, + "loss": 0.004, + "step": 525 + }, + { + "epoch": 3.910614525139665, + "eval_loss": 0.0064041148871183395, + "eval_runtime": 1.436, + "eval_samples_per_second": 5.571, + "eval_steps_per_second": 1.393, + "step": 525 + }, + { + "epoch": 3.9180633147113593, + "grad_norm": 0.014317545108497143, + "learning_rate": 1.3423433236816563e-05, + "loss": 0.004, + "step": 526 + }, + { + "epoch": 3.925512104283054, + "grad_norm": 0.013467320241034031, + "learning_rate": 1.324632162022153e-05, + "loss": 0.0039, + "step": 527 + }, + { + "epoch": 3.9329608938547485, + "grad_norm": 0.015422320924699306, + "learning_rate": 1.307020762317217e-05, + "loss": 0.0049, + "step": 528 + }, + { + "epoch": 3.9404096834264433, + "grad_norm": 0.011441254056990147, + "learning_rate": 1.289509602599996e-05, + "loss": 0.0037, + "step": 529 + }, + { + "epoch": 3.9478584729981376, + "grad_norm": 0.013289231806993484, + "learning_rate": 1.272099158182785e-05, + "loss": 0.0042, + "step": 530 + }, + { + "epoch": 3.9478584729981376, + "eval_loss": 0.006392916664481163, + "eval_runtime": 1.4359, + "eval_samples_per_second": 5.572, + "eval_steps_per_second": 1.393, + "step": 530 + }, + { + "epoch": 3.9553072625698324, + "grad_norm": 0.013485637493431568, + "learning_rate": 1.2547899016441222e-05, + "loss": 0.0049, + "step": 531 + }, + { + "epoch": 3.9627560521415273, + "grad_norm": 0.012843563221395016, + "learning_rate": 1.2375823028159667e-05, + "loss": 0.0045, + "step": 532 + }, + { + "epoch": 3.9702048417132216, + "grad_norm": 0.012260922230780125, + "learning_rate": 1.2204768287709395e-05, + "loss": 0.0036, + "step": 533 + }, + { + "epoch": 3.977653631284916, + "grad_norm": 0.013904851861298084, + "learning_rate": 1.203473943809651e-05, + "loss": 0.0041, + "step": 534 + }, + { + "epoch": 3.985102420856611, + "grad_norm": 0.01265387050807476, + "learning_rate": 1.1865741094480909e-05, + "loss": 0.0043, + "step": 535 + }, + { + "epoch": 3.985102420856611, + "eval_loss": 0.006239964161068201, + "eval_runtime": 1.4414, + "eval_samples_per_second": 5.55, + "eval_steps_per_second": 1.388, + "step": 535 + }, + { + "epoch": 3.9925512104283056, + "grad_norm": 0.01192470733076334, + "learning_rate": 1.1697777844051105e-05, + "loss": 0.0041, + "step": 536 + }, + { + "epoch": 4.0, + "grad_norm": 0.02570340596139431, + "learning_rate": 1.1530854245899659e-05, + "loss": 0.0061, + "step": 537 + }, + { + "epoch": 4.007448789571694, + "grad_norm": 0.009994372725486755, + "learning_rate": 1.1364974830899439e-05, + "loss": 0.0031, + "step": 538 + }, + { + "epoch": 4.01489757914339, + "grad_norm": 0.010831528343260288, + "learning_rate": 1.1200144101580635e-05, + "loss": 0.0029, + "step": 539 + }, + { + "epoch": 4.022346368715084, + "grad_norm": 0.010082140564918518, + "learning_rate": 1.1036366532008552e-05, + "loss": 0.003, + "step": 540 + }, + { + "epoch": 4.022346368715084, + "eval_loss": 0.0061585381627082825, + "eval_runtime": 1.4387, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 540 + }, + { + "epoch": 4.029795158286778, + "grad_norm": 0.011410355567932129, + "learning_rate": 1.0873646567662165e-05, + "loss": 0.0033, + "step": 541 + }, + { + "epoch": 4.037243947858473, + "grad_norm": 0.010611123405396938, + "learning_rate": 1.0711988625313468e-05, + "loss": 0.0031, + "step": 542 + }, + { + "epoch": 4.044692737430168, + "grad_norm": 0.012108752503991127, + "learning_rate": 1.055139709290755e-05, + "loss": 0.0037, + "step": 543 + }, + { + "epoch": 4.052141527001862, + "grad_norm": 0.011806153692305088, + "learning_rate": 1.0391876329443533e-05, + "loss": 0.0033, + "step": 544 + }, + { + "epoch": 4.059590316573557, + "grad_norm": 0.012824393808841705, + "learning_rate": 1.0233430664856236e-05, + "loss": 0.003, + "step": 545 + }, + { + "epoch": 4.059590316573557, + "eval_loss": 0.006417661905288696, + "eval_runtime": 1.4406, + "eval_samples_per_second": 5.553, + "eval_steps_per_second": 1.388, + "step": 545 + }, + { + "epoch": 4.067039106145251, + "grad_norm": 0.012239106930792332, + "learning_rate": 1.0076064399898627e-05, + "loss": 0.0029, + "step": 546 + }, + { + "epoch": 4.074487895716946, + "grad_norm": 0.01317081693559885, + "learning_rate": 9.919781806025135e-06, + "loss": 0.0031, + "step": 547 + }, + { + "epoch": 4.081936685288641, + "grad_norm": 0.013704821467399597, + "learning_rate": 9.764587125275654e-06, + "loss": 0.0031, + "step": 548 + }, + { + "epoch": 4.089385474860335, + "grad_norm": 0.013110890984535217, + "learning_rate": 9.610484570160444e-06, + "loss": 0.0028, + "step": 549 + }, + { + "epoch": 4.0968342644320295, + "grad_norm": 0.014365245588123798, + "learning_rate": 9.45747832354575e-06, + "loss": 0.0038, + "step": 550 + }, + { + "epoch": 4.0968342644320295, + "eval_loss": 0.006416608579456806, + "eval_runtime": 1.4387, + "eval_samples_per_second": 5.56, + "eval_steps_per_second": 1.39, + "step": 550 + }, + { + "epoch": 4.104283054003725, + "grad_norm": 0.011451455764472485, + "learning_rate": 9.305572538540296e-06, + "loss": 0.0025, + "step": 551 + }, + { + "epoch": 4.111731843575419, + "grad_norm": 0.014280877076089382, + "learning_rate": 9.154771338382545e-06, + "loss": 0.0032, + "step": 552 + }, + { + "epoch": 4.1191806331471135, + "grad_norm": 0.01055363193154335, + "learning_rate": 9.005078816328771e-06, + "loss": 0.0026, + "step": 553 + }, + { + "epoch": 4.126629422718808, + "grad_norm": 0.013088367879390717, + "learning_rate": 8.856499035541971e-06, + "loss": 0.0032, + "step": 554 + }, + { + "epoch": 4.134078212290503, + "grad_norm": 0.012816871516406536, + "learning_rate": 8.70903602898157e-06, + "loss": 0.0032, + "step": 555 + }, + { + "epoch": 4.134078212290503, + "eval_loss": 0.006293036043643951, + "eval_runtime": 1.4409, + "eval_samples_per_second": 5.552, + "eval_steps_per_second": 1.388, + "step": 555 + }, + { + "epoch": 4.1415270018621975, + "grad_norm": 0.013055169954895973, + "learning_rate": 8.562693799293931e-06, + "loss": 0.0029, + "step": 556 + }, + { + "epoch": 4.148975791433892, + "grad_norm": 0.011857760138809681, + "learning_rate": 8.417476318703744e-06, + "loss": 0.0024, + "step": 557 + }, + { + "epoch": 4.156424581005586, + "grad_norm": 0.01217963919043541, + "learning_rate": 8.2733875289062e-06, + "loss": 0.0031, + "step": 558 + }, + { + "epoch": 4.1638733705772815, + "grad_norm": 0.010157955810427666, + "learning_rate": 8.130431340959981e-06, + "loss": 0.0021, + "step": 559 + }, + { + "epoch": 4.171322160148976, + "grad_norm": 0.011829957365989685, + "learning_rate": 7.988611635181098e-06, + "loss": 0.003, + "step": 560 + }, + { + "epoch": 4.171322160148976, + "eval_loss": 0.006214559078216553, + "eval_runtime": 1.439, + "eval_samples_per_second": 5.559, + "eval_steps_per_second": 1.39, + "step": 560 + }, + { + "epoch": 4.17877094972067, + "grad_norm": 0.012243757024407387, + "learning_rate": 7.847932261037627e-06, + "loss": 0.003, + "step": 561 + }, + { + "epoch": 4.186219739292365, + "grad_norm": 0.012754394672811031, + "learning_rate": 7.708397037045129e-06, + "loss": 0.0026, + "step": 562 + }, + { + "epoch": 4.19366852886406, + "grad_norm": 0.013026714324951172, + "learning_rate": 7.570009750663054e-06, + "loss": 0.0028, + "step": 563 + }, + { + "epoch": 4.201117318435754, + "grad_norm": 0.011803582310676575, + "learning_rate": 7.432774158191946e-06, + "loss": 0.003, + "step": 564 + }, + { + "epoch": 4.208566108007449, + "grad_norm": 0.011542431078851223, + "learning_rate": 7.296693984671465e-06, + "loss": 0.0025, + "step": 565 + }, + { + "epoch": 4.208566108007449, + "eval_loss": 0.006258544512093067, + "eval_runtime": 1.4403, + "eval_samples_per_second": 5.554, + "eval_steps_per_second": 1.389, + "step": 565 + }, + { + "epoch": 4.216014897579143, + "grad_norm": 0.012065926566720009, + "learning_rate": 7.161772923779258e-06, + "loss": 0.0031, + "step": 566 + }, + { + "epoch": 4.223463687150838, + "grad_norm": 0.012815115042030811, + "learning_rate": 7.0280146377307395e-06, + "loss": 0.003, + "step": 567 + }, + { + "epoch": 4.230912476722533, + "grad_norm": 0.013720634393393993, + "learning_rate": 6.8954227571796815e-06, + "loss": 0.003, + "step": 568 + }, + { + "epoch": 4.238361266294227, + "grad_norm": 0.010038006119430065, + "learning_rate": 6.764000881119631e-06, + "loss": 0.0025, + "step": 569 + }, + { + "epoch": 4.245810055865922, + "grad_norm": 0.010450965724885464, + "learning_rate": 6.6337525767862505e-06, + "loss": 0.0025, + "step": 570 + }, + { + "epoch": 4.245810055865922, + "eval_loss": 0.006230730097740889, + "eval_runtime": 1.4418, + "eval_samples_per_second": 5.549, + "eval_steps_per_second": 1.387, + "step": 570 + }, + { + "epoch": 4.253258845437617, + "grad_norm": 0.014813080430030823, + "learning_rate": 6.50468137956049e-06, + "loss": 0.0027, + "step": 571 + }, + { + "epoch": 4.260707635009311, + "grad_norm": 0.012365314178168774, + "learning_rate": 6.376790792872611e-06, + "loss": 0.0033, + "step": 572 + }, + { + "epoch": 4.268156424581005, + "grad_norm": 0.012182981707155704, + "learning_rate": 6.2500842881071e-06, + "loss": 0.0025, + "step": 573 + }, + { + "epoch": 4.275605214152701, + "grad_norm": 0.011286159977316856, + "learning_rate": 6.124565304508439e-06, + "loss": 0.0024, + "step": 574 + }, + { + "epoch": 4.283054003724395, + "grad_norm": 0.012753508985042572, + "learning_rate": 6.000237249087776e-06, + "loss": 0.0029, + "step": 575 + }, + { + "epoch": 4.283054003724395, + "eval_loss": 0.006264370400458574, + "eval_runtime": 1.441, + "eval_samples_per_second": 5.552, + "eval_steps_per_second": 1.388, + "step": 575 + }, + { + "epoch": 4.290502793296089, + "grad_norm": 0.011141075752675533, + "learning_rate": 5.877103496530395e-06, + "loss": 0.0026, + "step": 576 + }, + { + "epoch": 4.297951582867784, + "grad_norm": 0.014343509450554848, + "learning_rate": 5.755167389104166e-06, + "loss": 0.0032, + "step": 577 + }, + { + "epoch": 4.305400372439479, + "grad_norm": 0.012262287549674511, + "learning_rate": 5.634432236568815e-06, + "loss": 0.0025, + "step": 578 + }, + { + "epoch": 4.312849162011173, + "grad_norm": 0.010775357484817505, + "learning_rate": 5.514901316086057e-06, + "loss": 0.0026, + "step": 579 + }, + { + "epoch": 4.320297951582868, + "grad_norm": 0.011888024397194386, + "learning_rate": 5.3965778721306755e-06, + "loss": 0.0027, + "step": 580 + }, + { + "epoch": 4.320297951582868, + "eval_loss": 0.0062323142774403095, + "eval_runtime": 1.443, + "eval_samples_per_second": 5.544, + "eval_steps_per_second": 1.386, + "step": 580 + }, + { + "epoch": 4.327746741154562, + "grad_norm": 0.012819167226552963, + "learning_rate": 5.279465116402438e-06, + "loss": 0.0031, + "step": 581 + }, + { + "epoch": 4.335195530726257, + "grad_norm": 0.014666089788079262, + "learning_rate": 5.163566227738936e-06, + "loss": 0.0029, + "step": 582 + }, + { + "epoch": 4.342644320297952, + "grad_norm": 0.012702619656920433, + "learning_rate": 5.048884352029271e-06, + "loss": 0.0028, + "step": 583 + }, + { + "epoch": 4.350093109869646, + "grad_norm": 0.013895975425839424, + "learning_rate": 4.935422602128697e-06, + "loss": 0.0027, + "step": 584 + }, + { + "epoch": 4.35754189944134, + "grad_norm": 0.011706478893756866, + "learning_rate": 4.823184057774116e-06, + "loss": 0.0029, + "step": 585 + }, + { + "epoch": 4.35754189944134, + "eval_loss": 0.006300531793385744, + "eval_runtime": 1.4393, + "eval_samples_per_second": 5.558, + "eval_steps_per_second": 1.39, + "step": 585 + }, + { + "epoch": 4.364990689013036, + "grad_norm": 0.011368724517524242, + "learning_rate": 4.712171765500484e-06, + "loss": 0.0024, + "step": 586 + }, + { + "epoch": 4.37243947858473, + "grad_norm": 0.012796067632734776, + "learning_rate": 4.602388738558078e-06, + "loss": 0.0029, + "step": 587 + }, + { + "epoch": 4.379888268156424, + "grad_norm": 0.012198933400213718, + "learning_rate": 4.493837956830788e-06, + "loss": 0.0031, + "step": 588 + }, + { + "epoch": 4.387337057728119, + "grad_norm": 0.011977463960647583, + "learning_rate": 4.386522366755169e-06, + "loss": 0.0028, + "step": 589 + }, + { + "epoch": 4.394785847299814, + "grad_norm": 0.011961457319557667, + "learning_rate": 4.280444881240475e-06, + "loss": 0.0029, + "step": 590 + }, + { + "epoch": 4.394785847299814, + "eval_loss": 0.006288326345384121, + "eval_runtime": 1.4396, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 590 + }, + { + "epoch": 4.402234636871508, + "grad_norm": 0.012083463370800018, + "learning_rate": 4.175608379589624e-06, + "loss": 0.0028, + "step": 591 + }, + { + "epoch": 4.409683426443203, + "grad_norm": 0.018090050667524338, + "learning_rate": 4.072015707421006e-06, + "loss": 0.0025, + "step": 592 + }, + { + "epoch": 4.417132216014897, + "grad_norm": 0.01615767925977707, + "learning_rate": 3.969669676591259e-06, + "loss": 0.0036, + "step": 593 + }, + { + "epoch": 4.424581005586592, + "grad_norm": 0.011792131699621677, + "learning_rate": 3.868573065118936e-06, + "loss": 0.0027, + "step": 594 + }, + { + "epoch": 4.432029795158287, + "grad_norm": 0.013308114372193813, + "learning_rate": 3.768728617109135e-06, + "loss": 0.0029, + "step": 595 + }, + { + "epoch": 4.432029795158287, + "eval_loss": 0.006269657053053379, + "eval_runtime": 1.4369, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 595 + }, + { + "epoch": 4.439478584729981, + "grad_norm": 0.011954352259635925, + "learning_rate": 3.670139042678955e-06, + "loss": 0.0029, + "step": 596 + }, + { + "epoch": 4.446927374301676, + "grad_norm": 0.011767936870455742, + "learning_rate": 3.5728070178839943e-06, + "loss": 0.0028, + "step": 597 + }, + { + "epoch": 4.454376163873371, + "grad_norm": 0.01164400763809681, + "learning_rate": 3.476735184645674e-06, + "loss": 0.0028, + "step": 598 + }, + { + "epoch": 4.461824953445065, + "grad_norm": 0.012548502534627914, + "learning_rate": 3.381926150679543e-06, + "loss": 0.0025, + "step": 599 + }, + { + "epoch": 4.4692737430167595, + "grad_norm": 0.011795224621891975, + "learning_rate": 3.288382489424502e-06, + "loss": 0.0028, + "step": 600 + }, + { + "epoch": 4.4692737430167595, + "eval_loss": 0.006181993521749973, + "eval_runtime": 1.4382, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.391, + "step": 600 + }, + { + "epoch": 4.476722532588455, + "grad_norm": 0.011075781658291817, + "learning_rate": 3.196106739972926e-06, + "loss": 0.0027, + "step": 601 + }, + { + "epoch": 4.484171322160149, + "grad_norm": 0.011777203530073166, + "learning_rate": 3.10510140700177e-06, + "loss": 0.0027, + "step": 602 + }, + { + "epoch": 4.4916201117318435, + "grad_norm": 0.012288088910281658, + "learning_rate": 3.0153689607045845e-06, + "loss": 0.0033, + "step": 603 + }, + { + "epoch": 4.499068901303538, + "grad_norm": 0.013129458762705326, + "learning_rate": 2.9269118367244385e-06, + "loss": 0.0033, + "step": 604 + }, + { + "epoch": 4.506517690875233, + "grad_norm": 0.01386756356805563, + "learning_rate": 2.839732436087833e-06, + "loss": 0.0035, + "step": 605 + }, + { + "epoch": 4.506517690875233, + "eval_loss": 0.006196299567818642, + "eval_runtime": 1.4452, + "eval_samples_per_second": 5.536, + "eval_steps_per_second": 1.384, + "step": 605 + }, + { + "epoch": 4.5139664804469275, + "grad_norm": 0.01066260039806366, + "learning_rate": 2.7538331251395266e-06, + "loss": 0.0024, + "step": 606 + }, + { + "epoch": 4.521415270018622, + "grad_norm": 0.012867064215242863, + "learning_rate": 2.6692162354782944e-06, + "loss": 0.0027, + "step": 607 + }, + { + "epoch": 4.528864059590316, + "grad_norm": 0.010836225934326649, + "learning_rate": 2.585884063893651e-06, + "loss": 0.0026, + "step": 608 + }, + { + "epoch": 4.5363128491620115, + "grad_norm": 0.010976696386933327, + "learning_rate": 2.5038388723034932e-06, + "loss": 0.0029, + "step": 609 + }, + { + "epoch": 4.543761638733706, + "grad_norm": 0.011194245889782906, + "learning_rate": 2.4230828876927294e-06, + "loss": 0.0024, + "step": 610 + }, + { + "epoch": 4.543761638733706, + "eval_loss": 0.006163444370031357, + "eval_runtime": 1.4365, + "eval_samples_per_second": 5.569, + "eval_steps_per_second": 1.392, + "step": 610 + }, + { + "epoch": 4.5512104283054, + "grad_norm": 0.011830671690404415, + "learning_rate": 2.343618302052808e-06, + "loss": 0.0028, + "step": 611 + }, + { + "epoch": 4.558659217877095, + "grad_norm": 0.013033718802034855, + "learning_rate": 2.265447272322213e-06, + "loss": 0.0033, + "step": 612 + }, + { + "epoch": 4.56610800744879, + "grad_norm": 0.011297180317342281, + "learning_rate": 2.1885719203279588e-06, + "loss": 0.0028, + "step": 613 + }, + { + "epoch": 4.573556797020484, + "grad_norm": 0.010839039459824562, + "learning_rate": 2.112994332727952e-06, + "loss": 0.0026, + "step": 614 + }, + { + "epoch": 4.581005586592179, + "grad_norm": 0.01038146112114191, + "learning_rate": 2.0387165609543736e-06, + "loss": 0.0026, + "step": 615 + }, + { + "epoch": 4.581005586592179, + "eval_loss": 0.006171726621687412, + "eval_runtime": 1.4427, + "eval_samples_per_second": 5.545, + "eval_steps_per_second": 1.386, + "step": 615 + }, + { + "epoch": 4.588454376163874, + "grad_norm": 0.01402036752551794, + "learning_rate": 1.9657406211579966e-06, + "loss": 0.0032, + "step": 616 + }, + { + "epoch": 4.595903165735568, + "grad_norm": 0.011762767098844051, + "learning_rate": 1.8940684941534392e-06, + "loss": 0.0026, + "step": 617 + }, + { + "epoch": 4.603351955307263, + "grad_norm": 0.012213380075991154, + "learning_rate": 1.8237021253654396e-06, + "loss": 0.003, + "step": 618 + }, + { + "epoch": 4.610800744878957, + "grad_norm": 0.011311118490993977, + "learning_rate": 1.7546434247760146e-06, + "loss": 0.0027, + "step": 619 + }, + { + "epoch": 4.618249534450651, + "grad_norm": 0.011504790745675564, + "learning_rate": 1.6868942668726407e-06, + "loss": 0.0028, + "step": 620 + }, + { + "epoch": 4.618249534450651, + "eval_loss": 0.006179572083055973, + "eval_runtime": 1.4385, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 620 + }, + { + "epoch": 4.625698324022347, + "grad_norm": 0.011975683271884918, + "learning_rate": 1.6204564905973386e-06, + "loss": 0.0029, + "step": 621 + }, + { + "epoch": 4.633147113594041, + "grad_norm": 0.012049319222569466, + "learning_rate": 1.555331899296808e-06, + "loss": 0.003, + "step": 622 + }, + { + "epoch": 4.640595903165735, + "grad_norm": 0.011926544830203056, + "learning_rate": 1.4915222606734392e-06, + "loss": 0.003, + "step": 623 + }, + { + "epoch": 4.648044692737431, + "grad_norm": 0.014196524396538734, + "learning_rate": 1.429029306737345e-06, + "loss": 0.0029, + "step": 624 + }, + { + "epoch": 4.655493482309125, + "grad_norm": 0.011004266329109669, + "learning_rate": 1.3678547337593494e-06, + "loss": 0.0024, + "step": 625 + }, + { + "epoch": 4.655493482309125, + "eval_loss": 0.006184046622365713, + "eval_runtime": 1.4382, + "eval_samples_per_second": 5.563, + "eval_steps_per_second": 1.391, + "step": 625 + }, + { + "epoch": 4.662942271880819, + "grad_norm": 0.01366499625146389, + "learning_rate": 1.3080002022249405e-06, + "loss": 0.0038, + "step": 626 + }, + { + "epoch": 4.670391061452514, + "grad_norm": 0.011911512352526188, + "learning_rate": 1.2494673367892062e-06, + "loss": 0.0028, + "step": 627 + }, + { + "epoch": 4.677839851024208, + "grad_norm": 0.012180446647107601, + "learning_rate": 1.1922577262327373e-06, + "loss": 0.0028, + "step": 628 + }, + { + "epoch": 4.685288640595903, + "grad_norm": 0.011556231416761875, + "learning_rate": 1.1363729234184827e-06, + "loss": 0.0029, + "step": 629 + }, + { + "epoch": 4.692737430167598, + "grad_norm": 0.011940409429371357, + "learning_rate": 1.0818144452496292e-06, + "loss": 0.0031, + "step": 630 + }, + { + "epoch": 4.692737430167598, + "eval_loss": 0.006148128770291805, + "eval_runtime": 1.4371, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 630 + }, + { + "epoch": 4.700186219739292, + "grad_norm": 0.011379748582839966, + "learning_rate": 1.0285837726283998e-06, + "loss": 0.0027, + "step": 631 + }, + { + "epoch": 4.707635009310987, + "grad_norm": 0.010856245644390583, + "learning_rate": 9.76682350415875e-07, + "loss": 0.0024, + "step": 632 + }, + { + "epoch": 4.715083798882682, + "grad_norm": 0.012437484227120876, + "learning_rate": 9.261115873927695e-07, + "loss": 0.0031, + "step": 633 + }, + { + "epoch": 4.722532588454376, + "grad_norm": 0.014863853342831135, + "learning_rate": 8.768728562211947e-07, + "loss": 0.0027, + "step": 634 + }, + { + "epoch": 4.72998137802607, + "grad_norm": 0.013696379959583282, + "learning_rate": 8.289674934073844e-07, + "loss": 0.0028, + "step": 635 + }, + { + "epoch": 4.72998137802607, + "eval_loss": 0.006156946066766977, + "eval_runtime": 1.4405, + "eval_samples_per_second": 5.554, + "eval_steps_per_second": 1.388, + "step": 635 + }, + { + "epoch": 4.737430167597766, + "grad_norm": 0.011920612305402756, + "learning_rate": 7.823967992654502e-07, + "loss": 0.0027, + "step": 636 + }, + { + "epoch": 4.74487895716946, + "grad_norm": 0.011787285096943378, + "learning_rate": 7.371620378820554e-07, + "loss": 0.003, + "step": 637 + }, + { + "epoch": 4.752327746741154, + "grad_norm": 0.012289394624531269, + "learning_rate": 6.932644370821085e-07, + "loss": 0.003, + "step": 638 + }, + { + "epoch": 4.759776536312849, + "grad_norm": 0.011694079264998436, + "learning_rate": 6.507051883954618e-07, + "loss": 0.0027, + "step": 639 + }, + { + "epoch": 4.767225325884544, + "grad_norm": 0.011612670496106148, + "learning_rate": 6.094854470245326e-07, + "loss": 0.0025, + "step": 640 + }, + { + "epoch": 4.767225325884544, + "eval_loss": 0.00619715079665184, + "eval_runtime": 1.4397, + "eval_samples_per_second": 5.557, + "eval_steps_per_second": 1.389, + "step": 640 + }, + { + "epoch": 4.774674115456238, + "grad_norm": 0.012073226273059845, + "learning_rate": 5.696063318129663e-07, + "loss": 0.0029, + "step": 641 + }, + { + "epoch": 4.782122905027933, + "grad_norm": 0.012153049930930138, + "learning_rate": 5.310689252152834e-07, + "loss": 0.0027, + "step": 642 + }, + { + "epoch": 4.789571694599628, + "grad_norm": 0.011195520870387554, + "learning_rate": 4.938742732674529e-07, + "loss": 0.0029, + "step": 643 + }, + { + "epoch": 4.797020484171322, + "grad_norm": 0.011535811237990856, + "learning_rate": 4.5802338555854254e-07, + "loss": 0.0027, + "step": 644 + }, + { + "epoch": 4.804469273743017, + "grad_norm": 0.01212665531784296, + "learning_rate": 4.235172352033023e-07, + "loss": 0.003, + "step": 645 + }, + { + "epoch": 4.804469273743017, + "eval_loss": 0.006184935569763184, + "eval_runtime": 1.4377, + "eval_samples_per_second": 5.565, + "eval_steps_per_second": 1.391, + "step": 645 + }, + { + "epoch": 4.811918063314711, + "grad_norm": 0.011656287126243114, + "learning_rate": 3.903567588157353e-07, + "loss": 0.0026, + "step": 646 + }, + { + "epoch": 4.8193668528864055, + "grad_norm": 0.01080574281513691, + "learning_rate": 3.585428564836957e-07, + "loss": 0.0029, + "step": 647 + }, + { + "epoch": 4.826815642458101, + "grad_norm": 0.011630838736891747, + "learning_rate": 3.280763917444363e-07, + "loss": 0.003, + "step": 648 + }, + { + "epoch": 4.834264432029795, + "grad_norm": 0.012125520035624504, + "learning_rate": 2.9895819156119943e-07, + "loss": 0.0027, + "step": 649 + }, + { + "epoch": 4.8417132216014895, + "grad_norm": 0.011449109762907028, + "learning_rate": 2.711890463007405e-07, + "loss": 0.0027, + "step": 650 + }, + { + "epoch": 4.8417132216014895, + "eval_loss": 0.006186852231621742, + "eval_runtime": 1.4384, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.39, + "step": 650 + }, + { + "epoch": 4.849162011173185, + "grad_norm": 0.011429588310420513, + "learning_rate": 2.447697097118951e-07, + "loss": 0.0025, + "step": 651 + }, + { + "epoch": 4.856610800744879, + "grad_norm": 0.011758127249777317, + "learning_rate": 2.1970089890509527e-07, + "loss": 0.003, + "step": 652 + }, + { + "epoch": 4.8640595903165735, + "grad_norm": 0.012119573540985584, + "learning_rate": 1.9598329433293538e-07, + "loss": 0.003, + "step": 653 + }, + { + "epoch": 4.871508379888268, + "grad_norm": 0.015237159095704556, + "learning_rate": 1.7361753977169215e-07, + "loss": 0.0036, + "step": 654 + }, + { + "epoch": 4.878957169459962, + "grad_norm": 0.011369496583938599, + "learning_rate": 1.5260424230382763e-07, + "loss": 0.0027, + "step": 655 + }, + { + "epoch": 4.878957169459962, + "eval_loss": 0.006174933630973101, + "eval_runtime": 1.4412, + "eval_samples_per_second": 5.551, + "eval_steps_per_second": 1.388, + "step": 655 + }, + { + "epoch": 4.8864059590316575, + "grad_norm": 0.01241251826286316, + "learning_rate": 1.3294397230153577e-07, + "loss": 0.0031, + "step": 656 + }, + { + "epoch": 4.893854748603352, + "grad_norm": 0.01049934420734644, + "learning_rate": 1.1463726341126025e-07, + "loss": 0.0025, + "step": 657 + }, + { + "epoch": 4.901303538175046, + "grad_norm": 0.012500923126935959, + "learning_rate": 9.768461253920614e-08, + "loss": 0.0022, + "step": 658 + }, + { + "epoch": 4.9087523277467415, + "grad_norm": 0.010567902587354183, + "learning_rate": 8.208647983782847e-08, + "loss": 0.0026, + "step": 659 + }, + { + "epoch": 4.916201117318436, + "grad_norm": 0.011495240963995457, + "learning_rate": 6.784328869339218e-08, + "loss": 0.0028, + "step": 660 + }, + { + "epoch": 4.916201117318436, + "eval_loss": 0.006185730919241905, + "eval_runtime": 1.4398, + "eval_samples_per_second": 5.556, + "eval_steps_per_second": 1.389, + "step": 660 + }, + { + "epoch": 4.92364990689013, + "grad_norm": 0.014018573798239231, + "learning_rate": 5.4955425714431353e-08, + "loss": 0.0028, + "step": 661 + }, + { + "epoch": 4.931098696461825, + "grad_norm": 0.011257159523665905, + "learning_rate": 4.3423240721268686e-08, + "loss": 0.0028, + "step": 662 + }, + { + "epoch": 4.93854748603352, + "grad_norm": 0.012268785387277603, + "learning_rate": 3.324704673655088e-08, + "loss": 0.0027, + "step": 663 + }, + { + "epoch": 4.945996275605214, + "grad_norm": 0.012677626684308052, + "learning_rate": 2.442711997670544e-08, + "loss": 0.003, + "step": 664 + }, + { + "epoch": 4.953445065176909, + "grad_norm": 0.015152164734899998, + "learning_rate": 1.6963699844474434e-08, + "loss": 0.0029, + "step": 665 + }, + { + "epoch": 4.953445065176909, + "eval_loss": 0.006173715926706791, + "eval_runtime": 1.4376, + "eval_samples_per_second": 5.565, + "eval_steps_per_second": 1.391, + "step": 665 + }, + { + "epoch": 4.960893854748603, + "grad_norm": 0.011466389521956444, + "learning_rate": 1.0856988922403056e-08, + "loss": 0.0023, + "step": 666 + }, + { + "epoch": 4.968342644320298, + "grad_norm": 0.011559192091226578, + "learning_rate": 6.107152967349539e-09, + "loss": 0.0027, + "step": 667 + }, + { + "epoch": 4.975791433891993, + "grad_norm": 0.01306323055177927, + "learning_rate": 2.714320905977674e-09, + "loss": 0.0033, + "step": 668 + }, + { + "epoch": 4.983240223463687, + "grad_norm": 0.012209140695631504, + "learning_rate": 6.785848312707011e-10, + "loss": 0.0032, + "step": 669 + }, + { + "epoch": 4.990689013035381, + "grad_norm": 0.012395043857395649, + "learning_rate": 0.0, + "loss": 0.0029, + "step": 670 + }, + { + "epoch": 4.990689013035381, + "eval_loss": 0.006168271414935589, + "eval_runtime": 1.4372, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 1.392, + "step": 670 + }, + { + "epoch": 4.990689013035381, + "step": 670, + "total_flos": 1.295611262483497e+18, + "train_loss": 0.009448963423509763, + "train_runtime": 10028.0419, + "train_samples_per_second": 2.142, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1, + "max_steps": 670, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.295611262483497e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}