{ "best_metric": null, "best_model_checkpoint": null, "epoch": 22.995433789954337, "eval_steps": 500, "global_step": 2518, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.182648401826484, "grad_norm": 0.4084379971027374, "learning_rate": 0.0001, "loss": 0.8312, "step": 20 }, { "epoch": 0.365296803652968, "grad_norm": 0.24725936353206635, "learning_rate": 0.0001, "loss": 0.1547, "step": 40 }, { "epoch": 0.547945205479452, "grad_norm": 0.1690889149904251, "learning_rate": 0.0001, "loss": 0.0644, "step": 60 }, { "epoch": 0.730593607305936, "grad_norm": 0.09192364662885666, "learning_rate": 0.0001, "loss": 0.0466, "step": 80 }, { "epoch": 0.91324200913242, "grad_norm": 0.08266641944646835, "learning_rate": 0.0001, "loss": 0.0385, "step": 100 }, { "epoch": 1.095890410958904, "grad_norm": 0.10168185085058212, "learning_rate": 0.0001, "loss": 0.0379, "step": 120 }, { "epoch": 1.278538812785388, "grad_norm": 0.10715723037719727, "learning_rate": 0.0001, "loss": 0.0337, "step": 140 }, { "epoch": 1.461187214611872, "grad_norm": 0.08185174316167831, "learning_rate": 0.0001, "loss": 0.0304, "step": 160 }, { "epoch": 1.643835616438356, "grad_norm": 0.0720980241894722, "learning_rate": 0.0001, "loss": 0.0342, "step": 180 }, { "epoch": 1.82648401826484, "grad_norm": 0.07974616438150406, "learning_rate": 0.0001, "loss": 0.0312, "step": 200 }, { "epoch": 2.009132420091324, "grad_norm": 0.08611268550157547, "learning_rate": 0.0001, "loss": 0.0315, "step": 220 }, { "epoch": 2.191780821917808, "grad_norm": 0.06699004024267197, "learning_rate": 0.0001, "loss": 0.0267, "step": 240 }, { "epoch": 2.374429223744292, "grad_norm": 0.1077587902545929, "learning_rate": 0.0001, "loss": 0.0246, "step": 260 }, { "epoch": 2.557077625570776, "grad_norm": 0.10352851450443268, "learning_rate": 0.0001, "loss": 0.0267, "step": 280 }, { "epoch": 2.73972602739726, "grad_norm": 0.08488716930150986, "learning_rate": 0.0001, "loss": 0.0297, "step": 300 }, { "epoch": 2.922374429223744, "grad_norm": 0.08407847583293915, "learning_rate": 0.0001, "loss": 0.0269, "step": 320 }, { "epoch": 3.105022831050228, "grad_norm": 0.0976366400718689, "learning_rate": 0.0001, "loss": 0.0251, "step": 340 }, { "epoch": 3.287671232876712, "grad_norm": 0.08240761607885361, "learning_rate": 0.0001, "loss": 0.0229, "step": 360 }, { "epoch": 3.470319634703196, "grad_norm": 0.0689239650964737, "learning_rate": 0.0001, "loss": 0.0232, "step": 380 }, { "epoch": 3.65296803652968, "grad_norm": 0.0607539638876915, "learning_rate": 0.0001, "loss": 0.0231, "step": 400 }, { "epoch": 3.8356164383561646, "grad_norm": 0.06858925521373749, "learning_rate": 0.0001, "loss": 0.023, "step": 420 }, { "epoch": 4.018264840182648, "grad_norm": 0.04049643874168396, "learning_rate": 0.0001, "loss": 0.0231, "step": 440 }, { "epoch": 4.200913242009133, "grad_norm": 0.08556920289993286, "learning_rate": 0.0001, "loss": 0.018, "step": 460 }, { "epoch": 4.383561643835616, "grad_norm": 0.05961354076862335, "learning_rate": 0.0001, "loss": 0.0183, "step": 480 }, { "epoch": 4.566210045662101, "grad_norm": 0.05691586434841156, "learning_rate": 0.0001, "loss": 0.02, "step": 500 }, { "epoch": 4.748858447488584, "grad_norm": 0.05423538759350777, "learning_rate": 0.0001, "loss": 0.0196, "step": 520 }, { "epoch": 4.931506849315069, "grad_norm": 0.10058747231960297, "learning_rate": 0.0001, "loss": 0.0206, "step": 540 }, { "epoch": 5.114155251141552, "grad_norm": 0.064676932990551, "learning_rate": 0.0001, "loss": 0.0177, "step": 560 }, { "epoch": 5.296803652968037, "grad_norm": 0.08128379285335541, "learning_rate": 0.0001, "loss": 0.0157, "step": 580 }, { "epoch": 5.47945205479452, "grad_norm": 0.10474538058042526, "learning_rate": 0.0001, "loss": 0.0169, "step": 600 }, { "epoch": 5.662100456621005, "grad_norm": 0.09420209378004074, "learning_rate": 0.0001, "loss": 0.0207, "step": 620 }, { "epoch": 5.844748858447488, "grad_norm": 0.07704417407512665, "learning_rate": 0.0001, "loss": 0.018, "step": 640 }, { "epoch": 6.027397260273973, "grad_norm": 0.044411078095436096, "learning_rate": 0.0001, "loss": 0.0168, "step": 660 }, { "epoch": 6.210045662100456, "grad_norm": 0.09763959795236588, "learning_rate": 0.0001, "loss": 0.0131, "step": 680 }, { "epoch": 6.392694063926941, "grad_norm": 0.08706251531839371, "learning_rate": 0.0001, "loss": 0.0146, "step": 700 }, { "epoch": 6.575342465753424, "grad_norm": 0.10404196381568909, "learning_rate": 0.0001, "loss": 0.0169, "step": 720 }, { "epoch": 6.757990867579909, "grad_norm": 0.1037658154964447, "learning_rate": 0.0001, "loss": 0.0165, "step": 740 }, { "epoch": 6.940639269406392, "grad_norm": 0.07572110742330551, "learning_rate": 0.0001, "loss": 0.0168, "step": 760 }, { "epoch": 7.123287671232877, "grad_norm": 0.06740553677082062, "learning_rate": 0.0001, "loss": 0.0139, "step": 780 }, { "epoch": 7.30593607305936, "grad_norm": 0.08043979108333588, "learning_rate": 0.0001, "loss": 0.014, "step": 800 }, { "epoch": 7.488584474885845, "grad_norm": 0.06607798486948013, "learning_rate": 0.0001, "loss": 0.0136, "step": 820 }, { "epoch": 7.671232876712329, "grad_norm": 0.11705009639263153, "learning_rate": 0.0001, "loss": 0.0146, "step": 840 }, { "epoch": 7.853881278538813, "grad_norm": 0.04560132324695587, "learning_rate": 0.0001, "loss": 0.0154, "step": 860 }, { "epoch": 8.036529680365296, "grad_norm": 0.05037812143564224, "learning_rate": 0.0001, "loss": 0.0129, "step": 880 }, { "epoch": 8.219178082191782, "grad_norm": 0.07135117053985596, "learning_rate": 0.0001, "loss": 0.0109, "step": 900 }, { "epoch": 8.401826484018265, "grad_norm": 0.05977578088641167, "learning_rate": 0.0001, "loss": 0.0117, "step": 920 }, { "epoch": 8.584474885844749, "grad_norm": 0.07411223649978638, "learning_rate": 0.0001, "loss": 0.0111, "step": 940 }, { "epoch": 8.767123287671232, "grad_norm": 0.08515261113643646, "learning_rate": 0.0001, "loss": 0.0122, "step": 960 }, { "epoch": 8.949771689497716, "grad_norm": 0.07383166998624802, "learning_rate": 0.0001, "loss": 0.0125, "step": 980 }, { "epoch": 9.132420091324201, "grad_norm": 0.041954681277275085, "learning_rate": 0.0001, "loss": 0.0105, "step": 1000 }, { "epoch": 9.315068493150685, "grad_norm": 0.09089387208223343, "learning_rate": 0.0001, "loss": 0.0105, "step": 1020 }, { "epoch": 9.497716894977168, "grad_norm": 0.08716876059770584, "learning_rate": 0.0001, "loss": 0.011, "step": 1040 }, { "epoch": 9.680365296803654, "grad_norm": 0.04927799850702286, "learning_rate": 0.0001, "loss": 0.0106, "step": 1060 }, { "epoch": 9.863013698630137, "grad_norm": 0.05259260907769203, "learning_rate": 0.0001, "loss": 0.0111, "step": 1080 }, { "epoch": 10.045662100456621, "grad_norm": 0.04412449151277542, "learning_rate": 0.0001, "loss": 0.0106, "step": 1100 }, { "epoch": 10.228310502283104, "grad_norm": 0.05673637241125107, "learning_rate": 0.0001, "loss": 0.0087, "step": 1120 }, { "epoch": 10.41095890410959, "grad_norm": 0.04577219486236572, "learning_rate": 0.0001, "loss": 0.0094, "step": 1140 }, { "epoch": 10.593607305936073, "grad_norm": 0.05691211298108101, "learning_rate": 0.0001, "loss": 0.0098, "step": 1160 }, { "epoch": 10.776255707762557, "grad_norm": 0.05354565382003784, "learning_rate": 0.0001, "loss": 0.01, "step": 1180 }, { "epoch": 10.95890410958904, "grad_norm": 0.06758158653974533, "learning_rate": 0.0001, "loss": 0.0104, "step": 1200 }, { "epoch": 11.141552511415526, "grad_norm": 0.05417347326874733, "learning_rate": 0.0001, "loss": 0.009, "step": 1220 }, { "epoch": 11.32420091324201, "grad_norm": 0.05120660364627838, "learning_rate": 0.0001, "loss": 0.0104, "step": 1240 }, { "epoch": 11.506849315068493, "grad_norm": 0.051275257021188736, "learning_rate": 0.0001, "loss": 0.0095, "step": 1260 }, { "epoch": 11.689497716894977, "grad_norm": 0.044794872403144836, "learning_rate": 0.0001, "loss": 0.0088, "step": 1280 }, { "epoch": 11.872146118721462, "grad_norm": 0.09698979556560516, "learning_rate": 0.0001, "loss": 0.009, "step": 1300 }, { "epoch": 12.054794520547945, "grad_norm": 0.060981862246990204, "learning_rate": 0.0001, "loss": 0.0091, "step": 1320 }, { "epoch": 12.237442922374429, "grad_norm": 0.0480022057890892, "learning_rate": 0.0001, "loss": 0.0085, "step": 1340 }, { "epoch": 12.420091324200913, "grad_norm": 0.05448669195175171, "learning_rate": 0.0001, "loss": 0.0082, "step": 1360 }, { "epoch": 12.602739726027398, "grad_norm": 0.06578750908374786, "learning_rate": 0.0001, "loss": 0.0086, "step": 1380 }, { "epoch": 12.785388127853881, "grad_norm": 0.0766950324177742, "learning_rate": 0.0001, "loss": 0.0089, "step": 1400 }, { "epoch": 12.968036529680365, "grad_norm": 0.05735301971435547, "learning_rate": 0.0001, "loss": 0.0094, "step": 1420 }, { "epoch": 13.150684931506849, "grad_norm": 0.05431370809674263, "learning_rate": 0.0001, "loss": 0.0094, "step": 1440 }, { "epoch": 13.333333333333334, "grad_norm": 0.04852620139718056, "learning_rate": 0.0001, "loss": 0.008, "step": 1460 }, { "epoch": 13.515981735159817, "grad_norm": 0.022798724472522736, "learning_rate": 0.0001, "loss": 0.0083, "step": 1480 }, { "epoch": 13.698630136986301, "grad_norm": 0.040976837277412415, "learning_rate": 0.0001, "loss": 0.0083, "step": 1500 }, { "epoch": 13.881278538812785, "grad_norm": 0.04464666545391083, "learning_rate": 0.0001, "loss": 0.0087, "step": 1520 }, { "epoch": 14.06392694063927, "grad_norm": 0.03699250891804695, "learning_rate": 0.0001, "loss": 0.0075, "step": 1540 }, { "epoch": 14.246575342465754, "grad_norm": 0.03735646605491638, "learning_rate": 0.0001, "loss": 0.008, "step": 1560 }, { "epoch": 14.429223744292237, "grad_norm": 0.04999354109168053, "learning_rate": 0.0001, "loss": 0.0077, "step": 1580 }, { "epoch": 14.61187214611872, "grad_norm": 0.032685693353414536, "learning_rate": 0.0001, "loss": 0.0078, "step": 1600 }, { "epoch": 14.794520547945206, "grad_norm": 0.051545485854148865, "learning_rate": 0.0001, "loss": 0.0076, "step": 1620 }, { "epoch": 14.97716894977169, "grad_norm": 0.046069201081991196, "learning_rate": 0.0001, "loss": 0.0088, "step": 1640 }, { "epoch": 15.159817351598173, "grad_norm": 0.07814318686723709, "learning_rate": 0.0001, "loss": 0.0088, "step": 1660 }, { "epoch": 15.342465753424657, "grad_norm": 0.05025108903646469, "learning_rate": 0.0001, "loss": 0.0076, "step": 1680 }, { "epoch": 15.525114155251142, "grad_norm": 0.05556974932551384, "learning_rate": 0.0001, "loss": 0.0079, "step": 1700 }, { "epoch": 15.707762557077626, "grad_norm": 0.16063013672828674, "learning_rate": 0.0001, "loss": 0.0091, "step": 1720 }, { "epoch": 15.89041095890411, "grad_norm": 0.05164189264178276, "learning_rate": 0.0001, "loss": 0.008, "step": 1740 }, { "epoch": 16.073059360730593, "grad_norm": 0.07564544677734375, "learning_rate": 0.0001, "loss": 0.0082, "step": 1760 }, { "epoch": 16.255707762557076, "grad_norm": 0.13074898719787598, "learning_rate": 0.0001, "loss": 0.0081, "step": 1780 }, { "epoch": 16.438356164383563, "grad_norm": 0.05894935131072998, "learning_rate": 0.0001, "loss": 0.0079, "step": 1800 }, { "epoch": 16.621004566210047, "grad_norm": 0.05766492336988449, "learning_rate": 0.0001, "loss": 0.0082, "step": 1820 }, { "epoch": 16.80365296803653, "grad_norm": 0.04949938878417015, "learning_rate": 0.0001, "loss": 0.0081, "step": 1840 }, { "epoch": 16.986301369863014, "grad_norm": 0.0840422660112381, "learning_rate": 0.0001, "loss": 0.0084, "step": 1860 }, { "epoch": 17.168949771689498, "grad_norm": 0.059242211282253265, "learning_rate": 0.0001, "loss": 0.0067, "step": 1880 }, { "epoch": 17.35159817351598, "grad_norm": 0.08060181140899658, "learning_rate": 0.0001, "loss": 0.0076, "step": 1900 }, { "epoch": 17.534246575342465, "grad_norm": 0.04570634663105011, "learning_rate": 0.0001, "loss": 0.0077, "step": 1920 }, { "epoch": 17.71689497716895, "grad_norm": 0.05903254821896553, "learning_rate": 0.0001, "loss": 0.0078, "step": 1940 }, { "epoch": 17.899543378995435, "grad_norm": 0.04280728101730347, "learning_rate": 0.0001, "loss": 0.0076, "step": 1960 }, { "epoch": 18.08219178082192, "grad_norm": 0.04099351540207863, "learning_rate": 0.0001, "loss": 0.0074, "step": 1980 }, { "epoch": 18.264840182648403, "grad_norm": 0.03019886650145054, "learning_rate": 0.0001, "loss": 0.0069, "step": 2000 }, { "epoch": 18.447488584474886, "grad_norm": 0.033835116773843765, "learning_rate": 0.0001, "loss": 0.0068, "step": 2020 }, { "epoch": 18.63013698630137, "grad_norm": 0.04315301775932312, "learning_rate": 0.0001, "loss": 0.0075, "step": 2040 }, { "epoch": 18.812785388127853, "grad_norm": 0.07251162081956863, "learning_rate": 0.0001, "loss": 0.0072, "step": 2060 }, { "epoch": 18.995433789954337, "grad_norm": 0.0530848354101181, "learning_rate": 0.0001, "loss": 0.0077, "step": 2080 }, { "epoch": 19.17808219178082, "grad_norm": 0.03507260978221893, "learning_rate": 0.0001, "loss": 0.0065, "step": 2100 }, { "epoch": 19.360730593607308, "grad_norm": 0.038022469729185104, "learning_rate": 0.0001, "loss": 0.0065, "step": 2120 }, { "epoch": 19.54337899543379, "grad_norm": 0.07193304598331451, "learning_rate": 0.0001, "loss": 0.0073, "step": 2140 }, { "epoch": 19.726027397260275, "grad_norm": 0.03509294614195824, "learning_rate": 0.0001, "loss": 0.0077, "step": 2160 }, { "epoch": 19.908675799086758, "grad_norm": 0.04408028721809387, "learning_rate": 0.0001, "loss": 0.0073, "step": 2180 }, { "epoch": 20.091324200913242, "grad_norm": 0.043653570115566254, "learning_rate": 0.0001, "loss": 0.0066, "step": 2200 }, { "epoch": 20.273972602739725, "grad_norm": 0.04543660581111908, "learning_rate": 0.0001, "loss": 0.0064, "step": 2220 }, { "epoch": 20.45662100456621, "grad_norm": 0.020475788041949272, "learning_rate": 0.0001, "loss": 0.0067, "step": 2240 }, { "epoch": 20.639269406392692, "grad_norm": 0.029115352779626846, "learning_rate": 0.0001, "loss": 0.0069, "step": 2260 }, { "epoch": 20.82191780821918, "grad_norm": 0.03670508787035942, "learning_rate": 0.0001, "loss": 0.0075, "step": 2280 }, { "epoch": 21.004566210045663, "grad_norm": 0.03692976012825966, "learning_rate": 0.0001, "loss": 0.007, "step": 2300 }, { "epoch": 21.187214611872147, "grad_norm": 0.02499617636203766, "learning_rate": 0.0001, "loss": 0.0064, "step": 2320 }, { "epoch": 21.36986301369863, "grad_norm": 0.02078641578555107, "learning_rate": 0.0001, "loss": 0.0061, "step": 2340 }, { "epoch": 21.552511415525114, "grad_norm": 0.03682105615735054, "learning_rate": 0.0001, "loss": 0.0065, "step": 2360 }, { "epoch": 21.735159817351597, "grad_norm": 0.02736218087375164, "learning_rate": 0.0001, "loss": 0.0069, "step": 2380 }, { "epoch": 21.91780821917808, "grad_norm": 0.04047287255525589, "learning_rate": 0.0001, "loss": 0.007, "step": 2400 }, { "epoch": 22.100456621004565, "grad_norm": 0.01508075650781393, "learning_rate": 0.0001, "loss": 0.0069, "step": 2420 }, { "epoch": 22.28310502283105, "grad_norm": 0.06898848712444305, "learning_rate": 0.0001, "loss": 0.0071, "step": 2440 }, { "epoch": 22.465753424657535, "grad_norm": 0.037652187049388885, "learning_rate": 0.0001, "loss": 0.0064, "step": 2460 }, { "epoch": 22.64840182648402, "grad_norm": 0.03481518477201462, "learning_rate": 0.0001, "loss": 0.0069, "step": 2480 }, { "epoch": 22.831050228310502, "grad_norm": 0.03326406702399254, "learning_rate": 0.0001, "loss": 0.007, "step": 2500 } ], "logging_steps": 20, "max_steps": 10900, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.910639205385347e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }