{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6166281755196303, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004618937644341801, "grad_norm": 10.975909233093262, "learning_rate": 3.4642032332563515e-07, "loss": 2.4942, "step": 10 }, { "epoch": 0.009237875288683603, "grad_norm": 9.186948776245117, "learning_rate": 9.237875288683603e-07, "loss": 2.7582, "step": 20 }, { "epoch": 0.013856812933025405, "grad_norm": 10.907584190368652, "learning_rate": 1.443418013856813e-06, "loss": 2.6814, "step": 30 }, { "epoch": 0.018475750577367205, "grad_norm": 9.531168937683105, "learning_rate": 2.0207852193995383e-06, "loss": 2.6982, "step": 40 }, { "epoch": 0.023094688221709007, "grad_norm": 14.727725982666016, "learning_rate": 2.5981524249422633e-06, "loss": 2.0218, "step": 50 }, { "epoch": 0.02771362586605081, "grad_norm": 8.314309120178223, "learning_rate": 3.117782909930716e-06, "loss": 1.5595, "step": 60 }, { "epoch": 0.03233256351039261, "grad_norm": 4.944284915924072, "learning_rate": 3.6951501154734412e-06, "loss": 1.02, "step": 70 }, { "epoch": 0.03695150115473441, "grad_norm": 10.882843971252441, "learning_rate": 4.272517321016166e-06, "loss": 1.0419, "step": 80 }, { "epoch": 0.04157043879907621, "grad_norm": 12.310320854187012, "learning_rate": 4.849884526558892e-06, "loss": 1.0801, "step": 90 }, { "epoch": 0.046189376443418015, "grad_norm": 6.49992036819458, "learning_rate": 5.427251732101616e-06, "loss": 0.8444, "step": 100 }, { "epoch": 0.050808314087759814, "grad_norm": 2.89493465423584, "learning_rate": 6.004618937644342e-06, "loss": 0.8884, "step": 110 }, { "epoch": 0.05542725173210162, "grad_norm": 3.966763734817505, "learning_rate": 6.581986143187067e-06, "loss": 0.8672, "step": 120 }, { "epoch": 0.06004618937644342, "grad_norm": 4.442293167114258, "learning_rate": 7.159353348729793e-06, "loss": 0.8037, "step": 130 }, { "epoch": 0.06466512702078522, "grad_norm": 2.506918430328369, "learning_rate": 7.736720554272519e-06, "loss": 0.724, "step": 140 }, { "epoch": 0.06928406466512702, "grad_norm": 5.733686447143555, "learning_rate": 8.314087759815242e-06, "loss": 0.7692, "step": 150 }, { "epoch": 0.07390300230946882, "grad_norm": 4.161188125610352, "learning_rate": 8.891454965357968e-06, "loss": 0.7413, "step": 160 }, { "epoch": 0.07852193995381063, "grad_norm": 3.9434962272644043, "learning_rate": 9.468822170900693e-06, "loss": 0.7386, "step": 170 }, { "epoch": 0.08314087759815242, "grad_norm": 2.9100701808929443, "learning_rate": 1.0046189376443418e-05, "loss": 0.6942, "step": 180 }, { "epoch": 0.08775981524249422, "grad_norm": 5.367318153381348, "learning_rate": 1.0623556581986144e-05, "loss": 0.8011, "step": 190 }, { "epoch": 0.09237875288683603, "grad_norm": 3.1690614223480225, "learning_rate": 1.1200923787528869e-05, "loss": 0.6816, "step": 200 }, { "epoch": 0.09699769053117784, "grad_norm": 4.35976505279541, "learning_rate": 1.1778290993071595e-05, "loss": 0.7408, "step": 210 }, { "epoch": 0.10161662817551963, "grad_norm": 3.330937623977661, "learning_rate": 1.235565819861432e-05, "loss": 0.7159, "step": 220 }, { "epoch": 0.10623556581986143, "grad_norm": 5.761129379272461, "learning_rate": 1.2933025404157046e-05, "loss": 0.6838, "step": 230 }, { "epoch": 0.11085450346420324, "grad_norm": 7.05668830871582, "learning_rate": 1.351039260969977e-05, "loss": 0.7135, "step": 240 }, { "epoch": 0.11547344110854503, "grad_norm": 3.7135939598083496, "learning_rate": 1.4087759815242497e-05, "loss": 0.6385, "step": 250 }, { "epoch": 0.12009237875288684, "grad_norm": 5.477907657623291, "learning_rate": 1.4665127020785218e-05, "loss": 0.6292, "step": 260 }, { "epoch": 0.12471131639722864, "grad_norm": 6.577059268951416, "learning_rate": 1.5242494226327944e-05, "loss": 0.6921, "step": 270 }, { "epoch": 0.12933025404157045, "grad_norm": 3.6328892707824707, "learning_rate": 1.581986143187067e-05, "loss": 0.6621, "step": 280 }, { "epoch": 0.13394919168591224, "grad_norm": 4.084783554077148, "learning_rate": 1.6397228637413393e-05, "loss": 0.6667, "step": 290 }, { "epoch": 0.13856812933025403, "grad_norm": 3.8719701766967773, "learning_rate": 1.697459584295612e-05, "loss": 0.6692, "step": 300 }, { "epoch": 0.14318706697459585, "grad_norm": 7.860931873321533, "learning_rate": 1.7551963048498846e-05, "loss": 0.6251, "step": 310 }, { "epoch": 0.14780600461893764, "grad_norm": 4.381837368011475, "learning_rate": 1.812933025404157e-05, "loss": 0.6297, "step": 320 }, { "epoch": 0.15242494226327943, "grad_norm": 3.7145886421203613, "learning_rate": 1.8706697459584295e-05, "loss": 0.6483, "step": 330 }, { "epoch": 0.15704387990762125, "grad_norm": 2.609006643295288, "learning_rate": 1.9284064665127023e-05, "loss": 0.6149, "step": 340 }, { "epoch": 0.16166281755196305, "grad_norm": 4.774081230163574, "learning_rate": 1.9861431870669748e-05, "loss": 0.6034, "step": 350 }, { "epoch": 0.16628175519630484, "grad_norm": 7.305100440979004, "learning_rate": 2.0438799076212473e-05, "loss": 0.6496, "step": 360 }, { "epoch": 0.17090069284064666, "grad_norm": 5.507181644439697, "learning_rate": 2.1016166281755197e-05, "loss": 0.643, "step": 370 }, { "epoch": 0.17551963048498845, "grad_norm": 4.033135890960693, "learning_rate": 2.1593533487297922e-05, "loss": 0.6186, "step": 380 }, { "epoch": 0.18013856812933027, "grad_norm": 3.903007745742798, "learning_rate": 2.217090069284065e-05, "loss": 0.6041, "step": 390 }, { "epoch": 0.18475750577367206, "grad_norm": 4.785562992095947, "learning_rate": 2.2748267898383374e-05, "loss": 0.5527, "step": 400 }, { "epoch": 0.18937644341801385, "grad_norm": 3.4289231300354004, "learning_rate": 2.3325635103926096e-05, "loss": 0.5936, "step": 410 }, { "epoch": 0.19399538106235567, "grad_norm": 2.384840965270996, "learning_rate": 2.3903002309468824e-05, "loss": 0.5421, "step": 420 }, { "epoch": 0.19861431870669746, "grad_norm": 4.025755882263184, "learning_rate": 2.448036951501155e-05, "loss": 0.5839, "step": 430 }, { "epoch": 0.20323325635103925, "grad_norm": 4.832013130187988, "learning_rate": 2.5057736720554276e-05, "loss": 0.5938, "step": 440 }, { "epoch": 0.20785219399538107, "grad_norm": 3.66886305809021, "learning_rate": 2.5635103926096998e-05, "loss": 0.5607, "step": 450 }, { "epoch": 0.21247113163972287, "grad_norm": 3.7285852432250977, "learning_rate": 2.6212471131639726e-05, "loss": 0.5457, "step": 460 }, { "epoch": 0.21709006928406466, "grad_norm": 3.755711555480957, "learning_rate": 2.678983833718245e-05, "loss": 0.5721, "step": 470 }, { "epoch": 0.22170900692840648, "grad_norm": 4.016116619110107, "learning_rate": 2.7367205542725178e-05, "loss": 0.59, "step": 480 }, { "epoch": 0.22632794457274827, "grad_norm": 6.123377799987793, "learning_rate": 2.79445727482679e-05, "loss": 0.6236, "step": 490 }, { "epoch": 0.23094688221709006, "grad_norm": 3.77093505859375, "learning_rate": 2.8521939953810624e-05, "loss": 0.6306, "step": 500 }, { "epoch": 0.23556581986143188, "grad_norm": 5.101199626922607, "learning_rate": 2.9099307159353352e-05, "loss": 0.5806, "step": 510 }, { "epoch": 0.24018475750577367, "grad_norm": 4.425400257110596, "learning_rate": 2.9676674364896073e-05, "loss": 0.5644, "step": 520 }, { "epoch": 0.24480369515011546, "grad_norm": 3.661381244659424, "learning_rate": 3.02540415704388e-05, "loss": 0.538, "step": 530 }, { "epoch": 0.24942263279445728, "grad_norm": 3.271655559539795, "learning_rate": 3.0831408775981526e-05, "loss": 0.5927, "step": 540 }, { "epoch": 0.2540415704387991, "grad_norm": 4.603795051574707, "learning_rate": 3.140877598152425e-05, "loss": 0.5772, "step": 550 }, { "epoch": 0.2586605080831409, "grad_norm": 3.379786968231201, "learning_rate": 3.1986143187066975e-05, "loss": 0.5577, "step": 560 }, { "epoch": 0.2632794457274827, "grad_norm": 3.3702409267425537, "learning_rate": 3.25635103926097e-05, "loss": 0.6258, "step": 570 }, { "epoch": 0.2678983833718245, "grad_norm": 3.1498706340789795, "learning_rate": 3.3140877598152424e-05, "loss": 0.5631, "step": 580 }, { "epoch": 0.27251732101616627, "grad_norm": 4.846761703491211, "learning_rate": 3.3718244803695156e-05, "loss": 0.5944, "step": 590 }, { "epoch": 0.27713625866050806, "grad_norm": 4.397818088531494, "learning_rate": 3.4295612009237874e-05, "loss": 0.5698, "step": 600 }, { "epoch": 0.2817551963048499, "grad_norm": 2.2724201679229736, "learning_rate": 3.4872979214780605e-05, "loss": 0.5468, "step": 610 }, { "epoch": 0.2863741339491917, "grad_norm": 4.73584508895874, "learning_rate": 3.545034642032333e-05, "loss": 0.5697, "step": 620 }, { "epoch": 0.2909930715935335, "grad_norm": 3.557711124420166, "learning_rate": 3.6027713625866054e-05, "loss": 0.5195, "step": 630 }, { "epoch": 0.2956120092378753, "grad_norm": 4.273311614990234, "learning_rate": 3.660508083140878e-05, "loss": 0.5561, "step": 640 }, { "epoch": 0.3002309468822171, "grad_norm": 3.6489686965942383, "learning_rate": 3.7182448036951504e-05, "loss": 0.566, "step": 650 }, { "epoch": 0.30484988452655887, "grad_norm": 3.4011149406433105, "learning_rate": 3.775981524249423e-05, "loss": 0.5468, "step": 660 }, { "epoch": 0.3094688221709007, "grad_norm": 3.517822742462158, "learning_rate": 3.833718244803695e-05, "loss": 0.5834, "step": 670 }, { "epoch": 0.3140877598152425, "grad_norm": 2.7577602863311768, "learning_rate": 3.891454965357968e-05, "loss": 0.5489, "step": 680 }, { "epoch": 0.3187066974595843, "grad_norm": 2.856598138809204, "learning_rate": 3.94919168591224e-05, "loss": 0.4906, "step": 690 }, { "epoch": 0.3233256351039261, "grad_norm": 4.136464595794678, "learning_rate": 4.0069284064665133e-05, "loss": 0.5705, "step": 700 }, { "epoch": 0.3279445727482679, "grad_norm": 4.121008396148682, "learning_rate": 4.064665127020785e-05, "loss": 0.6241, "step": 710 }, { "epoch": 0.3325635103926097, "grad_norm": 3.611814498901367, "learning_rate": 4.122401847575058e-05, "loss": 0.5034, "step": 720 }, { "epoch": 0.3371824480369515, "grad_norm": 3.4624834060668945, "learning_rate": 4.18013856812933e-05, "loss": 0.5643, "step": 730 }, { "epoch": 0.3418013856812933, "grad_norm": 3.3586976528167725, "learning_rate": 4.237875288683603e-05, "loss": 0.5439, "step": 740 }, { "epoch": 0.3464203233256351, "grad_norm": 2.5129313468933105, "learning_rate": 4.2956120092378757e-05, "loss": 0.5062, "step": 750 }, { "epoch": 0.3510392609699769, "grad_norm": 3.0052690505981445, "learning_rate": 4.353348729792148e-05, "loss": 0.5219, "step": 760 }, { "epoch": 0.3556581986143187, "grad_norm": 3.7070388793945312, "learning_rate": 4.4110854503464206e-05, "loss": 0.5425, "step": 770 }, { "epoch": 0.36027713625866054, "grad_norm": 2.8378756046295166, "learning_rate": 4.468822170900693e-05, "loss": 0.5208, "step": 780 }, { "epoch": 0.3648960739030023, "grad_norm": 2.3988196849823, "learning_rate": 4.5265588914549655e-05, "loss": 0.5149, "step": 790 }, { "epoch": 0.3695150115473441, "grad_norm": 2.2305569648742676, "learning_rate": 4.584295612009238e-05, "loss": 0.5028, "step": 800 }, { "epoch": 0.3741339491916859, "grad_norm": 3.0817391872406006, "learning_rate": 4.6420323325635104e-05, "loss": 0.5419, "step": 810 }, { "epoch": 0.3787528868360277, "grad_norm": 2.767381429672241, "learning_rate": 4.699769053117783e-05, "loss": 0.5036, "step": 820 }, { "epoch": 0.3833718244803695, "grad_norm": 2.8563129901885986, "learning_rate": 4.757505773672056e-05, "loss": 0.5079, "step": 830 }, { "epoch": 0.38799076212471134, "grad_norm": 4.459218978881836, "learning_rate": 4.815242494226328e-05, "loss": 0.4891, "step": 840 }, { "epoch": 0.39260969976905313, "grad_norm": 2.825631856918335, "learning_rate": 4.872979214780601e-05, "loss": 0.4921, "step": 850 }, { "epoch": 0.3972286374133949, "grad_norm": 2.835643768310547, "learning_rate": 4.9307159353348734e-05, "loss": 0.4736, "step": 860 }, { "epoch": 0.4018475750577367, "grad_norm": 3.237922430038452, "learning_rate": 4.988452655889146e-05, "loss": 0.5254, "step": 870 }, { "epoch": 0.4064665127020785, "grad_norm": 3.913771390914917, "learning_rate": 4.9999870022388165e-05, "loss": 0.5212, "step": 880 }, { "epoch": 0.4110854503464203, "grad_norm": 4.357654094696045, "learning_rate": 4.999934199065641e-05, "loss": 0.6004, "step": 890 }, { "epoch": 0.41570438799076215, "grad_norm": 2.1519246101379395, "learning_rate": 4.999840778977644e-05, "loss": 0.5106, "step": 900 }, { "epoch": 0.42032332563510394, "grad_norm": 3.196580410003662, "learning_rate": 4.9997067434926386e-05, "loss": 0.4708, "step": 910 }, { "epoch": 0.42494226327944573, "grad_norm": 3.0933518409729004, "learning_rate": 4.9995320947883265e-05, "loss": 0.5228, "step": 920 }, { "epoch": 0.4295612009237875, "grad_norm": 2.5883092880249023, "learning_rate": 4.999316835702259e-05, "loss": 0.54, "step": 930 }, { "epoch": 0.4341801385681293, "grad_norm": 2.738126277923584, "learning_rate": 4.9990609697317916e-05, "loss": 0.4965, "step": 940 }, { "epoch": 0.4387990762124711, "grad_norm": 2.903364419937134, "learning_rate": 4.998764501034028e-05, "loss": 0.4732, "step": 950 }, { "epoch": 0.44341801385681295, "grad_norm": 1.895765781402588, "learning_rate": 4.998427434425753e-05, "loss": 0.4857, "step": 960 }, { "epoch": 0.44803695150115475, "grad_norm": 5.765718936920166, "learning_rate": 4.998049775383353e-05, "loss": 0.4901, "step": 970 }, { "epoch": 0.45265588914549654, "grad_norm": 3.9257349967956543, "learning_rate": 4.997631530042727e-05, "loss": 0.493, "step": 980 }, { "epoch": 0.45727482678983833, "grad_norm": 2.3068912029266357, "learning_rate": 4.997172705199189e-05, "loss": 0.5061, "step": 990 }, { "epoch": 0.4618937644341801, "grad_norm": 2.966820001602173, "learning_rate": 4.996673308307355e-05, "loss": 0.4557, "step": 1000 }, { "epoch": 0.4665127020785219, "grad_norm": 3.704120397567749, "learning_rate": 4.996133347481021e-05, "loss": 0.5302, "step": 1010 }, { "epoch": 0.47113163972286376, "grad_norm": 2.066206216812134, "learning_rate": 4.9955528314930376e-05, "loss": 0.5035, "step": 1020 }, { "epoch": 0.47575057736720555, "grad_norm": 3.7750158309936523, "learning_rate": 4.9949317697751596e-05, "loss": 0.5327, "step": 1030 }, { "epoch": 0.48036951501154734, "grad_norm": 2.5127952098846436, "learning_rate": 4.9942701724178965e-05, "loss": 0.4929, "step": 1040 }, { "epoch": 0.48498845265588914, "grad_norm": 2.594442367553711, "learning_rate": 4.9935680501703485e-05, "loss": 0.4795, "step": 1050 }, { "epoch": 0.4896073903002309, "grad_norm": 4.830276966094971, "learning_rate": 4.992825414440032e-05, "loss": 0.5268, "step": 1060 }, { "epoch": 0.4942263279445728, "grad_norm": 2.404269218444824, "learning_rate": 4.9920422772926933e-05, "loss": 0.5199, "step": 1070 }, { "epoch": 0.49884526558891457, "grad_norm": 2.622856378555298, "learning_rate": 4.991218651452114e-05, "loss": 0.4928, "step": 1080 }, { "epoch": 0.5034642032332564, "grad_norm": 2.660313844680786, "learning_rate": 4.9903545502999014e-05, "loss": 0.516, "step": 1090 }, { "epoch": 0.5080831408775982, "grad_norm": 2.2060108184814453, "learning_rate": 4.9894499878752744e-05, "loss": 0.474, "step": 1100 }, { "epoch": 0.5127020785219399, "grad_norm": 2.774624824523926, "learning_rate": 4.988504978874835e-05, "loss": 0.5053, "step": 1110 }, { "epoch": 0.5173210161662818, "grad_norm": 2.0086095333099365, "learning_rate": 4.987519538652326e-05, "loss": 0.4478, "step": 1120 }, { "epoch": 0.5219399538106235, "grad_norm": 2.7462918758392334, "learning_rate": 4.986493683218386e-05, "loss": 0.4872, "step": 1130 }, { "epoch": 0.5265588914549654, "grad_norm": 2.2208642959594727, "learning_rate": 4.985427429240286e-05, "loss": 0.4617, "step": 1140 }, { "epoch": 0.5311778290993071, "grad_norm": 3.3714964389801025, "learning_rate": 4.984320794041662e-05, "loss": 0.5293, "step": 1150 }, { "epoch": 0.535796766743649, "grad_norm": 2.732804298400879, "learning_rate": 4.98317379560223e-05, "loss": 0.4582, "step": 1160 }, { "epoch": 0.5404157043879908, "grad_norm": 3.4411492347717285, "learning_rate": 4.981986452557495e-05, "loss": 0.5359, "step": 1170 }, { "epoch": 0.5450346420323325, "grad_norm": 2.5346171855926514, "learning_rate": 4.9807587841984494e-05, "loss": 0.438, "step": 1180 }, { "epoch": 0.5496535796766744, "grad_norm": 2.068873882293701, "learning_rate": 4.9794908104712586e-05, "loss": 0.4541, "step": 1190 }, { "epoch": 0.5542725173210161, "grad_norm": 3.7652342319488525, "learning_rate": 4.978182551976939e-05, "loss": 0.5372, "step": 1200 }, { "epoch": 0.558891454965358, "grad_norm": 2.7133548259735107, "learning_rate": 4.976834029971017e-05, "loss": 0.4949, "step": 1210 }, { "epoch": 0.5635103926096998, "grad_norm": 2.7093558311462402, "learning_rate": 4.975445266363191e-05, "loss": 0.4484, "step": 1220 }, { "epoch": 0.5681293302540416, "grad_norm": 2.551117420196533, "learning_rate": 4.974016283716974e-05, "loss": 0.4865, "step": 1230 }, { "epoch": 0.5727482678983834, "grad_norm": 3.5680108070373535, "learning_rate": 4.9725471052493225e-05, "loss": 0.512, "step": 1240 }, { "epoch": 0.5773672055427251, "grad_norm": 3.1270763874053955, "learning_rate": 4.9710377548302636e-05, "loss": 0.4658, "step": 1250 }, { "epoch": 0.581986143187067, "grad_norm": 2.7450294494628906, "learning_rate": 4.9694882569825045e-05, "loss": 0.5726, "step": 1260 }, { "epoch": 0.5866050808314087, "grad_norm": 2.192270040512085, "learning_rate": 4.967898636881039e-05, "loss": 0.456, "step": 1270 }, { "epoch": 0.5912240184757506, "grad_norm": 3.5508313179016113, "learning_rate": 4.9662689203527304e-05, "loss": 0.4363, "step": 1280 }, { "epoch": 0.5958429561200924, "grad_norm": 3.789588212966919, "learning_rate": 4.964599133875899e-05, "loss": 0.3894, "step": 1290 }, { "epoch": 0.6004618937644342, "grad_norm": 2.185136556625366, "learning_rate": 4.9628893045798905e-05, "loss": 0.4388, "step": 1300 }, { "epoch": 0.605080831408776, "grad_norm": 3.9445693492889404, "learning_rate": 4.961139460244631e-05, "loss": 0.4761, "step": 1310 }, { "epoch": 0.6096997690531177, "grad_norm": 2.027426242828369, "learning_rate": 4.95934962930018e-05, "loss": 0.4726, "step": 1320 }, { "epoch": 0.6143187066974596, "grad_norm": 2.555227518081665, "learning_rate": 4.957519840826268e-05, "loss": 0.4546, "step": 1330 }, { "epoch": 0.6189376443418014, "grad_norm": 3.800353765487671, "learning_rate": 4.955650124551823e-05, "loss": 0.4657, "step": 1340 }, { "epoch": 0.6235565819861432, "grad_norm": 1.877545952796936, "learning_rate": 4.953740510854485e-05, "loss": 0.46, "step": 1350 }, { "epoch": 0.628175519630485, "grad_norm": 2.205172538757324, "learning_rate": 4.9517910307601204e-05, "loss": 0.4845, "step": 1360 }, { "epoch": 0.6327944572748267, "grad_norm": 4.207221031188965, "learning_rate": 4.949801715942306e-05, "loss": 0.4784, "step": 1370 }, { "epoch": 0.6374133949191686, "grad_norm": 2.3010756969451904, "learning_rate": 4.947772598721828e-05, "loss": 0.4679, "step": 1380 }, { "epoch": 0.6420323325635104, "grad_norm": 1.776963233947754, "learning_rate": 4.9457037120661455e-05, "loss": 0.4405, "step": 1390 }, { "epoch": 0.6466512702078522, "grad_norm": 2.8734822273254395, "learning_rate": 4.9435950895888604e-05, "loss": 0.474, "step": 1400 }, { "epoch": 0.651270207852194, "grad_norm": 3.049509286880493, "learning_rate": 4.9414467655491695e-05, "loss": 0.4334, "step": 1410 }, { "epoch": 0.6558891454965358, "grad_norm": 3.274083375930786, "learning_rate": 4.9392587748513105e-05, "loss": 0.4519, "step": 1420 }, { "epoch": 0.6605080831408776, "grad_norm": 2.149959087371826, "learning_rate": 4.937031153043991e-05, "loss": 0.4625, "step": 1430 }, { "epoch": 0.6651270207852193, "grad_norm": 2.3562982082366943, "learning_rate": 4.934763936319814e-05, "loss": 0.4487, "step": 1440 }, { "epoch": 0.6697459584295612, "grad_norm": 2.5950613021850586, "learning_rate": 4.932457161514689e-05, "loss": 0.4979, "step": 1450 }, { "epoch": 0.674364896073903, "grad_norm": 2.4045233726501465, "learning_rate": 4.9301108661072315e-05, "loss": 0.4442, "step": 1460 }, { "epoch": 0.6789838337182448, "grad_norm": 2.833594799041748, "learning_rate": 4.9277250882181575e-05, "loss": 0.4408, "step": 1470 }, { "epoch": 0.6836027713625866, "grad_norm": 1.7960981130599976, "learning_rate": 4.9252998666096625e-05, "loss": 0.4672, "step": 1480 }, { "epoch": 0.6882217090069284, "grad_norm": 2.261974811553955, "learning_rate": 4.922835240684792e-05, "loss": 0.3947, "step": 1490 }, { "epoch": 0.6928406466512702, "grad_norm": 2.966799020767212, "learning_rate": 4.9203312504867994e-05, "loss": 0.4546, "step": 1500 }, { "epoch": 0.6974595842956121, "grad_norm": 2.1086158752441406, "learning_rate": 4.9177879366984985e-05, "loss": 0.4226, "step": 1510 }, { "epoch": 0.7020785219399538, "grad_norm": 2.4200305938720703, "learning_rate": 4.915205340641601e-05, "loss": 0.4481, "step": 1520 }, { "epoch": 0.7066974595842956, "grad_norm": 2.486839771270752, "learning_rate": 4.912583504276045e-05, "loss": 0.4594, "step": 1530 }, { "epoch": 0.7113163972286374, "grad_norm": 1.7007755041122437, "learning_rate": 4.9099224701993115e-05, "loss": 0.4472, "step": 1540 }, { "epoch": 0.7159353348729792, "grad_norm": 1.6047892570495605, "learning_rate": 4.907222281645739e-05, "loss": 0.4734, "step": 1550 }, { "epoch": 0.7205542725173211, "grad_norm": 3.141820192337036, "learning_rate": 4.904482982485813e-05, "loss": 0.4777, "step": 1560 }, { "epoch": 0.7251732101616628, "grad_norm": 2.6380691528320312, "learning_rate": 4.901704617225455e-05, "loss": 0.4438, "step": 1570 }, { "epoch": 0.7297921478060047, "grad_norm": 2.238063335418701, "learning_rate": 4.898887231005306e-05, "loss": 0.4424, "step": 1580 }, { "epoch": 0.7344110854503464, "grad_norm": 2.1911888122558594, "learning_rate": 4.896030869599983e-05, "loss": 0.437, "step": 1590 }, { "epoch": 0.7390300230946882, "grad_norm": 2.0892059803009033, "learning_rate": 4.893135579417345e-05, "loss": 0.4341, "step": 1600 }, { "epoch": 0.74364896073903, "grad_norm": 1.727277398109436, "learning_rate": 4.8902014074977295e-05, "loss": 0.4428, "step": 1610 }, { "epoch": 0.7482678983833718, "grad_norm": 2.6759467124938965, "learning_rate": 4.8872284015131965e-05, "loss": 0.411, "step": 1620 }, { "epoch": 0.7528868360277137, "grad_norm": 3.616370439529419, "learning_rate": 4.8842166097667505e-05, "loss": 0.4376, "step": 1630 }, { "epoch": 0.7575057736720554, "grad_norm": 1.77035653591156, "learning_rate": 4.881166081191555e-05, "loss": 0.4217, "step": 1640 }, { "epoch": 0.7621247113163973, "grad_norm": 2.7078516483306885, "learning_rate": 4.878076865350136e-05, "loss": 0.4451, "step": 1650 }, { "epoch": 0.766743648960739, "grad_norm": 1.7627867460250854, "learning_rate": 4.874949012433584e-05, "loss": 0.4399, "step": 1660 }, { "epoch": 0.7713625866050808, "grad_norm": 1.7273634672164917, "learning_rate": 4.871782573260729e-05, "loss": 0.4202, "step": 1670 }, { "epoch": 0.7759815242494227, "grad_norm": 2.0963988304138184, "learning_rate": 4.868577599277322e-05, "loss": 0.4447, "step": 1680 }, { "epoch": 0.7806004618937644, "grad_norm": 2.144355297088623, "learning_rate": 4.865334142555196e-05, "loss": 0.4387, "step": 1690 }, { "epoch": 0.7852193995381063, "grad_norm": 2.0202884674072266, "learning_rate": 4.862052255791419e-05, "loss": 0.4124, "step": 1700 }, { "epoch": 0.789838337182448, "grad_norm": 1.6518304347991943, "learning_rate": 4.858731992307441e-05, "loss": 0.4222, "step": 1710 }, { "epoch": 0.7944572748267898, "grad_norm": 1.3399207592010498, "learning_rate": 4.855373406048226e-05, "loss": 0.4713, "step": 1720 }, { "epoch": 0.7990762124711316, "grad_norm": 1.7980890274047852, "learning_rate": 4.8519765515813744e-05, "loss": 0.4484, "step": 1730 }, { "epoch": 0.8036951501154734, "grad_norm": 3.4402995109558105, "learning_rate": 4.8485414840962384e-05, "loss": 0.3777, "step": 1740 }, { "epoch": 0.8083140877598153, "grad_norm": 1.9399069547653198, "learning_rate": 4.845068259403024e-05, "loss": 0.3942, "step": 1750 }, { "epoch": 0.812933025404157, "grad_norm": 1.8960610628128052, "learning_rate": 4.841556933931886e-05, "loss": 0.3738, "step": 1760 }, { "epoch": 0.8175519630484989, "grad_norm": 2.6203131675720215, "learning_rate": 4.838007564732008e-05, "loss": 0.4725, "step": 1770 }, { "epoch": 0.8221709006928406, "grad_norm": 2.441143274307251, "learning_rate": 4.834420209470679e-05, "loss": 0.4287, "step": 1780 }, { "epoch": 0.8267898383371824, "grad_norm": 3.31255841255188, "learning_rate": 4.830794926432355e-05, "loss": 0.4215, "step": 1790 }, { "epoch": 0.8314087759815243, "grad_norm": 2.7491471767425537, "learning_rate": 4.827131774517709e-05, "loss": 0.4301, "step": 1800 }, { "epoch": 0.836027713625866, "grad_norm": 1.9549751281738281, "learning_rate": 4.8234308132426807e-05, "loss": 0.4379, "step": 1810 }, { "epoch": 0.8406466512702079, "grad_norm": 2.4840948581695557, "learning_rate": 4.819692102737505e-05, "loss": 0.4474, "step": 1820 }, { "epoch": 0.8452655889145496, "grad_norm": 2.865260601043701, "learning_rate": 4.815915703745734e-05, "loss": 0.4608, "step": 1830 }, { "epoch": 0.8498845265588915, "grad_norm": 2.3620669841766357, "learning_rate": 4.812101677623254e-05, "loss": 0.4311, "step": 1840 }, { "epoch": 0.8545034642032333, "grad_norm": 3.4287946224212646, "learning_rate": 4.808250086337284e-05, "loss": 0.4035, "step": 1850 }, { "epoch": 0.859122401847575, "grad_norm": 1.6560109853744507, "learning_rate": 4.8043609924653745e-05, "loss": 0.4004, "step": 1860 }, { "epoch": 0.8637413394919169, "grad_norm": 2.310784339904785, "learning_rate": 4.800434459194386e-05, "loss": 0.4219, "step": 1870 }, { "epoch": 0.8683602771362586, "grad_norm": 1.660062313079834, "learning_rate": 4.796470550319465e-05, "loss": 0.4102, "step": 1880 }, { "epoch": 0.8729792147806005, "grad_norm": 2.578974723815918, "learning_rate": 4.792469330243007e-05, "loss": 0.4375, "step": 1890 }, { "epoch": 0.8775981524249422, "grad_norm": 2.095388412475586, "learning_rate": 4.788430863973607e-05, "loss": 0.4385, "step": 1900 }, { "epoch": 0.8822170900692841, "grad_norm": 2.4962880611419678, "learning_rate": 4.7843552171250085e-05, "loss": 0.4399, "step": 1910 }, { "epoch": 0.8868360277136259, "grad_norm": 2.8000426292419434, "learning_rate": 4.780242455915036e-05, "loss": 0.4344, "step": 1920 }, { "epoch": 0.8914549653579676, "grad_norm": 2.0987462997436523, "learning_rate": 4.776092647164516e-05, "loss": 0.3961, "step": 1930 }, { "epoch": 0.8960739030023095, "grad_norm": 3.531191825866699, "learning_rate": 4.771905858296195e-05, "loss": 0.4463, "step": 1940 }, { "epoch": 0.9006928406466512, "grad_norm": 2.8484129905700684, "learning_rate": 4.767682157333643e-05, "loss": 0.4993, "step": 1950 }, { "epoch": 0.9053117782909931, "grad_norm": 2.164492607116699, "learning_rate": 4.7634216129001453e-05, "loss": 0.4545, "step": 1960 }, { "epoch": 0.9099307159353349, "grad_norm": 2.37625789642334, "learning_rate": 4.7591242942175924e-05, "loss": 0.3965, "step": 1970 }, { "epoch": 0.9145496535796767, "grad_norm": 2.488903284072876, "learning_rate": 4.7547902711053535e-05, "loss": 0.4459, "step": 1980 }, { "epoch": 0.9191685912240185, "grad_norm": 3.028672933578491, "learning_rate": 4.7504196139791405e-05, "loss": 0.452, "step": 1990 }, { "epoch": 0.9237875288683602, "grad_norm": 2.084078073501587, "learning_rate": 4.746012393849866e-05, "loss": 0.4078, "step": 2000 }, { "epoch": 0.9284064665127021, "grad_norm": 2.061680555343628, "learning_rate": 4.741568682322488e-05, "loss": 0.3691, "step": 2010 }, { "epoch": 0.9330254041570438, "grad_norm": 2.4597408771514893, "learning_rate": 4.7370885515948486e-05, "loss": 0.3844, "step": 2020 }, { "epoch": 0.9376443418013857, "grad_norm": 2.8126583099365234, "learning_rate": 4.732572074456498e-05, "loss": 0.392, "step": 2030 }, { "epoch": 0.9422632794457275, "grad_norm": 2.3137059211730957, "learning_rate": 4.728019324287516e-05, "loss": 0.4237, "step": 2040 }, { "epoch": 0.9468822170900693, "grad_norm": 3.646803855895996, "learning_rate": 4.723430375057314e-05, "loss": 0.3857, "step": 2050 }, { "epoch": 0.9515011547344111, "grad_norm": 1.3006259202957153, "learning_rate": 4.718805301323439e-05, "loss": 0.4193, "step": 2060 }, { "epoch": 0.9561200923787528, "grad_norm": 3.648193359375, "learning_rate": 4.71414417823036e-05, "loss": 0.3896, "step": 2070 }, { "epoch": 0.9607390300230947, "grad_norm": 2.0217714309692383, "learning_rate": 4.7094470815082443e-05, "loss": 0.4315, "step": 2080 }, { "epoch": 0.9653579676674365, "grad_norm": 1.8971320390701294, "learning_rate": 4.704714087471733e-05, "loss": 0.4643, "step": 2090 }, { "epoch": 0.9699769053117783, "grad_norm": 2.0947530269622803, "learning_rate": 4.6999452730186966e-05, "loss": 0.4065, "step": 2100 }, { "epoch": 0.9745958429561201, "grad_norm": 2.21998929977417, "learning_rate": 4.6951407156289856e-05, "loss": 0.4102, "step": 2110 }, { "epoch": 0.9792147806004619, "grad_norm": 2.2499032020568848, "learning_rate": 4.6903004933631745e-05, "loss": 0.4339, "step": 2120 }, { "epoch": 0.9838337182448037, "grad_norm": 2.322390079498291, "learning_rate": 4.68542468486129e-05, "loss": 0.41, "step": 2130 }, { "epoch": 0.9884526558891455, "grad_norm": 2.2508604526519775, "learning_rate": 4.680513369341539e-05, "loss": 0.4142, "step": 2140 }, { "epoch": 0.9930715935334873, "grad_norm": 2.1183762550354004, "learning_rate": 4.675566626599014e-05, "loss": 0.4064, "step": 2150 }, { "epoch": 0.9976905311778291, "grad_norm": 1.6184558868408203, "learning_rate": 4.6705845370044015e-05, "loss": 0.4167, "step": 2160 }, { "epoch": 1.002309468822171, "grad_norm": 2.536665678024292, "learning_rate": 4.665567181502676e-05, "loss": 0.3415, "step": 2170 }, { "epoch": 1.0069284064665127, "grad_norm": 1.4247705936431885, "learning_rate": 4.6605146416117835e-05, "loss": 0.2922, "step": 2180 }, { "epoch": 1.0115473441108545, "grad_norm": 2.4373302459716797, "learning_rate": 4.655426999421317e-05, "loss": 0.346, "step": 2190 }, { "epoch": 1.0161662817551964, "grad_norm": 2.1059627532958984, "learning_rate": 4.650304337591185e-05, "loss": 0.3091, "step": 2200 }, { "epoch": 1.0207852193995381, "grad_norm": 1.6682378053665161, "learning_rate": 4.6451467393502644e-05, "loss": 0.286, "step": 2210 }, { "epoch": 1.0254041570438799, "grad_norm": 2.1200857162475586, "learning_rate": 4.639954288495054e-05, "loss": 0.3246, "step": 2220 }, { "epoch": 1.0300230946882216, "grad_norm": 2.503147602081299, "learning_rate": 4.634727069388306e-05, "loss": 0.3594, "step": 2230 }, { "epoch": 1.0346420323325636, "grad_norm": 1.5546462535858154, "learning_rate": 4.629465166957662e-05, "loss": 0.3154, "step": 2240 }, { "epoch": 1.0392609699769053, "grad_norm": 1.6011874675750732, "learning_rate": 4.6241686666942694e-05, "loss": 0.3309, "step": 2250 }, { "epoch": 1.043879907621247, "grad_norm": 1.319419026374817, "learning_rate": 4.618837654651393e-05, "loss": 0.3198, "step": 2260 }, { "epoch": 1.048498845265589, "grad_norm": 1.689467191696167, "learning_rate": 4.613472217443017e-05, "loss": 0.3203, "step": 2270 }, { "epoch": 1.0531177829099307, "grad_norm": 3.2122750282287598, "learning_rate": 4.608072442242439e-05, "loss": 0.3269, "step": 2280 }, { "epoch": 1.0577367205542725, "grad_norm": 1.627459168434143, "learning_rate": 4.602638416780851e-05, "loss": 0.3094, "step": 2290 }, { "epoch": 1.0623556581986142, "grad_norm": 2.7835917472839355, "learning_rate": 4.5971702293459163e-05, "loss": 0.3146, "step": 2300 }, { "epoch": 1.0669745958429562, "grad_norm": 1.5927238464355469, "learning_rate": 4.591667968780336e-05, "loss": 0.3058, "step": 2310 }, { "epoch": 1.071593533487298, "grad_norm": 1.8358134031295776, "learning_rate": 4.5861317244804015e-05, "loss": 0.3079, "step": 2320 }, { "epoch": 1.0762124711316396, "grad_norm": 1.3477319478988647, "learning_rate": 4.580561586394545e-05, "loss": 0.3355, "step": 2330 }, { "epoch": 1.0808314087759816, "grad_norm": 2.179304361343384, "learning_rate": 4.57495764502188e-05, "loss": 0.3301, "step": 2340 }, { "epoch": 1.0854503464203233, "grad_norm": 1.5659071207046509, "learning_rate": 4.569319991410725e-05, "loss": 0.3133, "step": 2350 }, { "epoch": 1.090069284064665, "grad_norm": 1.8552881479263306, "learning_rate": 4.563648717157131e-05, "loss": 0.3287, "step": 2360 }, { "epoch": 1.0946882217090068, "grad_norm": 2.5701863765716553, "learning_rate": 4.557943914403386e-05, "loss": 0.3219, "step": 2370 }, { "epoch": 1.0993071593533488, "grad_norm": 1.5083070993423462, "learning_rate": 4.552205675836527e-05, "loss": 0.317, "step": 2380 }, { "epoch": 1.1039260969976905, "grad_norm": 2.8584365844726562, "learning_rate": 4.5464340946868256e-05, "loss": 0.2864, "step": 2390 }, { "epoch": 1.1085450346420322, "grad_norm": 1.630210518836975, "learning_rate": 4.540629264726278e-05, "loss": 0.2998, "step": 2400 }, { "epoch": 1.1131639722863742, "grad_norm": 1.8594950437545776, "learning_rate": 4.53479128026708e-05, "loss": 0.3156, "step": 2410 }, { "epoch": 1.117782909930716, "grad_norm": 1.5817375183105469, "learning_rate": 4.528920236160096e-05, "loss": 0.3201, "step": 2420 }, { "epoch": 1.1224018475750577, "grad_norm": 2.3912880420684814, "learning_rate": 4.523016227793315e-05, "loss": 0.2986, "step": 2430 }, { "epoch": 1.1270207852193996, "grad_norm": 1.5563384294509888, "learning_rate": 4.5170793510903046e-05, "loss": 0.3021, "step": 2440 }, { "epoch": 1.1316397228637414, "grad_norm": 1.8241164684295654, "learning_rate": 4.511109702508648e-05, "loss": 0.3316, "step": 2450 }, { "epoch": 1.136258660508083, "grad_norm": 2.19366192817688, "learning_rate": 4.505107379038384e-05, "loss": 0.3255, "step": 2460 }, { "epoch": 1.140877598152425, "grad_norm": 1.471584439277649, "learning_rate": 4.499072478200421e-05, "loss": 0.3245, "step": 2470 }, { "epoch": 1.1454965357967668, "grad_norm": 1.6688241958618164, "learning_rate": 4.493005098044963e-05, "loss": 0.3336, "step": 2480 }, { "epoch": 1.1501154734411085, "grad_norm": 1.9700522422790527, "learning_rate": 4.486905337149909e-05, "loss": 0.3277, "step": 2490 }, { "epoch": 1.1547344110854503, "grad_norm": 2.3884382247924805, "learning_rate": 4.480773294619255e-05, "loss": 0.2978, "step": 2500 }, { "epoch": 1.1593533487297922, "grad_norm": 2.134876251220703, "learning_rate": 4.474609070081483e-05, "loss": 0.3168, "step": 2510 }, { "epoch": 1.163972286374134, "grad_norm": 1.8444334268569946, "learning_rate": 4.468412763687942e-05, "loss": 0.3228, "step": 2520 }, { "epoch": 1.1685912240184757, "grad_norm": 1.6720913648605347, "learning_rate": 4.4621844761112216e-05, "loss": 0.3174, "step": 2530 }, { "epoch": 1.1732101616628174, "grad_norm": 1.8607888221740723, "learning_rate": 4.4559243085435154e-05, "loss": 0.265, "step": 2540 }, { "epoch": 1.1778290993071594, "grad_norm": 1.9527697563171387, "learning_rate": 4.449632362694978e-05, "loss": 0.3143, "step": 2550 }, { "epoch": 1.1824480369515011, "grad_norm": 2.462707757949829, "learning_rate": 4.443308740792072e-05, "loss": 0.3007, "step": 2560 }, { "epoch": 1.1870669745958429, "grad_norm": 1.9817255735397339, "learning_rate": 4.436953545575904e-05, "loss": 0.3135, "step": 2570 }, { "epoch": 1.1916859122401848, "grad_norm": 2.517148971557617, "learning_rate": 4.430566880300563e-05, "loss": 0.3346, "step": 2580 }, { "epoch": 1.1963048498845266, "grad_norm": 2.951472282409668, "learning_rate": 4.4241488487314365e-05, "loss": 0.3111, "step": 2590 }, { "epoch": 1.2009237875288683, "grad_norm": 3.675570249557495, "learning_rate": 4.417699555143523e-05, "loss": 0.3082, "step": 2600 }, { "epoch": 1.2055427251732103, "grad_norm": 1.6169134378433228, "learning_rate": 4.411219104319746e-05, "loss": 0.3247, "step": 2610 }, { "epoch": 1.210161662817552, "grad_norm": 4.280526638031006, "learning_rate": 4.404707601549244e-05, "loss": 0.3309, "step": 2620 }, { "epoch": 1.2147806004618937, "grad_norm": 1.2955102920532227, "learning_rate": 4.398165152625663e-05, "loss": 0.2845, "step": 2630 }, { "epoch": 1.2193995381062355, "grad_norm": 2.1361026763916016, "learning_rate": 4.391591863845436e-05, "loss": 0.3143, "step": 2640 }, { "epoch": 1.2240184757505774, "grad_norm": 2.1552889347076416, "learning_rate": 4.384987842006059e-05, "loss": 0.3042, "step": 2650 }, { "epoch": 1.2286374133949192, "grad_norm": 2.199091672897339, "learning_rate": 4.378353194404352e-05, "loss": 0.3426, "step": 2660 }, { "epoch": 1.233256351039261, "grad_norm": 2.23543119430542, "learning_rate": 4.371688028834721e-05, "loss": 0.3047, "step": 2670 }, { "epoch": 1.2378752886836029, "grad_norm": 1.7668689489364624, "learning_rate": 4.3649924535873994e-05, "loss": 0.3269, "step": 2680 }, { "epoch": 1.2424942263279446, "grad_norm": 1.9710248708724976, "learning_rate": 4.3582665774466946e-05, "loss": 0.3252, "step": 2690 }, { "epoch": 1.2471131639722863, "grad_norm": 1.9266043901443481, "learning_rate": 4.351510509689218e-05, "loss": 0.2913, "step": 2700 }, { "epoch": 1.251732101616628, "grad_norm": 1.4974067211151123, "learning_rate": 4.34472436008211e-05, "loss": 0.3071, "step": 2710 }, { "epoch": 1.25635103926097, "grad_norm": 1.7189793586730957, "learning_rate": 4.337908238881258e-05, "loss": 0.321, "step": 2720 }, { "epoch": 1.2609699769053118, "grad_norm": 1.8770833015441895, "learning_rate": 4.331062256829501e-05, "loss": 0.3252, "step": 2730 }, { "epoch": 1.2655889145496535, "grad_norm": 2.516841411590576, "learning_rate": 4.324186525154835e-05, "loss": 0.2971, "step": 2740 }, { "epoch": 1.2702078521939955, "grad_norm": 1.8335446119308472, "learning_rate": 4.317281155568604e-05, "loss": 0.3137, "step": 2750 }, { "epoch": 1.2748267898383372, "grad_norm": 1.632881999015808, "learning_rate": 4.3103462602636845e-05, "loss": 0.3096, "step": 2760 }, { "epoch": 1.279445727482679, "grad_norm": 1.6128835678100586, "learning_rate": 4.303381951912661e-05, "loss": 0.3228, "step": 2770 }, { "epoch": 1.2840646651270209, "grad_norm": 2.278327703475952, "learning_rate": 4.296388343666001e-05, "loss": 0.2835, "step": 2780 }, { "epoch": 1.2886836027713626, "grad_norm": 1.4413753747940063, "learning_rate": 4.2893655491502105e-05, "loss": 0.2993, "step": 2790 }, { "epoch": 1.2933025404157044, "grad_norm": 2.016895055770874, "learning_rate": 4.282313682465992e-05, "loss": 0.2721, "step": 2800 }, { "epoch": 1.2979214780600463, "grad_norm": 1.687320590019226, "learning_rate": 4.275232858186388e-05, "loss": 0.2811, "step": 2810 }, { "epoch": 1.302540415704388, "grad_norm": 1.556947112083435, "learning_rate": 4.268123191354921e-05, "loss": 0.3164, "step": 2820 }, { "epoch": 1.3071593533487298, "grad_norm": 1.7516964673995972, "learning_rate": 4.260984797483725e-05, "loss": 0.2722, "step": 2830 }, { "epoch": 1.3117782909930715, "grad_norm": 1.999335765838623, "learning_rate": 4.2538177925516665e-05, "loss": 0.289, "step": 2840 }, { "epoch": 1.3163972286374133, "grad_norm": 2.1948471069335938, "learning_rate": 4.2466222930024616e-05, "loss": 0.2969, "step": 2850 }, { "epoch": 1.3210161662817552, "grad_norm": 3.0124406814575195, "learning_rate": 4.239398415742784e-05, "loss": 0.336, "step": 2860 }, { "epoch": 1.325635103926097, "grad_norm": 2.1753032207489014, "learning_rate": 4.232146278140366e-05, "loss": 0.3086, "step": 2870 }, { "epoch": 1.3302540415704387, "grad_norm": 1.9752156734466553, "learning_rate": 4.224865998022092e-05, "loss": 0.2998, "step": 2880 }, { "epoch": 1.3348729792147807, "grad_norm": 2.4908149242401123, "learning_rate": 4.2175576936720805e-05, "loss": 0.3096, "step": 2890 }, { "epoch": 1.3394919168591224, "grad_norm": 1.8155696392059326, "learning_rate": 4.2102214838297696e-05, "loss": 0.3046, "step": 2900 }, { "epoch": 1.3441108545034641, "grad_norm": 2.2324860095977783, "learning_rate": 4.202857487687977e-05, "loss": 0.3065, "step": 2910 }, { "epoch": 1.348729792147806, "grad_norm": 1.9542967081069946, "learning_rate": 4.1954658248909764e-05, "loss": 0.284, "step": 2920 }, { "epoch": 1.3533487297921478, "grad_norm": 1.6855888366699219, "learning_rate": 4.188046615532541e-05, "loss": 0.2928, "step": 2930 }, { "epoch": 1.3579676674364896, "grad_norm": 2.32239031791687, "learning_rate": 4.180599980154005e-05, "loss": 0.3303, "step": 2940 }, { "epoch": 1.3625866050808315, "grad_norm": 1.3265233039855957, "learning_rate": 4.173126039742292e-05, "loss": 0.2994, "step": 2950 }, { "epoch": 1.3672055427251733, "grad_norm": 1.8445942401885986, "learning_rate": 4.16562491572796e-05, "loss": 0.2883, "step": 2960 }, { "epoch": 1.371824480369515, "grad_norm": 1.4618576765060425, "learning_rate": 4.158096729983222e-05, "loss": 0.3107, "step": 2970 }, { "epoch": 1.376443418013857, "grad_norm": 1.7024414539337158, "learning_rate": 4.150541604819969e-05, "loss": 0.2596, "step": 2980 }, { "epoch": 1.3810623556581987, "grad_norm": 1.2643704414367676, "learning_rate": 4.142959662987783e-05, "loss": 0.311, "step": 2990 }, { "epoch": 1.3856812933025404, "grad_norm": 2.466792345046997, "learning_rate": 4.1353510276719386e-05, "loss": 0.2991, "step": 3000 }, { "epoch": 1.3903002309468822, "grad_norm": 2.7137906551361084, "learning_rate": 4.127715822491408e-05, "loss": 0.3034, "step": 3010 }, { "epoch": 1.394919168591224, "grad_norm": 1.9120469093322754, "learning_rate": 4.120054171496847e-05, "loss": 0.2926, "step": 3020 }, { "epoch": 1.3995381062355658, "grad_norm": 1.790726900100708, "learning_rate": 4.1123661991685826e-05, "loss": 0.2634, "step": 3030 }, { "epoch": 1.4041570438799076, "grad_norm": 1.786881446838379, "learning_rate": 4.1046520304145884e-05, "loss": 0.2932, "step": 3040 }, { "epoch": 1.4087759815242493, "grad_norm": 1.7593064308166504, "learning_rate": 4.096911790568459e-05, "loss": 0.3173, "step": 3050 }, { "epoch": 1.4133949191685913, "grad_norm": 2.030292510986328, "learning_rate": 4.089145605387368e-05, "loss": 0.3248, "step": 3060 }, { "epoch": 1.418013856812933, "grad_norm": 1.089842438697815, "learning_rate": 4.08135360105003e-05, "loss": 0.2823, "step": 3070 }, { "epoch": 1.4226327944572748, "grad_norm": 3.046901226043701, "learning_rate": 4.0735359041546476e-05, "loss": 0.287, "step": 3080 }, { "epoch": 1.4272517321016167, "grad_norm": 1.6824251413345337, "learning_rate": 4.065692641716855e-05, "loss": 0.2727, "step": 3090 }, { "epoch": 1.4318706697459584, "grad_norm": 2.4673869609832764, "learning_rate": 4.0578239411676556e-05, "loss": 0.3161, "step": 3100 }, { "epoch": 1.4364896073903002, "grad_norm": 1.6147607564926147, "learning_rate": 4.049929930351349e-05, "loss": 0.338, "step": 3110 }, { "epoch": 1.4411085450346421, "grad_norm": 1.293908953666687, "learning_rate": 4.04201073752346e-05, "loss": 0.2863, "step": 3120 }, { "epoch": 1.4457274826789839, "grad_norm": 2.096881866455078, "learning_rate": 4.034066491348645e-05, "loss": 0.3121, "step": 3130 }, { "epoch": 1.4503464203233256, "grad_norm": 1.5880385637283325, "learning_rate": 4.026097320898609e-05, "loss": 0.2995, "step": 3140 }, { "epoch": 1.4549653579676676, "grad_norm": 1.1587382555007935, "learning_rate": 4.0181033556500074e-05, "loss": 0.3035, "step": 3150 }, { "epoch": 1.4595842956120093, "grad_norm": 4.055675029754639, "learning_rate": 4.0100847254823414e-05, "loss": 0.3209, "step": 3160 }, { "epoch": 1.464203233256351, "grad_norm": 2.2304954528808594, "learning_rate": 4.0020415606758474e-05, "loss": 0.2716, "step": 3170 }, { "epoch": 1.4688221709006928, "grad_norm": 1.7061644792556763, "learning_rate": 3.99397399190938e-05, "loss": 0.297, "step": 3180 }, { "epoch": 1.4734411085450345, "grad_norm": 2.139970302581787, "learning_rate": 3.985882150258291e-05, "loss": 0.2978, "step": 3190 }, { "epoch": 1.4780600461893765, "grad_norm": 1.6300883293151855, "learning_rate": 3.9777661671922984e-05, "loss": 0.2923, "step": 3200 }, { "epoch": 1.4826789838337182, "grad_norm": 1.210604190826416, "learning_rate": 3.96962617457335e-05, "loss": 0.3015, "step": 3210 }, { "epoch": 1.48729792147806, "grad_norm": 8.268691062927246, "learning_rate": 3.96146230465348e-05, "loss": 0.3306, "step": 3220 }, { "epoch": 1.491916859122402, "grad_norm": 2.2136807441711426, "learning_rate": 3.9532746900726645e-05, "loss": 0.3133, "step": 3230 }, { "epoch": 1.4965357967667436, "grad_norm": 2.0368247032165527, "learning_rate": 3.9450634638566624e-05, "loss": 0.3387, "step": 3240 }, { "epoch": 1.5011547344110854, "grad_norm": 2.5188982486724854, "learning_rate": 3.9368287594148555e-05, "loss": 0.3095, "step": 3250 }, { "epoch": 1.5057736720554273, "grad_norm": 3.128319263458252, "learning_rate": 3.92857071053808e-05, "loss": 0.2646, "step": 3260 }, { "epoch": 1.510392609699769, "grad_norm": 2.266287088394165, "learning_rate": 3.920289451396455e-05, "loss": 0.2996, "step": 3270 }, { "epoch": 1.5150115473441108, "grad_norm": 2.842649221420288, "learning_rate": 3.9119851165372e-05, "loss": 0.2782, "step": 3280 }, { "epoch": 1.5196304849884528, "grad_norm": 2.6462066173553467, "learning_rate": 3.903657840882453e-05, "loss": 0.2967, "step": 3290 }, { "epoch": 1.5242494226327945, "grad_norm": 2.3835110664367676, "learning_rate": 3.89530775972707e-05, "loss": 0.2774, "step": 3300 }, { "epoch": 1.5288683602771362, "grad_norm": 3.416583776473999, "learning_rate": 3.886935008736439e-05, "loss": 0.2862, "step": 3310 }, { "epoch": 1.5334872979214782, "grad_norm": 2.1469576358795166, "learning_rate": 3.8785397239442636e-05, "loss": 0.2783, "step": 3320 }, { "epoch": 1.5381062355658197, "grad_norm": 2.151310682296753, "learning_rate": 3.870122041750363e-05, "loss": 0.3143, "step": 3330 }, { "epoch": 1.5427251732101617, "grad_norm": 1.654341220855713, "learning_rate": 3.861682098918447e-05, "loss": 0.2778, "step": 3340 }, { "epoch": 1.5473441108545036, "grad_norm": 2.1126046180725098, "learning_rate": 3.853220032573902e-05, "loss": 0.2945, "step": 3350 }, { "epoch": 1.5519630484988451, "grad_norm": 1.182999610900879, "learning_rate": 3.844735980201557e-05, "loss": 0.2757, "step": 3360 }, { "epoch": 1.556581986143187, "grad_norm": 1.8858295679092407, "learning_rate": 3.836230079643452e-05, "loss": 0.2963, "step": 3370 }, { "epoch": 1.5612009237875288, "grad_norm": 2.020688533782959, "learning_rate": 3.8277024690966034e-05, "loss": 0.2638, "step": 3380 }, { "epoch": 1.5658198614318706, "grad_norm": 3.3935155868530273, "learning_rate": 3.819153287110746e-05, "loss": 0.2949, "step": 3390 }, { "epoch": 1.5704387990762125, "grad_norm": 2.6277477741241455, "learning_rate": 3.8105826725860976e-05, "loss": 0.3307, "step": 3400 }, { "epoch": 1.5750577367205543, "grad_norm": 1.7885854244232178, "learning_rate": 3.801990764771089e-05, "loss": 0.2866, "step": 3410 }, { "epoch": 1.579676674364896, "grad_norm": 2.0295510292053223, "learning_rate": 3.793377703260112e-05, "loss": 0.3182, "step": 3420 }, { "epoch": 1.584295612009238, "grad_norm": 1.3024362325668335, "learning_rate": 3.784743627991243e-05, "loss": 0.2969, "step": 3430 }, { "epoch": 1.5889145496535797, "grad_norm": 1.4249627590179443, "learning_rate": 3.7760886792439724e-05, "loss": 0.2748, "step": 3440 }, { "epoch": 1.5935334872979214, "grad_norm": 1.7537922859191895, "learning_rate": 3.767412997636929e-05, "loss": 0.28, "step": 3450 }, { "epoch": 1.5981524249422634, "grad_norm": 1.3318328857421875, "learning_rate": 3.758716724125592e-05, "loss": 0.2595, "step": 3460 }, { "epoch": 1.6027713625866051, "grad_norm": 2.035454273223877, "learning_rate": 3.7500000000000003e-05, "loss": 0.3124, "step": 3470 }, { "epoch": 1.6073903002309469, "grad_norm": 1.43570876121521, "learning_rate": 3.7412629668824575e-05, "loss": 0.297, "step": 3480 }, { "epoch": 1.6120092378752888, "grad_norm": 2.0970613956451416, "learning_rate": 3.732505766725231e-05, "loss": 0.2931, "step": 3490 }, { "epoch": 1.6166281755196303, "grad_norm": 1.3278722763061523, "learning_rate": 3.723728541808247e-05, "loss": 0.2814, "step": 3500 } ], "logging_steps": 10, "max_steps": 8660, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.605425304646451e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }