{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.540415704387991, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004618937644341801, "grad_norm": 10.975909233093262, "learning_rate": 3.4642032332563515e-07, "loss": 2.4942, "step": 10 }, { "epoch": 0.009237875288683603, "grad_norm": 9.186948776245117, "learning_rate": 9.237875288683603e-07, "loss": 2.7582, "step": 20 }, { "epoch": 0.013856812933025405, "grad_norm": 10.907584190368652, "learning_rate": 1.443418013856813e-06, "loss": 2.6814, "step": 30 }, { "epoch": 0.018475750577367205, "grad_norm": 9.531168937683105, "learning_rate": 2.0207852193995383e-06, "loss": 2.6982, "step": 40 }, { "epoch": 0.023094688221709007, "grad_norm": 14.727725982666016, "learning_rate": 2.5981524249422633e-06, "loss": 2.0218, "step": 50 }, { "epoch": 0.02771362586605081, "grad_norm": 8.314309120178223, "learning_rate": 3.117782909930716e-06, "loss": 1.5595, "step": 60 }, { "epoch": 0.03233256351039261, "grad_norm": 4.944284915924072, "learning_rate": 3.6951501154734412e-06, "loss": 1.02, "step": 70 }, { "epoch": 0.03695150115473441, "grad_norm": 10.882843971252441, "learning_rate": 4.272517321016166e-06, "loss": 1.0419, "step": 80 }, { "epoch": 0.04157043879907621, "grad_norm": 12.310320854187012, "learning_rate": 4.849884526558892e-06, "loss": 1.0801, "step": 90 }, { "epoch": 0.046189376443418015, "grad_norm": 6.49992036819458, "learning_rate": 5.427251732101616e-06, "loss": 0.8444, "step": 100 }, { "epoch": 0.050808314087759814, "grad_norm": 2.89493465423584, "learning_rate": 6.004618937644342e-06, "loss": 0.8884, "step": 110 }, { "epoch": 0.05542725173210162, "grad_norm": 3.966763734817505, "learning_rate": 6.581986143187067e-06, "loss": 0.8672, "step": 120 }, { "epoch": 0.06004618937644342, "grad_norm": 4.442293167114258, "learning_rate": 7.159353348729793e-06, "loss": 0.8037, "step": 130 }, { "epoch": 0.06466512702078522, "grad_norm": 2.506918430328369, "learning_rate": 7.736720554272519e-06, "loss": 0.724, "step": 140 }, { "epoch": 0.06928406466512702, "grad_norm": 5.733686447143555, "learning_rate": 8.314087759815242e-06, "loss": 0.7692, "step": 150 }, { "epoch": 0.07390300230946882, "grad_norm": 4.161188125610352, "learning_rate": 8.891454965357968e-06, "loss": 0.7413, "step": 160 }, { "epoch": 0.07852193995381063, "grad_norm": 3.9434962272644043, "learning_rate": 9.468822170900693e-06, "loss": 0.7386, "step": 170 }, { "epoch": 0.08314087759815242, "grad_norm": 2.9100701808929443, "learning_rate": 1.0046189376443418e-05, "loss": 0.6942, "step": 180 }, { "epoch": 0.08775981524249422, "grad_norm": 5.367318153381348, "learning_rate": 1.0623556581986144e-05, "loss": 0.8011, "step": 190 }, { "epoch": 0.09237875288683603, "grad_norm": 3.1690614223480225, "learning_rate": 1.1200923787528869e-05, "loss": 0.6816, "step": 200 }, { "epoch": 0.09699769053117784, "grad_norm": 4.35976505279541, "learning_rate": 1.1778290993071595e-05, "loss": 0.7408, "step": 210 }, { "epoch": 0.10161662817551963, "grad_norm": 3.330937623977661, "learning_rate": 1.235565819861432e-05, "loss": 0.7159, "step": 220 }, { "epoch": 0.10623556581986143, "grad_norm": 5.761129379272461, "learning_rate": 1.2933025404157046e-05, "loss": 0.6838, "step": 230 }, { "epoch": 0.11085450346420324, "grad_norm": 7.05668830871582, "learning_rate": 1.351039260969977e-05, "loss": 0.7135, "step": 240 }, { "epoch": 0.11547344110854503, "grad_norm": 3.7135939598083496, "learning_rate": 1.4087759815242497e-05, "loss": 0.6385, "step": 250 }, { "epoch": 0.12009237875288684, "grad_norm": 5.477907657623291, "learning_rate": 1.4665127020785218e-05, "loss": 0.6292, "step": 260 }, { "epoch": 0.12471131639722864, "grad_norm": 6.577059268951416, "learning_rate": 1.5242494226327944e-05, "loss": 0.6921, "step": 270 }, { "epoch": 0.12933025404157045, "grad_norm": 3.6328892707824707, "learning_rate": 1.581986143187067e-05, "loss": 0.6621, "step": 280 }, { "epoch": 0.13394919168591224, "grad_norm": 4.084783554077148, "learning_rate": 1.6397228637413393e-05, "loss": 0.6667, "step": 290 }, { "epoch": 0.13856812933025403, "grad_norm": 3.8719701766967773, "learning_rate": 1.697459584295612e-05, "loss": 0.6692, "step": 300 }, { "epoch": 0.14318706697459585, "grad_norm": 7.860931873321533, "learning_rate": 1.7551963048498846e-05, "loss": 0.6251, "step": 310 }, { "epoch": 0.14780600461893764, "grad_norm": 4.381837368011475, "learning_rate": 1.812933025404157e-05, "loss": 0.6297, "step": 320 }, { "epoch": 0.15242494226327943, "grad_norm": 3.7145886421203613, "learning_rate": 1.8706697459584295e-05, "loss": 0.6483, "step": 330 }, { "epoch": 0.15704387990762125, "grad_norm": 2.609006643295288, "learning_rate": 1.9284064665127023e-05, "loss": 0.6149, "step": 340 }, { "epoch": 0.16166281755196305, "grad_norm": 4.774081230163574, "learning_rate": 1.9861431870669748e-05, "loss": 0.6034, "step": 350 }, { "epoch": 0.16628175519630484, "grad_norm": 7.305100440979004, "learning_rate": 2.0438799076212473e-05, "loss": 0.6496, "step": 360 }, { "epoch": 0.17090069284064666, "grad_norm": 5.507181644439697, "learning_rate": 2.1016166281755197e-05, "loss": 0.643, "step": 370 }, { "epoch": 0.17551963048498845, "grad_norm": 4.033135890960693, "learning_rate": 2.1593533487297922e-05, "loss": 0.6186, "step": 380 }, { "epoch": 0.18013856812933027, "grad_norm": 3.903007745742798, "learning_rate": 2.217090069284065e-05, "loss": 0.6041, "step": 390 }, { "epoch": 0.18475750577367206, "grad_norm": 4.785562992095947, "learning_rate": 2.2748267898383374e-05, "loss": 0.5527, "step": 400 }, { "epoch": 0.18937644341801385, "grad_norm": 3.4289231300354004, "learning_rate": 2.3325635103926096e-05, "loss": 0.5936, "step": 410 }, { "epoch": 0.19399538106235567, "grad_norm": 2.384840965270996, "learning_rate": 2.3903002309468824e-05, "loss": 0.5421, "step": 420 }, { "epoch": 0.19861431870669746, "grad_norm": 4.025755882263184, "learning_rate": 2.448036951501155e-05, "loss": 0.5839, "step": 430 }, { "epoch": 0.20323325635103925, "grad_norm": 4.832013130187988, "learning_rate": 2.5057736720554276e-05, "loss": 0.5938, "step": 440 }, { "epoch": 0.20785219399538107, "grad_norm": 3.66886305809021, "learning_rate": 2.5635103926096998e-05, "loss": 0.5607, "step": 450 }, { "epoch": 0.21247113163972287, "grad_norm": 3.7285852432250977, "learning_rate": 2.6212471131639726e-05, "loss": 0.5457, "step": 460 }, { "epoch": 0.21709006928406466, "grad_norm": 3.755711555480957, "learning_rate": 2.678983833718245e-05, "loss": 0.5721, "step": 470 }, { "epoch": 0.22170900692840648, "grad_norm": 4.016116619110107, "learning_rate": 2.7367205542725178e-05, "loss": 0.59, "step": 480 }, { "epoch": 0.22632794457274827, "grad_norm": 6.123377799987793, "learning_rate": 2.79445727482679e-05, "loss": 0.6236, "step": 490 }, { "epoch": 0.23094688221709006, "grad_norm": 3.77093505859375, "learning_rate": 2.8521939953810624e-05, "loss": 0.6306, "step": 500 }, { "epoch": 0.23556581986143188, "grad_norm": 5.101199626922607, "learning_rate": 2.9099307159353352e-05, "loss": 0.5806, "step": 510 }, { "epoch": 0.24018475750577367, "grad_norm": 4.425400257110596, "learning_rate": 2.9676674364896073e-05, "loss": 0.5644, "step": 520 }, { "epoch": 0.24480369515011546, "grad_norm": 3.661381244659424, "learning_rate": 3.02540415704388e-05, "loss": 0.538, "step": 530 }, { "epoch": 0.24942263279445728, "grad_norm": 3.271655559539795, "learning_rate": 3.0831408775981526e-05, "loss": 0.5927, "step": 540 }, { "epoch": 0.2540415704387991, "grad_norm": 4.603795051574707, "learning_rate": 3.140877598152425e-05, "loss": 0.5772, "step": 550 }, { "epoch": 0.2586605080831409, "grad_norm": 3.379786968231201, "learning_rate": 3.1986143187066975e-05, "loss": 0.5577, "step": 560 }, { "epoch": 0.2632794457274827, "grad_norm": 3.3702409267425537, "learning_rate": 3.25635103926097e-05, "loss": 0.6258, "step": 570 }, { "epoch": 0.2678983833718245, "grad_norm": 3.1498706340789795, "learning_rate": 3.3140877598152424e-05, "loss": 0.5631, "step": 580 }, { "epoch": 0.27251732101616627, "grad_norm": 4.846761703491211, "learning_rate": 3.3718244803695156e-05, "loss": 0.5944, "step": 590 }, { "epoch": 0.27713625866050806, "grad_norm": 4.397818088531494, "learning_rate": 3.4295612009237874e-05, "loss": 0.5698, "step": 600 }, { "epoch": 0.2817551963048499, "grad_norm": 2.2724201679229736, "learning_rate": 3.4872979214780605e-05, "loss": 0.5468, "step": 610 }, { "epoch": 0.2863741339491917, "grad_norm": 4.73584508895874, "learning_rate": 3.545034642032333e-05, "loss": 0.5697, "step": 620 }, { "epoch": 0.2909930715935335, "grad_norm": 3.557711124420166, "learning_rate": 3.6027713625866054e-05, "loss": 0.5195, "step": 630 }, { "epoch": 0.2956120092378753, "grad_norm": 4.273311614990234, "learning_rate": 3.660508083140878e-05, "loss": 0.5561, "step": 640 }, { "epoch": 0.3002309468822171, "grad_norm": 3.6489686965942383, "learning_rate": 3.7182448036951504e-05, "loss": 0.566, "step": 650 }, { "epoch": 0.30484988452655887, "grad_norm": 3.4011149406433105, "learning_rate": 3.775981524249423e-05, "loss": 0.5468, "step": 660 }, { "epoch": 0.3094688221709007, "grad_norm": 3.517822742462158, "learning_rate": 3.833718244803695e-05, "loss": 0.5834, "step": 670 }, { "epoch": 0.3140877598152425, "grad_norm": 2.7577602863311768, "learning_rate": 3.891454965357968e-05, "loss": 0.5489, "step": 680 }, { "epoch": 0.3187066974595843, "grad_norm": 2.856598138809204, "learning_rate": 3.94919168591224e-05, "loss": 0.4906, "step": 690 }, { "epoch": 0.3233256351039261, "grad_norm": 4.136464595794678, "learning_rate": 4.0069284064665133e-05, "loss": 0.5705, "step": 700 }, { "epoch": 0.3279445727482679, "grad_norm": 4.121008396148682, "learning_rate": 4.064665127020785e-05, "loss": 0.6241, "step": 710 }, { "epoch": 0.3325635103926097, "grad_norm": 3.611814498901367, "learning_rate": 4.122401847575058e-05, "loss": 0.5034, "step": 720 }, { "epoch": 0.3371824480369515, "grad_norm": 3.4624834060668945, "learning_rate": 4.18013856812933e-05, "loss": 0.5643, "step": 730 }, { "epoch": 0.3418013856812933, "grad_norm": 3.3586976528167725, "learning_rate": 4.237875288683603e-05, "loss": 0.5439, "step": 740 }, { "epoch": 0.3464203233256351, "grad_norm": 2.5129313468933105, "learning_rate": 4.2956120092378757e-05, "loss": 0.5062, "step": 750 }, { "epoch": 0.3510392609699769, "grad_norm": 3.0052690505981445, "learning_rate": 4.353348729792148e-05, "loss": 0.5219, "step": 760 }, { "epoch": 0.3556581986143187, "grad_norm": 3.7070388793945312, "learning_rate": 4.4110854503464206e-05, "loss": 0.5425, "step": 770 }, { "epoch": 0.36027713625866054, "grad_norm": 2.8378756046295166, "learning_rate": 4.468822170900693e-05, "loss": 0.5208, "step": 780 }, { "epoch": 0.3648960739030023, "grad_norm": 2.3988196849823, "learning_rate": 4.5265588914549655e-05, "loss": 0.5149, "step": 790 }, { "epoch": 0.3695150115473441, "grad_norm": 2.2305569648742676, "learning_rate": 4.584295612009238e-05, "loss": 0.5028, "step": 800 }, { "epoch": 0.3741339491916859, "grad_norm": 3.0817391872406006, "learning_rate": 4.6420323325635104e-05, "loss": 0.5419, "step": 810 }, { "epoch": 0.3787528868360277, "grad_norm": 2.767381429672241, "learning_rate": 4.699769053117783e-05, "loss": 0.5036, "step": 820 }, { "epoch": 0.3833718244803695, "grad_norm": 2.8563129901885986, "learning_rate": 4.757505773672056e-05, "loss": 0.5079, "step": 830 }, { "epoch": 0.38799076212471134, "grad_norm": 4.459218978881836, "learning_rate": 4.815242494226328e-05, "loss": 0.4891, "step": 840 }, { "epoch": 0.39260969976905313, "grad_norm": 2.825631856918335, "learning_rate": 4.872979214780601e-05, "loss": 0.4921, "step": 850 }, { "epoch": 0.3972286374133949, "grad_norm": 2.835643768310547, "learning_rate": 4.9307159353348734e-05, "loss": 0.4736, "step": 860 }, { "epoch": 0.4018475750577367, "grad_norm": 3.237922430038452, "learning_rate": 4.988452655889146e-05, "loss": 0.5254, "step": 870 }, { "epoch": 0.4064665127020785, "grad_norm": 3.913771390914917, "learning_rate": 4.9999870022388165e-05, "loss": 0.5212, "step": 880 }, { "epoch": 0.4110854503464203, "grad_norm": 4.357654094696045, "learning_rate": 4.999934199065641e-05, "loss": 0.6004, "step": 890 }, { "epoch": 0.41570438799076215, "grad_norm": 2.1519246101379395, "learning_rate": 4.999840778977644e-05, "loss": 0.5106, "step": 900 }, { "epoch": 0.42032332563510394, "grad_norm": 3.196580410003662, "learning_rate": 4.9997067434926386e-05, "loss": 0.4708, "step": 910 }, { "epoch": 0.42494226327944573, "grad_norm": 3.0933518409729004, "learning_rate": 4.9995320947883265e-05, "loss": 0.5228, "step": 920 }, { "epoch": 0.4295612009237875, "grad_norm": 2.5883092880249023, "learning_rate": 4.999316835702259e-05, "loss": 0.54, "step": 930 }, { "epoch": 0.4341801385681293, "grad_norm": 2.738126277923584, "learning_rate": 4.9990609697317916e-05, "loss": 0.4965, "step": 940 }, { "epoch": 0.4387990762124711, "grad_norm": 2.903364419937134, "learning_rate": 4.998764501034028e-05, "loss": 0.4732, "step": 950 }, { "epoch": 0.44341801385681295, "grad_norm": 1.895765781402588, "learning_rate": 4.998427434425753e-05, "loss": 0.4857, "step": 960 }, { "epoch": 0.44803695150115475, "grad_norm": 5.765718936920166, "learning_rate": 4.998049775383353e-05, "loss": 0.4901, "step": 970 }, { "epoch": 0.45265588914549654, "grad_norm": 3.9257349967956543, "learning_rate": 4.997631530042727e-05, "loss": 0.493, "step": 980 }, { "epoch": 0.45727482678983833, "grad_norm": 2.3068912029266357, "learning_rate": 4.997172705199189e-05, "loss": 0.5061, "step": 990 }, { "epoch": 0.4618937644341801, "grad_norm": 2.966820001602173, "learning_rate": 4.996673308307355e-05, "loss": 0.4557, "step": 1000 }, { "epoch": 0.4665127020785219, "grad_norm": 3.704120397567749, "learning_rate": 4.996133347481021e-05, "loss": 0.5302, "step": 1010 }, { "epoch": 0.47113163972286376, "grad_norm": 2.066206216812134, "learning_rate": 4.9955528314930376e-05, "loss": 0.5035, "step": 1020 }, { "epoch": 0.47575057736720555, "grad_norm": 3.7750158309936523, "learning_rate": 4.9949317697751596e-05, "loss": 0.5327, "step": 1030 }, { "epoch": 0.48036951501154734, "grad_norm": 2.5127952098846436, "learning_rate": 4.9942701724178965e-05, "loss": 0.4929, "step": 1040 }, { "epoch": 0.48498845265588914, "grad_norm": 2.594442367553711, "learning_rate": 4.9935680501703485e-05, "loss": 0.4795, "step": 1050 }, { "epoch": 0.4896073903002309, "grad_norm": 4.830276966094971, "learning_rate": 4.992825414440032e-05, "loss": 0.5268, "step": 1060 }, { "epoch": 0.4942263279445728, "grad_norm": 2.404269218444824, "learning_rate": 4.9920422772926933e-05, "loss": 0.5199, "step": 1070 }, { "epoch": 0.49884526558891457, "grad_norm": 2.622856378555298, "learning_rate": 4.991218651452114e-05, "loss": 0.4928, "step": 1080 }, { "epoch": 0.5034642032332564, "grad_norm": 2.660313844680786, "learning_rate": 4.9903545502999014e-05, "loss": 0.516, "step": 1090 }, { "epoch": 0.5080831408775982, "grad_norm": 2.2060108184814453, "learning_rate": 4.9894499878752744e-05, "loss": 0.474, "step": 1100 }, { "epoch": 0.5127020785219399, "grad_norm": 2.774624824523926, "learning_rate": 4.988504978874835e-05, "loss": 0.5053, "step": 1110 }, { "epoch": 0.5173210161662818, "grad_norm": 2.0086095333099365, "learning_rate": 4.987519538652326e-05, "loss": 0.4478, "step": 1120 }, { "epoch": 0.5219399538106235, "grad_norm": 2.7462918758392334, "learning_rate": 4.986493683218386e-05, "loss": 0.4872, "step": 1130 }, { "epoch": 0.5265588914549654, "grad_norm": 2.2208642959594727, "learning_rate": 4.985427429240286e-05, "loss": 0.4617, "step": 1140 }, { "epoch": 0.5311778290993071, "grad_norm": 3.3714964389801025, "learning_rate": 4.984320794041662e-05, "loss": 0.5293, "step": 1150 }, { "epoch": 0.535796766743649, "grad_norm": 2.732804298400879, "learning_rate": 4.98317379560223e-05, "loss": 0.4582, "step": 1160 }, { "epoch": 0.5404157043879908, "grad_norm": 3.4411492347717285, "learning_rate": 4.981986452557495e-05, "loss": 0.5359, "step": 1170 }, { "epoch": 0.5450346420323325, "grad_norm": 2.5346171855926514, "learning_rate": 4.9807587841984494e-05, "loss": 0.438, "step": 1180 }, { "epoch": 0.5496535796766744, "grad_norm": 2.068873882293701, "learning_rate": 4.9794908104712586e-05, "loss": 0.4541, "step": 1190 }, { "epoch": 0.5542725173210161, "grad_norm": 3.7652342319488525, "learning_rate": 4.978182551976939e-05, "loss": 0.5372, "step": 1200 }, { "epoch": 0.558891454965358, "grad_norm": 2.7133548259735107, "learning_rate": 4.976834029971017e-05, "loss": 0.4949, "step": 1210 }, { "epoch": 0.5635103926096998, "grad_norm": 2.7093558311462402, "learning_rate": 4.975445266363191e-05, "loss": 0.4484, "step": 1220 }, { "epoch": 0.5681293302540416, "grad_norm": 2.551117420196533, "learning_rate": 4.974016283716974e-05, "loss": 0.4865, "step": 1230 }, { "epoch": 0.5727482678983834, "grad_norm": 3.5680108070373535, "learning_rate": 4.9725471052493225e-05, "loss": 0.512, "step": 1240 }, { "epoch": 0.5773672055427251, "grad_norm": 3.1270763874053955, "learning_rate": 4.9710377548302636e-05, "loss": 0.4658, "step": 1250 }, { "epoch": 0.581986143187067, "grad_norm": 2.7450294494628906, "learning_rate": 4.9694882569825045e-05, "loss": 0.5726, "step": 1260 }, { "epoch": 0.5866050808314087, "grad_norm": 2.192270040512085, "learning_rate": 4.967898636881039e-05, "loss": 0.456, "step": 1270 }, { "epoch": 0.5912240184757506, "grad_norm": 3.5508313179016113, "learning_rate": 4.9662689203527304e-05, "loss": 0.4363, "step": 1280 }, { "epoch": 0.5958429561200924, "grad_norm": 3.789588212966919, "learning_rate": 4.964599133875899e-05, "loss": 0.3894, "step": 1290 }, { "epoch": 0.6004618937644342, "grad_norm": 2.185136556625366, "learning_rate": 4.9628893045798905e-05, "loss": 0.4388, "step": 1300 }, { "epoch": 0.605080831408776, "grad_norm": 3.9445693492889404, "learning_rate": 4.961139460244631e-05, "loss": 0.4761, "step": 1310 }, { "epoch": 0.6096997690531177, "grad_norm": 2.027426242828369, "learning_rate": 4.95934962930018e-05, "loss": 0.4726, "step": 1320 }, { "epoch": 0.6143187066974596, "grad_norm": 2.555227518081665, "learning_rate": 4.957519840826268e-05, "loss": 0.4546, "step": 1330 }, { "epoch": 0.6189376443418014, "grad_norm": 3.800353765487671, "learning_rate": 4.955650124551823e-05, "loss": 0.4657, "step": 1340 }, { "epoch": 0.6235565819861432, "grad_norm": 1.877545952796936, "learning_rate": 4.953740510854485e-05, "loss": 0.46, "step": 1350 }, { "epoch": 0.628175519630485, "grad_norm": 2.205172538757324, "learning_rate": 4.9517910307601204e-05, "loss": 0.4845, "step": 1360 }, { "epoch": 0.6327944572748267, "grad_norm": 4.207221031188965, "learning_rate": 4.949801715942306e-05, "loss": 0.4784, "step": 1370 }, { "epoch": 0.6374133949191686, "grad_norm": 2.3010756969451904, "learning_rate": 4.947772598721828e-05, "loss": 0.4679, "step": 1380 }, { "epoch": 0.6420323325635104, "grad_norm": 1.776963233947754, "learning_rate": 4.9457037120661455e-05, "loss": 0.4405, "step": 1390 }, { "epoch": 0.6466512702078522, "grad_norm": 2.8734822273254395, "learning_rate": 4.9435950895888604e-05, "loss": 0.474, "step": 1400 }, { "epoch": 0.651270207852194, "grad_norm": 3.049509286880493, "learning_rate": 4.9414467655491695e-05, "loss": 0.4334, "step": 1410 }, { "epoch": 0.6558891454965358, "grad_norm": 3.274083375930786, "learning_rate": 4.9392587748513105e-05, "loss": 0.4519, "step": 1420 }, { "epoch": 0.6605080831408776, "grad_norm": 2.149959087371826, "learning_rate": 4.937031153043991e-05, "loss": 0.4625, "step": 1430 }, { "epoch": 0.6651270207852193, "grad_norm": 2.3562982082366943, "learning_rate": 4.934763936319814e-05, "loss": 0.4487, "step": 1440 }, { "epoch": 0.6697459584295612, "grad_norm": 2.5950613021850586, "learning_rate": 4.932457161514689e-05, "loss": 0.4979, "step": 1450 }, { "epoch": 0.674364896073903, "grad_norm": 2.4045233726501465, "learning_rate": 4.9301108661072315e-05, "loss": 0.4442, "step": 1460 }, { "epoch": 0.6789838337182448, "grad_norm": 2.833594799041748, "learning_rate": 4.9277250882181575e-05, "loss": 0.4408, "step": 1470 }, { "epoch": 0.6836027713625866, "grad_norm": 1.7960981130599976, "learning_rate": 4.9252998666096625e-05, "loss": 0.4672, "step": 1480 }, { "epoch": 0.6882217090069284, "grad_norm": 2.261974811553955, "learning_rate": 4.922835240684792e-05, "loss": 0.3947, "step": 1490 }, { "epoch": 0.6928406466512702, "grad_norm": 2.966799020767212, "learning_rate": 4.9203312504867994e-05, "loss": 0.4546, "step": 1500 }, { "epoch": 0.6974595842956121, "grad_norm": 2.1086158752441406, "learning_rate": 4.9177879366984985e-05, "loss": 0.4226, "step": 1510 }, { "epoch": 0.7020785219399538, "grad_norm": 2.4200305938720703, "learning_rate": 4.915205340641601e-05, "loss": 0.4481, "step": 1520 }, { "epoch": 0.7066974595842956, "grad_norm": 2.486839771270752, "learning_rate": 4.912583504276045e-05, "loss": 0.4594, "step": 1530 }, { "epoch": 0.7113163972286374, "grad_norm": 1.7007755041122437, "learning_rate": 4.9099224701993115e-05, "loss": 0.4472, "step": 1540 }, { "epoch": 0.7159353348729792, "grad_norm": 1.6047892570495605, "learning_rate": 4.907222281645739e-05, "loss": 0.4734, "step": 1550 }, { "epoch": 0.7205542725173211, "grad_norm": 3.141820192337036, "learning_rate": 4.904482982485813e-05, "loss": 0.4777, "step": 1560 }, { "epoch": 0.7251732101616628, "grad_norm": 2.6380691528320312, "learning_rate": 4.901704617225455e-05, "loss": 0.4438, "step": 1570 }, { "epoch": 0.7297921478060047, "grad_norm": 2.238063335418701, "learning_rate": 4.898887231005306e-05, "loss": 0.4424, "step": 1580 }, { "epoch": 0.7344110854503464, "grad_norm": 2.1911888122558594, "learning_rate": 4.896030869599983e-05, "loss": 0.437, "step": 1590 }, { "epoch": 0.7390300230946882, "grad_norm": 2.0892059803009033, "learning_rate": 4.893135579417345e-05, "loss": 0.4341, "step": 1600 }, { "epoch": 0.74364896073903, "grad_norm": 1.727277398109436, "learning_rate": 4.8902014074977295e-05, "loss": 0.4428, "step": 1610 }, { "epoch": 0.7482678983833718, "grad_norm": 2.6759467124938965, "learning_rate": 4.8872284015131965e-05, "loss": 0.411, "step": 1620 }, { "epoch": 0.7528868360277137, "grad_norm": 3.616370439529419, "learning_rate": 4.8842166097667505e-05, "loss": 0.4376, "step": 1630 }, { "epoch": 0.7575057736720554, "grad_norm": 1.77035653591156, "learning_rate": 4.881166081191555e-05, "loss": 0.4217, "step": 1640 }, { "epoch": 0.7621247113163973, "grad_norm": 2.7078516483306885, "learning_rate": 4.878076865350136e-05, "loss": 0.4451, "step": 1650 }, { "epoch": 0.766743648960739, "grad_norm": 1.7627867460250854, "learning_rate": 4.874949012433584e-05, "loss": 0.4399, "step": 1660 }, { "epoch": 0.7713625866050808, "grad_norm": 1.7273634672164917, "learning_rate": 4.871782573260729e-05, "loss": 0.4202, "step": 1670 }, { "epoch": 0.7759815242494227, "grad_norm": 2.0963988304138184, "learning_rate": 4.868577599277322e-05, "loss": 0.4447, "step": 1680 }, { "epoch": 0.7806004618937644, "grad_norm": 2.144355297088623, "learning_rate": 4.865334142555196e-05, "loss": 0.4387, "step": 1690 }, { "epoch": 0.7852193995381063, "grad_norm": 2.0202884674072266, "learning_rate": 4.862052255791419e-05, "loss": 0.4124, "step": 1700 }, { "epoch": 0.789838337182448, "grad_norm": 1.6518304347991943, "learning_rate": 4.858731992307441e-05, "loss": 0.4222, "step": 1710 }, { "epoch": 0.7944572748267898, "grad_norm": 1.3399207592010498, "learning_rate": 4.855373406048226e-05, "loss": 0.4713, "step": 1720 }, { "epoch": 0.7990762124711316, "grad_norm": 1.7980890274047852, "learning_rate": 4.8519765515813744e-05, "loss": 0.4484, "step": 1730 }, { "epoch": 0.8036951501154734, "grad_norm": 3.4402995109558105, "learning_rate": 4.8485414840962384e-05, "loss": 0.3777, "step": 1740 }, { "epoch": 0.8083140877598153, "grad_norm": 1.9399069547653198, "learning_rate": 4.845068259403024e-05, "loss": 0.3942, "step": 1750 }, { "epoch": 0.812933025404157, "grad_norm": 1.8960610628128052, "learning_rate": 4.841556933931886e-05, "loss": 0.3738, "step": 1760 }, { "epoch": 0.8175519630484989, "grad_norm": 2.6203131675720215, "learning_rate": 4.838007564732008e-05, "loss": 0.4725, "step": 1770 }, { "epoch": 0.8221709006928406, "grad_norm": 2.441143274307251, "learning_rate": 4.834420209470679e-05, "loss": 0.4287, "step": 1780 }, { "epoch": 0.8267898383371824, "grad_norm": 3.31255841255188, "learning_rate": 4.830794926432355e-05, "loss": 0.4215, "step": 1790 }, { "epoch": 0.8314087759815243, "grad_norm": 2.7491471767425537, "learning_rate": 4.827131774517709e-05, "loss": 0.4301, "step": 1800 }, { "epoch": 0.836027713625866, "grad_norm": 1.9549751281738281, "learning_rate": 4.8234308132426807e-05, "loss": 0.4379, "step": 1810 }, { "epoch": 0.8406466512702079, "grad_norm": 2.4840948581695557, "learning_rate": 4.819692102737505e-05, "loss": 0.4474, "step": 1820 }, { "epoch": 0.8452655889145496, "grad_norm": 2.865260601043701, "learning_rate": 4.815915703745734e-05, "loss": 0.4608, "step": 1830 }, { "epoch": 0.8498845265588915, "grad_norm": 2.3620669841766357, "learning_rate": 4.812101677623254e-05, "loss": 0.4311, "step": 1840 }, { "epoch": 0.8545034642032333, "grad_norm": 3.4287946224212646, "learning_rate": 4.808250086337284e-05, "loss": 0.4035, "step": 1850 }, { "epoch": 0.859122401847575, "grad_norm": 1.6560109853744507, "learning_rate": 4.8043609924653745e-05, "loss": 0.4004, "step": 1860 }, { "epoch": 0.8637413394919169, "grad_norm": 2.310784339904785, "learning_rate": 4.800434459194386e-05, "loss": 0.4219, "step": 1870 }, { "epoch": 0.8683602771362586, "grad_norm": 1.660062313079834, "learning_rate": 4.796470550319465e-05, "loss": 0.4102, "step": 1880 }, { "epoch": 0.8729792147806005, "grad_norm": 2.578974723815918, "learning_rate": 4.792469330243007e-05, "loss": 0.4375, "step": 1890 }, { "epoch": 0.8775981524249422, "grad_norm": 2.095388412475586, "learning_rate": 4.788430863973607e-05, "loss": 0.4385, "step": 1900 }, { "epoch": 0.8822170900692841, "grad_norm": 2.4962880611419678, "learning_rate": 4.7843552171250085e-05, "loss": 0.4399, "step": 1910 }, { "epoch": 0.8868360277136259, "grad_norm": 2.8000426292419434, "learning_rate": 4.780242455915036e-05, "loss": 0.4344, "step": 1920 }, { "epoch": 0.8914549653579676, "grad_norm": 2.0987462997436523, "learning_rate": 4.776092647164516e-05, "loss": 0.3961, "step": 1930 }, { "epoch": 0.8960739030023095, "grad_norm": 3.531191825866699, "learning_rate": 4.771905858296195e-05, "loss": 0.4463, "step": 1940 }, { "epoch": 0.9006928406466512, "grad_norm": 2.8484129905700684, "learning_rate": 4.767682157333643e-05, "loss": 0.4993, "step": 1950 }, { "epoch": 0.9053117782909931, "grad_norm": 2.164492607116699, "learning_rate": 4.7634216129001453e-05, "loss": 0.4545, "step": 1960 }, { "epoch": 0.9099307159353349, "grad_norm": 2.37625789642334, "learning_rate": 4.7591242942175924e-05, "loss": 0.3965, "step": 1970 }, { "epoch": 0.9145496535796767, "grad_norm": 2.488903284072876, "learning_rate": 4.7547902711053535e-05, "loss": 0.4459, "step": 1980 }, { "epoch": 0.9191685912240185, "grad_norm": 3.028672933578491, "learning_rate": 4.7504196139791405e-05, "loss": 0.452, "step": 1990 }, { "epoch": 0.9237875288683602, "grad_norm": 2.084078073501587, "learning_rate": 4.746012393849866e-05, "loss": 0.4078, "step": 2000 }, { "epoch": 0.9284064665127021, "grad_norm": 2.061680555343628, "learning_rate": 4.741568682322488e-05, "loss": 0.3691, "step": 2010 }, { "epoch": 0.9330254041570438, "grad_norm": 2.4597408771514893, "learning_rate": 4.7370885515948486e-05, "loss": 0.3844, "step": 2020 }, { "epoch": 0.9376443418013857, "grad_norm": 2.8126583099365234, "learning_rate": 4.732572074456498e-05, "loss": 0.392, "step": 2030 }, { "epoch": 0.9422632794457275, "grad_norm": 2.3137059211730957, "learning_rate": 4.728019324287516e-05, "loss": 0.4237, "step": 2040 }, { "epoch": 0.9468822170900693, "grad_norm": 3.646803855895996, "learning_rate": 4.723430375057314e-05, "loss": 0.3857, "step": 2050 }, { "epoch": 0.9515011547344111, "grad_norm": 1.3006259202957153, "learning_rate": 4.718805301323439e-05, "loss": 0.4193, "step": 2060 }, { "epoch": 0.9561200923787528, "grad_norm": 3.648193359375, "learning_rate": 4.71414417823036e-05, "loss": 0.3896, "step": 2070 }, { "epoch": 0.9607390300230947, "grad_norm": 2.0217714309692383, "learning_rate": 4.7094470815082443e-05, "loss": 0.4315, "step": 2080 }, { "epoch": 0.9653579676674365, "grad_norm": 1.8971320390701294, "learning_rate": 4.704714087471733e-05, "loss": 0.4643, "step": 2090 }, { "epoch": 0.9699769053117783, "grad_norm": 2.0947530269622803, "learning_rate": 4.6999452730186966e-05, "loss": 0.4065, "step": 2100 }, { "epoch": 0.9745958429561201, "grad_norm": 2.21998929977417, "learning_rate": 4.6951407156289856e-05, "loss": 0.4102, "step": 2110 }, { "epoch": 0.9792147806004619, "grad_norm": 2.2499032020568848, "learning_rate": 4.6903004933631745e-05, "loss": 0.4339, "step": 2120 }, { "epoch": 0.9838337182448037, "grad_norm": 2.322390079498291, "learning_rate": 4.68542468486129e-05, "loss": 0.41, "step": 2130 }, { "epoch": 0.9884526558891455, "grad_norm": 2.2508604526519775, "learning_rate": 4.680513369341539e-05, "loss": 0.4142, "step": 2140 }, { "epoch": 0.9930715935334873, "grad_norm": 2.1183762550354004, "learning_rate": 4.675566626599014e-05, "loss": 0.4064, "step": 2150 }, { "epoch": 0.9976905311778291, "grad_norm": 1.6184558868408203, "learning_rate": 4.6705845370044015e-05, "loss": 0.4167, "step": 2160 }, { "epoch": 1.002309468822171, "grad_norm": 2.536665678024292, "learning_rate": 4.665567181502676e-05, "loss": 0.3415, "step": 2170 }, { "epoch": 1.0069284064665127, "grad_norm": 1.4247705936431885, "learning_rate": 4.6605146416117835e-05, "loss": 0.2922, "step": 2180 }, { "epoch": 1.0115473441108545, "grad_norm": 2.4373302459716797, "learning_rate": 4.655426999421317e-05, "loss": 0.346, "step": 2190 }, { "epoch": 1.0161662817551964, "grad_norm": 2.1059627532958984, "learning_rate": 4.650304337591185e-05, "loss": 0.3091, "step": 2200 }, { "epoch": 1.0207852193995381, "grad_norm": 1.6682378053665161, "learning_rate": 4.6451467393502644e-05, "loss": 0.286, "step": 2210 }, { "epoch": 1.0254041570438799, "grad_norm": 2.1200857162475586, "learning_rate": 4.639954288495054e-05, "loss": 0.3246, "step": 2220 }, { "epoch": 1.0300230946882216, "grad_norm": 2.503147602081299, "learning_rate": 4.634727069388306e-05, "loss": 0.3594, "step": 2230 }, { "epoch": 1.0346420323325636, "grad_norm": 1.5546462535858154, "learning_rate": 4.629465166957662e-05, "loss": 0.3154, "step": 2240 }, { "epoch": 1.0392609699769053, "grad_norm": 1.6011874675750732, "learning_rate": 4.6241686666942694e-05, "loss": 0.3309, "step": 2250 }, { "epoch": 1.043879907621247, "grad_norm": 1.319419026374817, "learning_rate": 4.618837654651393e-05, "loss": 0.3198, "step": 2260 }, { "epoch": 1.048498845265589, "grad_norm": 1.689467191696167, "learning_rate": 4.613472217443017e-05, "loss": 0.3203, "step": 2270 }, { "epoch": 1.0531177829099307, "grad_norm": 3.2122750282287598, "learning_rate": 4.608072442242439e-05, "loss": 0.3269, "step": 2280 }, { "epoch": 1.0577367205542725, "grad_norm": 1.627459168434143, "learning_rate": 4.602638416780851e-05, "loss": 0.3094, "step": 2290 }, { "epoch": 1.0623556581986142, "grad_norm": 2.7835917472839355, "learning_rate": 4.5971702293459163e-05, "loss": 0.3146, "step": 2300 }, { "epoch": 1.0669745958429562, "grad_norm": 1.5927238464355469, "learning_rate": 4.591667968780336e-05, "loss": 0.3058, "step": 2310 }, { "epoch": 1.071593533487298, "grad_norm": 1.8358134031295776, "learning_rate": 4.5861317244804015e-05, "loss": 0.3079, "step": 2320 }, { "epoch": 1.0762124711316396, "grad_norm": 1.3477319478988647, "learning_rate": 4.580561586394545e-05, "loss": 0.3355, "step": 2330 }, { "epoch": 1.0808314087759816, "grad_norm": 2.179304361343384, "learning_rate": 4.57495764502188e-05, "loss": 0.3301, "step": 2340 }, { "epoch": 1.0854503464203233, "grad_norm": 1.5659071207046509, "learning_rate": 4.569319991410725e-05, "loss": 0.3133, "step": 2350 }, { "epoch": 1.090069284064665, "grad_norm": 1.8552881479263306, "learning_rate": 4.563648717157131e-05, "loss": 0.3287, "step": 2360 }, { "epoch": 1.0946882217090068, "grad_norm": 2.5701863765716553, "learning_rate": 4.557943914403386e-05, "loss": 0.3219, "step": 2370 }, { "epoch": 1.0993071593533488, "grad_norm": 1.5083070993423462, "learning_rate": 4.552205675836527e-05, "loss": 0.317, "step": 2380 }, { "epoch": 1.1039260969976905, "grad_norm": 2.8584365844726562, "learning_rate": 4.5464340946868256e-05, "loss": 0.2864, "step": 2390 }, { "epoch": 1.1085450346420322, "grad_norm": 1.630210518836975, "learning_rate": 4.540629264726278e-05, "loss": 0.2998, "step": 2400 }, { "epoch": 1.1131639722863742, "grad_norm": 1.8594950437545776, "learning_rate": 4.53479128026708e-05, "loss": 0.3156, "step": 2410 }, { "epoch": 1.117782909930716, "grad_norm": 1.5817375183105469, "learning_rate": 4.528920236160096e-05, "loss": 0.3201, "step": 2420 }, { "epoch": 1.1224018475750577, "grad_norm": 2.3912880420684814, "learning_rate": 4.523016227793315e-05, "loss": 0.2986, "step": 2430 }, { "epoch": 1.1270207852193996, "grad_norm": 1.5563384294509888, "learning_rate": 4.5170793510903046e-05, "loss": 0.3021, "step": 2440 }, { "epoch": 1.1316397228637414, "grad_norm": 1.8241164684295654, "learning_rate": 4.511109702508648e-05, "loss": 0.3316, "step": 2450 }, { "epoch": 1.136258660508083, "grad_norm": 2.19366192817688, "learning_rate": 4.505107379038384e-05, "loss": 0.3255, "step": 2460 }, { "epoch": 1.140877598152425, "grad_norm": 1.471584439277649, "learning_rate": 4.499072478200421e-05, "loss": 0.3245, "step": 2470 }, { "epoch": 1.1454965357967668, "grad_norm": 1.6688241958618164, "learning_rate": 4.493005098044963e-05, "loss": 0.3336, "step": 2480 }, { "epoch": 1.1501154734411085, "grad_norm": 1.9700522422790527, "learning_rate": 4.486905337149909e-05, "loss": 0.3277, "step": 2490 }, { "epoch": 1.1547344110854503, "grad_norm": 2.3884382247924805, "learning_rate": 4.480773294619255e-05, "loss": 0.2978, "step": 2500 }, { "epoch": 1.1593533487297922, "grad_norm": 2.134876251220703, "learning_rate": 4.474609070081483e-05, "loss": 0.3168, "step": 2510 }, { "epoch": 1.163972286374134, "grad_norm": 1.8444334268569946, "learning_rate": 4.468412763687942e-05, "loss": 0.3228, "step": 2520 }, { "epoch": 1.1685912240184757, "grad_norm": 1.6720913648605347, "learning_rate": 4.4621844761112216e-05, "loss": 0.3174, "step": 2530 }, { "epoch": 1.1732101616628174, "grad_norm": 1.8607888221740723, "learning_rate": 4.4559243085435154e-05, "loss": 0.265, "step": 2540 }, { "epoch": 1.1778290993071594, "grad_norm": 1.9527697563171387, "learning_rate": 4.449632362694978e-05, "loss": 0.3143, "step": 2550 }, { "epoch": 1.1824480369515011, "grad_norm": 2.462707757949829, "learning_rate": 4.443308740792072e-05, "loss": 0.3007, "step": 2560 }, { "epoch": 1.1870669745958429, "grad_norm": 1.9817255735397339, "learning_rate": 4.436953545575904e-05, "loss": 0.3135, "step": 2570 }, { "epoch": 1.1916859122401848, "grad_norm": 2.517148971557617, "learning_rate": 4.430566880300563e-05, "loss": 0.3346, "step": 2580 }, { "epoch": 1.1963048498845266, "grad_norm": 2.951472282409668, "learning_rate": 4.4241488487314365e-05, "loss": 0.3111, "step": 2590 }, { "epoch": 1.2009237875288683, "grad_norm": 3.675570249557495, "learning_rate": 4.417699555143523e-05, "loss": 0.3082, "step": 2600 }, { "epoch": 1.2055427251732103, "grad_norm": 1.6169134378433228, "learning_rate": 4.411219104319746e-05, "loss": 0.3247, "step": 2610 }, { "epoch": 1.210161662817552, "grad_norm": 4.280526638031006, "learning_rate": 4.404707601549244e-05, "loss": 0.3309, "step": 2620 }, { "epoch": 1.2147806004618937, "grad_norm": 1.2955102920532227, "learning_rate": 4.398165152625663e-05, "loss": 0.2845, "step": 2630 }, { "epoch": 1.2193995381062355, "grad_norm": 2.1361026763916016, "learning_rate": 4.391591863845436e-05, "loss": 0.3143, "step": 2640 }, { "epoch": 1.2240184757505774, "grad_norm": 2.1552889347076416, "learning_rate": 4.384987842006059e-05, "loss": 0.3042, "step": 2650 }, { "epoch": 1.2286374133949192, "grad_norm": 2.199091672897339, "learning_rate": 4.378353194404352e-05, "loss": 0.3426, "step": 2660 }, { "epoch": 1.233256351039261, "grad_norm": 2.23543119430542, "learning_rate": 4.371688028834721e-05, "loss": 0.3047, "step": 2670 }, { "epoch": 1.2378752886836029, "grad_norm": 1.7668689489364624, "learning_rate": 4.3649924535873994e-05, "loss": 0.3269, "step": 2680 }, { "epoch": 1.2424942263279446, "grad_norm": 1.9710248708724976, "learning_rate": 4.3582665774466946e-05, "loss": 0.3252, "step": 2690 }, { "epoch": 1.2471131639722863, "grad_norm": 1.9266043901443481, "learning_rate": 4.351510509689218e-05, "loss": 0.2913, "step": 2700 }, { "epoch": 1.251732101616628, "grad_norm": 1.4974067211151123, "learning_rate": 4.34472436008211e-05, "loss": 0.3071, "step": 2710 }, { "epoch": 1.25635103926097, "grad_norm": 1.7189793586730957, "learning_rate": 4.337908238881258e-05, "loss": 0.321, "step": 2720 }, { "epoch": 1.2609699769053118, "grad_norm": 1.8770833015441895, "learning_rate": 4.331062256829501e-05, "loss": 0.3252, "step": 2730 }, { "epoch": 1.2655889145496535, "grad_norm": 2.516841411590576, "learning_rate": 4.324186525154835e-05, "loss": 0.2971, "step": 2740 }, { "epoch": 1.2702078521939955, "grad_norm": 1.8335446119308472, "learning_rate": 4.317281155568604e-05, "loss": 0.3137, "step": 2750 }, { "epoch": 1.2748267898383372, "grad_norm": 1.632881999015808, "learning_rate": 4.3103462602636845e-05, "loss": 0.3096, "step": 2760 }, { "epoch": 1.279445727482679, "grad_norm": 1.6128835678100586, "learning_rate": 4.303381951912661e-05, "loss": 0.3228, "step": 2770 }, { "epoch": 1.2840646651270209, "grad_norm": 2.278327703475952, "learning_rate": 4.296388343666001e-05, "loss": 0.2835, "step": 2780 }, { "epoch": 1.2886836027713626, "grad_norm": 1.4413753747940063, "learning_rate": 4.2893655491502105e-05, "loss": 0.2993, "step": 2790 }, { "epoch": 1.2933025404157044, "grad_norm": 2.016895055770874, "learning_rate": 4.282313682465992e-05, "loss": 0.2721, "step": 2800 }, { "epoch": 1.2979214780600463, "grad_norm": 1.687320590019226, "learning_rate": 4.275232858186388e-05, "loss": 0.2811, "step": 2810 }, { "epoch": 1.302540415704388, "grad_norm": 1.556947112083435, "learning_rate": 4.268123191354921e-05, "loss": 0.3164, "step": 2820 }, { "epoch": 1.3071593533487298, "grad_norm": 1.7516964673995972, "learning_rate": 4.260984797483725e-05, "loss": 0.2722, "step": 2830 }, { "epoch": 1.3117782909930715, "grad_norm": 1.999335765838623, "learning_rate": 4.2538177925516665e-05, "loss": 0.289, "step": 2840 }, { "epoch": 1.3163972286374133, "grad_norm": 2.1948471069335938, "learning_rate": 4.2466222930024616e-05, "loss": 0.2969, "step": 2850 }, { "epoch": 1.3210161662817552, "grad_norm": 3.0124406814575195, "learning_rate": 4.239398415742784e-05, "loss": 0.336, "step": 2860 }, { "epoch": 1.325635103926097, "grad_norm": 2.1753032207489014, "learning_rate": 4.232146278140366e-05, "loss": 0.3086, "step": 2870 }, { "epoch": 1.3302540415704387, "grad_norm": 1.9752156734466553, "learning_rate": 4.224865998022092e-05, "loss": 0.2998, "step": 2880 }, { "epoch": 1.3348729792147807, "grad_norm": 2.4908149242401123, "learning_rate": 4.2175576936720805e-05, "loss": 0.3096, "step": 2890 }, { "epoch": 1.3394919168591224, "grad_norm": 1.8155696392059326, "learning_rate": 4.2102214838297696e-05, "loss": 0.3046, "step": 2900 }, { "epoch": 1.3441108545034641, "grad_norm": 2.2324860095977783, "learning_rate": 4.202857487687977e-05, "loss": 0.3065, "step": 2910 }, { "epoch": 1.348729792147806, "grad_norm": 1.9542967081069946, "learning_rate": 4.1954658248909764e-05, "loss": 0.284, "step": 2920 }, { "epoch": 1.3533487297921478, "grad_norm": 1.6855888366699219, "learning_rate": 4.188046615532541e-05, "loss": 0.2928, "step": 2930 }, { "epoch": 1.3579676674364896, "grad_norm": 2.32239031791687, "learning_rate": 4.180599980154005e-05, "loss": 0.3303, "step": 2940 }, { "epoch": 1.3625866050808315, "grad_norm": 1.3265233039855957, "learning_rate": 4.173126039742292e-05, "loss": 0.2994, "step": 2950 }, { "epoch": 1.3672055427251733, "grad_norm": 1.8445942401885986, "learning_rate": 4.16562491572796e-05, "loss": 0.2883, "step": 2960 }, { "epoch": 1.371824480369515, "grad_norm": 1.4618576765060425, "learning_rate": 4.158096729983222e-05, "loss": 0.3107, "step": 2970 }, { "epoch": 1.376443418013857, "grad_norm": 1.7024414539337158, "learning_rate": 4.150541604819969e-05, "loss": 0.2596, "step": 2980 }, { "epoch": 1.3810623556581987, "grad_norm": 1.2643704414367676, "learning_rate": 4.142959662987783e-05, "loss": 0.311, "step": 2990 }, { "epoch": 1.3856812933025404, "grad_norm": 2.466792345046997, "learning_rate": 4.1353510276719386e-05, "loss": 0.2991, "step": 3000 }, { "epoch": 1.3903002309468822, "grad_norm": 2.7137906551361084, "learning_rate": 4.127715822491408e-05, "loss": 0.3034, "step": 3010 }, { "epoch": 1.394919168591224, "grad_norm": 1.9120469093322754, "learning_rate": 4.120054171496847e-05, "loss": 0.2926, "step": 3020 }, { "epoch": 1.3995381062355658, "grad_norm": 1.790726900100708, "learning_rate": 4.1123661991685826e-05, "loss": 0.2634, "step": 3030 }, { "epoch": 1.4041570438799076, "grad_norm": 1.786881446838379, "learning_rate": 4.1046520304145884e-05, "loss": 0.2932, "step": 3040 }, { "epoch": 1.4087759815242493, "grad_norm": 1.7593064308166504, "learning_rate": 4.096911790568459e-05, "loss": 0.3173, "step": 3050 }, { "epoch": 1.4133949191685913, "grad_norm": 2.030292510986328, "learning_rate": 4.089145605387368e-05, "loss": 0.3248, "step": 3060 }, { "epoch": 1.418013856812933, "grad_norm": 1.089842438697815, "learning_rate": 4.08135360105003e-05, "loss": 0.2823, "step": 3070 }, { "epoch": 1.4226327944572748, "grad_norm": 3.046901226043701, "learning_rate": 4.0735359041546476e-05, "loss": 0.287, "step": 3080 }, { "epoch": 1.4272517321016167, "grad_norm": 1.6824251413345337, "learning_rate": 4.065692641716855e-05, "loss": 0.2727, "step": 3090 }, { "epoch": 1.4318706697459584, "grad_norm": 2.4673869609832764, "learning_rate": 4.0578239411676556e-05, "loss": 0.3161, "step": 3100 }, { "epoch": 1.4364896073903002, "grad_norm": 1.6147607564926147, "learning_rate": 4.049929930351349e-05, "loss": 0.338, "step": 3110 }, { "epoch": 1.4411085450346421, "grad_norm": 1.293908953666687, "learning_rate": 4.04201073752346e-05, "loss": 0.2863, "step": 3120 }, { "epoch": 1.4457274826789839, "grad_norm": 2.096881866455078, "learning_rate": 4.034066491348645e-05, "loss": 0.3121, "step": 3130 }, { "epoch": 1.4503464203233256, "grad_norm": 1.5880385637283325, "learning_rate": 4.026097320898609e-05, "loss": 0.2995, "step": 3140 }, { "epoch": 1.4549653579676676, "grad_norm": 1.1587382555007935, "learning_rate": 4.0181033556500074e-05, "loss": 0.3035, "step": 3150 }, { "epoch": 1.4595842956120093, "grad_norm": 4.055675029754639, "learning_rate": 4.0100847254823414e-05, "loss": 0.3209, "step": 3160 }, { "epoch": 1.464203233256351, "grad_norm": 2.2304954528808594, "learning_rate": 4.0020415606758474e-05, "loss": 0.2716, "step": 3170 }, { "epoch": 1.4688221709006928, "grad_norm": 1.7061644792556763, "learning_rate": 3.99397399190938e-05, "loss": 0.297, "step": 3180 }, { "epoch": 1.4734411085450345, "grad_norm": 2.139970302581787, "learning_rate": 3.985882150258291e-05, "loss": 0.2978, "step": 3190 }, { "epoch": 1.4780600461893765, "grad_norm": 1.6300883293151855, "learning_rate": 3.9777661671922984e-05, "loss": 0.2923, "step": 3200 }, { "epoch": 1.4826789838337182, "grad_norm": 1.210604190826416, "learning_rate": 3.96962617457335e-05, "loss": 0.3015, "step": 3210 }, { "epoch": 1.48729792147806, "grad_norm": 8.268691062927246, "learning_rate": 3.96146230465348e-05, "loss": 0.3306, "step": 3220 }, { "epoch": 1.491916859122402, "grad_norm": 2.2136807441711426, "learning_rate": 3.9532746900726645e-05, "loss": 0.3133, "step": 3230 }, { "epoch": 1.4965357967667436, "grad_norm": 2.0368247032165527, "learning_rate": 3.9450634638566624e-05, "loss": 0.3387, "step": 3240 }, { "epoch": 1.5011547344110854, "grad_norm": 2.5188982486724854, "learning_rate": 3.9368287594148555e-05, "loss": 0.3095, "step": 3250 }, { "epoch": 1.5057736720554273, "grad_norm": 3.128319263458252, "learning_rate": 3.92857071053808e-05, "loss": 0.2646, "step": 3260 }, { "epoch": 1.510392609699769, "grad_norm": 2.266287088394165, "learning_rate": 3.920289451396455e-05, "loss": 0.2996, "step": 3270 }, { "epoch": 1.5150115473441108, "grad_norm": 2.842649221420288, "learning_rate": 3.9119851165372e-05, "loss": 0.2782, "step": 3280 }, { "epoch": 1.5196304849884528, "grad_norm": 2.6462066173553467, "learning_rate": 3.903657840882453e-05, "loss": 0.2967, "step": 3290 }, { "epoch": 1.5242494226327945, "grad_norm": 2.3835110664367676, "learning_rate": 3.89530775972707e-05, "loss": 0.2774, "step": 3300 }, { "epoch": 1.5288683602771362, "grad_norm": 3.416583776473999, "learning_rate": 3.886935008736439e-05, "loss": 0.2862, "step": 3310 }, { "epoch": 1.5334872979214782, "grad_norm": 2.1469576358795166, "learning_rate": 3.8785397239442636e-05, "loss": 0.2783, "step": 3320 }, { "epoch": 1.5381062355658197, "grad_norm": 2.151310682296753, "learning_rate": 3.870122041750363e-05, "loss": 0.3143, "step": 3330 }, { "epoch": 1.5427251732101617, "grad_norm": 1.654341220855713, "learning_rate": 3.861682098918447e-05, "loss": 0.2778, "step": 3340 }, { "epoch": 1.5473441108545036, "grad_norm": 2.1126046180725098, "learning_rate": 3.853220032573902e-05, "loss": 0.2945, "step": 3350 }, { "epoch": 1.5519630484988451, "grad_norm": 1.182999610900879, "learning_rate": 3.844735980201557e-05, "loss": 0.2757, "step": 3360 }, { "epoch": 1.556581986143187, "grad_norm": 1.8858295679092407, "learning_rate": 3.836230079643452e-05, "loss": 0.2963, "step": 3370 }, { "epoch": 1.5612009237875288, "grad_norm": 2.020688533782959, "learning_rate": 3.8277024690966034e-05, "loss": 0.2638, "step": 3380 }, { "epoch": 1.5658198614318706, "grad_norm": 3.3935155868530273, "learning_rate": 3.819153287110746e-05, "loss": 0.2949, "step": 3390 }, { "epoch": 1.5704387990762125, "grad_norm": 2.6277477741241455, "learning_rate": 3.8105826725860976e-05, "loss": 0.3307, "step": 3400 }, { "epoch": 1.5750577367205543, "grad_norm": 1.7885854244232178, "learning_rate": 3.801990764771089e-05, "loss": 0.2866, "step": 3410 }, { "epoch": 1.579676674364896, "grad_norm": 2.0295510292053223, "learning_rate": 3.793377703260112e-05, "loss": 0.3182, "step": 3420 }, { "epoch": 1.584295612009238, "grad_norm": 1.3024362325668335, "learning_rate": 3.784743627991243e-05, "loss": 0.2969, "step": 3430 }, { "epoch": 1.5889145496535797, "grad_norm": 1.4249627590179443, "learning_rate": 3.7760886792439724e-05, "loss": 0.2748, "step": 3440 }, { "epoch": 1.5935334872979214, "grad_norm": 1.7537922859191895, "learning_rate": 3.767412997636929e-05, "loss": 0.28, "step": 3450 }, { "epoch": 1.5981524249422634, "grad_norm": 1.3318328857421875, "learning_rate": 3.758716724125592e-05, "loss": 0.2595, "step": 3460 }, { "epoch": 1.6027713625866051, "grad_norm": 2.035454273223877, "learning_rate": 3.7500000000000003e-05, "loss": 0.3124, "step": 3470 }, { "epoch": 1.6073903002309469, "grad_norm": 1.43570876121521, "learning_rate": 3.7412629668824575e-05, "loss": 0.297, "step": 3480 }, { "epoch": 1.6120092378752888, "grad_norm": 2.0970613956451416, "learning_rate": 3.732505766725231e-05, "loss": 0.2931, "step": 3490 }, { "epoch": 1.6166281755196303, "grad_norm": 1.3278722763061523, "learning_rate": 3.723728541808247e-05, "loss": 0.2814, "step": 3500 }, { "epoch": 1.6212471131639723, "grad_norm": 2.9331254959106445, "learning_rate": 3.714931434736778e-05, "loss": 0.2789, "step": 3510 }, { "epoch": 1.625866050808314, "grad_norm": 1.5764210224151611, "learning_rate": 3.706114588439127e-05, "loss": 0.2792, "step": 3520 }, { "epoch": 1.6304849884526558, "grad_norm": 2.03232741355896, "learning_rate": 3.6972781461643e-05, "loss": 0.2995, "step": 3530 }, { "epoch": 1.6351039260969977, "grad_norm": 1.9853382110595703, "learning_rate": 3.688422251479686e-05, "loss": 0.3067, "step": 3540 }, { "epoch": 1.6397228637413395, "grad_norm": 1.4756333827972412, "learning_rate": 3.679547048268721e-05, "loss": 0.3201, "step": 3550 }, { "epoch": 1.6443418013856812, "grad_norm": 1.2364952564239502, "learning_rate": 3.670652680728548e-05, "loss": 0.2631, "step": 3560 }, { "epoch": 1.6489607390300232, "grad_norm": 1.897588849067688, "learning_rate": 3.661739293367678e-05, "loss": 0.2751, "step": 3570 }, { "epoch": 1.653579676674365, "grad_norm": 1.7648630142211914, "learning_rate": 3.652807031003642e-05, "loss": 0.2861, "step": 3580 }, { "epoch": 1.6581986143187066, "grad_norm": 1.905043125152588, "learning_rate": 3.643856038760634e-05, "loss": 0.3111, "step": 3590 }, { "epoch": 1.6628175519630486, "grad_norm": 1.5192134380340576, "learning_rate": 3.63488646206716e-05, "loss": 0.2962, "step": 3600 }, { "epoch": 1.6674364896073903, "grad_norm": 2.8806204795837402, "learning_rate": 3.625898446653666e-05, "loss": 0.3159, "step": 3610 }, { "epoch": 1.672055427251732, "grad_norm": 1.5190327167510986, "learning_rate": 3.6168921385501794e-05, "loss": 0.2874, "step": 3620 }, { "epoch": 1.676674364896074, "grad_norm": 2.227309226989746, "learning_rate": 3.60786768408393e-05, "loss": 0.2793, "step": 3630 }, { "epoch": 1.6812933025404158, "grad_norm": 1.3853188753128052, "learning_rate": 3.598825229876979e-05, "loss": 0.2809, "step": 3640 }, { "epoch": 1.6859122401847575, "grad_norm": 1.5016895532608032, "learning_rate": 3.589764922843828e-05, "loss": 0.2876, "step": 3650 }, { "epoch": 1.6905311778290995, "grad_norm": 1.5421347618103027, "learning_rate": 3.580686910189039e-05, "loss": 0.2872, "step": 3660 }, { "epoch": 1.695150115473441, "grad_norm": 1.6591922044754028, "learning_rate": 3.5715913394048425e-05, "loss": 0.2941, "step": 3670 }, { "epoch": 1.699769053117783, "grad_norm": 2.824263572692871, "learning_rate": 3.562478358268735e-05, "loss": 0.2751, "step": 3680 }, { "epoch": 1.7043879907621247, "grad_norm": 1.939245343208313, "learning_rate": 3.5533481148410875e-05, "loss": 0.2989, "step": 3690 }, { "epoch": 1.7090069284064664, "grad_norm": 2.3213257789611816, "learning_rate": 3.544200757462731e-05, "loss": 0.3177, "step": 3700 }, { "epoch": 1.7136258660508084, "grad_norm": 1.687760353088379, "learning_rate": 3.535036434752551e-05, "loss": 0.272, "step": 3710 }, { "epoch": 1.71824480369515, "grad_norm": 1.6622027158737183, "learning_rate": 3.525855295605074e-05, "loss": 0.2875, "step": 3720 }, { "epoch": 1.7228637413394918, "grad_norm": 1.5205411911010742, "learning_rate": 3.516657489188043e-05, "loss": 0.3006, "step": 3730 }, { "epoch": 1.7274826789838338, "grad_norm": 2.778884172439575, "learning_rate": 3.5074431649400004e-05, "loss": 0.2739, "step": 3740 }, { "epoch": 1.7321016166281755, "grad_norm": 1.613918423652649, "learning_rate": 3.498212472567855e-05, "loss": 0.2921, "step": 3750 }, { "epoch": 1.7367205542725173, "grad_norm": 1.5840065479278564, "learning_rate": 3.488965562044452e-05, "loss": 0.297, "step": 3760 }, { "epoch": 1.7413394919168592, "grad_norm": 1.6937522888183594, "learning_rate": 3.4797025836061384e-05, "loss": 0.2793, "step": 3770 }, { "epoch": 1.745958429561201, "grad_norm": 2.0421125888824463, "learning_rate": 3.470423687750317e-05, "loss": 0.2902, "step": 3780 }, { "epoch": 1.7505773672055427, "grad_norm": 1.6249983310699463, "learning_rate": 3.461129025233004e-05, "loss": 0.3115, "step": 3790 }, { "epoch": 1.7551963048498846, "grad_norm": 1.5829148292541504, "learning_rate": 3.451818747066381e-05, "loss": 0.2648, "step": 3800 }, { "epoch": 1.7598152424942262, "grad_norm": 1.2830034494400024, "learning_rate": 3.44249300451634e-05, "loss": 0.3059, "step": 3810 }, { "epoch": 1.7644341801385681, "grad_norm": 2.2198989391326904, "learning_rate": 3.433151949100024e-05, "loss": 0.2785, "step": 3820 }, { "epoch": 1.76905311778291, "grad_norm": 1.5156611204147339, "learning_rate": 3.42379573258337e-05, "loss": 0.2795, "step": 3830 }, { "epoch": 1.7736720554272516, "grad_norm": 2.114614725112915, "learning_rate": 3.4144245069786374e-05, "loss": 0.2755, "step": 3840 }, { "epoch": 1.7782909930715936, "grad_norm": 1.391946792602539, "learning_rate": 3.405038424541943e-05, "loss": 0.2885, "step": 3850 }, { "epoch": 1.7829099307159353, "grad_norm": 2.22967267036438, "learning_rate": 3.3956376377707854e-05, "loss": 0.282, "step": 3860 }, { "epoch": 1.787528868360277, "grad_norm": 1.859484076499939, "learning_rate": 3.3862222994015663e-05, "loss": 0.2648, "step": 3870 }, { "epoch": 1.792147806004619, "grad_norm": 1.2860459089279175, "learning_rate": 3.376792562407111e-05, "loss": 0.2858, "step": 3880 }, { "epoch": 1.7967667436489607, "grad_norm": 3.0899295806884766, "learning_rate": 3.3673485799941806e-05, "loss": 0.2963, "step": 3890 }, { "epoch": 1.8013856812933025, "grad_norm": 1.5559500455856323, "learning_rate": 3.357890505600988e-05, "loss": 0.2696, "step": 3900 }, { "epoch": 1.8060046189376444, "grad_norm": 2.3138883113861084, "learning_rate": 3.348418492894695e-05, "loss": 0.3045, "step": 3910 }, { "epoch": 1.8106235565819861, "grad_norm": 1.8770431280136108, "learning_rate": 3.3389326957689284e-05, "loss": 0.2659, "step": 3920 }, { "epoch": 1.8152424942263279, "grad_norm": 1.8959892988204956, "learning_rate": 3.3294332683412675e-05, "loss": 0.3247, "step": 3930 }, { "epoch": 1.8198614318706698, "grad_norm": 1.7467166185379028, "learning_rate": 3.3199203649507514e-05, "loss": 0.2596, "step": 3940 }, { "epoch": 1.8244803695150116, "grad_norm": 2.085278272628784, "learning_rate": 3.310394140155361e-05, "loss": 0.2882, "step": 3950 }, { "epoch": 1.8290993071593533, "grad_norm": 3.6550683975219727, "learning_rate": 3.300854748729515e-05, "loss": 0.2721, "step": 3960 }, { "epoch": 1.8337182448036953, "grad_norm": 1.661049723625183, "learning_rate": 3.2913023456615524e-05, "loss": 0.3035, "step": 3970 }, { "epoch": 1.8383371824480368, "grad_norm": 1.9763981103897095, "learning_rate": 3.281737086151214e-05, "loss": 0.2571, "step": 3980 }, { "epoch": 1.8429561200923787, "grad_norm": 1.6466537714004517, "learning_rate": 3.272159125607123e-05, "loss": 0.2801, "step": 3990 }, { "epoch": 1.8475750577367207, "grad_norm": 2.7753658294677734, "learning_rate": 3.262568619644259e-05, "loss": 0.2494, "step": 4000 }, { "epoch": 1.8521939953810622, "grad_norm": 2.5988552570343018, "learning_rate": 3.252965724081428e-05, "loss": 0.2717, "step": 4010 }, { "epoch": 1.8568129330254042, "grad_norm": 1.3616626262664795, "learning_rate": 3.2433505949387325e-05, "loss": 0.2617, "step": 4020 }, { "epoch": 1.861431870669746, "grad_norm": 1.6661334037780762, "learning_rate": 3.233723388435039e-05, "loss": 0.2463, "step": 4030 }, { "epoch": 1.8660508083140877, "grad_norm": 1.3578579425811768, "learning_rate": 3.224084260985432e-05, "loss": 0.3056, "step": 4040 }, { "epoch": 1.8706697459584296, "grad_norm": 1.414781928062439, "learning_rate": 3.214433369198682e-05, "loss": 0.2463, "step": 4050 }, { "epoch": 1.8752886836027713, "grad_norm": 2.290524959564209, "learning_rate": 3.204770869874697e-05, "loss": 0.2966, "step": 4060 }, { "epoch": 1.879907621247113, "grad_norm": 1.2253903150558472, "learning_rate": 3.1950969200019724e-05, "loss": 0.2646, "step": 4070 }, { "epoch": 1.884526558891455, "grad_norm": 2.6661570072174072, "learning_rate": 3.185411676755046e-05, "loss": 0.2779, "step": 4080 }, { "epoch": 1.8891454965357968, "grad_norm": 1.4080040454864502, "learning_rate": 3.175715297491938e-05, "loss": 0.3086, "step": 4090 }, { "epoch": 1.8937644341801385, "grad_norm": 1.793808102607727, "learning_rate": 3.166007939751599e-05, "loss": 0.281, "step": 4100 }, { "epoch": 1.8983833718244805, "grad_norm": 1.3977165222167969, "learning_rate": 3.1562897612513506e-05, "loss": 0.2702, "step": 4110 }, { "epoch": 1.9030023094688222, "grad_norm": 2.07999849319458, "learning_rate": 3.1465609198843184e-05, "loss": 0.2916, "step": 4120 }, { "epoch": 1.907621247113164, "grad_norm": 1.8861336708068848, "learning_rate": 3.136821573716872e-05, "loss": 0.2685, "step": 4130 }, { "epoch": 1.912240184757506, "grad_norm": 1.0516432523727417, "learning_rate": 3.1270718809860535e-05, "loss": 0.308, "step": 4140 }, { "epoch": 1.9168591224018474, "grad_norm": 2.8229928016662598, "learning_rate": 3.117312000097008e-05, "loss": 0.2892, "step": 4150 }, { "epoch": 1.9214780600461894, "grad_norm": 1.7853163480758667, "learning_rate": 3.10754208962041e-05, "loss": 0.2836, "step": 4160 }, { "epoch": 1.9260969976905313, "grad_norm": 3.0247716903686523, "learning_rate": 3.0977623082898846e-05, "loss": 0.2884, "step": 4170 }, { "epoch": 1.9307159353348728, "grad_norm": 1.726279616355896, "learning_rate": 3.0879728149994334e-05, "loss": 0.2983, "step": 4180 }, { "epoch": 1.9353348729792148, "grad_norm": 1.9192169904708862, "learning_rate": 3.0781737688008486e-05, "loss": 0.267, "step": 4190 }, { "epoch": 1.9399538106235565, "grad_norm": 1.7103173732757568, "learning_rate": 3.06836532890113e-05, "loss": 0.2871, "step": 4200 }, { "epoch": 1.9445727482678983, "grad_norm": 2.5731523036956787, "learning_rate": 3.0585476546599e-05, "loss": 0.2907, "step": 4210 }, { "epoch": 1.9491916859122402, "grad_norm": 1.847522258758545, "learning_rate": 3.0487209055868114e-05, "loss": 0.2915, "step": 4220 }, { "epoch": 1.953810623556582, "grad_norm": 2.150347948074341, "learning_rate": 3.0388852413389583e-05, "loss": 0.2675, "step": 4230 }, { "epoch": 1.9584295612009237, "grad_norm": 1.943219542503357, "learning_rate": 3.029040821718282e-05, "loss": 0.3287, "step": 4240 }, { "epoch": 1.9630484988452657, "grad_norm": 3.235292911529541, "learning_rate": 3.0191878066689744e-05, "loss": 0.2682, "step": 4250 }, { "epoch": 1.9676674364896074, "grad_norm": 1.4820986986160278, "learning_rate": 3.009326356274877e-05, "loss": 0.2625, "step": 4260 }, { "epoch": 1.9722863741339491, "grad_norm": 1.1359635591506958, "learning_rate": 2.9994566307568833e-05, "loss": 0.27, "step": 4270 }, { "epoch": 1.976905311778291, "grad_norm": 2.165886402130127, "learning_rate": 2.990566935090711e-05, "loss": 0.2638, "step": 4280 }, { "epoch": 1.9815242494226328, "grad_norm": 1.4407706260681152, "learning_rate": 2.9806819287253002e-05, "loss": 0.301, "step": 4290 }, { "epoch": 1.9861431870669746, "grad_norm": 2.1644721031188965, "learning_rate": 2.9707891126275522e-05, "loss": 0.2699, "step": 4300 }, { "epoch": 1.9907621247113165, "grad_norm": 1.8348596096038818, "learning_rate": 2.9608886475279652e-05, "loss": 0.2473, "step": 4310 }, { "epoch": 1.995381062355658, "grad_norm": 1.6486015319824219, "learning_rate": 2.950980694281312e-05, "loss": 0.2692, "step": 4320 }, { "epoch": 2.0, "grad_norm": 1.314980149269104, "learning_rate": 2.941065413864026e-05, "loss": 0.3164, "step": 4330 }, { "epoch": 2.004618937644342, "grad_norm": 1.0696333646774292, "learning_rate": 2.9311429673715886e-05, "loss": 0.1681, "step": 4340 }, { "epoch": 2.0092378752886835, "grad_norm": 1.729023814201355, "learning_rate": 2.921213516015907e-05, "loss": 0.1849, "step": 4350 }, { "epoch": 2.0138568129330254, "grad_norm": 1.4104517698287964, "learning_rate": 2.9112772211227012e-05, "loss": 0.1715, "step": 4360 }, { "epoch": 2.0184757505773674, "grad_norm": 0.8736545443534851, "learning_rate": 2.901334244128876e-05, "loss": 0.1882, "step": 4370 }, { "epoch": 2.023094688221709, "grad_norm": 0.8108435273170471, "learning_rate": 2.8913847465799033e-05, "loss": 0.1627, "step": 4380 }, { "epoch": 2.027713625866051, "grad_norm": 1.6901952028274536, "learning_rate": 2.8814288901271967e-05, "loss": 0.1782, "step": 4390 }, { "epoch": 2.032332563510393, "grad_norm": 1.809261441230774, "learning_rate": 2.8714668365254827e-05, "loss": 0.1914, "step": 4400 }, { "epoch": 2.0369515011547343, "grad_norm": 2.1029443740844727, "learning_rate": 2.861498747630174e-05, "loss": 0.1664, "step": 4410 }, { "epoch": 2.0415704387990763, "grad_norm": 1.186228632926941, "learning_rate": 2.8515247853947402e-05, "loss": 0.174, "step": 4420 }, { "epoch": 2.046189376443418, "grad_norm": 1.2275420427322388, "learning_rate": 2.841545111868077e-05, "loss": 0.1837, "step": 4430 }, { "epoch": 2.0508083140877598, "grad_norm": 1.1749093532562256, "learning_rate": 2.8315598891918716e-05, "loss": 0.1852, "step": 4440 }, { "epoch": 2.0554272517321017, "grad_norm": 1.4249165058135986, "learning_rate": 2.8215692795979686e-05, "loss": 0.1798, "step": 4450 }, { "epoch": 2.0600461893764432, "grad_norm": 0.9593158960342407, "learning_rate": 2.8115734454057374e-05, "loss": 0.1734, "step": 4460 }, { "epoch": 2.064665127020785, "grad_norm": 1.475946068763733, "learning_rate": 2.8015725490194306e-05, "loss": 0.1751, "step": 4470 }, { "epoch": 2.069284064665127, "grad_norm": 1.298519492149353, "learning_rate": 2.7915667529255463e-05, "loss": 0.183, "step": 4480 }, { "epoch": 2.0739030023094687, "grad_norm": 1.2408766746520996, "learning_rate": 2.7815562196901924e-05, "loss": 0.1933, "step": 4490 }, { "epoch": 2.0785219399538106, "grad_norm": 1.46627938747406, "learning_rate": 2.7715411119564378e-05, "loss": 0.1869, "step": 4500 }, { "epoch": 2.0831408775981526, "grad_norm": 1.2481168508529663, "learning_rate": 2.7615215924416777e-05, "loss": 0.1931, "step": 4510 }, { "epoch": 2.087759815242494, "grad_norm": 1.3887110948562622, "learning_rate": 2.7514978239349854e-05, "loss": 0.1948, "step": 4520 }, { "epoch": 2.092378752886836, "grad_norm": 3.4649949073791504, "learning_rate": 2.741469969294467e-05, "loss": 0.1729, "step": 4530 }, { "epoch": 2.096997690531178, "grad_norm": 1.056360125541687, "learning_rate": 2.731438191444619e-05, "loss": 0.1975, "step": 4540 }, { "epoch": 2.1016166281755195, "grad_norm": 1.524235725402832, "learning_rate": 2.7214026533736763e-05, "loss": 0.1843, "step": 4550 }, { "epoch": 2.1062355658198615, "grad_norm": 1.278883695602417, "learning_rate": 2.7113635181309693e-05, "loss": 0.1774, "step": 4560 }, { "epoch": 2.1108545034642034, "grad_norm": 1.465677261352539, "learning_rate": 2.701320948824272e-05, "loss": 0.1966, "step": 4570 }, { "epoch": 2.115473441108545, "grad_norm": 1.2884267568588257, "learning_rate": 2.6912751086171518e-05, "loss": 0.1761, "step": 4580 }, { "epoch": 2.120092378752887, "grad_norm": 1.1672310829162598, "learning_rate": 2.681226160726318e-05, "loss": 0.1759, "step": 4590 }, { "epoch": 2.1247113163972284, "grad_norm": 1.2387830018997192, "learning_rate": 2.6711742684189723e-05, "loss": 0.1648, "step": 4600 }, { "epoch": 2.1293302540415704, "grad_norm": 0.9033365845680237, "learning_rate": 2.6611195950101546e-05, "loss": 0.1601, "step": 4610 }, { "epoch": 2.1339491916859123, "grad_norm": 1.491166591644287, "learning_rate": 2.6510623038600914e-05, "loss": 0.1752, "step": 4620 }, { "epoch": 2.138568129330254, "grad_norm": 1.0755062103271484, "learning_rate": 2.6410025583715366e-05, "loss": 0.1629, "step": 4630 }, { "epoch": 2.143187066974596, "grad_norm": 1.2499966621398926, "learning_rate": 2.6309405219871237e-05, "loss": 0.1802, "step": 4640 }, { "epoch": 2.147806004618938, "grad_norm": 1.5925968885421753, "learning_rate": 2.620876358186705e-05, "loss": 0.1939, "step": 4650 }, { "epoch": 2.1524249422632793, "grad_norm": 1.5254716873168945, "learning_rate": 2.6108102304846982e-05, "loss": 0.1653, "step": 4660 }, { "epoch": 2.1570438799076213, "grad_norm": 1.819468379020691, "learning_rate": 2.6007423024274284e-05, "loss": 0.1677, "step": 4670 }, { "epoch": 2.161662817551963, "grad_norm": 1.1221483945846558, "learning_rate": 2.5906727375904705e-05, "loss": 0.1899, "step": 4680 }, { "epoch": 2.1662817551963047, "grad_norm": 1.1592941284179688, "learning_rate": 2.5806016995759946e-05, "loss": 0.1774, "step": 4690 }, { "epoch": 2.1709006928406467, "grad_norm": 1.8313263654708862, "learning_rate": 2.570529352010105e-05, "loss": 0.1959, "step": 4700 }, { "epoch": 2.1755196304849886, "grad_norm": 1.7101258039474487, "learning_rate": 2.560455858540181e-05, "loss": 0.1721, "step": 4710 }, { "epoch": 2.18013856812933, "grad_norm": 1.081250786781311, "learning_rate": 2.5503813828322217e-05, "loss": 0.1679, "step": 4720 }, { "epoch": 2.184757505773672, "grad_norm": 1.0335906744003296, "learning_rate": 2.5403060885681836e-05, "loss": 0.1641, "step": 4730 }, { "epoch": 2.1893764434180136, "grad_norm": 1.797884225845337, "learning_rate": 2.530230139443322e-05, "loss": 0.1896, "step": 4740 }, { "epoch": 2.1939953810623556, "grad_norm": 1.0984156131744385, "learning_rate": 2.5201536991635343e-05, "loss": 0.1748, "step": 4750 }, { "epoch": 2.1986143187066975, "grad_norm": 1.2501264810562134, "learning_rate": 2.5100769314426952e-05, "loss": 0.1806, "step": 4760 }, { "epoch": 2.203233256351039, "grad_norm": 1.6789705753326416, "learning_rate": 2.5e-05, "loss": 0.1755, "step": 4770 }, { "epoch": 2.207852193995381, "grad_norm": 1.7335457801818848, "learning_rate": 2.489923068557306e-05, "loss": 0.1786, "step": 4780 }, { "epoch": 2.212471131639723, "grad_norm": 2.127920627593994, "learning_rate": 2.4798463008364662e-05, "loss": 0.1707, "step": 4790 }, { "epoch": 2.2170900692840645, "grad_norm": 0.9540851712226868, "learning_rate": 2.4697698605566782e-05, "loss": 0.1687, "step": 4800 }, { "epoch": 2.2217090069284064, "grad_norm": 1.4594472646713257, "learning_rate": 2.4596939114318174e-05, "loss": 0.1486, "step": 4810 }, { "epoch": 2.2263279445727484, "grad_norm": 1.913409948348999, "learning_rate": 2.449618617167779e-05, "loss": 0.173, "step": 4820 }, { "epoch": 2.23094688221709, "grad_norm": 1.4003239870071411, "learning_rate": 2.4395441414598198e-05, "loss": 0.1829, "step": 4830 }, { "epoch": 2.235565819861432, "grad_norm": 1.4671051502227783, "learning_rate": 2.4294706479898954e-05, "loss": 0.1826, "step": 4840 }, { "epoch": 2.240184757505774, "grad_norm": 1.9703898429870605, "learning_rate": 2.419398300424006e-05, "loss": 0.1683, "step": 4850 }, { "epoch": 2.2448036951501154, "grad_norm": 2.080009698867798, "learning_rate": 2.40932726240953e-05, "loss": 0.1762, "step": 4860 }, { "epoch": 2.2494226327944573, "grad_norm": 1.1113896369934082, "learning_rate": 2.3992576975725718e-05, "loss": 0.1518, "step": 4870 }, { "epoch": 2.2540415704387993, "grad_norm": 1.418796420097351, "learning_rate": 2.3891897695153024e-05, "loss": 0.194, "step": 4880 }, { "epoch": 2.258660508083141, "grad_norm": 1.3478052616119385, "learning_rate": 2.3791236418132948e-05, "loss": 0.1696, "step": 4890 }, { "epoch": 2.2632794457274827, "grad_norm": 1.2730656862258911, "learning_rate": 2.369059478012877e-05, "loss": 0.1801, "step": 4900 }, { "epoch": 2.2678983833718247, "grad_norm": 2.4409050941467285, "learning_rate": 2.3589974416284647e-05, "loss": 0.1936, "step": 4910 }, { "epoch": 2.272517321016166, "grad_norm": 2.189056158065796, "learning_rate": 2.3489376961399095e-05, "loss": 0.1859, "step": 4920 }, { "epoch": 2.277136258660508, "grad_norm": 2.0005669593811035, "learning_rate": 2.3388804049898457e-05, "loss": 0.1906, "step": 4930 }, { "epoch": 2.28175519630485, "grad_norm": 1.375228762626648, "learning_rate": 2.328825731581028e-05, "loss": 0.1682, "step": 4940 }, { "epoch": 2.2863741339491916, "grad_norm": 1.6800892353057861, "learning_rate": 2.318773839273683e-05, "loss": 0.1819, "step": 4950 }, { "epoch": 2.2909930715935336, "grad_norm": 1.2466888427734375, "learning_rate": 2.3087248913828498e-05, "loss": 0.1767, "step": 4960 }, { "epoch": 2.295612009237875, "grad_norm": 1.337326169013977, "learning_rate": 2.298679051175728e-05, "loss": 0.1965, "step": 4970 }, { "epoch": 2.300230946882217, "grad_norm": 1.4556164741516113, "learning_rate": 2.2886364818690313e-05, "loss": 0.1801, "step": 4980 }, { "epoch": 2.304849884526559, "grad_norm": 1.534479022026062, "learning_rate": 2.2785973466263246e-05, "loss": 0.1625, "step": 4990 }, { "epoch": 2.3094688221709005, "grad_norm": 1.1505579948425293, "learning_rate": 2.268561808555382e-05, "loss": 0.1913, "step": 5000 }, { "epoch": 2.3140877598152425, "grad_norm": 2.6239471435546875, "learning_rate": 2.2585300307055337e-05, "loss": 0.1758, "step": 5010 }, { "epoch": 2.3187066974595845, "grad_norm": 1.6162166595458984, "learning_rate": 2.248502176065015e-05, "loss": 0.1712, "step": 5020 }, { "epoch": 2.323325635103926, "grad_norm": 1.3041070699691772, "learning_rate": 2.2384784075583226e-05, "loss": 0.1672, "step": 5030 }, { "epoch": 2.327944572748268, "grad_norm": 1.069870948791504, "learning_rate": 2.2284588880435628e-05, "loss": 0.1692, "step": 5040 }, { "epoch": 2.3325635103926095, "grad_norm": 2.735729694366455, "learning_rate": 2.2184437803098082e-05, "loss": 0.1899, "step": 5050 }, { "epoch": 2.3371824480369514, "grad_norm": 2.4670474529266357, "learning_rate": 2.208433247074454e-05, "loss": 0.1817, "step": 5060 }, { "epoch": 2.3418013856812934, "grad_norm": 1.4816709756851196, "learning_rate": 2.1984274509805696e-05, "loss": 0.1878, "step": 5070 }, { "epoch": 2.346420323325635, "grad_norm": 1.6306219100952148, "learning_rate": 2.188426554594263e-05, "loss": 0.1712, "step": 5080 }, { "epoch": 2.351039260969977, "grad_norm": 1.3102030754089355, "learning_rate": 2.178430720402032e-05, "loss": 0.1732, "step": 5090 }, { "epoch": 2.355658198614319, "grad_norm": 1.4050968885421753, "learning_rate": 2.1684401108081294e-05, "loss": 0.1753, "step": 5100 }, { "epoch": 2.3602771362586603, "grad_norm": 1.079567551612854, "learning_rate": 2.1584548881319236e-05, "loss": 0.1745, "step": 5110 }, { "epoch": 2.3648960739030023, "grad_norm": 1.821539044380188, "learning_rate": 2.1484752146052607e-05, "loss": 0.191, "step": 5120 }, { "epoch": 2.3695150115473442, "grad_norm": 1.4591292142868042, "learning_rate": 2.1385012523698268e-05, "loss": 0.1744, "step": 5130 }, { "epoch": 2.3741339491916857, "grad_norm": 1.778499960899353, "learning_rate": 2.1285331634745182e-05, "loss": 0.154, "step": 5140 }, { "epoch": 2.3787528868360277, "grad_norm": 1.7727078199386597, "learning_rate": 2.1185711098728036e-05, "loss": 0.1655, "step": 5150 }, { "epoch": 2.3833718244803697, "grad_norm": 1.6329830884933472, "learning_rate": 2.1086152534200973e-05, "loss": 0.2027, "step": 5160 }, { "epoch": 2.387990762124711, "grad_norm": 1.0071061849594116, "learning_rate": 2.098665755871125e-05, "loss": 0.1696, "step": 5170 }, { "epoch": 2.392609699769053, "grad_norm": 1.3714135885238647, "learning_rate": 2.0887227788772994e-05, "loss": 0.1713, "step": 5180 }, { "epoch": 2.397228637413395, "grad_norm": 2.148244857788086, "learning_rate": 2.0787864839840936e-05, "loss": 0.1718, "step": 5190 }, { "epoch": 2.4018475750577366, "grad_norm": 1.1608026027679443, "learning_rate": 2.0688570326284117e-05, "loss": 0.173, "step": 5200 }, { "epoch": 2.4064665127020786, "grad_norm": 0.9221383333206177, "learning_rate": 2.0589345861359743e-05, "loss": 0.1871, "step": 5210 }, { "epoch": 2.4110854503464205, "grad_norm": 1.5555343627929688, "learning_rate": 2.0490193057186887e-05, "loss": 0.1927, "step": 5220 }, { "epoch": 2.415704387990762, "grad_norm": 1.48004150390625, "learning_rate": 2.039111352472035e-05, "loss": 0.1945, "step": 5230 }, { "epoch": 2.420323325635104, "grad_norm": 1.2729753255844116, "learning_rate": 2.029210887372448e-05, "loss": 0.179, "step": 5240 }, { "epoch": 2.424942263279446, "grad_norm": 1.5722098350524902, "learning_rate": 2.0193180712747007e-05, "loss": 0.1688, "step": 5250 }, { "epoch": 2.4295612009237875, "grad_norm": 1.3203589916229248, "learning_rate": 2.0094330649092895e-05, "loss": 0.1745, "step": 5260 }, { "epoch": 2.4341801385681294, "grad_norm": 1.1365830898284912, "learning_rate": 1.9995560288798285e-05, "loss": 0.1618, "step": 5270 }, { "epoch": 2.438799076212471, "grad_norm": 1.3933275938034058, "learning_rate": 1.9896871236604315e-05, "loss": 0.1619, "step": 5280 }, { "epoch": 2.443418013856813, "grad_norm": 1.4565578699111938, "learning_rate": 1.9798265095931137e-05, "loss": 0.1758, "step": 5290 }, { "epoch": 2.448036951501155, "grad_norm": 1.2321804761886597, "learning_rate": 1.9699743468851803e-05, "loss": 0.1741, "step": 5300 }, { "epoch": 2.4526558891454964, "grad_norm": 1.397826910018921, "learning_rate": 1.960130795606624e-05, "loss": 0.1714, "step": 5310 }, { "epoch": 2.4572748267898383, "grad_norm": 1.3442643880844116, "learning_rate": 1.9502960156875306e-05, "loss": 0.1753, "step": 5320 }, { "epoch": 2.4618937644341803, "grad_norm": 1.2074075937271118, "learning_rate": 1.9404701669154704e-05, "loss": 0.181, "step": 5330 }, { "epoch": 2.466512702078522, "grad_norm": 1.942808985710144, "learning_rate": 1.9306534089329123e-05, "loss": 0.1834, "step": 5340 }, { "epoch": 2.4711316397228638, "grad_norm": 1.6311671733856201, "learning_rate": 1.920845901234622e-05, "loss": 0.1689, "step": 5350 }, { "epoch": 2.4757505773672057, "grad_norm": 1.7184101343154907, "learning_rate": 1.9110478031650754e-05, "loss": 0.1763, "step": 5360 }, { "epoch": 2.4803695150115472, "grad_norm": 1.3892570734024048, "learning_rate": 1.9012592739158703e-05, "loss": 0.1674, "step": 5370 }, { "epoch": 2.484988452655889, "grad_norm": 2.845325469970703, "learning_rate": 1.8914804725231346e-05, "loss": 0.1691, "step": 5380 }, { "epoch": 2.4896073903002307, "grad_norm": 1.4539880752563477, "learning_rate": 1.881711557864946e-05, "loss": 0.1683, "step": 5390 }, { "epoch": 2.4942263279445727, "grad_norm": 1.1928037405014038, "learning_rate": 1.8719526886587547e-05, "loss": 0.1617, "step": 5400 }, { "epoch": 2.4988452655889146, "grad_norm": 1.1701024770736694, "learning_rate": 1.862204023458795e-05, "loss": 0.1724, "step": 5410 }, { "epoch": 2.503464203233256, "grad_norm": 1.09821617603302, "learning_rate": 1.8524657206535193e-05, "loss": 0.1799, "step": 5420 }, { "epoch": 2.508083140877598, "grad_norm": 1.388356328010559, "learning_rate": 1.842737938463018e-05, "loss": 0.1736, "step": 5430 }, { "epoch": 2.51270207852194, "grad_norm": 1.0940160751342773, "learning_rate": 1.833020834936449e-05, "loss": 0.1767, "step": 5440 }, { "epoch": 2.5173210161662816, "grad_norm": 1.4465347528457642, "learning_rate": 1.823314567949478e-05, "loss": 0.1697, "step": 5450 }, { "epoch": 2.5219399538106235, "grad_norm": 3.767317295074463, "learning_rate": 1.8136192952016995e-05, "loss": 0.1747, "step": 5460 }, { "epoch": 2.5265588914549655, "grad_norm": 1.304645299911499, "learning_rate": 1.8039351742140895e-05, "loss": 0.1887, "step": 5470 }, { "epoch": 2.531177829099307, "grad_norm": 1.7054566144943237, "learning_rate": 1.7942623623264355e-05, "loss": 0.1779, "step": 5480 }, { "epoch": 2.535796766743649, "grad_norm": 0.9667575359344482, "learning_rate": 1.7846010166947825e-05, "loss": 0.1539, "step": 5490 }, { "epoch": 2.540415704387991, "grad_norm": 2.093245267868042, "learning_rate": 1.7749512942888857e-05, "loss": 0.1768, "step": 5500 } ], "logging_steps": 10, "max_steps": 8660, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.512819164006318e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }