{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9600156801254411, "eval_steps": 2000, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 12.522311210632324, "learning_rate": 5.0000000000000004e-08, "loss": 1.3062, "step": 10 }, { "epoch": 0.0, "grad_norm": 12.255845069885254, "learning_rate": 1.0000000000000001e-07, "loss": 1.3002, "step": 20 }, { "epoch": 0.0, "grad_norm": 11.711555480957031, "learning_rate": 1.5000000000000002e-07, "loss": 1.2878, "step": 30 }, { "epoch": 0.0, "grad_norm": 11.784612655639648, "learning_rate": 2.0000000000000002e-07, "loss": 1.2743, "step": 40 }, { "epoch": 0.0, "grad_norm": 11.679885864257812, "learning_rate": 2.5000000000000004e-07, "loss": 1.2406, "step": 50 }, { "epoch": 0.01, "grad_norm": 12.24442195892334, "learning_rate": 3.0000000000000004e-07, "loss": 1.2251, "step": 60 }, { "epoch": 0.01, "grad_norm": 10.110812187194824, "learning_rate": 3.5000000000000004e-07, "loss": 1.1775, "step": 70 }, { "epoch": 0.01, "grad_norm": 10.462944030761719, "learning_rate": 4.0000000000000003e-07, "loss": 1.163, "step": 80 }, { "epoch": 0.01, "grad_norm": 9.963655471801758, "learning_rate": 4.5000000000000003e-07, "loss": 1.1576, "step": 90 }, { "epoch": 0.01, "grad_norm": 9.657032012939453, "learning_rate": 5.000000000000001e-07, "loss": 1.1238, "step": 100 }, { "epoch": 0.01, "grad_norm": 9.575132369995117, "learning_rate": 5.5e-07, "loss": 1.1223, "step": 110 }, { "epoch": 0.01, "grad_norm": 10.12923812866211, "learning_rate": 6.000000000000001e-07, "loss": 1.1058, "step": 120 }, { "epoch": 0.01, "grad_norm": 9.709489822387695, "learning_rate": 6.5e-07, "loss": 1.1188, "step": 130 }, { "epoch": 0.01, "grad_norm": 9.579288482666016, "learning_rate": 7.000000000000001e-07, "loss": 1.1072, "step": 140 }, { "epoch": 0.01, "grad_norm": 9.753271102905273, "learning_rate": 7.5e-07, "loss": 1.0949, "step": 150 }, { "epoch": 0.02, "grad_norm": 8.8035249710083, "learning_rate": 8.000000000000001e-07, "loss": 1.0845, "step": 160 }, { "epoch": 0.02, "grad_norm": 8.80864143371582, "learning_rate": 8.500000000000001e-07, "loss": 1.0626, "step": 170 }, { "epoch": 0.02, "grad_norm": 9.8696928024292, "learning_rate": 9.000000000000001e-07, "loss": 1.0701, "step": 180 }, { "epoch": 0.02, "grad_norm": 10.316693305969238, "learning_rate": 9.500000000000001e-07, "loss": 1.0462, "step": 190 }, { "epoch": 0.02, "grad_norm": 10.428446769714355, "learning_rate": 1.0000000000000002e-06, "loss": 1.0489, "step": 200 }, { "epoch": 0.02, "grad_norm": 9.067750930786133, "learning_rate": 1.0500000000000001e-06, "loss": 1.0684, "step": 210 }, { "epoch": 0.02, "grad_norm": 8.946035385131836, "learning_rate": 1.1e-06, "loss": 1.0563, "step": 220 }, { "epoch": 0.02, "grad_norm": 9.240349769592285, "learning_rate": 1.1500000000000002e-06, "loss": 1.057, "step": 230 }, { "epoch": 0.02, "grad_norm": 9.197181701660156, "learning_rate": 1.2000000000000002e-06, "loss": 1.0714, "step": 240 }, { "epoch": 0.02, "grad_norm": 9.713706970214844, "learning_rate": 1.25e-06, "loss": 1.0558, "step": 250 }, { "epoch": 0.03, "grad_norm": 10.378229141235352, "learning_rate": 1.3e-06, "loss": 1.0454, "step": 260 }, { "epoch": 0.03, "grad_norm": 8.882588386535645, "learning_rate": 1.3500000000000002e-06, "loss": 1.06, "step": 270 }, { "epoch": 0.03, "grad_norm": 9.052627563476562, "learning_rate": 1.4000000000000001e-06, "loss": 1.0572, "step": 280 }, { "epoch": 0.03, "grad_norm": 9.072053909301758, "learning_rate": 1.45e-06, "loss": 1.0383, "step": 290 }, { "epoch": 0.03, "grad_norm": 9.55165958404541, "learning_rate": 1.5e-06, "loss": 1.0455, "step": 300 }, { "epoch": 0.03, "grad_norm": 9.083109855651855, "learning_rate": 1.5500000000000002e-06, "loss": 1.0424, "step": 310 }, { "epoch": 0.03, "grad_norm": 8.25512409210205, "learning_rate": 1.6000000000000001e-06, "loss": 1.037, "step": 320 }, { "epoch": 0.03, "grad_norm": 9.08288288116455, "learning_rate": 1.6500000000000003e-06, "loss": 1.0354, "step": 330 }, { "epoch": 0.03, "grad_norm": 9.528592109680176, "learning_rate": 1.7000000000000002e-06, "loss": 1.0445, "step": 340 }, { "epoch": 0.03, "grad_norm": 9.586995124816895, "learning_rate": 1.75e-06, "loss": 1.016, "step": 350 }, { "epoch": 0.04, "grad_norm": 8.11673641204834, "learning_rate": 1.8000000000000001e-06, "loss": 1.0263, "step": 360 }, { "epoch": 0.04, "grad_norm": 9.928458213806152, "learning_rate": 1.85e-06, "loss": 1.0276, "step": 370 }, { "epoch": 0.04, "grad_norm": 10.672739028930664, "learning_rate": 1.9000000000000002e-06, "loss": 1.0289, "step": 380 }, { "epoch": 0.04, "grad_norm": 9.744338035583496, "learning_rate": 1.9500000000000004e-06, "loss": 1.0139, "step": 390 }, { "epoch": 0.04, "grad_norm": 10.300024032592773, "learning_rate": 2.0000000000000003e-06, "loss": 1.0247, "step": 400 }, { "epoch": 0.04, "grad_norm": 9.249749183654785, "learning_rate": 2.05e-06, "loss": 1.0292, "step": 410 }, { "epoch": 0.04, "grad_norm": 9.111607551574707, "learning_rate": 2.1000000000000002e-06, "loss": 1.0462, "step": 420 }, { "epoch": 0.04, "grad_norm": 7.269195079803467, "learning_rate": 2.15e-06, "loss": 1.0606, "step": 430 }, { "epoch": 0.04, "grad_norm": 9.545472145080566, "learning_rate": 2.2e-06, "loss": 1.03, "step": 440 }, { "epoch": 0.04, "grad_norm": 9.686765670776367, "learning_rate": 2.25e-06, "loss": 1.0344, "step": 450 }, { "epoch": 0.05, "grad_norm": 8.875536918640137, "learning_rate": 2.3000000000000004e-06, "loss": 1.0197, "step": 460 }, { "epoch": 0.05, "grad_norm": 8.848780632019043, "learning_rate": 2.35e-06, "loss": 1.0108, "step": 470 }, { "epoch": 0.05, "grad_norm": 9.135149002075195, "learning_rate": 2.4000000000000003e-06, "loss": 1.0185, "step": 480 }, { "epoch": 0.05, "grad_norm": 9.894800186157227, "learning_rate": 2.4500000000000003e-06, "loss": 1.0339, "step": 490 }, { "epoch": 0.05, "grad_norm": 8.37612247467041, "learning_rate": 2.5e-06, "loss": 1.037, "step": 500 }, { "epoch": 0.05, "grad_norm": 10.130329132080078, "learning_rate": 2.55e-06, "loss": 1.0343, "step": 510 }, { "epoch": 0.05, "grad_norm": 8.802632331848145, "learning_rate": 2.6e-06, "loss": 1.031, "step": 520 }, { "epoch": 0.05, "grad_norm": 9.689617156982422, "learning_rate": 2.6500000000000005e-06, "loss": 1.0311, "step": 530 }, { "epoch": 0.05, "grad_norm": 8.675555229187012, "learning_rate": 2.7000000000000004e-06, "loss": 1.0218, "step": 540 }, { "epoch": 0.05, "grad_norm": 8.797578811645508, "learning_rate": 2.7500000000000004e-06, "loss": 1.0331, "step": 550 }, { "epoch": 0.05, "grad_norm": 8.753621101379395, "learning_rate": 2.8000000000000003e-06, "loss": 1.021, "step": 560 }, { "epoch": 0.06, "grad_norm": 9.217890739440918, "learning_rate": 2.85e-06, "loss": 1.0209, "step": 570 }, { "epoch": 0.06, "grad_norm": 9.160318374633789, "learning_rate": 2.9e-06, "loss": 1.0523, "step": 580 }, { "epoch": 0.06, "grad_norm": 9.489140510559082, "learning_rate": 2.95e-06, "loss": 1.0141, "step": 590 }, { "epoch": 0.06, "grad_norm": 8.357597351074219, "learning_rate": 3e-06, "loss": 1.0072, "step": 600 }, { "epoch": 0.06, "grad_norm": 9.924766540527344, "learning_rate": 3.05e-06, "loss": 1.0252, "step": 610 }, { "epoch": 0.06, "grad_norm": 9.269017219543457, "learning_rate": 3.1000000000000004e-06, "loss": 1.024, "step": 620 }, { "epoch": 0.06, "grad_norm": 7.910837650299072, "learning_rate": 3.1500000000000003e-06, "loss": 1.0279, "step": 630 }, { "epoch": 0.06, "grad_norm": 8.93685531616211, "learning_rate": 3.2000000000000003e-06, "loss": 0.9883, "step": 640 }, { "epoch": 0.06, "grad_norm": 8.706589698791504, "learning_rate": 3.2500000000000002e-06, "loss": 1.0258, "step": 650 }, { "epoch": 0.06, "grad_norm": 8.293551445007324, "learning_rate": 3.3000000000000006e-06, "loss": 1.006, "step": 660 }, { "epoch": 0.07, "grad_norm": 9.94530963897705, "learning_rate": 3.3500000000000005e-06, "loss": 0.9923, "step": 670 }, { "epoch": 0.07, "grad_norm": 9.74544620513916, "learning_rate": 3.4000000000000005e-06, "loss": 1.0059, "step": 680 }, { "epoch": 0.07, "grad_norm": 8.988492965698242, "learning_rate": 3.45e-06, "loss": 1.001, "step": 690 }, { "epoch": 0.07, "grad_norm": 7.93975830078125, "learning_rate": 3.5e-06, "loss": 1.0035, "step": 700 }, { "epoch": 0.07, "grad_norm": 7.531262397766113, "learning_rate": 3.5500000000000003e-06, "loss": 1.012, "step": 710 }, { "epoch": 0.07, "grad_norm": 8.432524681091309, "learning_rate": 3.6000000000000003e-06, "loss": 1.0136, "step": 720 }, { "epoch": 0.07, "grad_norm": 9.21248722076416, "learning_rate": 3.65e-06, "loss": 1.0151, "step": 730 }, { "epoch": 0.07, "grad_norm": 10.034098625183105, "learning_rate": 3.7e-06, "loss": 1.0189, "step": 740 }, { "epoch": 0.07, "grad_norm": 7.43057918548584, "learning_rate": 3.7500000000000005e-06, "loss": 1.0421, "step": 750 }, { "epoch": 0.07, "grad_norm": 8.48992919921875, "learning_rate": 3.8000000000000005e-06, "loss": 1.0228, "step": 760 }, { "epoch": 0.08, "grad_norm": 7.472344875335693, "learning_rate": 3.85e-06, "loss": 1.0208, "step": 770 }, { "epoch": 0.08, "grad_norm": 8.405960083007812, "learning_rate": 3.900000000000001e-06, "loss": 1.0283, "step": 780 }, { "epoch": 0.08, "grad_norm": 7.958547115325928, "learning_rate": 3.95e-06, "loss": 1.0215, "step": 790 }, { "epoch": 0.08, "grad_norm": 8.395108222961426, "learning_rate": 4.000000000000001e-06, "loss": 1.0107, "step": 800 }, { "epoch": 0.08, "grad_norm": 8.254814147949219, "learning_rate": 4.05e-06, "loss": 1.0099, "step": 810 }, { "epoch": 0.08, "grad_norm": 8.988401412963867, "learning_rate": 4.1e-06, "loss": 1.0074, "step": 820 }, { "epoch": 0.08, "grad_norm": 9.366766929626465, "learning_rate": 4.15e-06, "loss": 1.0068, "step": 830 }, { "epoch": 0.08, "grad_norm": 9.443115234375, "learning_rate": 4.2000000000000004e-06, "loss": 1.0083, "step": 840 }, { "epoch": 0.08, "grad_norm": 8.349184036254883, "learning_rate": 4.25e-06, "loss": 1.0083, "step": 850 }, { "epoch": 0.08, "grad_norm": 8.714761734008789, "learning_rate": 4.3e-06, "loss": 1.0244, "step": 860 }, { "epoch": 0.09, "grad_norm": 8.16286849975586, "learning_rate": 4.350000000000001e-06, "loss": 1.0219, "step": 870 }, { "epoch": 0.09, "grad_norm": 9.875529289245605, "learning_rate": 4.4e-06, "loss": 1.0139, "step": 880 }, { "epoch": 0.09, "grad_norm": 8.1596040725708, "learning_rate": 4.450000000000001e-06, "loss": 1.0092, "step": 890 }, { "epoch": 0.09, "grad_norm": 8.961359977722168, "learning_rate": 4.5e-06, "loss": 1.0187, "step": 900 }, { "epoch": 0.09, "grad_norm": 9.80009937286377, "learning_rate": 4.5500000000000005e-06, "loss": 0.9997, "step": 910 }, { "epoch": 0.09, "grad_norm": 9.042901992797852, "learning_rate": 4.600000000000001e-06, "loss": 1.013, "step": 920 }, { "epoch": 0.09, "grad_norm": 6.883285999298096, "learning_rate": 4.65e-06, "loss": 1.0249, "step": 930 }, { "epoch": 0.09, "grad_norm": 8.24597454071045, "learning_rate": 4.7e-06, "loss": 1.0207, "step": 940 }, { "epoch": 0.09, "grad_norm": 8.265912055969238, "learning_rate": 4.75e-06, "loss": 1.0196, "step": 950 }, { "epoch": 0.09, "grad_norm": 8.990100860595703, "learning_rate": 4.800000000000001e-06, "loss": 0.9873, "step": 960 }, { "epoch": 0.1, "grad_norm": 8.091612815856934, "learning_rate": 4.85e-06, "loss": 1.0149, "step": 970 }, { "epoch": 0.1, "grad_norm": 7.061271667480469, "learning_rate": 4.9000000000000005e-06, "loss": 1.0504, "step": 980 }, { "epoch": 0.1, "grad_norm": 10.124354362487793, "learning_rate": 4.95e-06, "loss": 1.0274, "step": 990 }, { "epoch": 0.1, "grad_norm": 8.14189338684082, "learning_rate": 5e-06, "loss": 1.0056, "step": 1000 }, { "epoch": 0.1, "grad_norm": 10.074453353881836, "learning_rate": 5.050000000000001e-06, "loss": 1.0166, "step": 1010 }, { "epoch": 0.1, "grad_norm": 8.120701789855957, "learning_rate": 5.1e-06, "loss": 1.0096, "step": 1020 }, { "epoch": 0.1, "grad_norm": 8.863253593444824, "learning_rate": 5.150000000000001e-06, "loss": 1.0285, "step": 1030 }, { "epoch": 0.1, "grad_norm": 7.489706039428711, "learning_rate": 5.2e-06, "loss": 1.0071, "step": 1040 }, { "epoch": 0.1, "grad_norm": 10.268345832824707, "learning_rate": 5.2500000000000006e-06, "loss": 1.0018, "step": 1050 }, { "epoch": 0.1, "grad_norm": 8.715660095214844, "learning_rate": 5.300000000000001e-06, "loss": 1.0169, "step": 1060 }, { "epoch": 0.1, "grad_norm": 7.419419288635254, "learning_rate": 5.3500000000000004e-06, "loss": 1.0196, "step": 1070 }, { "epoch": 0.11, "grad_norm": 9.169241905212402, "learning_rate": 5.400000000000001e-06, "loss": 1.0153, "step": 1080 }, { "epoch": 0.11, "grad_norm": 8.258715629577637, "learning_rate": 5.450000000000001e-06, "loss": 1.029, "step": 1090 }, { "epoch": 0.11, "grad_norm": 7.860774040222168, "learning_rate": 5.500000000000001e-06, "loss": 1.0165, "step": 1100 }, { "epoch": 0.11, "grad_norm": 7.660974979400635, "learning_rate": 5.550000000000001e-06, "loss": 1.0318, "step": 1110 }, { "epoch": 0.11, "grad_norm": 9.192011833190918, "learning_rate": 5.600000000000001e-06, "loss": 0.9942, "step": 1120 }, { "epoch": 0.11, "grad_norm": 7.8968682289123535, "learning_rate": 5.65e-06, "loss": 1.0466, "step": 1130 }, { "epoch": 0.11, "grad_norm": 8.00294017791748, "learning_rate": 5.7e-06, "loss": 1.0199, "step": 1140 }, { "epoch": 0.11, "grad_norm": 8.527581214904785, "learning_rate": 5.75e-06, "loss": 1.0074, "step": 1150 }, { "epoch": 0.11, "grad_norm": 8.593584060668945, "learning_rate": 5.8e-06, "loss": 1.0233, "step": 1160 }, { "epoch": 0.11, "grad_norm": 7.406680583953857, "learning_rate": 5.85e-06, "loss": 1.0095, "step": 1170 }, { "epoch": 0.12, "grad_norm": 7.880866527557373, "learning_rate": 5.9e-06, "loss": 1.0101, "step": 1180 }, { "epoch": 0.12, "grad_norm": 9.04269027709961, "learning_rate": 5.950000000000001e-06, "loss": 0.9932, "step": 1190 }, { "epoch": 0.12, "grad_norm": 8.32613754272461, "learning_rate": 6e-06, "loss": 1.0136, "step": 1200 }, { "epoch": 0.12, "grad_norm": 8.852860450744629, "learning_rate": 6.0500000000000005e-06, "loss": 1.0079, "step": 1210 }, { "epoch": 0.12, "grad_norm": 9.952341079711914, "learning_rate": 6.1e-06, "loss": 1.0195, "step": 1220 }, { "epoch": 0.12, "grad_norm": 8.150203704833984, "learning_rate": 6.15e-06, "loss": 1.0325, "step": 1230 }, { "epoch": 0.12, "grad_norm": 8.488512992858887, "learning_rate": 6.200000000000001e-06, "loss": 1.0328, "step": 1240 }, { "epoch": 0.12, "grad_norm": 9.25046443939209, "learning_rate": 6.25e-06, "loss": 1.0337, "step": 1250 }, { "epoch": 0.12, "grad_norm": 7.550364017486572, "learning_rate": 6.300000000000001e-06, "loss": 1.0036, "step": 1260 }, { "epoch": 0.12, "grad_norm": 8.465344429016113, "learning_rate": 6.35e-06, "loss": 0.9879, "step": 1270 }, { "epoch": 0.13, "grad_norm": 7.861695289611816, "learning_rate": 6.4000000000000006e-06, "loss": 1.0286, "step": 1280 }, { "epoch": 0.13, "grad_norm": 7.540966510772705, "learning_rate": 6.450000000000001e-06, "loss": 1.0092, "step": 1290 }, { "epoch": 0.13, "grad_norm": 6.81091833114624, "learning_rate": 6.5000000000000004e-06, "loss": 1.0253, "step": 1300 }, { "epoch": 0.13, "grad_norm": 7.465524196624756, "learning_rate": 6.550000000000001e-06, "loss": 1.0052, "step": 1310 }, { "epoch": 0.13, "grad_norm": 9.11961841583252, "learning_rate": 6.600000000000001e-06, "loss": 1.009, "step": 1320 }, { "epoch": 0.13, "grad_norm": 8.388483047485352, "learning_rate": 6.650000000000001e-06, "loss": 1.0233, "step": 1330 }, { "epoch": 0.13, "grad_norm": 8.0752592086792, "learning_rate": 6.700000000000001e-06, "loss": 1.005, "step": 1340 }, { "epoch": 0.13, "grad_norm": 9.166247367858887, "learning_rate": 6.750000000000001e-06, "loss": 1.0059, "step": 1350 }, { "epoch": 0.13, "grad_norm": 8.830463409423828, "learning_rate": 6.800000000000001e-06, "loss": 1.0531, "step": 1360 }, { "epoch": 0.13, "grad_norm": 8.8436279296875, "learning_rate": 6.850000000000001e-06, "loss": 1.0251, "step": 1370 }, { "epoch": 0.14, "grad_norm": 8.408062934875488, "learning_rate": 6.9e-06, "loss": 1.0032, "step": 1380 }, { "epoch": 0.14, "grad_norm": 7.358797550201416, "learning_rate": 6.95e-06, "loss": 1.0303, "step": 1390 }, { "epoch": 0.14, "grad_norm": 7.57014799118042, "learning_rate": 7e-06, "loss": 1.0392, "step": 1400 }, { "epoch": 0.14, "grad_norm": 7.598930835723877, "learning_rate": 7.05e-06, "loss": 1.0227, "step": 1410 }, { "epoch": 0.14, "grad_norm": 7.534142971038818, "learning_rate": 7.100000000000001e-06, "loss": 1.0232, "step": 1420 }, { "epoch": 0.14, "grad_norm": 8.777505874633789, "learning_rate": 7.15e-06, "loss": 1.0289, "step": 1430 }, { "epoch": 0.14, "grad_norm": 7.477867126464844, "learning_rate": 7.2000000000000005e-06, "loss": 1.002, "step": 1440 }, { "epoch": 0.14, "grad_norm": 7.299503803253174, "learning_rate": 7.25e-06, "loss": 1.0387, "step": 1450 }, { "epoch": 0.14, "grad_norm": 8.296052932739258, "learning_rate": 7.3e-06, "loss": 1.0008, "step": 1460 }, { "epoch": 0.14, "grad_norm": 7.58913516998291, "learning_rate": 7.350000000000001e-06, "loss": 1.0297, "step": 1470 }, { "epoch": 0.15, "grad_norm": 7.368912220001221, "learning_rate": 7.4e-06, "loss": 1.0179, "step": 1480 }, { "epoch": 0.15, "grad_norm": 7.106993198394775, "learning_rate": 7.450000000000001e-06, "loss": 1.0168, "step": 1490 }, { "epoch": 0.15, "grad_norm": 8.242332458496094, "learning_rate": 7.500000000000001e-06, "loss": 1.0184, "step": 1500 }, { "epoch": 0.15, "grad_norm": 7.458347320556641, "learning_rate": 7.5500000000000006e-06, "loss": 1.0102, "step": 1510 }, { "epoch": 0.15, "grad_norm": 13.338580131530762, "learning_rate": 7.600000000000001e-06, "loss": 1.0061, "step": 1520 }, { "epoch": 0.15, "grad_norm": 8.80572509765625, "learning_rate": 7.650000000000001e-06, "loss": 1.0426, "step": 1530 }, { "epoch": 0.15, "grad_norm": 8.227895736694336, "learning_rate": 7.7e-06, "loss": 1.0239, "step": 1540 }, { "epoch": 0.15, "grad_norm": 8.583667755126953, "learning_rate": 7.75e-06, "loss": 1.0222, "step": 1550 }, { "epoch": 0.15, "grad_norm": 8.334319114685059, "learning_rate": 7.800000000000002e-06, "loss": 0.9859, "step": 1560 }, { "epoch": 0.15, "grad_norm": 7.69195032119751, "learning_rate": 7.850000000000001e-06, "loss": 1.0157, "step": 1570 }, { "epoch": 0.15, "grad_norm": 8.934256553649902, "learning_rate": 7.9e-06, "loss": 1.0305, "step": 1580 }, { "epoch": 0.16, "grad_norm": 8.637744903564453, "learning_rate": 7.950000000000002e-06, "loss": 1.0345, "step": 1590 }, { "epoch": 0.16, "grad_norm": 7.4502787590026855, "learning_rate": 8.000000000000001e-06, "loss": 1.0105, "step": 1600 }, { "epoch": 0.16, "grad_norm": 7.652475833892822, "learning_rate": 8.050000000000001e-06, "loss": 1.0315, "step": 1610 }, { "epoch": 0.16, "grad_norm": 7.198962211608887, "learning_rate": 8.1e-06, "loss": 1.0213, "step": 1620 }, { "epoch": 0.16, "grad_norm": 9.004481315612793, "learning_rate": 8.15e-06, "loss": 1.0336, "step": 1630 }, { "epoch": 0.16, "grad_norm": 8.093441009521484, "learning_rate": 8.2e-06, "loss": 1.0263, "step": 1640 }, { "epoch": 0.16, "grad_norm": 7.775898456573486, "learning_rate": 8.25e-06, "loss": 1.0028, "step": 1650 }, { "epoch": 0.16, "grad_norm": 7.741401672363281, "learning_rate": 8.3e-06, "loss": 0.9973, "step": 1660 }, { "epoch": 0.16, "grad_norm": 8.203429222106934, "learning_rate": 8.35e-06, "loss": 1.0372, "step": 1670 }, { "epoch": 0.16, "grad_norm": 8.2229642868042, "learning_rate": 8.400000000000001e-06, "loss": 1.0308, "step": 1680 }, { "epoch": 0.17, "grad_norm": 7.984185695648193, "learning_rate": 8.45e-06, "loss": 1.0187, "step": 1690 }, { "epoch": 0.17, "grad_norm": 7.575095176696777, "learning_rate": 8.5e-06, "loss": 1.0283, "step": 1700 }, { "epoch": 0.17, "grad_norm": 7.937477111816406, "learning_rate": 8.550000000000001e-06, "loss": 0.9848, "step": 1710 }, { "epoch": 0.17, "grad_norm": 7.414380073547363, "learning_rate": 8.6e-06, "loss": 0.9992, "step": 1720 }, { "epoch": 0.17, "grad_norm": 7.995779037475586, "learning_rate": 8.65e-06, "loss": 1.0512, "step": 1730 }, { "epoch": 0.17, "grad_norm": 7.2947492599487305, "learning_rate": 8.700000000000001e-06, "loss": 1.0218, "step": 1740 }, { "epoch": 0.17, "grad_norm": 6.746693134307861, "learning_rate": 8.750000000000001e-06, "loss": 1.0248, "step": 1750 }, { "epoch": 0.17, "grad_norm": 8.890190124511719, "learning_rate": 8.8e-06, "loss": 1.0318, "step": 1760 }, { "epoch": 0.17, "grad_norm": 8.689223289489746, "learning_rate": 8.85e-06, "loss": 1.0131, "step": 1770 }, { "epoch": 0.17, "grad_norm": 7.382048606872559, "learning_rate": 8.900000000000001e-06, "loss": 1.0371, "step": 1780 }, { "epoch": 0.18, "grad_norm": 8.549667358398438, "learning_rate": 8.95e-06, "loss": 1.0021, "step": 1790 }, { "epoch": 0.18, "grad_norm": 8.157551765441895, "learning_rate": 9e-06, "loss": 1.0355, "step": 1800 }, { "epoch": 0.18, "grad_norm": 7.85901403427124, "learning_rate": 9.050000000000001e-06, "loss": 1.0185, "step": 1810 }, { "epoch": 0.18, "grad_norm": 7.107308387756348, "learning_rate": 9.100000000000001e-06, "loss": 1.0254, "step": 1820 }, { "epoch": 0.18, "grad_norm": 7.245466709136963, "learning_rate": 9.15e-06, "loss": 1.0095, "step": 1830 }, { "epoch": 0.18, "grad_norm": 6.624356269836426, "learning_rate": 9.200000000000002e-06, "loss": 1.0432, "step": 1840 }, { "epoch": 0.18, "grad_norm": 7.3254899978637695, "learning_rate": 9.250000000000001e-06, "loss": 1.0328, "step": 1850 }, { "epoch": 0.18, "grad_norm": 7.241863250732422, "learning_rate": 9.3e-06, "loss": 1.0346, "step": 1860 }, { "epoch": 0.18, "grad_norm": 7.163956642150879, "learning_rate": 9.350000000000002e-06, "loss": 1.0272, "step": 1870 }, { "epoch": 0.18, "grad_norm": 7.742920398712158, "learning_rate": 9.4e-06, "loss": 1.0399, "step": 1880 }, { "epoch": 0.19, "grad_norm": 8.182421684265137, "learning_rate": 9.450000000000001e-06, "loss": 1.0282, "step": 1890 }, { "epoch": 0.19, "grad_norm": 8.270386695861816, "learning_rate": 9.5e-06, "loss": 0.9903, "step": 1900 }, { "epoch": 0.19, "grad_norm": 8.560492515563965, "learning_rate": 9.55e-06, "loss": 0.9963, "step": 1910 }, { "epoch": 0.19, "grad_norm": 6.796733379364014, "learning_rate": 9.600000000000001e-06, "loss": 1.0279, "step": 1920 }, { "epoch": 0.19, "grad_norm": 7.293542861938477, "learning_rate": 9.65e-06, "loss": 1.0246, "step": 1930 }, { "epoch": 0.19, "grad_norm": 7.502761363983154, "learning_rate": 9.7e-06, "loss": 1.0257, "step": 1940 }, { "epoch": 0.19, "grad_norm": 7.434638977050781, "learning_rate": 9.75e-06, "loss": 1.018, "step": 1950 }, { "epoch": 0.19, "grad_norm": 8.350656509399414, "learning_rate": 9.800000000000001e-06, "loss": 1.0279, "step": 1960 }, { "epoch": 0.19, "grad_norm": 7.466753959655762, "learning_rate": 9.85e-06, "loss": 1.0192, "step": 1970 }, { "epoch": 0.19, "grad_norm": 7.639096736907959, "learning_rate": 9.9e-06, "loss": 1.0316, "step": 1980 }, { "epoch": 0.2, "grad_norm": 7.572539329528809, "learning_rate": 9.950000000000001e-06, "loss": 1.0287, "step": 1990 }, { "epoch": 0.2, "grad_norm": 7.9195122718811035, "learning_rate": 1e-05, "loss": 1.0375, "step": 2000 }, { "epoch": 0.2, "eval_loss": 1.0288177728652954, "eval_runtime": 25.0896, "eval_samples_per_second": 26.067, "eval_steps_per_second": 3.268, "step": 2000 }, { "epoch": 0.2, "grad_norm": 7.2744035720825195, "learning_rate": 9.994444444444446e-06, "loss": 1.0513, "step": 2010 }, { "epoch": 0.2, "grad_norm": 7.848827838897705, "learning_rate": 9.98888888888889e-06, "loss": 1.0335, "step": 2020 }, { "epoch": 0.2, "grad_norm": 6.891033172607422, "learning_rate": 9.983333333333333e-06, "loss": 1.0542, "step": 2030 }, { "epoch": 0.2, "grad_norm": 7.121194362640381, "learning_rate": 9.977777777777778e-06, "loss": 1.0225, "step": 2040 }, { "epoch": 0.2, "grad_norm": 7.5911335945129395, "learning_rate": 9.972222222222224e-06, "loss": 1.0333, "step": 2050 }, { "epoch": 0.2, "grad_norm": 7.545977592468262, "learning_rate": 9.966666666666667e-06, "loss": 1.0115, "step": 2060 }, { "epoch": 0.2, "grad_norm": 7.341533660888672, "learning_rate": 9.96111111111111e-06, "loss": 1.0328, "step": 2070 }, { "epoch": 0.2, "grad_norm": 8.573892593383789, "learning_rate": 9.955555555555556e-06, "loss": 1.0182, "step": 2080 }, { "epoch": 0.2, "grad_norm": 8.867987632751465, "learning_rate": 9.950000000000001e-06, "loss": 1.0279, "step": 2090 }, { "epoch": 0.21, "grad_norm": 7.3706889152526855, "learning_rate": 9.944444444444445e-06, "loss": 1.0273, "step": 2100 }, { "epoch": 0.21, "grad_norm": 8.370817184448242, "learning_rate": 9.93888888888889e-06, "loss": 1.0307, "step": 2110 }, { "epoch": 0.21, "grad_norm": 8.138284683227539, "learning_rate": 9.933333333333334e-06, "loss": 1.0131, "step": 2120 }, { "epoch": 0.21, "grad_norm": 6.65316915512085, "learning_rate": 9.927777777777779e-06, "loss": 1.0351, "step": 2130 }, { "epoch": 0.21, "grad_norm": 7.6826982498168945, "learning_rate": 9.922222222222222e-06, "loss": 1.0226, "step": 2140 }, { "epoch": 0.21, "grad_norm": 7.195889472961426, "learning_rate": 9.916666666666668e-06, "loss": 1.0259, "step": 2150 }, { "epoch": 0.21, "grad_norm": 9.045339584350586, "learning_rate": 9.911111111111113e-06, "loss": 1.0227, "step": 2160 }, { "epoch": 0.21, "grad_norm": 8.845091819763184, "learning_rate": 9.905555555555557e-06, "loss": 1.0473, "step": 2170 }, { "epoch": 0.21, "grad_norm": 7.309664726257324, "learning_rate": 9.9e-06, "loss": 1.014, "step": 2180 }, { "epoch": 0.21, "grad_norm": 6.420015335083008, "learning_rate": 9.894444444444445e-06, "loss": 1.0005, "step": 2190 }, { "epoch": 0.22, "grad_norm": 8.499987602233887, "learning_rate": 9.88888888888889e-06, "loss": 1.0304, "step": 2200 }, { "epoch": 0.22, "grad_norm": 7.747542381286621, "learning_rate": 9.883333333333334e-06, "loss": 1.0044, "step": 2210 }, { "epoch": 0.22, "grad_norm": 9.178853988647461, "learning_rate": 9.877777777777778e-06, "loss": 1.0356, "step": 2220 }, { "epoch": 0.22, "grad_norm": 6.947020053863525, "learning_rate": 9.872222222222223e-06, "loss": 1.018, "step": 2230 }, { "epoch": 0.22, "grad_norm": 5.955436706542969, "learning_rate": 9.866666666666668e-06, "loss": 1.0038, "step": 2240 }, { "epoch": 0.22, "grad_norm": 7.272679805755615, "learning_rate": 9.861111111111112e-06, "loss": 1.0158, "step": 2250 }, { "epoch": 0.22, "grad_norm": 7.22878885269165, "learning_rate": 9.855555555555555e-06, "loss": 1.0177, "step": 2260 }, { "epoch": 0.22, "grad_norm": 6.601935863494873, "learning_rate": 9.85e-06, "loss": 1.0485, "step": 2270 }, { "epoch": 0.22, "grad_norm": 7.103997707366943, "learning_rate": 9.844444444444446e-06, "loss": 1.0287, "step": 2280 }, { "epoch": 0.22, "grad_norm": 7.7743821144104, "learning_rate": 9.83888888888889e-06, "loss": 1.0352, "step": 2290 }, { "epoch": 0.23, "grad_norm": 7.64756441116333, "learning_rate": 9.833333333333333e-06, "loss": 1.0081, "step": 2300 }, { "epoch": 0.23, "grad_norm": 7.442409038543701, "learning_rate": 9.827777777777778e-06, "loss": 1.0023, "step": 2310 }, { "epoch": 0.23, "grad_norm": 7.010642051696777, "learning_rate": 9.822222222222223e-06, "loss": 1.0449, "step": 2320 }, { "epoch": 0.23, "grad_norm": 7.551766395568848, "learning_rate": 9.816666666666667e-06, "loss": 1.0434, "step": 2330 }, { "epoch": 0.23, "grad_norm": 7.410819053649902, "learning_rate": 9.811111111111112e-06, "loss": 0.9956, "step": 2340 }, { "epoch": 0.23, "grad_norm": 6.79862642288208, "learning_rate": 9.805555555555556e-06, "loss": 1.064, "step": 2350 }, { "epoch": 0.23, "grad_norm": 8.079883575439453, "learning_rate": 9.800000000000001e-06, "loss": 1.0368, "step": 2360 }, { "epoch": 0.23, "grad_norm": 8.448101043701172, "learning_rate": 9.794444444444445e-06, "loss": 1.0334, "step": 2370 }, { "epoch": 0.23, "grad_norm": 8.567143440246582, "learning_rate": 9.78888888888889e-06, "loss": 1.0112, "step": 2380 }, { "epoch": 0.23, "grad_norm": 7.6639404296875, "learning_rate": 9.783333333333335e-06, "loss": 1.0225, "step": 2390 }, { "epoch": 0.24, "grad_norm": 7.357000827789307, "learning_rate": 9.777777777777779e-06, "loss": 1.0302, "step": 2400 }, { "epoch": 0.24, "grad_norm": 7.313828468322754, "learning_rate": 9.772222222222222e-06, "loss": 1.0294, "step": 2410 }, { "epoch": 0.24, "grad_norm": 6.831286430358887, "learning_rate": 9.766666666666667e-06, "loss": 1.0155, "step": 2420 }, { "epoch": 0.24, "grad_norm": 7.242568016052246, "learning_rate": 9.761111111111113e-06, "loss": 1.0328, "step": 2430 }, { "epoch": 0.24, "grad_norm": 7.36611270904541, "learning_rate": 9.755555555555556e-06, "loss": 1.0395, "step": 2440 }, { "epoch": 0.24, "grad_norm": 8.87148380279541, "learning_rate": 9.75e-06, "loss": 1.0328, "step": 2450 }, { "epoch": 0.24, "grad_norm": 7.56249475479126, "learning_rate": 9.744444444444445e-06, "loss": 1.005, "step": 2460 }, { "epoch": 0.24, "grad_norm": 6.673750877380371, "learning_rate": 9.73888888888889e-06, "loss": 1.0135, "step": 2470 }, { "epoch": 0.24, "grad_norm": 7.21262788772583, "learning_rate": 9.733333333333334e-06, "loss": 1.0035, "step": 2480 }, { "epoch": 0.24, "grad_norm": 6.4624552726745605, "learning_rate": 9.727777777777777e-06, "loss": 1.0226, "step": 2490 }, { "epoch": 0.25, "grad_norm": 7.45573616027832, "learning_rate": 9.722222222222223e-06, "loss": 1.0289, "step": 2500 }, { "epoch": 0.25, "grad_norm": 7.268604278564453, "learning_rate": 9.716666666666668e-06, "loss": 1.023, "step": 2510 }, { "epoch": 0.25, "grad_norm": 6.674758434295654, "learning_rate": 9.711111111111111e-06, "loss": 1.0079, "step": 2520 }, { "epoch": 0.25, "grad_norm": 7.931172847747803, "learning_rate": 9.705555555555555e-06, "loss": 1.0265, "step": 2530 }, { "epoch": 0.25, "grad_norm": 7.007352352142334, "learning_rate": 9.7e-06, "loss": 1.0084, "step": 2540 }, { "epoch": 0.25, "grad_norm": 7.204163551330566, "learning_rate": 9.694444444444446e-06, "loss": 1.0261, "step": 2550 }, { "epoch": 0.25, "grad_norm": 7.260873317718506, "learning_rate": 9.688888888888889e-06, "loss": 1.0301, "step": 2560 }, { "epoch": 0.25, "grad_norm": 7.039237022399902, "learning_rate": 9.683333333333334e-06, "loss": 1.0127, "step": 2570 }, { "epoch": 0.25, "grad_norm": 6.593019962310791, "learning_rate": 9.677777777777778e-06, "loss": 1.0188, "step": 2580 }, { "epoch": 0.25, "grad_norm": 6.8481035232543945, "learning_rate": 9.672222222222223e-06, "loss": 1.0274, "step": 2590 }, { "epoch": 0.25, "grad_norm": 8.501296997070312, "learning_rate": 9.666666666666667e-06, "loss": 1.0148, "step": 2600 }, { "epoch": 0.26, "grad_norm": 7.963475704193115, "learning_rate": 9.661111111111112e-06, "loss": 1.0148, "step": 2610 }, { "epoch": 0.26, "grad_norm": 6.3807692527771, "learning_rate": 9.655555555555556e-06, "loss": 1.0057, "step": 2620 }, { "epoch": 0.26, "grad_norm": 6.403753757476807, "learning_rate": 9.65e-06, "loss": 1.0144, "step": 2630 }, { "epoch": 0.26, "grad_norm": 7.47929048538208, "learning_rate": 9.644444444444444e-06, "loss": 1.0249, "step": 2640 }, { "epoch": 0.26, "grad_norm": 7.441832542419434, "learning_rate": 9.63888888888889e-06, "loss": 1.013, "step": 2650 }, { "epoch": 0.26, "grad_norm": 6.977060794830322, "learning_rate": 9.633333333333335e-06, "loss": 1.0102, "step": 2660 }, { "epoch": 0.26, "grad_norm": 6.741335868835449, "learning_rate": 9.627777777777778e-06, "loss": 1.0231, "step": 2670 }, { "epoch": 0.26, "grad_norm": 7.60915470123291, "learning_rate": 9.622222222222222e-06, "loss": 1.0063, "step": 2680 }, { "epoch": 0.26, "grad_norm": 6.490938663482666, "learning_rate": 9.616666666666667e-06, "loss": 1.0236, "step": 2690 }, { "epoch": 0.26, "grad_norm": 7.14434289932251, "learning_rate": 9.611111111111112e-06, "loss": 1.0025, "step": 2700 }, { "epoch": 0.27, "grad_norm": 6.9825825691223145, "learning_rate": 9.605555555555556e-06, "loss": 1.0366, "step": 2710 }, { "epoch": 0.27, "grad_norm": 7.743397235870361, "learning_rate": 9.600000000000001e-06, "loss": 1.0157, "step": 2720 }, { "epoch": 0.27, "grad_norm": 6.508712291717529, "learning_rate": 9.594444444444445e-06, "loss": 1.0113, "step": 2730 }, { "epoch": 0.27, "grad_norm": 7.540585517883301, "learning_rate": 9.58888888888889e-06, "loss": 0.9941, "step": 2740 }, { "epoch": 0.27, "grad_norm": 7.058248996734619, "learning_rate": 9.583333333333335e-06, "loss": 1.0444, "step": 2750 }, { "epoch": 0.27, "grad_norm": 6.87162446975708, "learning_rate": 9.577777777777779e-06, "loss": 1.0201, "step": 2760 }, { "epoch": 0.27, "grad_norm": 7.648634910583496, "learning_rate": 9.572222222222222e-06, "loss": 1.0169, "step": 2770 }, { "epoch": 0.27, "grad_norm": 7.422387599945068, "learning_rate": 9.566666666666668e-06, "loss": 1.0228, "step": 2780 }, { "epoch": 0.27, "grad_norm": 7.140253067016602, "learning_rate": 9.561111111111113e-06, "loss": 1.009, "step": 2790 }, { "epoch": 0.27, "grad_norm": 6.5219316482543945, "learning_rate": 9.555555555555556e-06, "loss": 1.0093, "step": 2800 }, { "epoch": 0.28, "grad_norm": 6.9567437171936035, "learning_rate": 9.55e-06, "loss": 1.0016, "step": 2810 }, { "epoch": 0.28, "grad_norm": 7.445886611938477, "learning_rate": 9.544444444444445e-06, "loss": 1.0106, "step": 2820 }, { "epoch": 0.28, "grad_norm": 7.71561336517334, "learning_rate": 9.53888888888889e-06, "loss": 1.0182, "step": 2830 }, { "epoch": 0.28, "grad_norm": 7.0434346199035645, "learning_rate": 9.533333333333334e-06, "loss": 1.021, "step": 2840 }, { "epoch": 0.28, "grad_norm": 7.767464637756348, "learning_rate": 9.527777777777778e-06, "loss": 0.9849, "step": 2850 }, { "epoch": 0.28, "grad_norm": 7.035460948944092, "learning_rate": 9.522222222222223e-06, "loss": 0.9987, "step": 2860 }, { "epoch": 0.28, "grad_norm": 6.219535827636719, "learning_rate": 9.516666666666668e-06, "loss": 1.0252, "step": 2870 }, { "epoch": 0.28, "grad_norm": 7.309386253356934, "learning_rate": 9.511111111111112e-06, "loss": 1.0193, "step": 2880 }, { "epoch": 0.28, "grad_norm": 5.582414150238037, "learning_rate": 9.505555555555557e-06, "loss": 0.9947, "step": 2890 }, { "epoch": 0.28, "grad_norm": 7.555544853210449, "learning_rate": 9.5e-06, "loss": 1.0048, "step": 2900 }, { "epoch": 0.29, "grad_norm": 6.8697896003723145, "learning_rate": 9.494444444444446e-06, "loss": 1.02, "step": 2910 }, { "epoch": 0.29, "grad_norm": 7.6480889320373535, "learning_rate": 9.48888888888889e-06, "loss": 1.0141, "step": 2920 }, { "epoch": 0.29, "grad_norm": 6.709159851074219, "learning_rate": 9.483333333333335e-06, "loss": 1.0138, "step": 2930 }, { "epoch": 0.29, "grad_norm": 6.754703521728516, "learning_rate": 9.47777777777778e-06, "loss": 1.015, "step": 2940 }, { "epoch": 0.29, "grad_norm": 6.532148361206055, "learning_rate": 9.472222222222223e-06, "loss": 1.0166, "step": 2950 }, { "epoch": 0.29, "grad_norm": 7.5026397705078125, "learning_rate": 9.466666666666667e-06, "loss": 1.0142, "step": 2960 }, { "epoch": 0.29, "grad_norm": 8.21164608001709, "learning_rate": 9.461111111111112e-06, "loss": 1.0041, "step": 2970 }, { "epoch": 0.29, "grad_norm": 6.537563323974609, "learning_rate": 9.455555555555557e-06, "loss": 0.9942, "step": 2980 }, { "epoch": 0.29, "grad_norm": 6.31074857711792, "learning_rate": 9.450000000000001e-06, "loss": 0.9897, "step": 2990 }, { "epoch": 0.29, "grad_norm": 7.485910415649414, "learning_rate": 9.444444444444445e-06, "loss": 0.9829, "step": 3000 }, { "epoch": 0.29, "grad_norm": 6.8783979415893555, "learning_rate": 9.43888888888889e-06, "loss": 1.0128, "step": 3010 }, { "epoch": 0.3, "grad_norm": 7.49146842956543, "learning_rate": 9.433333333333335e-06, "loss": 0.9928, "step": 3020 }, { "epoch": 0.3, "grad_norm": 7.311631202697754, "learning_rate": 9.427777777777779e-06, "loss": 1.017, "step": 3030 }, { "epoch": 0.3, "grad_norm": 6.036073207855225, "learning_rate": 9.422222222222222e-06, "loss": 0.9856, "step": 3040 }, { "epoch": 0.3, "grad_norm": 6.988036632537842, "learning_rate": 9.416666666666667e-06, "loss": 1.0062, "step": 3050 }, { "epoch": 0.3, "grad_norm": 7.399910926818848, "learning_rate": 9.411111111111113e-06, "loss": 1.0302, "step": 3060 }, { "epoch": 0.3, "grad_norm": 7.4218430519104, "learning_rate": 9.405555555555556e-06, "loss": 1.0073, "step": 3070 }, { "epoch": 0.3, "grad_norm": 6.425349235534668, "learning_rate": 9.4e-06, "loss": 1.0107, "step": 3080 }, { "epoch": 0.3, "grad_norm": 7.233841419219971, "learning_rate": 9.394444444444445e-06, "loss": 1.0057, "step": 3090 }, { "epoch": 0.3, "grad_norm": 7.819307327270508, "learning_rate": 9.38888888888889e-06, "loss": 1.0128, "step": 3100 }, { "epoch": 0.3, "grad_norm": 7.580729961395264, "learning_rate": 9.383333333333334e-06, "loss": 1.0005, "step": 3110 }, { "epoch": 0.31, "grad_norm": 7.48482608795166, "learning_rate": 9.377777777777779e-06, "loss": 0.9822, "step": 3120 }, { "epoch": 0.31, "grad_norm": 5.902833938598633, "learning_rate": 9.372222222222223e-06, "loss": 1.0161, "step": 3130 }, { "epoch": 0.31, "grad_norm": 6.607807636260986, "learning_rate": 9.366666666666668e-06, "loss": 0.9767, "step": 3140 }, { "epoch": 0.31, "grad_norm": 8.062936782836914, "learning_rate": 9.361111111111111e-06, "loss": 1.0046, "step": 3150 }, { "epoch": 0.31, "grad_norm": 6.158736228942871, "learning_rate": 9.355555555555557e-06, "loss": 1.0159, "step": 3160 }, { "epoch": 0.31, "grad_norm": 6.21358585357666, "learning_rate": 9.350000000000002e-06, "loss": 0.9965, "step": 3170 }, { "epoch": 0.31, "grad_norm": 7.558384895324707, "learning_rate": 9.344444444444446e-06, "loss": 1.0311, "step": 3180 }, { "epoch": 0.31, "grad_norm": 7.404246807098389, "learning_rate": 9.338888888888889e-06, "loss": 0.994, "step": 3190 }, { "epoch": 0.31, "grad_norm": 6.217035293579102, "learning_rate": 9.333333333333334e-06, "loss": 0.9766, "step": 3200 }, { "epoch": 0.31, "grad_norm": 7.356032371520996, "learning_rate": 9.32777777777778e-06, "loss": 1.0167, "step": 3210 }, { "epoch": 0.32, "grad_norm": 7.191532611846924, "learning_rate": 9.322222222222223e-06, "loss": 0.9917, "step": 3220 }, { "epoch": 0.32, "grad_norm": 7.2428364753723145, "learning_rate": 9.316666666666667e-06, "loss": 0.978, "step": 3230 }, { "epoch": 0.32, "grad_norm": 7.48451566696167, "learning_rate": 9.311111111111112e-06, "loss": 1.0159, "step": 3240 }, { "epoch": 0.32, "grad_norm": 7.097259998321533, "learning_rate": 9.305555555555557e-06, "loss": 0.9875, "step": 3250 }, { "epoch": 0.32, "grad_norm": 7.401674270629883, "learning_rate": 9.3e-06, "loss": 1.0034, "step": 3260 }, { "epoch": 0.32, "grad_norm": 5.9528961181640625, "learning_rate": 9.294444444444444e-06, "loss": 0.9854, "step": 3270 }, { "epoch": 0.32, "grad_norm": 6.213513374328613, "learning_rate": 9.28888888888889e-06, "loss": 0.9746, "step": 3280 }, { "epoch": 0.32, "grad_norm": 6.535775184631348, "learning_rate": 9.283333333333335e-06, "loss": 1.007, "step": 3290 }, { "epoch": 0.32, "grad_norm": 7.0399556159973145, "learning_rate": 9.277777777777778e-06, "loss": 1.0104, "step": 3300 }, { "epoch": 0.32, "grad_norm": 7.379040718078613, "learning_rate": 9.272222222222222e-06, "loss": 0.9997, "step": 3310 }, { "epoch": 0.33, "grad_norm": 7.252180576324463, "learning_rate": 9.266666666666667e-06, "loss": 1.0004, "step": 3320 }, { "epoch": 0.33, "grad_norm": 7.710741996765137, "learning_rate": 9.261111111111112e-06, "loss": 0.9893, "step": 3330 }, { "epoch": 0.33, "grad_norm": 7.532569885253906, "learning_rate": 9.255555555555556e-06, "loss": 0.997, "step": 3340 }, { "epoch": 0.33, "grad_norm": 7.086602687835693, "learning_rate": 9.250000000000001e-06, "loss": 0.9759, "step": 3350 }, { "epoch": 0.33, "grad_norm": 5.734565734863281, "learning_rate": 9.244444444444445e-06, "loss": 1.0044, "step": 3360 }, { "epoch": 0.33, "grad_norm": 6.726978778839111, "learning_rate": 9.23888888888889e-06, "loss": 1.0014, "step": 3370 }, { "epoch": 0.33, "grad_norm": 6.218667030334473, "learning_rate": 9.233333333333334e-06, "loss": 0.9949, "step": 3380 }, { "epoch": 0.33, "grad_norm": 7.2244415283203125, "learning_rate": 9.227777777777779e-06, "loss": 0.994, "step": 3390 }, { "epoch": 0.33, "grad_norm": 7.357363224029541, "learning_rate": 9.222222222222224e-06, "loss": 1.0028, "step": 3400 }, { "epoch": 0.33, "grad_norm": 6.267972946166992, "learning_rate": 9.216666666666668e-06, "loss": 0.9984, "step": 3410 }, { "epoch": 0.34, "grad_norm": 7.361604690551758, "learning_rate": 9.211111111111111e-06, "loss": 1.0056, "step": 3420 }, { "epoch": 0.34, "grad_norm": 7.466953754425049, "learning_rate": 9.205555555555556e-06, "loss": 0.9835, "step": 3430 }, { "epoch": 0.34, "grad_norm": 6.407705783843994, "learning_rate": 9.200000000000002e-06, "loss": 0.969, "step": 3440 }, { "epoch": 0.34, "grad_norm": 6.575141429901123, "learning_rate": 9.194444444444445e-06, "loss": 0.9944, "step": 3450 }, { "epoch": 0.34, "grad_norm": 7.027390003204346, "learning_rate": 9.188888888888889e-06, "loss": 0.9608, "step": 3460 }, { "epoch": 0.34, "grad_norm": 7.603301048278809, "learning_rate": 9.183333333333334e-06, "loss": 0.9903, "step": 3470 }, { "epoch": 0.34, "grad_norm": 6.897797107696533, "learning_rate": 9.17777777777778e-06, "loss": 0.982, "step": 3480 }, { "epoch": 0.34, "grad_norm": 7.157762050628662, "learning_rate": 9.172222222222223e-06, "loss": 1.0067, "step": 3490 }, { "epoch": 0.34, "grad_norm": 6.13525390625, "learning_rate": 9.166666666666666e-06, "loss": 0.994, "step": 3500 }, { "epoch": 0.34, "grad_norm": 6.5132927894592285, "learning_rate": 9.161111111111112e-06, "loss": 1.0145, "step": 3510 }, { "epoch": 0.34, "grad_norm": 7.745060443878174, "learning_rate": 9.155555555555557e-06, "loss": 0.9952, "step": 3520 }, { "epoch": 0.35, "grad_norm": 6.890546798706055, "learning_rate": 9.15e-06, "loss": 0.9958, "step": 3530 }, { "epoch": 0.35, "grad_norm": 6.966720104217529, "learning_rate": 9.144444444444444e-06, "loss": 0.971, "step": 3540 }, { "epoch": 0.35, "grad_norm": 7.677045822143555, "learning_rate": 9.13888888888889e-06, "loss": 1.0033, "step": 3550 }, { "epoch": 0.35, "grad_norm": 7.402923583984375, "learning_rate": 9.133333333333335e-06, "loss": 0.9707, "step": 3560 }, { "epoch": 0.35, "grad_norm": 6.710414886474609, "learning_rate": 9.127777777777778e-06, "loss": 0.9827, "step": 3570 }, { "epoch": 0.35, "grad_norm": 6.4494123458862305, "learning_rate": 9.122222222222223e-06, "loss": 0.9933, "step": 3580 }, { "epoch": 0.35, "grad_norm": 6.759268760681152, "learning_rate": 9.116666666666667e-06, "loss": 0.9748, "step": 3590 }, { "epoch": 0.35, "grad_norm": 6.140712738037109, "learning_rate": 9.111111111111112e-06, "loss": 0.9936, "step": 3600 }, { "epoch": 0.35, "grad_norm": 6.955813407897949, "learning_rate": 9.105555555555556e-06, "loss": 0.9936, "step": 3610 }, { "epoch": 0.35, "grad_norm": 6.751615047454834, "learning_rate": 9.100000000000001e-06, "loss": 0.9885, "step": 3620 }, { "epoch": 0.36, "grad_norm": 6.414419174194336, "learning_rate": 9.094444444444446e-06, "loss": 0.9802, "step": 3630 }, { "epoch": 0.36, "grad_norm": 8.30736255645752, "learning_rate": 9.08888888888889e-06, "loss": 0.9917, "step": 3640 }, { "epoch": 0.36, "grad_norm": 6.489970684051514, "learning_rate": 9.083333333333333e-06, "loss": 1.0025, "step": 3650 }, { "epoch": 0.36, "grad_norm": 5.9044294357299805, "learning_rate": 9.077777777777779e-06, "loss": 0.9822, "step": 3660 }, { "epoch": 0.36, "grad_norm": 6.012246131896973, "learning_rate": 9.072222222222224e-06, "loss": 1.0146, "step": 3670 }, { "epoch": 0.36, "grad_norm": 5.992971897125244, "learning_rate": 9.066666666666667e-06, "loss": 1.0046, "step": 3680 }, { "epoch": 0.36, "grad_norm": 6.792891979217529, "learning_rate": 9.061111111111111e-06, "loss": 0.9911, "step": 3690 }, { "epoch": 0.36, "grad_norm": 6.368929386138916, "learning_rate": 9.055555555555556e-06, "loss": 0.9744, "step": 3700 }, { "epoch": 0.36, "grad_norm": 6.073873043060303, "learning_rate": 9.050000000000001e-06, "loss": 0.9797, "step": 3710 }, { "epoch": 0.36, "grad_norm": 7.107660293579102, "learning_rate": 9.044444444444445e-06, "loss": 0.9877, "step": 3720 }, { "epoch": 0.37, "grad_norm": 6.877776622772217, "learning_rate": 9.038888888888889e-06, "loss": 0.9805, "step": 3730 }, { "epoch": 0.37, "grad_norm": 6.973341464996338, "learning_rate": 9.033333333333334e-06, "loss": 0.9514, "step": 3740 }, { "epoch": 0.37, "grad_norm": 6.831413745880127, "learning_rate": 9.027777777777779e-06, "loss": 0.9914, "step": 3750 }, { "epoch": 0.37, "grad_norm": 7.162606716156006, "learning_rate": 9.022222222222223e-06, "loss": 0.9922, "step": 3760 }, { "epoch": 0.37, "grad_norm": 7.019341468811035, "learning_rate": 9.016666666666666e-06, "loss": 1.0, "step": 3770 }, { "epoch": 0.37, "grad_norm": 6.746554374694824, "learning_rate": 9.011111111111111e-06, "loss": 0.9917, "step": 3780 }, { "epoch": 0.37, "grad_norm": 6.074807167053223, "learning_rate": 9.005555555555557e-06, "loss": 0.9812, "step": 3790 }, { "epoch": 0.37, "grad_norm": 6.36068868637085, "learning_rate": 9e-06, "loss": 0.9899, "step": 3800 }, { "epoch": 0.37, "grad_norm": 7.354294776916504, "learning_rate": 8.994444444444445e-06, "loss": 0.9872, "step": 3810 }, { "epoch": 0.37, "grad_norm": 6.703911781311035, "learning_rate": 8.988888888888889e-06, "loss": 0.9825, "step": 3820 }, { "epoch": 0.38, "grad_norm": 7.337576389312744, "learning_rate": 8.983333333333334e-06, "loss": 1.0058, "step": 3830 }, { "epoch": 0.38, "grad_norm": 6.3277201652526855, "learning_rate": 8.977777777777778e-06, "loss": 0.9867, "step": 3840 }, { "epoch": 0.38, "grad_norm": 6.2475714683532715, "learning_rate": 8.972222222222223e-06, "loss": 0.9851, "step": 3850 }, { "epoch": 0.38, "grad_norm": 7.100306987762451, "learning_rate": 8.966666666666667e-06, "loss": 0.9686, "step": 3860 }, { "epoch": 0.38, "grad_norm": 6.611792087554932, "learning_rate": 8.961111111111112e-06, "loss": 0.9778, "step": 3870 }, { "epoch": 0.38, "grad_norm": 7.2821478843688965, "learning_rate": 8.955555555555555e-06, "loss": 0.9703, "step": 3880 }, { "epoch": 0.38, "grad_norm": 7.183440685272217, "learning_rate": 8.95e-06, "loss": 1.0035, "step": 3890 }, { "epoch": 0.38, "grad_norm": 6.618818759918213, "learning_rate": 8.944444444444446e-06, "loss": 1.0126, "step": 3900 }, { "epoch": 0.38, "grad_norm": 5.802262783050537, "learning_rate": 8.93888888888889e-06, "loss": 0.9803, "step": 3910 }, { "epoch": 0.38, "grad_norm": 6.774290084838867, "learning_rate": 8.933333333333333e-06, "loss": 0.9898, "step": 3920 }, { "epoch": 0.39, "grad_norm": 6.797236442565918, "learning_rate": 8.927777777777778e-06, "loss": 1.0055, "step": 3930 }, { "epoch": 0.39, "grad_norm": 6.226317882537842, "learning_rate": 8.922222222222224e-06, "loss": 0.9902, "step": 3940 }, { "epoch": 0.39, "grad_norm": 6.301506519317627, "learning_rate": 8.916666666666667e-06, "loss": 0.9751, "step": 3950 }, { "epoch": 0.39, "grad_norm": 6.825466156005859, "learning_rate": 8.91111111111111e-06, "loss": 0.987, "step": 3960 }, { "epoch": 0.39, "grad_norm": 7.073983192443848, "learning_rate": 8.905555555555556e-06, "loss": 1.0156, "step": 3970 }, { "epoch": 0.39, "grad_norm": 6.598026275634766, "learning_rate": 8.900000000000001e-06, "loss": 0.9787, "step": 3980 }, { "epoch": 0.39, "grad_norm": 7.465191841125488, "learning_rate": 8.894444444444445e-06, "loss": 0.9686, "step": 3990 }, { "epoch": 0.39, "grad_norm": 7.519344806671143, "learning_rate": 8.888888888888888e-06, "loss": 0.9769, "step": 4000 }, { "epoch": 0.39, "eval_loss": 0.9706473350524902, "eval_runtime": 25.0223, "eval_samples_per_second": 26.137, "eval_steps_per_second": 3.277, "step": 4000 }, { "epoch": 0.39, "grad_norm": 7.698840141296387, "learning_rate": 8.883333333333334e-06, "loss": 0.9815, "step": 4010 }, { "epoch": 0.39, "grad_norm": 6.7525177001953125, "learning_rate": 8.877777777777779e-06, "loss": 0.9945, "step": 4020 }, { "epoch": 0.39, "grad_norm": 7.281030178070068, "learning_rate": 8.872222222222222e-06, "loss": 1.0048, "step": 4030 }, { "epoch": 0.4, "grad_norm": 6.921372413635254, "learning_rate": 8.866666666666668e-06, "loss": 0.9639, "step": 4040 }, { "epoch": 0.4, "grad_norm": 5.818887233734131, "learning_rate": 8.861111111111111e-06, "loss": 0.9925, "step": 4050 }, { "epoch": 0.4, "grad_norm": 5.9734673500061035, "learning_rate": 8.855555555555556e-06, "loss": 0.9649, "step": 4060 }, { "epoch": 0.4, "grad_norm": 7.516678810119629, "learning_rate": 8.85e-06, "loss": 1.003, "step": 4070 }, { "epoch": 0.4, "grad_norm": 6.732432842254639, "learning_rate": 8.844444444444445e-06, "loss": 0.9675, "step": 4080 }, { "epoch": 0.4, "grad_norm": 6.887324810028076, "learning_rate": 8.838888888888889e-06, "loss": 0.9835, "step": 4090 }, { "epoch": 0.4, "grad_norm": 6.822541236877441, "learning_rate": 8.833333333333334e-06, "loss": 0.9794, "step": 4100 }, { "epoch": 0.4, "grad_norm": 6.283742904663086, "learning_rate": 8.82777777777778e-06, "loss": 0.9908, "step": 4110 }, { "epoch": 0.4, "grad_norm": 5.754868984222412, "learning_rate": 8.822222222222223e-06, "loss": 0.9801, "step": 4120 }, { "epoch": 0.4, "grad_norm": 8.865314483642578, "learning_rate": 8.816666666666668e-06, "loss": 0.9931, "step": 4130 }, { "epoch": 0.41, "grad_norm": 6.626471042633057, "learning_rate": 8.811111111111112e-06, "loss": 0.9827, "step": 4140 }, { "epoch": 0.41, "grad_norm": 7.39995002746582, "learning_rate": 8.805555555555557e-06, "loss": 0.9755, "step": 4150 }, { "epoch": 0.41, "grad_norm": 6.4092116355896, "learning_rate": 8.8e-06, "loss": 0.9603, "step": 4160 }, { "epoch": 0.41, "grad_norm": 7.112963676452637, "learning_rate": 8.794444444444446e-06, "loss": 0.989, "step": 4170 }, { "epoch": 0.41, "grad_norm": 7.569598197937012, "learning_rate": 8.788888888888891e-06, "loss": 0.9931, "step": 4180 }, { "epoch": 0.41, "grad_norm": 7.269093990325928, "learning_rate": 8.783333333333335e-06, "loss": 0.9787, "step": 4190 }, { "epoch": 0.41, "grad_norm": 7.388169288635254, "learning_rate": 8.777777777777778e-06, "loss": 0.9809, "step": 4200 }, { "epoch": 0.41, "grad_norm": 6.631279945373535, "learning_rate": 8.772222222222223e-06, "loss": 0.9889, "step": 4210 }, { "epoch": 0.41, "grad_norm": 6.825128078460693, "learning_rate": 8.766666666666669e-06, "loss": 0.9869, "step": 4220 }, { "epoch": 0.41, "grad_norm": 5.928747177124023, "learning_rate": 8.761111111111112e-06, "loss": 0.9774, "step": 4230 }, { "epoch": 0.42, "grad_norm": 6.170531272888184, "learning_rate": 8.755555555555556e-06, "loss": 0.9845, "step": 4240 }, { "epoch": 0.42, "grad_norm": 6.33007287979126, "learning_rate": 8.750000000000001e-06, "loss": 0.9709, "step": 4250 }, { "epoch": 0.42, "grad_norm": 7.216691493988037, "learning_rate": 8.744444444444446e-06, "loss": 0.9652, "step": 4260 }, { "epoch": 0.42, "grad_norm": 7.132676601409912, "learning_rate": 8.73888888888889e-06, "loss": 0.9842, "step": 4270 }, { "epoch": 0.42, "grad_norm": 7.249996662139893, "learning_rate": 8.733333333333333e-06, "loss": 0.9792, "step": 4280 }, { "epoch": 0.42, "grad_norm": 6.802064418792725, "learning_rate": 8.727777777777779e-06, "loss": 0.9945, "step": 4290 }, { "epoch": 0.42, "grad_norm": 6.876787185668945, "learning_rate": 8.722222222222224e-06, "loss": 0.9665, "step": 4300 }, { "epoch": 0.42, "grad_norm": 6.107343673706055, "learning_rate": 8.716666666666667e-06, "loss": 0.9815, "step": 4310 }, { "epoch": 0.42, "grad_norm": 6.867188930511475, "learning_rate": 8.711111111111111e-06, "loss": 0.9482, "step": 4320 }, { "epoch": 0.42, "grad_norm": 6.189272403717041, "learning_rate": 8.705555555555556e-06, "loss": 0.9832, "step": 4330 }, { "epoch": 0.43, "grad_norm": 7.103214263916016, "learning_rate": 8.700000000000001e-06, "loss": 0.9505, "step": 4340 }, { "epoch": 0.43, "grad_norm": 6.826525688171387, "learning_rate": 8.694444444444445e-06, "loss": 0.961, "step": 4350 }, { "epoch": 0.43, "grad_norm": 6.759648323059082, "learning_rate": 8.68888888888889e-06, "loss": 0.9554, "step": 4360 }, { "epoch": 0.43, "grad_norm": 5.611583709716797, "learning_rate": 8.683333333333334e-06, "loss": 0.9764, "step": 4370 }, { "epoch": 0.43, "grad_norm": 6.199190616607666, "learning_rate": 8.677777777777779e-06, "loss": 0.9563, "step": 4380 }, { "epoch": 0.43, "grad_norm": 6.6653289794921875, "learning_rate": 8.672222222222223e-06, "loss": 0.9659, "step": 4390 }, { "epoch": 0.43, "grad_norm": 6.69498872756958, "learning_rate": 8.666666666666668e-06, "loss": 0.9634, "step": 4400 }, { "epoch": 0.43, "grad_norm": 6.445537567138672, "learning_rate": 8.661111111111113e-06, "loss": 0.9894, "step": 4410 }, { "epoch": 0.43, "grad_norm": 6.721536159515381, "learning_rate": 8.655555555555557e-06, "loss": 1.0027, "step": 4420 }, { "epoch": 0.43, "grad_norm": 6.932891845703125, "learning_rate": 8.65e-06, "loss": 0.9793, "step": 4430 }, { "epoch": 0.44, "grad_norm": 6.356424331665039, "learning_rate": 8.644444444444445e-06, "loss": 0.9678, "step": 4440 }, { "epoch": 0.44, "grad_norm": 6.7934346199035645, "learning_rate": 8.63888888888889e-06, "loss": 1.0048, "step": 4450 }, { "epoch": 0.44, "grad_norm": 6.368439674377441, "learning_rate": 8.633333333333334e-06, "loss": 0.9556, "step": 4460 }, { "epoch": 0.44, "grad_norm": 6.740757465362549, "learning_rate": 8.627777777777778e-06, "loss": 0.9774, "step": 4470 }, { "epoch": 0.44, "grad_norm": 6.4069108963012695, "learning_rate": 8.622222222222223e-06, "loss": 0.9711, "step": 4480 }, { "epoch": 0.44, "grad_norm": 7.043665885925293, "learning_rate": 8.616666666666668e-06, "loss": 0.9787, "step": 4490 }, { "epoch": 0.44, "grad_norm": 7.1120405197143555, "learning_rate": 8.611111111111112e-06, "loss": 0.9579, "step": 4500 }, { "epoch": 0.44, "grad_norm": 8.486714363098145, "learning_rate": 8.605555555555555e-06, "loss": 0.9886, "step": 4510 }, { "epoch": 0.44, "grad_norm": 6.67174768447876, "learning_rate": 8.6e-06, "loss": 0.9525, "step": 4520 }, { "epoch": 0.44, "grad_norm": 6.464170455932617, "learning_rate": 8.594444444444446e-06, "loss": 0.9833, "step": 4530 }, { "epoch": 0.44, "grad_norm": 6.826434135437012, "learning_rate": 8.58888888888889e-06, "loss": 0.9907, "step": 4540 }, { "epoch": 0.45, "grad_norm": 6.193078994750977, "learning_rate": 8.583333333333333e-06, "loss": 0.9997, "step": 4550 }, { "epoch": 0.45, "grad_norm": 6.022221565246582, "learning_rate": 8.577777777777778e-06, "loss": 0.9488, "step": 4560 }, { "epoch": 0.45, "grad_norm": 6.674755096435547, "learning_rate": 8.572222222222224e-06, "loss": 0.9746, "step": 4570 }, { "epoch": 0.45, "grad_norm": 7.147762298583984, "learning_rate": 8.566666666666667e-06, "loss": 0.9538, "step": 4580 }, { "epoch": 0.45, "grad_norm": 6.487443447113037, "learning_rate": 8.561111111111112e-06, "loss": 0.9488, "step": 4590 }, { "epoch": 0.45, "grad_norm": 6.660772800445557, "learning_rate": 8.555555555555556e-06, "loss": 0.9548, "step": 4600 }, { "epoch": 0.45, "grad_norm": 6.74433708190918, "learning_rate": 8.550000000000001e-06, "loss": 0.9483, "step": 4610 }, { "epoch": 0.45, "grad_norm": 7.539987087249756, "learning_rate": 8.544444444444445e-06, "loss": 0.9859, "step": 4620 }, { "epoch": 0.45, "grad_norm": 5.9039788246154785, "learning_rate": 8.53888888888889e-06, "loss": 0.9821, "step": 4630 }, { "epoch": 0.45, "grad_norm": 6.730224132537842, "learning_rate": 8.533333333333335e-06, "loss": 0.9431, "step": 4640 }, { "epoch": 0.46, "grad_norm": 6.265594482421875, "learning_rate": 8.527777777777779e-06, "loss": 0.9718, "step": 4650 }, { "epoch": 0.46, "grad_norm": 7.609050750732422, "learning_rate": 8.522222222222222e-06, "loss": 0.944, "step": 4660 }, { "epoch": 0.46, "grad_norm": 6.801077842712402, "learning_rate": 8.516666666666668e-06, "loss": 0.9454, "step": 4670 }, { "epoch": 0.46, "grad_norm": 6.386157512664795, "learning_rate": 8.511111111111113e-06, "loss": 0.9573, "step": 4680 }, { "epoch": 0.46, "grad_norm": 6.361467361450195, "learning_rate": 8.505555555555556e-06, "loss": 0.9575, "step": 4690 }, { "epoch": 0.46, "grad_norm": 6.71563720703125, "learning_rate": 8.5e-06, "loss": 0.9294, "step": 4700 }, { "epoch": 0.46, "grad_norm": 6.786048889160156, "learning_rate": 8.494444444444445e-06, "loss": 0.9557, "step": 4710 }, { "epoch": 0.46, "grad_norm": 6.923149585723877, "learning_rate": 8.48888888888889e-06, "loss": 0.9748, "step": 4720 }, { "epoch": 0.46, "grad_norm": 7.03141450881958, "learning_rate": 8.483333333333334e-06, "loss": 0.9741, "step": 4730 }, { "epoch": 0.46, "grad_norm": 6.623723983764648, "learning_rate": 8.477777777777778e-06, "loss": 0.96, "step": 4740 }, { "epoch": 0.47, "grad_norm": 6.4162821769714355, "learning_rate": 8.472222222222223e-06, "loss": 0.9803, "step": 4750 }, { "epoch": 0.47, "grad_norm": 6.626855373382568, "learning_rate": 8.466666666666668e-06, "loss": 0.9335, "step": 4760 }, { "epoch": 0.47, "grad_norm": 7.205950736999512, "learning_rate": 8.461111111111112e-06, "loss": 0.9514, "step": 4770 }, { "epoch": 0.47, "grad_norm": 6.924039363861084, "learning_rate": 8.455555555555555e-06, "loss": 0.9891, "step": 4780 }, { "epoch": 0.47, "grad_norm": 5.932241439819336, "learning_rate": 8.45e-06, "loss": 0.9616, "step": 4790 }, { "epoch": 0.47, "grad_norm": 6.703830242156982, "learning_rate": 8.444444444444446e-06, "loss": 0.9556, "step": 4800 }, { "epoch": 0.47, "grad_norm": 6.727988243103027, "learning_rate": 8.43888888888889e-06, "loss": 0.9583, "step": 4810 }, { "epoch": 0.47, "grad_norm": 6.906303882598877, "learning_rate": 8.433333333333334e-06, "loss": 0.976, "step": 4820 }, { "epoch": 0.47, "grad_norm": 6.180042266845703, "learning_rate": 8.427777777777778e-06, "loss": 0.9357, "step": 4830 }, { "epoch": 0.47, "grad_norm": 6.648016452789307, "learning_rate": 8.422222222222223e-06, "loss": 0.9484, "step": 4840 }, { "epoch": 0.48, "grad_norm": 7.258022308349609, "learning_rate": 8.416666666666667e-06, "loss": 0.9585, "step": 4850 }, { "epoch": 0.48, "grad_norm": 7.872158527374268, "learning_rate": 8.411111111111112e-06, "loss": 0.9878, "step": 4860 }, { "epoch": 0.48, "grad_norm": 6.172829627990723, "learning_rate": 8.405555555555556e-06, "loss": 0.948, "step": 4870 }, { "epoch": 0.48, "grad_norm": 6.47758150100708, "learning_rate": 8.400000000000001e-06, "loss": 0.9613, "step": 4880 }, { "epoch": 0.48, "grad_norm": 7.681306838989258, "learning_rate": 8.394444444444444e-06, "loss": 0.9434, "step": 4890 }, { "epoch": 0.48, "grad_norm": 7.308563709259033, "learning_rate": 8.38888888888889e-06, "loss": 0.9454, "step": 4900 }, { "epoch": 0.48, "grad_norm": 8.226972579956055, "learning_rate": 8.383333333333335e-06, "loss": 0.9798, "step": 4910 }, { "epoch": 0.48, "grad_norm": 6.787251949310303, "learning_rate": 8.377777777777779e-06, "loss": 0.9642, "step": 4920 }, { "epoch": 0.48, "grad_norm": 6.76749324798584, "learning_rate": 8.372222222222222e-06, "loss": 0.9512, "step": 4930 }, { "epoch": 0.48, "grad_norm": 5.852420806884766, "learning_rate": 8.366666666666667e-06, "loss": 0.9573, "step": 4940 }, { "epoch": 0.49, "grad_norm": 6.435279369354248, "learning_rate": 8.361111111111113e-06, "loss": 0.9473, "step": 4950 }, { "epoch": 0.49, "grad_norm": 6.477540016174316, "learning_rate": 8.355555555555556e-06, "loss": 0.9733, "step": 4960 }, { "epoch": 0.49, "grad_norm": 7.102156639099121, "learning_rate": 8.35e-06, "loss": 0.9471, "step": 4970 }, { "epoch": 0.49, "grad_norm": 6.519305229187012, "learning_rate": 8.344444444444445e-06, "loss": 0.9606, "step": 4980 }, { "epoch": 0.49, "grad_norm": 6.12736177444458, "learning_rate": 8.33888888888889e-06, "loss": 0.9293, "step": 4990 }, { "epoch": 0.49, "grad_norm": 6.436068058013916, "learning_rate": 8.333333333333334e-06, "loss": 0.9492, "step": 5000 }, { "epoch": 0.49, "grad_norm": 6.453101634979248, "learning_rate": 8.327777777777777e-06, "loss": 0.9594, "step": 5010 }, { "epoch": 0.49, "grad_norm": 6.643795013427734, "learning_rate": 8.322222222222223e-06, "loss": 0.9494, "step": 5020 }, { "epoch": 0.49, "grad_norm": 7.121280670166016, "learning_rate": 8.316666666666668e-06, "loss": 0.9595, "step": 5030 }, { "epoch": 0.49, "grad_norm": 6.966318130493164, "learning_rate": 8.311111111111111e-06, "loss": 0.9447, "step": 5040 }, { "epoch": 0.49, "grad_norm": 6.52246618270874, "learning_rate": 8.305555555555557e-06, "loss": 0.9666, "step": 5050 }, { "epoch": 0.5, "grad_norm": 6.333890914916992, "learning_rate": 8.3e-06, "loss": 0.965, "step": 5060 }, { "epoch": 0.5, "grad_norm": 6.743497848510742, "learning_rate": 8.294444444444445e-06, "loss": 0.9646, "step": 5070 }, { "epoch": 0.5, "grad_norm": 6.774175643920898, "learning_rate": 8.288888888888889e-06, "loss": 0.9571, "step": 5080 }, { "epoch": 0.5, "grad_norm": 6.474306583404541, "learning_rate": 8.283333333333334e-06, "loss": 0.9764, "step": 5090 }, { "epoch": 0.5, "grad_norm": 7.056856632232666, "learning_rate": 8.277777777777778e-06, "loss": 0.9505, "step": 5100 }, { "epoch": 0.5, "grad_norm": 7.353367328643799, "learning_rate": 8.272222222222223e-06, "loss": 0.9457, "step": 5110 }, { "epoch": 0.5, "grad_norm": 7.850890636444092, "learning_rate": 8.266666666666667e-06, "loss": 0.9496, "step": 5120 }, { "epoch": 0.5, "grad_norm": 6.819700241088867, "learning_rate": 8.261111111111112e-06, "loss": 0.9527, "step": 5130 }, { "epoch": 0.5, "grad_norm": 6.376556873321533, "learning_rate": 8.255555555555557e-06, "loss": 0.9247, "step": 5140 }, { "epoch": 0.5, "grad_norm": 6.3290510177612305, "learning_rate": 8.25e-06, "loss": 0.9584, "step": 5150 }, { "epoch": 0.51, "grad_norm": 6.998957633972168, "learning_rate": 8.244444444444444e-06, "loss": 0.9799, "step": 5160 }, { "epoch": 0.51, "grad_norm": 7.610361099243164, "learning_rate": 8.23888888888889e-06, "loss": 0.968, "step": 5170 }, { "epoch": 0.51, "grad_norm": 6.03651237487793, "learning_rate": 8.233333333333335e-06, "loss": 0.9393, "step": 5180 }, { "epoch": 0.51, "grad_norm": 6.198638916015625, "learning_rate": 8.227777777777778e-06, "loss": 0.9525, "step": 5190 }, { "epoch": 0.51, "grad_norm": 6.9260711669921875, "learning_rate": 8.222222222222222e-06, "loss": 0.9409, "step": 5200 }, { "epoch": 0.51, "grad_norm": 6.872012138366699, "learning_rate": 8.216666666666667e-06, "loss": 0.9576, "step": 5210 }, { "epoch": 0.51, "grad_norm": 6.457302093505859, "learning_rate": 8.211111111111112e-06, "loss": 0.9473, "step": 5220 }, { "epoch": 0.51, "grad_norm": 6.2500691413879395, "learning_rate": 8.205555555555556e-06, "loss": 0.9581, "step": 5230 }, { "epoch": 0.51, "grad_norm": 6.4951934814453125, "learning_rate": 8.2e-06, "loss": 0.9361, "step": 5240 }, { "epoch": 0.51, "grad_norm": 6.518310546875, "learning_rate": 8.194444444444445e-06, "loss": 0.9788, "step": 5250 }, { "epoch": 0.52, "grad_norm": 7.069422245025635, "learning_rate": 8.18888888888889e-06, "loss": 0.9632, "step": 5260 }, { "epoch": 0.52, "grad_norm": 6.948794841766357, "learning_rate": 8.183333333333333e-06, "loss": 0.9532, "step": 5270 }, { "epoch": 0.52, "grad_norm": 6.65959358215332, "learning_rate": 8.177777777777779e-06, "loss": 0.9715, "step": 5280 }, { "epoch": 0.52, "grad_norm": 6.53006649017334, "learning_rate": 8.172222222222222e-06, "loss": 0.9419, "step": 5290 }, { "epoch": 0.52, "grad_norm": 6.356775760650635, "learning_rate": 8.166666666666668e-06, "loss": 0.942, "step": 5300 }, { "epoch": 0.52, "grad_norm": 6.82747220993042, "learning_rate": 8.161111111111111e-06, "loss": 0.97, "step": 5310 }, { "epoch": 0.52, "grad_norm": 7.198683738708496, "learning_rate": 8.155555555555556e-06, "loss": 0.9674, "step": 5320 }, { "epoch": 0.52, "grad_norm": 6.466299057006836, "learning_rate": 8.15e-06, "loss": 0.9515, "step": 5330 }, { "epoch": 0.52, "grad_norm": 6.114297389984131, "learning_rate": 8.144444444444445e-06, "loss": 0.9507, "step": 5340 }, { "epoch": 0.52, "grad_norm": 7.608702182769775, "learning_rate": 8.138888888888889e-06, "loss": 0.9315, "step": 5350 }, { "epoch": 0.53, "grad_norm": 6.639521598815918, "learning_rate": 8.133333333333334e-06, "loss": 0.9739, "step": 5360 }, { "epoch": 0.53, "grad_norm": 6.743986129760742, "learning_rate": 8.12777777777778e-06, "loss": 0.9355, "step": 5370 }, { "epoch": 0.53, "grad_norm": 6.91831636428833, "learning_rate": 8.122222222222223e-06, "loss": 0.9311, "step": 5380 }, { "epoch": 0.53, "grad_norm": 5.80686092376709, "learning_rate": 8.116666666666666e-06, "loss": 0.9757, "step": 5390 }, { "epoch": 0.53, "grad_norm": 7.458464622497559, "learning_rate": 8.111111111111112e-06, "loss": 0.9389, "step": 5400 }, { "epoch": 0.53, "grad_norm": 6.649440765380859, "learning_rate": 8.105555555555557e-06, "loss": 0.9666, "step": 5410 }, { "epoch": 0.53, "grad_norm": 6.171680927276611, "learning_rate": 8.1e-06, "loss": 0.9412, "step": 5420 }, { "epoch": 0.53, "grad_norm": 7.853817939758301, "learning_rate": 8.094444444444444e-06, "loss": 0.9758, "step": 5430 }, { "epoch": 0.53, "grad_norm": 7.073939800262451, "learning_rate": 8.08888888888889e-06, "loss": 0.9528, "step": 5440 }, { "epoch": 0.53, "grad_norm": 6.701811790466309, "learning_rate": 8.083333333333334e-06, "loss": 0.9096, "step": 5450 }, { "epoch": 0.54, "grad_norm": 6.306342601776123, "learning_rate": 8.077777777777778e-06, "loss": 0.956, "step": 5460 }, { "epoch": 0.54, "grad_norm": 6.861067295074463, "learning_rate": 8.072222222222223e-06, "loss": 0.9392, "step": 5470 }, { "epoch": 0.54, "grad_norm": 7.274597644805908, "learning_rate": 8.066666666666667e-06, "loss": 0.9647, "step": 5480 }, { "epoch": 0.54, "grad_norm": 9.951617240905762, "learning_rate": 8.061111111111112e-06, "loss": 0.9313, "step": 5490 }, { "epoch": 0.54, "grad_norm": 7.4188642501831055, "learning_rate": 8.055555555555557e-06, "loss": 0.9512, "step": 5500 }, { "epoch": 0.54, "grad_norm": 7.226146221160889, "learning_rate": 8.050000000000001e-06, "loss": 0.9632, "step": 5510 }, { "epoch": 0.54, "grad_norm": 6.792093276977539, "learning_rate": 8.044444444444444e-06, "loss": 0.9553, "step": 5520 }, { "epoch": 0.54, "grad_norm": 7.422070026397705, "learning_rate": 8.03888888888889e-06, "loss": 0.964, "step": 5530 }, { "epoch": 0.54, "grad_norm": 6.375083923339844, "learning_rate": 8.033333333333335e-06, "loss": 0.9472, "step": 5540 }, { "epoch": 0.54, "grad_norm": 7.314244270324707, "learning_rate": 8.027777777777778e-06, "loss": 0.9567, "step": 5550 }, { "epoch": 0.54, "grad_norm": 5.61725378036499, "learning_rate": 8.022222222222222e-06, "loss": 0.9525, "step": 5560 }, { "epoch": 0.55, "grad_norm": 5.94066858291626, "learning_rate": 8.016666666666667e-06, "loss": 0.9419, "step": 5570 }, { "epoch": 0.55, "grad_norm": 6.774581432342529, "learning_rate": 8.011111111111113e-06, "loss": 0.9514, "step": 5580 }, { "epoch": 0.55, "grad_norm": 6.012673854827881, "learning_rate": 8.005555555555556e-06, "loss": 0.9551, "step": 5590 }, { "epoch": 0.55, "grad_norm": 6.662960529327393, "learning_rate": 8.000000000000001e-06, "loss": 0.95, "step": 5600 }, { "epoch": 0.55, "grad_norm": 6.887300491333008, "learning_rate": 7.994444444444445e-06, "loss": 0.9601, "step": 5610 }, { "epoch": 0.55, "grad_norm": 6.807535171508789, "learning_rate": 7.98888888888889e-06, "loss": 0.9353, "step": 5620 }, { "epoch": 0.55, "grad_norm": 5.879833698272705, "learning_rate": 7.983333333333334e-06, "loss": 0.9424, "step": 5630 }, { "epoch": 0.55, "grad_norm": 5.445749759674072, "learning_rate": 7.977777777777779e-06, "loss": 0.9279, "step": 5640 }, { "epoch": 0.55, "grad_norm": 7.1856489181518555, "learning_rate": 7.972222222222224e-06, "loss": 0.9495, "step": 5650 }, { "epoch": 0.55, "grad_norm": 10.796788215637207, "learning_rate": 7.966666666666668e-06, "loss": 0.9511, "step": 5660 }, { "epoch": 0.56, "grad_norm": 6.877364635467529, "learning_rate": 7.961111111111111e-06, "loss": 0.9288, "step": 5670 }, { "epoch": 0.56, "grad_norm": 7.3206706047058105, "learning_rate": 7.955555555555557e-06, "loss": 0.9173, "step": 5680 }, { "epoch": 0.56, "grad_norm": 7.0967230796813965, "learning_rate": 7.950000000000002e-06, "loss": 0.9425, "step": 5690 }, { "epoch": 0.56, "grad_norm": 7.142643451690674, "learning_rate": 7.944444444444445e-06, "loss": 0.9601, "step": 5700 }, { "epoch": 0.56, "grad_norm": 7.465871334075928, "learning_rate": 7.938888888888889e-06, "loss": 0.9481, "step": 5710 }, { "epoch": 0.56, "grad_norm": 6.544083595275879, "learning_rate": 7.933333333333334e-06, "loss": 0.9538, "step": 5720 }, { "epoch": 0.56, "grad_norm": 7.09990119934082, "learning_rate": 7.92777777777778e-06, "loss": 0.9493, "step": 5730 }, { "epoch": 0.56, "grad_norm": 6.8810343742370605, "learning_rate": 7.922222222222223e-06, "loss": 0.929, "step": 5740 }, { "epoch": 0.56, "grad_norm": 5.756643295288086, "learning_rate": 7.916666666666667e-06, "loss": 0.9242, "step": 5750 }, { "epoch": 0.56, "grad_norm": 6.920449256896973, "learning_rate": 7.911111111111112e-06, "loss": 0.9316, "step": 5760 }, { "epoch": 0.57, "grad_norm": 6.3990397453308105, "learning_rate": 7.905555555555557e-06, "loss": 0.9495, "step": 5770 }, { "epoch": 0.57, "grad_norm": 6.080567359924316, "learning_rate": 7.9e-06, "loss": 0.9601, "step": 5780 }, { "epoch": 0.57, "grad_norm": 7.239852428436279, "learning_rate": 7.894444444444444e-06, "loss": 0.939, "step": 5790 }, { "epoch": 0.57, "grad_norm": 40.539276123046875, "learning_rate": 7.88888888888889e-06, "loss": 0.9413, "step": 5800 }, { "epoch": 0.57, "grad_norm": 6.196249008178711, "learning_rate": 7.883333333333335e-06, "loss": 0.9474, "step": 5810 }, { "epoch": 0.57, "grad_norm": 6.26318359375, "learning_rate": 7.877777777777778e-06, "loss": 0.931, "step": 5820 }, { "epoch": 0.57, "grad_norm": 5.942195415496826, "learning_rate": 7.872222222222223e-06, "loss": 0.9512, "step": 5830 }, { "epoch": 0.57, "grad_norm": 7.007154941558838, "learning_rate": 7.866666666666667e-06, "loss": 0.9607, "step": 5840 }, { "epoch": 0.57, "grad_norm": 7.148138046264648, "learning_rate": 7.861111111111112e-06, "loss": 0.9436, "step": 5850 }, { "epoch": 0.57, "grad_norm": 6.078429222106934, "learning_rate": 7.855555555555556e-06, "loss": 0.9379, "step": 5860 }, { "epoch": 0.58, "grad_norm": 7.125260829925537, "learning_rate": 7.850000000000001e-06, "loss": 0.969, "step": 5870 }, { "epoch": 0.58, "grad_norm": 7.715258598327637, "learning_rate": 7.844444444444446e-06, "loss": 0.9558, "step": 5880 }, { "epoch": 0.58, "grad_norm": 7.450591564178467, "learning_rate": 7.83888888888889e-06, "loss": 0.9344, "step": 5890 }, { "epoch": 0.58, "grad_norm": 6.3936991691589355, "learning_rate": 7.833333333333333e-06, "loss": 0.9111, "step": 5900 }, { "epoch": 0.58, "grad_norm": 7.4788498878479, "learning_rate": 7.827777777777779e-06, "loss": 0.9245, "step": 5910 }, { "epoch": 0.58, "grad_norm": 6.922938823699951, "learning_rate": 7.822222222222224e-06, "loss": 0.9267, "step": 5920 }, { "epoch": 0.58, "grad_norm": 6.613411903381348, "learning_rate": 7.816666666666667e-06, "loss": 0.9514, "step": 5930 }, { "epoch": 0.58, "grad_norm": 6.157261848449707, "learning_rate": 7.811111111111111e-06, "loss": 0.9128, "step": 5940 }, { "epoch": 0.58, "grad_norm": 6.349913597106934, "learning_rate": 7.805555555555556e-06, "loss": 0.9383, "step": 5950 }, { "epoch": 0.58, "grad_norm": 8.247715950012207, "learning_rate": 7.800000000000002e-06, "loss": 0.9452, "step": 5960 }, { "epoch": 0.59, "grad_norm": 6.12860631942749, "learning_rate": 7.794444444444445e-06, "loss": 0.9334, "step": 5970 }, { "epoch": 0.59, "grad_norm": 7.1938018798828125, "learning_rate": 7.788888888888889e-06, "loss": 0.9237, "step": 5980 }, { "epoch": 0.59, "grad_norm": 6.232997417449951, "learning_rate": 7.783333333333334e-06, "loss": 0.9175, "step": 5990 }, { "epoch": 0.59, "grad_norm": 8.434849739074707, "learning_rate": 7.77777777777778e-06, "loss": 0.9542, "step": 6000 }, { "epoch": 0.59, "eval_loss": 0.9280689358711243, "eval_runtime": 25.0759, "eval_samples_per_second": 26.081, "eval_steps_per_second": 3.27, "step": 6000 }, { "epoch": 0.59, "grad_norm": 5.511234283447266, "learning_rate": 7.772222222222223e-06, "loss": 0.9465, "step": 6010 }, { "epoch": 0.59, "grad_norm": 6.701432228088379, "learning_rate": 7.766666666666666e-06, "loss": 0.9288, "step": 6020 }, { "epoch": 0.59, "grad_norm": 6.576080799102783, "learning_rate": 7.761111111111112e-06, "loss": 0.9271, "step": 6030 }, { "epoch": 0.59, "grad_norm": 7.5846076011657715, "learning_rate": 7.755555555555557e-06, "loss": 0.9526, "step": 6040 }, { "epoch": 0.59, "grad_norm": 6.835690498352051, "learning_rate": 7.75e-06, "loss": 0.9342, "step": 6050 }, { "epoch": 0.59, "grad_norm": 7.263296604156494, "learning_rate": 7.744444444444446e-06, "loss": 0.9513, "step": 6060 }, { "epoch": 0.59, "grad_norm": 6.679274082183838, "learning_rate": 7.738888888888889e-06, "loss": 0.9173, "step": 6070 }, { "epoch": 0.6, "grad_norm": 7.330471515655518, "learning_rate": 7.733333333333334e-06, "loss": 0.9424, "step": 6080 }, { "epoch": 0.6, "grad_norm": 6.464056968688965, "learning_rate": 7.727777777777778e-06, "loss": 0.9463, "step": 6090 }, { "epoch": 0.6, "grad_norm": 6.972497940063477, "learning_rate": 7.722222222222223e-06, "loss": 0.9309, "step": 6100 }, { "epoch": 0.6, "grad_norm": 6.497080326080322, "learning_rate": 7.716666666666667e-06, "loss": 0.9319, "step": 6110 }, { "epoch": 0.6, "grad_norm": 6.947892665863037, "learning_rate": 7.711111111111112e-06, "loss": 0.9375, "step": 6120 }, { "epoch": 0.6, "grad_norm": 7.431771278381348, "learning_rate": 7.705555555555556e-06, "loss": 0.9328, "step": 6130 }, { "epoch": 0.6, "grad_norm": 6.458249092102051, "learning_rate": 7.7e-06, "loss": 0.9129, "step": 6140 }, { "epoch": 0.6, "grad_norm": 6.600308895111084, "learning_rate": 7.694444444444446e-06, "loss": 0.9387, "step": 6150 }, { "epoch": 0.6, "grad_norm": 7.697118282318115, "learning_rate": 7.68888888888889e-06, "loss": 0.9466, "step": 6160 }, { "epoch": 0.6, "grad_norm": 6.152834415435791, "learning_rate": 7.683333333333333e-06, "loss": 0.9577, "step": 6170 }, { "epoch": 0.61, "grad_norm": 6.656818866729736, "learning_rate": 7.677777777777778e-06, "loss": 0.9296, "step": 6180 }, { "epoch": 0.61, "grad_norm": 7.15354061126709, "learning_rate": 7.672222222222224e-06, "loss": 0.9166, "step": 6190 }, { "epoch": 0.61, "grad_norm": 5.422146797180176, "learning_rate": 7.666666666666667e-06, "loss": 0.9216, "step": 6200 }, { "epoch": 0.61, "grad_norm": 6.80369758605957, "learning_rate": 7.66111111111111e-06, "loss": 0.9161, "step": 6210 }, { "epoch": 0.61, "grad_norm": 6.492003917694092, "learning_rate": 7.655555555555556e-06, "loss": 0.9074, "step": 6220 }, { "epoch": 0.61, "grad_norm": 6.364880561828613, "learning_rate": 7.650000000000001e-06, "loss": 0.9191, "step": 6230 }, { "epoch": 0.61, "grad_norm": 6.899060249328613, "learning_rate": 7.644444444444445e-06, "loss": 0.9527, "step": 6240 }, { "epoch": 0.61, "grad_norm": 6.892459392547607, "learning_rate": 7.638888888888888e-06, "loss": 0.9182, "step": 6250 }, { "epoch": 0.61, "grad_norm": 6.262991428375244, "learning_rate": 7.633333333333334e-06, "loss": 0.9241, "step": 6260 }, { "epoch": 0.61, "grad_norm": 6.4501543045043945, "learning_rate": 7.627777777777778e-06, "loss": 0.9168, "step": 6270 }, { "epoch": 0.62, "grad_norm": 6.077027797698975, "learning_rate": 7.622222222222223e-06, "loss": 0.9388, "step": 6280 }, { "epoch": 0.62, "grad_norm": 8.005155563354492, "learning_rate": 7.616666666666668e-06, "loss": 0.9466, "step": 6290 }, { "epoch": 0.62, "grad_norm": 7.830831050872803, "learning_rate": 7.611111111111111e-06, "loss": 0.9506, "step": 6300 }, { "epoch": 0.62, "grad_norm": 6.98394775390625, "learning_rate": 7.605555555555556e-06, "loss": 0.9421, "step": 6310 }, { "epoch": 0.62, "grad_norm": 5.755524635314941, "learning_rate": 7.600000000000001e-06, "loss": 0.931, "step": 6320 }, { "epoch": 0.62, "grad_norm": 5.994245529174805, "learning_rate": 7.594444444444445e-06, "loss": 0.9331, "step": 6330 }, { "epoch": 0.62, "grad_norm": 5.818086624145508, "learning_rate": 7.588888888888889e-06, "loss": 0.943, "step": 6340 }, { "epoch": 0.62, "grad_norm": 5.371914386749268, "learning_rate": 7.583333333333333e-06, "loss": 0.9583, "step": 6350 }, { "epoch": 0.62, "grad_norm": 6.878881454467773, "learning_rate": 7.5777777777777785e-06, "loss": 0.9431, "step": 6360 }, { "epoch": 0.62, "grad_norm": 7.436302661895752, "learning_rate": 7.572222222222223e-06, "loss": 0.9288, "step": 6370 }, { "epoch": 0.63, "grad_norm": 7.161762237548828, "learning_rate": 7.566666666666667e-06, "loss": 0.9483, "step": 6380 }, { "epoch": 0.63, "grad_norm": 7.268876075744629, "learning_rate": 7.561111111111111e-06, "loss": 0.925, "step": 6390 }, { "epoch": 0.63, "grad_norm": 6.818887233734131, "learning_rate": 7.555555555555556e-06, "loss": 0.9185, "step": 6400 }, { "epoch": 0.63, "grad_norm": 7.29956579208374, "learning_rate": 7.5500000000000006e-06, "loss": 0.9336, "step": 6410 }, { "epoch": 0.63, "grad_norm": 6.289857864379883, "learning_rate": 7.544444444444445e-06, "loss": 0.9614, "step": 6420 }, { "epoch": 0.63, "grad_norm": 6.578990936279297, "learning_rate": 7.53888888888889e-06, "loss": 0.9562, "step": 6430 }, { "epoch": 0.63, "grad_norm": 7.141304969787598, "learning_rate": 7.533333333333334e-06, "loss": 0.9226, "step": 6440 }, { "epoch": 0.63, "grad_norm": 5.9252519607543945, "learning_rate": 7.527777777777778e-06, "loss": 0.9375, "step": 6450 }, { "epoch": 0.63, "grad_norm": 7.16292142868042, "learning_rate": 7.5222222222222226e-06, "loss": 0.9097, "step": 6460 }, { "epoch": 0.63, "grad_norm": 7.047999858856201, "learning_rate": 7.516666666666668e-06, "loss": 0.9051, "step": 6470 }, { "epoch": 0.64, "grad_norm": 7.577090740203857, "learning_rate": 7.511111111111111e-06, "loss": 0.9397, "step": 6480 }, { "epoch": 0.64, "grad_norm": 6.7867865562438965, "learning_rate": 7.505555555555556e-06, "loss": 0.9127, "step": 6490 }, { "epoch": 0.64, "grad_norm": 6.417243003845215, "learning_rate": 7.500000000000001e-06, "loss": 0.9545, "step": 6500 }, { "epoch": 0.64, "grad_norm": 7.113065719604492, "learning_rate": 7.4944444444444454e-06, "loss": 0.9239, "step": 6510 }, { "epoch": 0.64, "grad_norm": 5.6416401863098145, "learning_rate": 7.48888888888889e-06, "loss": 0.9207, "step": 6520 }, { "epoch": 0.64, "grad_norm": 6.47081184387207, "learning_rate": 7.483333333333333e-06, "loss": 0.9188, "step": 6530 }, { "epoch": 0.64, "grad_norm": 5.9870123863220215, "learning_rate": 7.477777777777779e-06, "loss": 0.9177, "step": 6540 }, { "epoch": 0.64, "grad_norm": 6.798870086669922, "learning_rate": 7.472222222222223e-06, "loss": 0.932, "step": 6550 }, { "epoch": 0.64, "grad_norm": 6.849503040313721, "learning_rate": 7.4666666666666675e-06, "loss": 0.9253, "step": 6560 }, { "epoch": 0.64, "grad_norm": 7.435683250427246, "learning_rate": 7.461111111111111e-06, "loss": 0.9195, "step": 6570 }, { "epoch": 0.64, "grad_norm": 6.82542610168457, "learning_rate": 7.455555555555556e-06, "loss": 0.9225, "step": 6580 }, { "epoch": 0.65, "grad_norm": 6.897842884063721, "learning_rate": 7.450000000000001e-06, "loss": 0.9308, "step": 6590 }, { "epoch": 0.65, "grad_norm": 7.483247756958008, "learning_rate": 7.444444444444445e-06, "loss": 0.9464, "step": 6600 }, { "epoch": 0.65, "grad_norm": 7.201714038848877, "learning_rate": 7.43888888888889e-06, "loss": 0.9275, "step": 6610 }, { "epoch": 0.65, "grad_norm": 7.582096576690674, "learning_rate": 7.433333333333334e-06, "loss": 0.9073, "step": 6620 }, { "epoch": 0.65, "grad_norm": 6.894292831420898, "learning_rate": 7.427777777777778e-06, "loss": 0.905, "step": 6630 }, { "epoch": 0.65, "grad_norm": 6.390003681182861, "learning_rate": 7.422222222222223e-06, "loss": 0.9051, "step": 6640 }, { "epoch": 0.65, "grad_norm": 6.3859968185424805, "learning_rate": 7.416666666666668e-06, "loss": 0.9076, "step": 6650 }, { "epoch": 0.65, "grad_norm": 6.695901393890381, "learning_rate": 7.411111111111112e-06, "loss": 0.9375, "step": 6660 }, { "epoch": 0.65, "grad_norm": 8.029141426086426, "learning_rate": 7.405555555555556e-06, "loss": 0.9131, "step": 6670 }, { "epoch": 0.65, "grad_norm": 7.5984978675842285, "learning_rate": 7.4e-06, "loss": 0.9476, "step": 6680 }, { "epoch": 0.66, "grad_norm": 8.214203834533691, "learning_rate": 7.3944444444444456e-06, "loss": 0.9541, "step": 6690 }, { "epoch": 0.66, "grad_norm": 5.453817367553711, "learning_rate": 7.38888888888889e-06, "loss": 0.9125, "step": 6700 }, { "epoch": 0.66, "grad_norm": 7.748270034790039, "learning_rate": 7.3833333333333335e-06, "loss": 0.9274, "step": 6710 }, { "epoch": 0.66, "grad_norm": 6.21115779876709, "learning_rate": 7.377777777777778e-06, "loss": 0.9337, "step": 6720 }, { "epoch": 0.66, "grad_norm": 6.447852611541748, "learning_rate": 7.372222222222223e-06, "loss": 0.9242, "step": 6730 }, { "epoch": 0.66, "grad_norm": 5.615070343017578, "learning_rate": 7.3666666666666676e-06, "loss": 0.9103, "step": 6740 }, { "epoch": 0.66, "grad_norm": 6.468333721160889, "learning_rate": 7.361111111111112e-06, "loss": 0.8996, "step": 6750 }, { "epoch": 0.66, "grad_norm": 6.770966053009033, "learning_rate": 7.3555555555555555e-06, "loss": 0.9197, "step": 6760 }, { "epoch": 0.66, "grad_norm": 7.449739456176758, "learning_rate": 7.350000000000001e-06, "loss": 0.9105, "step": 6770 }, { "epoch": 0.66, "grad_norm": 7.388159275054932, "learning_rate": 7.344444444444445e-06, "loss": 0.895, "step": 6780 }, { "epoch": 0.67, "grad_norm": 6.2874650955200195, "learning_rate": 7.33888888888889e-06, "loss": 0.9282, "step": 6790 }, { "epoch": 0.67, "grad_norm": 5.6335673332214355, "learning_rate": 7.333333333333333e-06, "loss": 0.9181, "step": 6800 }, { "epoch": 0.67, "grad_norm": 6.593587398529053, "learning_rate": 7.327777777777778e-06, "loss": 0.9459, "step": 6810 }, { "epoch": 0.67, "grad_norm": 6.855390548706055, "learning_rate": 7.322222222222223e-06, "loss": 0.914, "step": 6820 }, { "epoch": 0.67, "grad_norm": 7.27256441116333, "learning_rate": 7.316666666666667e-06, "loss": 0.9179, "step": 6830 }, { "epoch": 0.67, "grad_norm": 6.774872779846191, "learning_rate": 7.3111111111111125e-06, "loss": 0.9162, "step": 6840 }, { "epoch": 0.67, "grad_norm": 7.510383129119873, "learning_rate": 7.305555555555556e-06, "loss": 0.921, "step": 6850 }, { "epoch": 0.67, "grad_norm": 7.852698802947998, "learning_rate": 7.3e-06, "loss": 0.9338, "step": 6860 }, { "epoch": 0.67, "grad_norm": 6.857759952545166, "learning_rate": 7.294444444444445e-06, "loss": 0.9186, "step": 6870 }, { "epoch": 0.67, "grad_norm": 7.018872261047363, "learning_rate": 7.28888888888889e-06, "loss": 0.934, "step": 6880 }, { "epoch": 0.68, "grad_norm": 5.917805194854736, "learning_rate": 7.2833333333333345e-06, "loss": 0.9408, "step": 6890 }, { "epoch": 0.68, "grad_norm": 7.299219131469727, "learning_rate": 7.277777777777778e-06, "loss": 0.9353, "step": 6900 }, { "epoch": 0.68, "grad_norm": 8.337811470031738, "learning_rate": 7.2722222222222224e-06, "loss": 0.8996, "step": 6910 }, { "epoch": 0.68, "grad_norm": 5.5630364418029785, "learning_rate": 7.266666666666668e-06, "loss": 0.9299, "step": 6920 }, { "epoch": 0.68, "grad_norm": 7.0643415451049805, "learning_rate": 7.261111111111112e-06, "loss": 0.9188, "step": 6930 }, { "epoch": 0.68, "grad_norm": 6.577130317687988, "learning_rate": 7.255555555555556e-06, "loss": 0.9055, "step": 6940 }, { "epoch": 0.68, "grad_norm": 7.097878932952881, "learning_rate": 7.25e-06, "loss": 0.9129, "step": 6950 }, { "epoch": 0.68, "grad_norm": 6.687019348144531, "learning_rate": 7.244444444444445e-06, "loss": 0.9166, "step": 6960 }, { "epoch": 0.68, "grad_norm": 6.685943603515625, "learning_rate": 7.23888888888889e-06, "loss": 0.93, "step": 6970 }, { "epoch": 0.68, "grad_norm": 6.5908589363098145, "learning_rate": 7.233333333333334e-06, "loss": 0.917, "step": 6980 }, { "epoch": 0.69, "grad_norm": 7.181557655334473, "learning_rate": 7.227777777777778e-06, "loss": 0.9159, "step": 6990 }, { "epoch": 0.69, "grad_norm": 7.442741394042969, "learning_rate": 7.222222222222223e-06, "loss": 0.9367, "step": 7000 }, { "epoch": 0.69, "grad_norm": 7.4278950691223145, "learning_rate": 7.216666666666667e-06, "loss": 0.9143, "step": 7010 }, { "epoch": 0.69, "grad_norm": 6.39820671081543, "learning_rate": 7.211111111111112e-06, "loss": 0.9117, "step": 7020 }, { "epoch": 0.69, "grad_norm": 7.562165260314941, "learning_rate": 7.205555555555555e-06, "loss": 0.9415, "step": 7030 }, { "epoch": 0.69, "grad_norm": 7.379988670349121, "learning_rate": 7.2000000000000005e-06, "loss": 0.9171, "step": 7040 }, { "epoch": 0.69, "grad_norm": 7.480957984924316, "learning_rate": 7.194444444444445e-06, "loss": 0.9165, "step": 7050 }, { "epoch": 0.69, "grad_norm": 7.023442268371582, "learning_rate": 7.188888888888889e-06, "loss": 0.8952, "step": 7060 }, { "epoch": 0.69, "grad_norm": 7.537672996520996, "learning_rate": 7.183333333333335e-06, "loss": 0.9274, "step": 7070 }, { "epoch": 0.69, "grad_norm": 7.334287643432617, "learning_rate": 7.177777777777778e-06, "loss": 0.9026, "step": 7080 }, { "epoch": 0.69, "grad_norm": 7.228966236114502, "learning_rate": 7.1722222222222225e-06, "loss": 0.8923, "step": 7090 }, { "epoch": 0.7, "grad_norm": 6.3140997886657715, "learning_rate": 7.166666666666667e-06, "loss": 0.9543, "step": 7100 }, { "epoch": 0.7, "grad_norm": 7.3922505378723145, "learning_rate": 7.161111111111112e-06, "loss": 0.9031, "step": 7110 }, { "epoch": 0.7, "grad_norm": 6.066789150238037, "learning_rate": 7.155555555555556e-06, "loss": 0.906, "step": 7120 }, { "epoch": 0.7, "grad_norm": 6.609281063079834, "learning_rate": 7.15e-06, "loss": 0.9198, "step": 7130 }, { "epoch": 0.7, "grad_norm": 7.062366485595703, "learning_rate": 7.1444444444444446e-06, "loss": 0.8958, "step": 7140 }, { "epoch": 0.7, "grad_norm": 6.791074752807617, "learning_rate": 7.13888888888889e-06, "loss": 0.8997, "step": 7150 }, { "epoch": 0.7, "grad_norm": 6.277819633483887, "learning_rate": 7.133333333333334e-06, "loss": 0.9067, "step": 7160 }, { "epoch": 0.7, "grad_norm": 7.178339004516602, "learning_rate": 7.127777777777778e-06, "loss": 0.8923, "step": 7170 }, { "epoch": 0.7, "grad_norm": 7.200781345367432, "learning_rate": 7.122222222222222e-06, "loss": 0.9128, "step": 7180 }, { "epoch": 0.7, "grad_norm": 7.161320686340332, "learning_rate": 7.116666666666667e-06, "loss": 0.9089, "step": 7190 }, { "epoch": 0.71, "grad_norm": 6.944462299346924, "learning_rate": 7.111111111111112e-06, "loss": 0.9234, "step": 7200 }, { "epoch": 0.71, "grad_norm": 7.177565574645996, "learning_rate": 7.105555555555556e-06, "loss": 0.9143, "step": 7210 }, { "epoch": 0.71, "grad_norm": 7.205848693847656, "learning_rate": 7.100000000000001e-06, "loss": 0.916, "step": 7220 }, { "epoch": 0.71, "grad_norm": 6.959636688232422, "learning_rate": 7.094444444444445e-06, "loss": 0.8998, "step": 7230 }, { "epoch": 0.71, "grad_norm": 7.397844314575195, "learning_rate": 7.0888888888888894e-06, "loss": 0.9221, "step": 7240 }, { "epoch": 0.71, "grad_norm": 7.143486022949219, "learning_rate": 7.083333333333335e-06, "loss": 0.9151, "step": 7250 }, { "epoch": 0.71, "grad_norm": 6.996533393859863, "learning_rate": 7.077777777777778e-06, "loss": 0.9345, "step": 7260 }, { "epoch": 0.71, "grad_norm": 6.514726638793945, "learning_rate": 7.072222222222223e-06, "loss": 0.9217, "step": 7270 }, { "epoch": 0.71, "grad_norm": 6.596681118011475, "learning_rate": 7.066666666666667e-06, "loss": 0.9121, "step": 7280 }, { "epoch": 0.71, "grad_norm": 7.167609691619873, "learning_rate": 7.061111111111112e-06, "loss": 0.886, "step": 7290 }, { "epoch": 0.72, "grad_norm": 7.826645851135254, "learning_rate": 7.055555555555557e-06, "loss": 0.9046, "step": 7300 }, { "epoch": 0.72, "grad_norm": 7.795003414154053, "learning_rate": 7.05e-06, "loss": 0.916, "step": 7310 }, { "epoch": 0.72, "grad_norm": 6.620437145233154, "learning_rate": 7.044444444444445e-06, "loss": 0.9203, "step": 7320 }, { "epoch": 0.72, "grad_norm": 6.450239658355713, "learning_rate": 7.03888888888889e-06, "loss": 0.911, "step": 7330 }, { "epoch": 0.72, "grad_norm": 7.532473087310791, "learning_rate": 7.033333333333334e-06, "loss": 0.9207, "step": 7340 }, { "epoch": 0.72, "grad_norm": 6.834466934204102, "learning_rate": 7.027777777777778e-06, "loss": 0.8928, "step": 7350 }, { "epoch": 0.72, "grad_norm": 7.2004218101501465, "learning_rate": 7.022222222222222e-06, "loss": 0.9513, "step": 7360 }, { "epoch": 0.72, "grad_norm": 7.1573967933654785, "learning_rate": 7.0166666666666675e-06, "loss": 0.9213, "step": 7370 }, { "epoch": 0.72, "grad_norm": 6.033583641052246, "learning_rate": 7.011111111111112e-06, "loss": 0.896, "step": 7380 }, { "epoch": 0.72, "grad_norm": 7.1822428703308105, "learning_rate": 7.005555555555556e-06, "loss": 0.9255, "step": 7390 }, { "epoch": 0.73, "grad_norm": 6.001760959625244, "learning_rate": 7e-06, "loss": 0.9467, "step": 7400 }, { "epoch": 0.73, "grad_norm": 7.443753242492676, "learning_rate": 6.994444444444445e-06, "loss": 0.906, "step": 7410 }, { "epoch": 0.73, "grad_norm": 7.562281131744385, "learning_rate": 6.9888888888888895e-06, "loss": 0.896, "step": 7420 }, { "epoch": 0.73, "grad_norm": 7.213113307952881, "learning_rate": 6.983333333333334e-06, "loss": 0.9083, "step": 7430 }, { "epoch": 0.73, "grad_norm": 6.070459365844727, "learning_rate": 6.977777777777779e-06, "loss": 0.9211, "step": 7440 }, { "epoch": 0.73, "grad_norm": 6.004364013671875, "learning_rate": 6.972222222222223e-06, "loss": 0.8686, "step": 7450 }, { "epoch": 0.73, "grad_norm": 7.216383457183838, "learning_rate": 6.966666666666667e-06, "loss": 0.92, "step": 7460 }, { "epoch": 0.73, "grad_norm": 7.999691963195801, "learning_rate": 6.9611111111111116e-06, "loss": 0.9025, "step": 7470 }, { "epoch": 0.73, "grad_norm": 6.500344276428223, "learning_rate": 6.955555555555557e-06, "loss": 0.8938, "step": 7480 }, { "epoch": 0.73, "grad_norm": 7.130675315856934, "learning_rate": 6.95e-06, "loss": 0.9217, "step": 7490 }, { "epoch": 0.74, "grad_norm": 8.138983726501465, "learning_rate": 6.944444444444445e-06, "loss": 0.9318, "step": 7500 }, { "epoch": 0.74, "grad_norm": 6.690699100494385, "learning_rate": 6.938888888888889e-06, "loss": 0.9, "step": 7510 }, { "epoch": 0.74, "grad_norm": 7.136821746826172, "learning_rate": 6.9333333333333344e-06, "loss": 0.9107, "step": 7520 }, { "epoch": 0.74, "grad_norm": 9.44869613647461, "learning_rate": 6.927777777777779e-06, "loss": 0.909, "step": 7530 }, { "epoch": 0.74, "grad_norm": 6.300996780395508, "learning_rate": 6.922222222222222e-06, "loss": 0.9213, "step": 7540 }, { "epoch": 0.74, "grad_norm": 7.29595422744751, "learning_rate": 6.916666666666667e-06, "loss": 0.8956, "step": 7550 }, { "epoch": 0.74, "grad_norm": 7.200852394104004, "learning_rate": 6.911111111111112e-06, "loss": 0.8795, "step": 7560 }, { "epoch": 0.74, "grad_norm": 7.068015098571777, "learning_rate": 6.9055555555555564e-06, "loss": 0.9354, "step": 7570 }, { "epoch": 0.74, "grad_norm": 5.922299385070801, "learning_rate": 6.9e-06, "loss": 0.9324, "step": 7580 }, { "epoch": 0.74, "grad_norm": 7.6257734298706055, "learning_rate": 6.894444444444444e-06, "loss": 0.9059, "step": 7590 }, { "epoch": 0.74, "grad_norm": 6.600005626678467, "learning_rate": 6.88888888888889e-06, "loss": 0.916, "step": 7600 }, { "epoch": 0.75, "grad_norm": 5.874154090881348, "learning_rate": 6.883333333333334e-06, "loss": 0.902, "step": 7610 }, { "epoch": 0.75, "grad_norm": 6.354024410247803, "learning_rate": 6.8777777777777785e-06, "loss": 0.9105, "step": 7620 }, { "epoch": 0.75, "grad_norm": 7.046152114868164, "learning_rate": 6.872222222222222e-06, "loss": 0.8935, "step": 7630 }, { "epoch": 0.75, "grad_norm": 7.621962547302246, "learning_rate": 6.866666666666667e-06, "loss": 0.9197, "step": 7640 }, { "epoch": 0.75, "grad_norm": 6.728060722351074, "learning_rate": 6.861111111111112e-06, "loss": 0.8917, "step": 7650 }, { "epoch": 0.75, "grad_norm": 7.954779148101807, "learning_rate": 6.855555555555556e-06, "loss": 0.8732, "step": 7660 }, { "epoch": 0.75, "grad_norm": 7.51579475402832, "learning_rate": 6.850000000000001e-06, "loss": 0.8983, "step": 7670 }, { "epoch": 0.75, "grad_norm": 7.349095821380615, "learning_rate": 6.844444444444445e-06, "loss": 0.9049, "step": 7680 }, { "epoch": 0.75, "grad_norm": 6.657243728637695, "learning_rate": 6.838888888888889e-06, "loss": 0.893, "step": 7690 }, { "epoch": 0.75, "grad_norm": 7.094632625579834, "learning_rate": 6.833333333333334e-06, "loss": 0.8951, "step": 7700 }, { "epoch": 0.76, "grad_norm": 6.151246547698975, "learning_rate": 6.827777777777779e-06, "loss": 0.8855, "step": 7710 }, { "epoch": 0.76, "grad_norm": 7.50338077545166, "learning_rate": 6.8222222222222225e-06, "loss": 0.903, "step": 7720 }, { "epoch": 0.76, "grad_norm": 7.592001438140869, "learning_rate": 6.816666666666667e-06, "loss": 0.8968, "step": 7730 }, { "epoch": 0.76, "grad_norm": 7.889358997344971, "learning_rate": 6.811111111111111e-06, "loss": 0.8923, "step": 7740 }, { "epoch": 0.76, "grad_norm": 7.130982875823975, "learning_rate": 6.8055555555555566e-06, "loss": 0.8987, "step": 7750 }, { "epoch": 0.76, "grad_norm": 7.185723781585693, "learning_rate": 6.800000000000001e-06, "loss": 0.8988, "step": 7760 }, { "epoch": 0.76, "grad_norm": 7.381372451782227, "learning_rate": 6.7944444444444445e-06, "loss": 0.9176, "step": 7770 }, { "epoch": 0.76, "grad_norm": 6.879724025726318, "learning_rate": 6.788888888888889e-06, "loss": 0.9171, "step": 7780 }, { "epoch": 0.76, "grad_norm": 7.617702007293701, "learning_rate": 6.783333333333334e-06, "loss": 0.9016, "step": 7790 }, { "epoch": 0.76, "grad_norm": 7.214888572692871, "learning_rate": 6.777777777777779e-06, "loss": 0.907, "step": 7800 }, { "epoch": 0.77, "grad_norm": 7.6054606437683105, "learning_rate": 6.772222222222222e-06, "loss": 0.9042, "step": 7810 }, { "epoch": 0.77, "grad_norm": 6.148036479949951, "learning_rate": 6.7666666666666665e-06, "loss": 0.8981, "step": 7820 }, { "epoch": 0.77, "grad_norm": 7.380016326904297, "learning_rate": 6.761111111111112e-06, "loss": 0.8871, "step": 7830 }, { "epoch": 0.77, "grad_norm": 6.931553840637207, "learning_rate": 6.755555555555556e-06, "loss": 0.8944, "step": 7840 }, { "epoch": 0.77, "grad_norm": 6.014115333557129, "learning_rate": 6.750000000000001e-06, "loss": 0.8933, "step": 7850 }, { "epoch": 0.77, "grad_norm": 7.075136661529541, "learning_rate": 6.744444444444444e-06, "loss": 0.8914, "step": 7860 }, { "epoch": 0.77, "grad_norm": 8.026590347290039, "learning_rate": 6.738888888888889e-06, "loss": 0.892, "step": 7870 }, { "epoch": 0.77, "grad_norm": 5.462163925170898, "learning_rate": 6.733333333333334e-06, "loss": 0.892, "step": 7880 }, { "epoch": 0.77, "grad_norm": 6.8757195472717285, "learning_rate": 6.727777777777778e-06, "loss": 0.882, "step": 7890 }, { "epoch": 0.77, "grad_norm": 7.298379898071289, "learning_rate": 6.7222222222222235e-06, "loss": 0.9091, "step": 7900 }, { "epoch": 0.78, "grad_norm": 6.536348342895508, "learning_rate": 6.716666666666667e-06, "loss": 0.8737, "step": 7910 }, { "epoch": 0.78, "grad_norm": 7.426848888397217, "learning_rate": 6.711111111111111e-06, "loss": 0.8834, "step": 7920 }, { "epoch": 0.78, "grad_norm": 7.8451361656188965, "learning_rate": 6.705555555555557e-06, "loss": 0.9073, "step": 7930 }, { "epoch": 0.78, "grad_norm": 6.914769649505615, "learning_rate": 6.700000000000001e-06, "loss": 0.9074, "step": 7940 }, { "epoch": 0.78, "grad_norm": 6.386804580688477, "learning_rate": 6.694444444444445e-06, "loss": 0.9041, "step": 7950 }, { "epoch": 0.78, "grad_norm": 7.7358903884887695, "learning_rate": 6.688888888888889e-06, "loss": 0.8933, "step": 7960 }, { "epoch": 0.78, "grad_norm": 6.443516731262207, "learning_rate": 6.683333333333334e-06, "loss": 0.8952, "step": 7970 }, { "epoch": 0.78, "grad_norm": 6.301036357879639, "learning_rate": 6.677777777777779e-06, "loss": 0.9098, "step": 7980 }, { "epoch": 0.78, "grad_norm": 5.590214252471924, "learning_rate": 6.672222222222223e-06, "loss": 0.8966, "step": 7990 }, { "epoch": 0.78, "grad_norm": 7.103362560272217, "learning_rate": 6.666666666666667e-06, "loss": 0.8946, "step": 8000 }, { "epoch": 0.78, "eval_loss": 0.883616030216217, "eval_runtime": 25.0514, "eval_samples_per_second": 26.106, "eval_steps_per_second": 3.273, "step": 8000 }, { "epoch": 0.78, "grad_norm": 7.307333469390869, "learning_rate": 6.661111111111112e-06, "loss": 0.8997, "step": 8010 }, { "epoch": 0.79, "grad_norm": 6.578701019287109, "learning_rate": 6.655555555555556e-06, "loss": 0.8667, "step": 8020 }, { "epoch": 0.79, "grad_norm": 7.42984676361084, "learning_rate": 6.650000000000001e-06, "loss": 0.8903, "step": 8030 }, { "epoch": 0.79, "grad_norm": 6.889919757843018, "learning_rate": 6.644444444444444e-06, "loss": 0.8878, "step": 8040 }, { "epoch": 0.79, "grad_norm": 7.341306686401367, "learning_rate": 6.6388888888888895e-06, "loss": 0.8879, "step": 8050 }, { "epoch": 0.79, "grad_norm": 8.312880516052246, "learning_rate": 6.633333333333334e-06, "loss": 0.9022, "step": 8060 }, { "epoch": 0.79, "grad_norm": 6.954687118530273, "learning_rate": 6.627777777777778e-06, "loss": 0.8947, "step": 8070 }, { "epoch": 0.79, "grad_norm": 7.56857442855835, "learning_rate": 6.6222222222222236e-06, "loss": 0.9152, "step": 8080 }, { "epoch": 0.79, "grad_norm": 6.4846601486206055, "learning_rate": 6.616666666666667e-06, "loss": 0.8901, "step": 8090 }, { "epoch": 0.79, "grad_norm": 6.888131618499756, "learning_rate": 6.6111111111111115e-06, "loss": 0.8851, "step": 8100 }, { "epoch": 0.79, "grad_norm": 6.602269649505615, "learning_rate": 6.605555555555556e-06, "loss": 0.886, "step": 8110 }, { "epoch": 0.8, "grad_norm": 7.381767272949219, "learning_rate": 6.600000000000001e-06, "loss": 0.8852, "step": 8120 }, { "epoch": 0.8, "grad_norm": 6.873960494995117, "learning_rate": 6.594444444444446e-06, "loss": 0.8924, "step": 8130 }, { "epoch": 0.8, "grad_norm": 6.781832218170166, "learning_rate": 6.588888888888889e-06, "loss": 0.9066, "step": 8140 }, { "epoch": 0.8, "grad_norm": 7.549422740936279, "learning_rate": 6.5833333333333335e-06, "loss": 0.903, "step": 8150 }, { "epoch": 0.8, "grad_norm": 7.031944274902344, "learning_rate": 6.577777777777779e-06, "loss": 0.8878, "step": 8160 }, { "epoch": 0.8, "grad_norm": 6.963977336883545, "learning_rate": 6.572222222222223e-06, "loss": 0.8887, "step": 8170 }, { "epoch": 0.8, "grad_norm": 6.608301162719727, "learning_rate": 6.566666666666667e-06, "loss": 0.9121, "step": 8180 }, { "epoch": 0.8, "grad_norm": 5.947847843170166, "learning_rate": 6.561111111111111e-06, "loss": 0.9074, "step": 8190 }, { "epoch": 0.8, "grad_norm": 6.546743869781494, "learning_rate": 6.555555555555556e-06, "loss": 0.8943, "step": 8200 }, { "epoch": 0.8, "grad_norm": 6.005590915679932, "learning_rate": 6.550000000000001e-06, "loss": 0.8818, "step": 8210 }, { "epoch": 0.81, "grad_norm": 6.621908187866211, "learning_rate": 6.544444444444445e-06, "loss": 0.8936, "step": 8220 }, { "epoch": 0.81, "grad_norm": 7.243603706359863, "learning_rate": 6.538888888888889e-06, "loss": 0.8895, "step": 8230 }, { "epoch": 0.81, "grad_norm": 7.072266578674316, "learning_rate": 6.533333333333334e-06, "loss": 0.9111, "step": 8240 }, { "epoch": 0.81, "grad_norm": 6.280644416809082, "learning_rate": 6.5277777777777784e-06, "loss": 0.9099, "step": 8250 }, { "epoch": 0.81, "grad_norm": 7.026856422424316, "learning_rate": 6.522222222222223e-06, "loss": 0.8751, "step": 8260 }, { "epoch": 0.81, "grad_norm": 6.7686662673950195, "learning_rate": 6.516666666666666e-06, "loss": 0.8824, "step": 8270 }, { "epoch": 0.81, "grad_norm": 6.93698263168335, "learning_rate": 6.511111111111112e-06, "loss": 0.8833, "step": 8280 }, { "epoch": 0.81, "grad_norm": 5.809185981750488, "learning_rate": 6.505555555555556e-06, "loss": 0.9165, "step": 8290 }, { "epoch": 0.81, "grad_norm": 6.815030574798584, "learning_rate": 6.5000000000000004e-06, "loss": 0.8971, "step": 8300 }, { "epoch": 0.81, "grad_norm": 7.039297580718994, "learning_rate": 6.494444444444446e-06, "loss": 0.8984, "step": 8310 }, { "epoch": 0.82, "grad_norm": 6.6432647705078125, "learning_rate": 6.488888888888889e-06, "loss": 0.8778, "step": 8320 }, { "epoch": 0.82, "grad_norm": 6.757240295410156, "learning_rate": 6.483333333333334e-06, "loss": 0.8691, "step": 8330 }, { "epoch": 0.82, "grad_norm": 7.119251728057861, "learning_rate": 6.477777777777778e-06, "loss": 0.8968, "step": 8340 }, { "epoch": 0.82, "grad_norm": 7.613892078399658, "learning_rate": 6.472222222222223e-06, "loss": 0.8869, "step": 8350 }, { "epoch": 0.82, "grad_norm": 6.328048229217529, "learning_rate": 6.466666666666667e-06, "loss": 0.8692, "step": 8360 }, { "epoch": 0.82, "grad_norm": 7.0231781005859375, "learning_rate": 6.461111111111111e-06, "loss": 0.8933, "step": 8370 }, { "epoch": 0.82, "grad_norm": 7.306582450866699, "learning_rate": 6.455555555555556e-06, "loss": 0.9003, "step": 8380 }, { "epoch": 0.82, "grad_norm": 6.899701118469238, "learning_rate": 6.450000000000001e-06, "loss": 0.9077, "step": 8390 }, { "epoch": 0.82, "grad_norm": 6.884790897369385, "learning_rate": 6.444444444444445e-06, "loss": 0.8872, "step": 8400 }, { "epoch": 0.82, "grad_norm": 6.220856666564941, "learning_rate": 6.438888888888889e-06, "loss": 0.9021, "step": 8410 }, { "epoch": 0.83, "grad_norm": 7.12216329574585, "learning_rate": 6.433333333333333e-06, "loss": 0.8828, "step": 8420 }, { "epoch": 0.83, "grad_norm": 7.8202972412109375, "learning_rate": 6.4277777777777785e-06, "loss": 0.9118, "step": 8430 }, { "epoch": 0.83, "grad_norm": 6.592990398406982, "learning_rate": 6.422222222222223e-06, "loss": 0.8976, "step": 8440 }, { "epoch": 0.83, "grad_norm": 6.4377360343933105, "learning_rate": 6.416666666666667e-06, "loss": 0.875, "step": 8450 }, { "epoch": 0.83, "grad_norm": 6.305859565734863, "learning_rate": 6.411111111111111e-06, "loss": 0.9059, "step": 8460 }, { "epoch": 0.83, "grad_norm": 6.851753234863281, "learning_rate": 6.405555555555556e-06, "loss": 0.887, "step": 8470 }, { "epoch": 0.83, "grad_norm": 7.4267964363098145, "learning_rate": 6.4000000000000006e-06, "loss": 0.8851, "step": 8480 }, { "epoch": 0.83, "grad_norm": 7.084948539733887, "learning_rate": 6.394444444444445e-06, "loss": 0.8995, "step": 8490 }, { "epoch": 0.83, "grad_norm": 6.8262763023376465, "learning_rate": 6.3888888888888885e-06, "loss": 0.8894, "step": 8500 }, { "epoch": 0.83, "grad_norm": 6.955739498138428, "learning_rate": 6.383333333333334e-06, "loss": 0.872, "step": 8510 }, { "epoch": 0.83, "grad_norm": 6.6519389152526855, "learning_rate": 6.377777777777778e-06, "loss": 0.8846, "step": 8520 }, { "epoch": 0.84, "grad_norm": 6.0418314933776855, "learning_rate": 6.3722222222222226e-06, "loss": 0.9045, "step": 8530 }, { "epoch": 0.84, "grad_norm": 7.054715633392334, "learning_rate": 6.366666666666668e-06, "loss": 0.8738, "step": 8540 }, { "epoch": 0.84, "grad_norm": 7.057066917419434, "learning_rate": 6.361111111111111e-06, "loss": 0.8737, "step": 8550 }, { "epoch": 0.84, "grad_norm": 6.640417575836182, "learning_rate": 6.355555555555556e-06, "loss": 0.8757, "step": 8560 }, { "epoch": 0.84, "grad_norm": 6.397624969482422, "learning_rate": 6.35e-06, "loss": 0.8896, "step": 8570 }, { "epoch": 0.84, "grad_norm": 5.5577850341796875, "learning_rate": 6.3444444444444454e-06, "loss": 0.8888, "step": 8580 }, { "epoch": 0.84, "grad_norm": 6.212761878967285, "learning_rate": 6.338888888888889e-06, "loss": 0.8983, "step": 8590 }, { "epoch": 0.84, "grad_norm": 7.039618968963623, "learning_rate": 6.333333333333333e-06, "loss": 0.8774, "step": 8600 }, { "epoch": 0.84, "grad_norm": 7.137359619140625, "learning_rate": 6.327777777777779e-06, "loss": 0.8798, "step": 8610 }, { "epoch": 0.84, "grad_norm": 7.312909126281738, "learning_rate": 6.322222222222223e-06, "loss": 0.8687, "step": 8620 }, { "epoch": 0.85, "grad_norm": 6.781071186065674, "learning_rate": 6.3166666666666675e-06, "loss": 0.8784, "step": 8630 }, { "epoch": 0.85, "grad_norm": 7.213908672332764, "learning_rate": 6.311111111111111e-06, "loss": 0.8778, "step": 8640 }, { "epoch": 0.85, "grad_norm": 7.662302017211914, "learning_rate": 6.305555555555556e-06, "loss": 0.9064, "step": 8650 }, { "epoch": 0.85, "grad_norm": 6.778668403625488, "learning_rate": 6.300000000000001e-06, "loss": 0.8893, "step": 8660 }, { "epoch": 0.85, "grad_norm": 6.653033256530762, "learning_rate": 6.294444444444445e-06, "loss": 0.8667, "step": 8670 }, { "epoch": 0.85, "grad_norm": 6.839854717254639, "learning_rate": 6.28888888888889e-06, "loss": 0.8925, "step": 8680 }, { "epoch": 0.85, "grad_norm": 7.295853137969971, "learning_rate": 6.283333333333334e-06, "loss": 0.8832, "step": 8690 }, { "epoch": 0.85, "grad_norm": 6.4520978927612305, "learning_rate": 6.277777777777778e-06, "loss": 0.8656, "step": 8700 }, { "epoch": 0.85, "grad_norm": 6.035552024841309, "learning_rate": 6.272222222222223e-06, "loss": 0.8575, "step": 8710 }, { "epoch": 0.85, "grad_norm": 7.363282203674316, "learning_rate": 6.266666666666668e-06, "loss": 0.8697, "step": 8720 }, { "epoch": 0.86, "grad_norm": 7.587259292602539, "learning_rate": 6.2611111111111115e-06, "loss": 0.8869, "step": 8730 }, { "epoch": 0.86, "grad_norm": 7.281359672546387, "learning_rate": 6.255555555555556e-06, "loss": 0.8825, "step": 8740 }, { "epoch": 0.86, "grad_norm": 7.208422660827637, "learning_rate": 6.25e-06, "loss": 0.8791, "step": 8750 }, { "epoch": 0.86, "grad_norm": 7.07909631729126, "learning_rate": 6.2444444444444456e-06, "loss": 0.8865, "step": 8760 }, { "epoch": 0.86, "grad_norm": 6.3391218185424805, "learning_rate": 6.23888888888889e-06, "loss": 0.8841, "step": 8770 }, { "epoch": 0.86, "grad_norm": 8.006622314453125, "learning_rate": 6.2333333333333335e-06, "loss": 0.8779, "step": 8780 }, { "epoch": 0.86, "grad_norm": 6.903020858764648, "learning_rate": 6.227777777777778e-06, "loss": 0.8838, "step": 8790 }, { "epoch": 0.86, "grad_norm": 6.86175012588501, "learning_rate": 6.222222222222223e-06, "loss": 0.8657, "step": 8800 }, { "epoch": 0.86, "grad_norm": 8.133430480957031, "learning_rate": 6.2166666666666676e-06, "loss": 0.8983, "step": 8810 }, { "epoch": 0.86, "grad_norm": 7.037128925323486, "learning_rate": 6.211111111111111e-06, "loss": 0.8803, "step": 8820 }, { "epoch": 0.87, "grad_norm": 6.0375471115112305, "learning_rate": 6.2055555555555555e-06, "loss": 0.8741, "step": 8830 }, { "epoch": 0.87, "grad_norm": 7.056241035461426, "learning_rate": 6.200000000000001e-06, "loss": 0.8717, "step": 8840 }, { "epoch": 0.87, "grad_norm": 7.239374160766602, "learning_rate": 6.194444444444445e-06, "loss": 0.8835, "step": 8850 }, { "epoch": 0.87, "grad_norm": 7.4393086433410645, "learning_rate": 6.18888888888889e-06, "loss": 0.8853, "step": 8860 }, { "epoch": 0.87, "grad_norm": 7.676321506500244, "learning_rate": 6.183333333333333e-06, "loss": 0.8897, "step": 8870 }, { "epoch": 0.87, "grad_norm": 7.6380510330200195, "learning_rate": 6.177777777777778e-06, "loss": 0.8895, "step": 8880 }, { "epoch": 0.87, "grad_norm": 7.9057297706604, "learning_rate": 6.172222222222223e-06, "loss": 0.8698, "step": 8890 }, { "epoch": 0.87, "grad_norm": 6.453971862792969, "learning_rate": 6.166666666666667e-06, "loss": 0.8778, "step": 8900 }, { "epoch": 0.87, "grad_norm": 7.497171878814697, "learning_rate": 6.1611111111111124e-06, "loss": 0.8876, "step": 8910 }, { "epoch": 0.87, "grad_norm": 8.176643371582031, "learning_rate": 6.155555555555556e-06, "loss": 0.8806, "step": 8920 }, { "epoch": 0.88, "grad_norm": 7.073744773864746, "learning_rate": 6.15e-06, "loss": 0.8896, "step": 8930 }, { "epoch": 0.88, "grad_norm": 6.9904351234436035, "learning_rate": 6.144444444444445e-06, "loss": 0.8914, "step": 8940 }, { "epoch": 0.88, "grad_norm": 6.550717353820801, "learning_rate": 6.13888888888889e-06, "loss": 0.8947, "step": 8950 }, { "epoch": 0.88, "grad_norm": 7.141576766967773, "learning_rate": 6.133333333333334e-06, "loss": 0.8984, "step": 8960 }, { "epoch": 0.88, "grad_norm": 7.0152153968811035, "learning_rate": 6.127777777777778e-06, "loss": 0.8808, "step": 8970 }, { "epoch": 0.88, "grad_norm": 6.518797397613525, "learning_rate": 6.1222222222222224e-06, "loss": 0.8792, "step": 8980 }, { "epoch": 0.88, "grad_norm": 7.444545269012451, "learning_rate": 6.116666666666668e-06, "loss": 0.8714, "step": 8990 }, { "epoch": 0.88, "grad_norm": 6.313089370727539, "learning_rate": 6.111111111111112e-06, "loss": 0.9085, "step": 9000 }, { "epoch": 0.88, "grad_norm": 6.948700428009033, "learning_rate": 6.105555555555556e-06, "loss": 0.889, "step": 9010 }, { "epoch": 0.88, "grad_norm": 6.369102954864502, "learning_rate": 6.1e-06, "loss": 0.8759, "step": 9020 }, { "epoch": 0.88, "grad_norm": 6.854885101318359, "learning_rate": 6.094444444444445e-06, "loss": 0.8622, "step": 9030 }, { "epoch": 0.89, "grad_norm": 7.499849796295166, "learning_rate": 6.08888888888889e-06, "loss": 0.887, "step": 9040 }, { "epoch": 0.89, "grad_norm": 7.458281517028809, "learning_rate": 6.083333333333333e-06, "loss": 0.8907, "step": 9050 }, { "epoch": 0.89, "grad_norm": 8.356446266174316, "learning_rate": 6.077777777777778e-06, "loss": 0.8594, "step": 9060 }, { "epoch": 0.89, "grad_norm": 6.314296722412109, "learning_rate": 6.072222222222223e-06, "loss": 0.8887, "step": 9070 }, { "epoch": 0.89, "grad_norm": 6.681983947753906, "learning_rate": 6.066666666666667e-06, "loss": 0.8539, "step": 9080 }, { "epoch": 0.89, "grad_norm": 6.1515212059021, "learning_rate": 6.061111111111112e-06, "loss": 0.9017, "step": 9090 }, { "epoch": 0.89, "grad_norm": 6.6081156730651855, "learning_rate": 6.055555555555555e-06, "loss": 0.875, "step": 9100 }, { "epoch": 0.89, "grad_norm": 8.859786033630371, "learning_rate": 6.0500000000000005e-06, "loss": 0.8866, "step": 9110 }, { "epoch": 0.89, "grad_norm": 6.386359691619873, "learning_rate": 6.044444444444445e-06, "loss": 0.8869, "step": 9120 }, { "epoch": 0.89, "grad_norm": 7.896292686462402, "learning_rate": 6.038888888888889e-06, "loss": 0.876, "step": 9130 }, { "epoch": 0.9, "grad_norm": 6.483770847320557, "learning_rate": 6.033333333333335e-06, "loss": 0.85, "step": 9140 }, { "epoch": 0.9, "grad_norm": 7.183856010437012, "learning_rate": 6.027777777777778e-06, "loss": 0.8605, "step": 9150 }, { "epoch": 0.9, "grad_norm": 5.264705181121826, "learning_rate": 6.0222222222222225e-06, "loss": 0.8808, "step": 9160 }, { "epoch": 0.9, "grad_norm": 6.453924179077148, "learning_rate": 6.016666666666667e-06, "loss": 0.8979, "step": 9170 }, { "epoch": 0.9, "grad_norm": 6.932863235473633, "learning_rate": 6.011111111111112e-06, "loss": 0.8781, "step": 9180 }, { "epoch": 0.9, "grad_norm": 7.382064342498779, "learning_rate": 6.005555555555556e-06, "loss": 0.8886, "step": 9190 }, { "epoch": 0.9, "grad_norm": 7.988288402557373, "learning_rate": 6e-06, "loss": 0.8731, "step": 9200 }, { "epoch": 0.9, "grad_norm": 6.656442165374756, "learning_rate": 5.9944444444444446e-06, "loss": 0.8548, "step": 9210 }, { "epoch": 0.9, "grad_norm": 6.33673095703125, "learning_rate": 5.98888888888889e-06, "loss": 0.8852, "step": 9220 }, { "epoch": 0.9, "grad_norm": 7.264461517333984, "learning_rate": 5.983333333333334e-06, "loss": 0.8743, "step": 9230 }, { "epoch": 0.91, "grad_norm": 6.480602741241455, "learning_rate": 5.977777777777778e-06, "loss": 0.8634, "step": 9240 }, { "epoch": 0.91, "grad_norm": 7.590120315551758, "learning_rate": 5.972222222222222e-06, "loss": 0.883, "step": 9250 }, { "epoch": 0.91, "grad_norm": 7.197909355163574, "learning_rate": 5.966666666666667e-06, "loss": 0.8717, "step": 9260 }, { "epoch": 0.91, "grad_norm": 7.05042839050293, "learning_rate": 5.961111111111112e-06, "loss": 0.8638, "step": 9270 }, { "epoch": 0.91, "grad_norm": 5.97763729095459, "learning_rate": 5.955555555555555e-06, "loss": 0.8459, "step": 9280 }, { "epoch": 0.91, "grad_norm": 7.28564977645874, "learning_rate": 5.950000000000001e-06, "loss": 0.8538, "step": 9290 }, { "epoch": 0.91, "grad_norm": 6.655655860900879, "learning_rate": 5.944444444444445e-06, "loss": 0.8824, "step": 9300 }, { "epoch": 0.91, "grad_norm": 5.312273025512695, "learning_rate": 5.9388888888888894e-06, "loss": 0.8758, "step": 9310 }, { "epoch": 0.91, "grad_norm": 5.882920265197754, "learning_rate": 5.933333333333335e-06, "loss": 0.8712, "step": 9320 }, { "epoch": 0.91, "grad_norm": 6.464552402496338, "learning_rate": 5.927777777777778e-06, "loss": 0.8697, "step": 9330 }, { "epoch": 0.92, "grad_norm": 6.8139448165893555, "learning_rate": 5.922222222222223e-06, "loss": 0.8667, "step": 9340 }, { "epoch": 0.92, "grad_norm": 7.046186447143555, "learning_rate": 5.916666666666667e-06, "loss": 0.8712, "step": 9350 }, { "epoch": 0.92, "grad_norm": 5.462280750274658, "learning_rate": 5.911111111111112e-06, "loss": 0.8769, "step": 9360 }, { "epoch": 0.92, "grad_norm": 6.669032096862793, "learning_rate": 5.905555555555556e-06, "loss": 0.8834, "step": 9370 }, { "epoch": 0.92, "grad_norm": 7.375804901123047, "learning_rate": 5.9e-06, "loss": 0.8676, "step": 9380 }, { "epoch": 0.92, "grad_norm": 6.915449142456055, "learning_rate": 5.894444444444445e-06, "loss": 0.837, "step": 9390 }, { "epoch": 0.92, "grad_norm": 7.145249366760254, "learning_rate": 5.88888888888889e-06, "loss": 0.8609, "step": 9400 }, { "epoch": 0.92, "grad_norm": 6.838450908660889, "learning_rate": 5.883333333333334e-06, "loss": 0.8717, "step": 9410 }, { "epoch": 0.92, "grad_norm": 6.5778679847717285, "learning_rate": 5.877777777777778e-06, "loss": 0.8756, "step": 9420 }, { "epoch": 0.92, "grad_norm": 7.0362982749938965, "learning_rate": 5.872222222222222e-06, "loss": 0.8578, "step": 9430 }, { "epoch": 0.93, "grad_norm": 7.113833904266357, "learning_rate": 5.8666666666666675e-06, "loss": 0.8717, "step": 9440 }, { "epoch": 0.93, "grad_norm": 7.0388617515563965, "learning_rate": 5.861111111111112e-06, "loss": 0.8604, "step": 9450 }, { "epoch": 0.93, "grad_norm": 6.346746444702148, "learning_rate": 5.855555555555556e-06, "loss": 0.8552, "step": 9460 }, { "epoch": 0.93, "grad_norm": 6.798004627227783, "learning_rate": 5.85e-06, "loss": 0.8652, "step": 9470 }, { "epoch": 0.93, "grad_norm": 6.887036323547363, "learning_rate": 5.844444444444445e-06, "loss": 0.8683, "step": 9480 }, { "epoch": 0.93, "grad_norm": 8.138835906982422, "learning_rate": 5.8388888888888895e-06, "loss": 0.8685, "step": 9490 }, { "epoch": 0.93, "grad_norm": 7.43895959854126, "learning_rate": 5.833333333333334e-06, "loss": 0.8622, "step": 9500 }, { "epoch": 0.93, "grad_norm": 5.998297214508057, "learning_rate": 5.8277777777777775e-06, "loss": 0.8868, "step": 9510 }, { "epoch": 0.93, "grad_norm": 6.144559383392334, "learning_rate": 5.822222222222223e-06, "loss": 0.8651, "step": 9520 }, { "epoch": 0.93, "grad_norm": 7.2114949226379395, "learning_rate": 5.816666666666667e-06, "loss": 0.8721, "step": 9530 }, { "epoch": 0.93, "grad_norm": 7.576504707336426, "learning_rate": 5.8111111111111116e-06, "loss": 0.8711, "step": 9540 }, { "epoch": 0.94, "grad_norm": 7.064696311950684, "learning_rate": 5.805555555555557e-06, "loss": 0.8701, "step": 9550 }, { "epoch": 0.94, "grad_norm": 6.365035533905029, "learning_rate": 5.8e-06, "loss": 0.8671, "step": 9560 }, { "epoch": 0.94, "grad_norm": 6.618658065795898, "learning_rate": 5.794444444444445e-06, "loss": 0.8583, "step": 9570 }, { "epoch": 0.94, "grad_norm": 7.324855327606201, "learning_rate": 5.788888888888889e-06, "loss": 0.8818, "step": 9580 }, { "epoch": 0.94, "grad_norm": 6.58635139465332, "learning_rate": 5.7833333333333344e-06, "loss": 0.8746, "step": 9590 }, { "epoch": 0.94, "grad_norm": 7.235201835632324, "learning_rate": 5.777777777777778e-06, "loss": 0.8736, "step": 9600 }, { "epoch": 0.94, "grad_norm": 7.773404121398926, "learning_rate": 5.772222222222222e-06, "loss": 0.854, "step": 9610 }, { "epoch": 0.94, "grad_norm": 7.819270610809326, "learning_rate": 5.766666666666667e-06, "loss": 0.8495, "step": 9620 }, { "epoch": 0.94, "grad_norm": 6.8440842628479, "learning_rate": 5.761111111111112e-06, "loss": 0.8636, "step": 9630 }, { "epoch": 0.94, "grad_norm": 7.480376243591309, "learning_rate": 5.7555555555555564e-06, "loss": 0.8465, "step": 9640 }, { "epoch": 0.95, "grad_norm": 7.8194098472595215, "learning_rate": 5.75e-06, "loss": 0.8915, "step": 9650 }, { "epoch": 0.95, "grad_norm": 7.215272426605225, "learning_rate": 5.744444444444444e-06, "loss": 0.8598, "step": 9660 }, { "epoch": 0.95, "grad_norm": 6.709381103515625, "learning_rate": 5.73888888888889e-06, "loss": 0.8734, "step": 9670 }, { "epoch": 0.95, "grad_norm": 6.557707786560059, "learning_rate": 5.733333333333334e-06, "loss": 0.8729, "step": 9680 }, { "epoch": 0.95, "grad_norm": 7.274158477783203, "learning_rate": 5.7277777777777785e-06, "loss": 0.851, "step": 9690 }, { "epoch": 0.95, "grad_norm": 6.466496467590332, "learning_rate": 5.722222222222222e-06, "loss": 0.8489, "step": 9700 }, { "epoch": 0.95, "grad_norm": 6.673398971557617, "learning_rate": 5.716666666666667e-06, "loss": 0.8643, "step": 9710 }, { "epoch": 0.95, "grad_norm": 6.518968105316162, "learning_rate": 5.711111111111112e-06, "loss": 0.8534, "step": 9720 }, { "epoch": 0.95, "grad_norm": 7.913751602172852, "learning_rate": 5.705555555555556e-06, "loss": 0.8683, "step": 9730 }, { "epoch": 0.95, "grad_norm": 6.435301303863525, "learning_rate": 5.7e-06, "loss": 0.8687, "step": 9740 }, { "epoch": 0.96, "grad_norm": 7.524555206298828, "learning_rate": 5.694444444444445e-06, "loss": 0.8721, "step": 9750 }, { "epoch": 0.96, "grad_norm": 6.4265851974487305, "learning_rate": 5.688888888888889e-06, "loss": 0.8546, "step": 9760 }, { "epoch": 0.96, "grad_norm": 6.514756202697754, "learning_rate": 5.683333333333334e-06, "loss": 0.8736, "step": 9770 }, { "epoch": 0.96, "grad_norm": 6.738321781158447, "learning_rate": 5.677777777777779e-06, "loss": 0.8577, "step": 9780 }, { "epoch": 0.96, "grad_norm": 6.784518718719482, "learning_rate": 5.6722222222222225e-06, "loss": 0.8738, "step": 9790 }, { "epoch": 0.96, "grad_norm": 6.944060802459717, "learning_rate": 5.666666666666667e-06, "loss": 0.8568, "step": 9800 }, { "epoch": 0.96, "grad_norm": 6.219351291656494, "learning_rate": 5.661111111111111e-06, "loss": 0.8713, "step": 9810 }, { "epoch": 0.96, "grad_norm": 6.350907325744629, "learning_rate": 5.6555555555555566e-06, "loss": 0.8827, "step": 9820 }, { "epoch": 0.96, "grad_norm": 6.970325469970703, "learning_rate": 5.65e-06, "loss": 0.8765, "step": 9830 }, { "epoch": 0.96, "grad_norm": 8.448198318481445, "learning_rate": 5.6444444444444445e-06, "loss": 0.8459, "step": 9840 }, { "epoch": 0.97, "grad_norm": 5.720845699310303, "learning_rate": 5.638888888888889e-06, "loss": 0.8689, "step": 9850 }, { "epoch": 0.97, "grad_norm": 6.733059406280518, "learning_rate": 5.633333333333334e-06, "loss": 0.8417, "step": 9860 }, { "epoch": 0.97, "grad_norm": 6.162039279937744, "learning_rate": 5.6277777777777786e-06, "loss": 0.8552, "step": 9870 }, { "epoch": 0.97, "grad_norm": 7.286827564239502, "learning_rate": 5.622222222222222e-06, "loss": 0.8482, "step": 9880 }, { "epoch": 0.97, "grad_norm": 6.680145740509033, "learning_rate": 5.6166666666666665e-06, "loss": 0.8363, "step": 9890 }, { "epoch": 0.97, "grad_norm": 8.603917121887207, "learning_rate": 5.611111111111112e-06, "loss": 0.8738, "step": 9900 }, { "epoch": 0.97, "grad_norm": 6.840752124786377, "learning_rate": 5.605555555555556e-06, "loss": 0.8564, "step": 9910 }, { "epoch": 0.97, "grad_norm": 7.862891674041748, "learning_rate": 5.600000000000001e-06, "loss": 0.8398, "step": 9920 }, { "epoch": 0.97, "grad_norm": 7.299100875854492, "learning_rate": 5.594444444444444e-06, "loss": 0.853, "step": 9930 }, { "epoch": 0.97, "grad_norm": 7.499954700469971, "learning_rate": 5.588888888888889e-06, "loss": 0.8693, "step": 9940 }, { "epoch": 0.98, "grad_norm": 7.074316501617432, "learning_rate": 5.583333333333334e-06, "loss": 0.8465, "step": 9950 }, { "epoch": 0.98, "grad_norm": 6.285116672515869, "learning_rate": 5.577777777777778e-06, "loss": 0.857, "step": 9960 }, { "epoch": 0.98, "grad_norm": 5.868269443511963, "learning_rate": 5.572222222222223e-06, "loss": 0.8762, "step": 9970 }, { "epoch": 0.98, "grad_norm": 7.850697994232178, "learning_rate": 5.566666666666667e-06, "loss": 0.8503, "step": 9980 }, { "epoch": 0.98, "grad_norm": 6.032191276550293, "learning_rate": 5.561111111111111e-06, "loss": 0.8761, "step": 9990 }, { "epoch": 0.98, "grad_norm": 6.6063079833984375, "learning_rate": 5.555555555555557e-06, "loss": 0.8662, "step": 10000 }, { "epoch": 0.98, "eval_loss": 0.8441851139068604, "eval_runtime": 24.8939, "eval_samples_per_second": 26.272, "eval_steps_per_second": 3.294, "step": 10000 }, { "epoch": 0.98, "grad_norm": 7.31265115737915, "learning_rate": 5.550000000000001e-06, "loss": 0.8543, "step": 10010 }, { "epoch": 0.98, "grad_norm": 6.747231960296631, "learning_rate": 5.544444444444445e-06, "loss": 0.865, "step": 10020 }, { "epoch": 0.98, "grad_norm": 8.0430326461792, "learning_rate": 5.538888888888889e-06, "loss": 0.8643, "step": 10030 }, { "epoch": 0.98, "grad_norm": 7.47281551361084, "learning_rate": 5.533333333333334e-06, "loss": 0.8702, "step": 10040 }, { "epoch": 0.98, "grad_norm": 7.648735523223877, "learning_rate": 5.527777777777779e-06, "loss": 0.8462, "step": 10050 }, { "epoch": 0.99, "grad_norm": 7.100777626037598, "learning_rate": 5.522222222222222e-06, "loss": 0.8697, "step": 10060 }, { "epoch": 0.99, "grad_norm": 40.33247375488281, "learning_rate": 5.516666666666667e-06, "loss": 0.8548, "step": 10070 }, { "epoch": 0.99, "grad_norm": 7.459417819976807, "learning_rate": 5.511111111111112e-06, "loss": 0.9143, "step": 10080 }, { "epoch": 0.99, "grad_norm": 7.384781837463379, "learning_rate": 5.505555555555556e-06, "loss": 0.8751, "step": 10090 }, { "epoch": 0.99, "grad_norm": 7.588937282562256, "learning_rate": 5.500000000000001e-06, "loss": 0.8519, "step": 10100 }, { "epoch": 0.99, "grad_norm": 7.06969690322876, "learning_rate": 5.494444444444444e-06, "loss": 0.8719, "step": 10110 }, { "epoch": 0.99, "grad_norm": 7.099348545074463, "learning_rate": 5.4888888888888895e-06, "loss": 0.8658, "step": 10120 }, { "epoch": 0.99, "grad_norm": 8.15184211730957, "learning_rate": 5.483333333333334e-06, "loss": 0.8773, "step": 10130 }, { "epoch": 0.99, "grad_norm": 7.414036750793457, "learning_rate": 5.477777777777778e-06, "loss": 0.8478, "step": 10140 }, { "epoch": 0.99, "grad_norm": 9.085332870483398, "learning_rate": 5.4722222222222236e-06, "loss": 0.8387, "step": 10150 }, { "epoch": 1.0, "grad_norm": 6.485350131988525, "learning_rate": 5.466666666666667e-06, "loss": 0.8682, "step": 10160 }, { "epoch": 1.0, "grad_norm": 7.1299052238464355, "learning_rate": 5.4611111111111115e-06, "loss": 0.8795, "step": 10170 }, { "epoch": 1.0, "grad_norm": 6.058491230010986, "learning_rate": 5.455555555555556e-06, "loss": 0.8448, "step": 10180 }, { "epoch": 1.0, "grad_norm": 6.84428596496582, "learning_rate": 5.450000000000001e-06, "loss": 0.8775, "step": 10190 }, { "epoch": 1.0, "grad_norm": 6.108795166015625, "learning_rate": 5.444444444444445e-06, "loss": 0.8624, "step": 10200 }, { "epoch": 1.0, "grad_norm": 7.1825995445251465, "learning_rate": 5.438888888888889e-06, "loss": 0.8338, "step": 10210 }, { "epoch": 1.0, "grad_norm": 6.524367809295654, "learning_rate": 5.4333333333333335e-06, "loss": 0.7737, "step": 10220 }, { "epoch": 1.0, "grad_norm": 6.790142059326172, "learning_rate": 5.427777777777779e-06, "loss": 0.7657, "step": 10230 }, { "epoch": 1.0, "grad_norm": 6.787601947784424, "learning_rate": 5.422222222222223e-06, "loss": 0.8039, "step": 10240 }, { "epoch": 1.0, "grad_norm": 7.303506374359131, "learning_rate": 5.416666666666667e-06, "loss": 0.7919, "step": 10250 }, { "epoch": 1.01, "grad_norm": 6.302385330200195, "learning_rate": 5.411111111111111e-06, "loss": 0.7823, "step": 10260 }, { "epoch": 1.01, "grad_norm": 8.548937797546387, "learning_rate": 5.405555555555556e-06, "loss": 0.7795, "step": 10270 }, { "epoch": 1.01, "grad_norm": 7.061395168304443, "learning_rate": 5.400000000000001e-06, "loss": 0.7736, "step": 10280 }, { "epoch": 1.01, "grad_norm": 7.917107105255127, "learning_rate": 5.394444444444444e-06, "loss": 0.78, "step": 10290 }, { "epoch": 1.01, "grad_norm": 6.9600677490234375, "learning_rate": 5.388888888888889e-06, "loss": 0.7608, "step": 10300 }, { "epoch": 1.01, "grad_norm": 7.7025651931762695, "learning_rate": 5.383333333333334e-06, "loss": 0.7875, "step": 10310 }, { "epoch": 1.01, "grad_norm": 7.105093955993652, "learning_rate": 5.3777777777777784e-06, "loss": 0.7695, "step": 10320 }, { "epoch": 1.01, "grad_norm": 6.752792835235596, "learning_rate": 5.372222222222223e-06, "loss": 0.7746, "step": 10330 }, { "epoch": 1.01, "grad_norm": 7.004038333892822, "learning_rate": 5.366666666666666e-06, "loss": 0.7779, "step": 10340 }, { "epoch": 1.01, "grad_norm": 6.771589279174805, "learning_rate": 5.361111111111112e-06, "loss": 0.7639, "step": 10350 }, { "epoch": 1.02, "grad_norm": 6.449252605438232, "learning_rate": 5.355555555555556e-06, "loss": 0.7846, "step": 10360 }, { "epoch": 1.02, "grad_norm": 7.27652645111084, "learning_rate": 5.3500000000000004e-06, "loss": 0.7807, "step": 10370 }, { "epoch": 1.02, "grad_norm": 7.641276836395264, "learning_rate": 5.344444444444446e-06, "loss": 0.7705, "step": 10380 }, { "epoch": 1.02, "grad_norm": 7.427809715270996, "learning_rate": 5.338888888888889e-06, "loss": 0.7627, "step": 10390 }, { "epoch": 1.02, "grad_norm": 6.4789838790893555, "learning_rate": 5.333333333333334e-06, "loss": 0.785, "step": 10400 }, { "epoch": 1.02, "grad_norm": 7.497314453125, "learning_rate": 5.327777777777778e-06, "loss": 0.7841, "step": 10410 }, { "epoch": 1.02, "grad_norm": 6.524948596954346, "learning_rate": 5.322222222222223e-06, "loss": 0.7677, "step": 10420 }, { "epoch": 1.02, "grad_norm": 8.56777286529541, "learning_rate": 5.316666666666667e-06, "loss": 0.7718, "step": 10430 }, { "epoch": 1.02, "grad_norm": 8.81805419921875, "learning_rate": 5.311111111111111e-06, "loss": 0.7798, "step": 10440 }, { "epoch": 1.02, "grad_norm": 6.876734256744385, "learning_rate": 5.305555555555556e-06, "loss": 0.7947, "step": 10450 }, { "epoch": 1.03, "grad_norm": 6.171253681182861, "learning_rate": 5.300000000000001e-06, "loss": 0.7895, "step": 10460 }, { "epoch": 1.03, "grad_norm": 7.5997819900512695, "learning_rate": 5.294444444444445e-06, "loss": 0.7588, "step": 10470 }, { "epoch": 1.03, "grad_norm": 8.281477928161621, "learning_rate": 5.288888888888889e-06, "loss": 0.7756, "step": 10480 }, { "epoch": 1.03, "grad_norm": 7.262159824371338, "learning_rate": 5.283333333333333e-06, "loss": 0.7709, "step": 10490 }, { "epoch": 1.03, "grad_norm": 7.292929172515869, "learning_rate": 5.2777777777777785e-06, "loss": 0.7876, "step": 10500 }, { "epoch": 1.03, "grad_norm": 7.4358601570129395, "learning_rate": 5.272222222222223e-06, "loss": 0.7738, "step": 10510 }, { "epoch": 1.03, "grad_norm": 7.284677028656006, "learning_rate": 5.2666666666666665e-06, "loss": 0.7722, "step": 10520 }, { "epoch": 1.03, "grad_norm": 16.528291702270508, "learning_rate": 5.261111111111111e-06, "loss": 0.7769, "step": 10530 }, { "epoch": 1.03, "grad_norm": 7.4214677810668945, "learning_rate": 5.255555555555556e-06, "loss": 0.7806, "step": 10540 }, { "epoch": 1.03, "grad_norm": 8.047987937927246, "learning_rate": 5.2500000000000006e-06, "loss": 0.7705, "step": 10550 }, { "epoch": 1.03, "grad_norm": 7.030718803405762, "learning_rate": 5.244444444444445e-06, "loss": 0.7848, "step": 10560 }, { "epoch": 1.04, "grad_norm": 7.704085350036621, "learning_rate": 5.2388888888888885e-06, "loss": 0.7791, "step": 10570 }, { "epoch": 1.04, "grad_norm": 7.204519748687744, "learning_rate": 5.233333333333334e-06, "loss": 0.7643, "step": 10580 }, { "epoch": 1.04, "grad_norm": 6.923788070678711, "learning_rate": 5.227777777777778e-06, "loss": 0.7577, "step": 10590 }, { "epoch": 1.04, "grad_norm": 7.939523696899414, "learning_rate": 5.2222222222222226e-06, "loss": 0.7657, "step": 10600 }, { "epoch": 1.04, "grad_norm": 6.574915885925293, "learning_rate": 5.216666666666666e-06, "loss": 0.7779, "step": 10610 }, { "epoch": 1.04, "grad_norm": 6.890714645385742, "learning_rate": 5.211111111111111e-06, "loss": 0.7671, "step": 10620 }, { "epoch": 1.04, "grad_norm": 7.500261306762695, "learning_rate": 5.205555555555556e-06, "loss": 0.7858, "step": 10630 }, { "epoch": 1.04, "grad_norm": 7.938897609710693, "learning_rate": 5.2e-06, "loss": 0.7808, "step": 10640 }, { "epoch": 1.04, "grad_norm": 7.144112586975098, "learning_rate": 5.1944444444444454e-06, "loss": 0.7901, "step": 10650 }, { "epoch": 1.04, "grad_norm": 7.783050060272217, "learning_rate": 5.188888888888889e-06, "loss": 0.7677, "step": 10660 }, { "epoch": 1.05, "grad_norm": 8.227736473083496, "learning_rate": 5.183333333333333e-06, "loss": 0.7613, "step": 10670 }, { "epoch": 1.05, "grad_norm": 7.241130352020264, "learning_rate": 5.177777777777779e-06, "loss": 0.7602, "step": 10680 }, { "epoch": 1.05, "grad_norm": 6.7576212882995605, "learning_rate": 5.172222222222223e-06, "loss": 0.7708, "step": 10690 }, { "epoch": 1.05, "grad_norm": 6.645476341247559, "learning_rate": 5.1666666666666675e-06, "loss": 0.7697, "step": 10700 }, { "epoch": 1.05, "grad_norm": 7.1985955238342285, "learning_rate": 5.161111111111111e-06, "loss": 0.7815, "step": 10710 }, { "epoch": 1.05, "grad_norm": 6.331073760986328, "learning_rate": 5.155555555555556e-06, "loss": 0.7908, "step": 10720 }, { "epoch": 1.05, "grad_norm": 8.193612098693848, "learning_rate": 5.150000000000001e-06, "loss": 0.7425, "step": 10730 }, { "epoch": 1.05, "grad_norm": 6.711711883544922, "learning_rate": 5.144444444444445e-06, "loss": 0.7782, "step": 10740 }, { "epoch": 1.05, "grad_norm": 7.859786033630371, "learning_rate": 5.138888888888889e-06, "loss": 0.7686, "step": 10750 }, { "epoch": 1.05, "grad_norm": 7.887181758880615, "learning_rate": 5.133333333333334e-06, "loss": 0.7746, "step": 10760 }, { "epoch": 1.06, "grad_norm": 7.689380645751953, "learning_rate": 5.127777777777778e-06, "loss": 0.7514, "step": 10770 }, { "epoch": 1.06, "grad_norm": 6.9431047439575195, "learning_rate": 5.122222222222223e-06, "loss": 0.7676, "step": 10780 }, { "epoch": 1.06, "grad_norm": 7.557960510253906, "learning_rate": 5.116666666666668e-06, "loss": 0.7619, "step": 10790 }, { "epoch": 1.06, "grad_norm": 7.4696455001831055, "learning_rate": 5.1111111111111115e-06, "loss": 0.7602, "step": 10800 }, { "epoch": 1.06, "grad_norm": 7.264859676361084, "learning_rate": 5.105555555555556e-06, "loss": 0.7673, "step": 10810 }, { "epoch": 1.06, "grad_norm": 6.598574161529541, "learning_rate": 5.1e-06, "loss": 0.7737, "step": 10820 }, { "epoch": 1.06, "grad_norm": 7.837955474853516, "learning_rate": 5.0944444444444455e-06, "loss": 0.7505, "step": 10830 }, { "epoch": 1.06, "grad_norm": 7.541133880615234, "learning_rate": 5.088888888888889e-06, "loss": 0.7639, "step": 10840 }, { "epoch": 1.06, "grad_norm": 7.9933342933654785, "learning_rate": 5.0833333333333335e-06, "loss": 0.7678, "step": 10850 }, { "epoch": 1.06, "grad_norm": 8.484573364257812, "learning_rate": 5.077777777777778e-06, "loss": 0.7506, "step": 10860 }, { "epoch": 1.07, "grad_norm": 8.10472297668457, "learning_rate": 5.072222222222223e-06, "loss": 0.7567, "step": 10870 }, { "epoch": 1.07, "grad_norm": 8.194365501403809, "learning_rate": 5.0666666666666676e-06, "loss": 0.768, "step": 10880 }, { "epoch": 1.07, "grad_norm": 6.342767715454102, "learning_rate": 5.061111111111111e-06, "loss": 0.7743, "step": 10890 }, { "epoch": 1.07, "grad_norm": 8.106664657592773, "learning_rate": 5.0555555555555555e-06, "loss": 0.7854, "step": 10900 }, { "epoch": 1.07, "grad_norm": 8.772774696350098, "learning_rate": 5.050000000000001e-06, "loss": 0.7943, "step": 10910 }, { "epoch": 1.07, "grad_norm": 8.443886756896973, "learning_rate": 5.044444444444445e-06, "loss": 0.7833, "step": 10920 }, { "epoch": 1.07, "grad_norm": 7.84366512298584, "learning_rate": 5.03888888888889e-06, "loss": 0.7492, "step": 10930 }, { "epoch": 1.07, "grad_norm": 7.727038383483887, "learning_rate": 5.033333333333333e-06, "loss": 0.784, "step": 10940 }, { "epoch": 1.07, "grad_norm": 7.438961029052734, "learning_rate": 5.027777777777778e-06, "loss": 0.7807, "step": 10950 }, { "epoch": 1.07, "grad_norm": 7.321585178375244, "learning_rate": 5.022222222222223e-06, "loss": 0.7752, "step": 10960 }, { "epoch": 1.08, "grad_norm": 6.543097496032715, "learning_rate": 5.016666666666667e-06, "loss": 0.7579, "step": 10970 }, { "epoch": 1.08, "grad_norm": 6.780231952667236, "learning_rate": 5.011111111111111e-06, "loss": 0.7426, "step": 10980 }, { "epoch": 1.08, "grad_norm": 7.468822956085205, "learning_rate": 5.005555555555556e-06, "loss": 0.779, "step": 10990 }, { "epoch": 1.08, "grad_norm": 8.029332160949707, "learning_rate": 5e-06, "loss": 0.7953, "step": 11000 }, { "epoch": 1.08, "grad_norm": 6.612665176391602, "learning_rate": 4.994444444444445e-06, "loss": 0.7362, "step": 11010 }, { "epoch": 1.08, "grad_norm": 7.356622695922852, "learning_rate": 4.988888888888889e-06, "loss": 0.7486, "step": 11020 }, { "epoch": 1.08, "grad_norm": 6.145163059234619, "learning_rate": 4.983333333333334e-06, "loss": 0.7713, "step": 11030 }, { "epoch": 1.08, "grad_norm": 8.78792667388916, "learning_rate": 4.977777777777778e-06, "loss": 0.7872, "step": 11040 }, { "epoch": 1.08, "grad_norm": 7.8841233253479, "learning_rate": 4.9722222222222224e-06, "loss": 0.7615, "step": 11050 }, { "epoch": 1.08, "grad_norm": 7.7549943923950195, "learning_rate": 4.966666666666667e-06, "loss": 0.7699, "step": 11060 }, { "epoch": 1.08, "grad_norm": 7.059272766113281, "learning_rate": 4.961111111111111e-06, "loss": 0.7645, "step": 11070 }, { "epoch": 1.09, "grad_norm": 7.266576766967773, "learning_rate": 4.9555555555555565e-06, "loss": 0.7657, "step": 11080 }, { "epoch": 1.09, "grad_norm": 8.116548538208008, "learning_rate": 4.95e-06, "loss": 0.7411, "step": 11090 }, { "epoch": 1.09, "grad_norm": 7.295284271240234, "learning_rate": 4.944444444444445e-06, "loss": 0.7711, "step": 11100 }, { "epoch": 1.09, "grad_norm": 7.4410929679870605, "learning_rate": 4.938888888888889e-06, "loss": 0.7753, "step": 11110 }, { "epoch": 1.09, "grad_norm": 8.007226943969727, "learning_rate": 4.933333333333334e-06, "loss": 0.7622, "step": 11120 }, { "epoch": 1.09, "grad_norm": 6.156768798828125, "learning_rate": 4.927777777777778e-06, "loss": 0.7683, "step": 11130 }, { "epoch": 1.09, "grad_norm": 6.832637786865234, "learning_rate": 4.922222222222223e-06, "loss": 0.7698, "step": 11140 }, { "epoch": 1.09, "grad_norm": 6.576035499572754, "learning_rate": 4.9166666666666665e-06, "loss": 0.7771, "step": 11150 }, { "epoch": 1.09, "grad_norm": 6.393606185913086, "learning_rate": 4.911111111111112e-06, "loss": 0.7657, "step": 11160 }, { "epoch": 1.09, "grad_norm": 6.723941802978516, "learning_rate": 4.905555555555556e-06, "loss": 0.7651, "step": 11170 }, { "epoch": 1.1, "grad_norm": 7.113588809967041, "learning_rate": 4.9000000000000005e-06, "loss": 0.7602, "step": 11180 }, { "epoch": 1.1, "grad_norm": 8.784859657287598, "learning_rate": 4.894444444444445e-06, "loss": 0.7653, "step": 11190 }, { "epoch": 1.1, "grad_norm": 7.776210308074951, "learning_rate": 4.888888888888889e-06, "loss": 0.756, "step": 11200 }, { "epoch": 1.1, "grad_norm": 7.726292133331299, "learning_rate": 4.883333333333334e-06, "loss": 0.7635, "step": 11210 }, { "epoch": 1.1, "grad_norm": 7.038413047790527, "learning_rate": 4.877777777777778e-06, "loss": 0.7628, "step": 11220 }, { "epoch": 1.1, "grad_norm": 7.515929698944092, "learning_rate": 4.8722222222222225e-06, "loss": 0.7582, "step": 11230 }, { "epoch": 1.1, "grad_norm": 8.43639087677002, "learning_rate": 4.866666666666667e-06, "loss": 0.7645, "step": 11240 }, { "epoch": 1.1, "grad_norm": 21.605331420898438, "learning_rate": 4.861111111111111e-06, "loss": 0.7699, "step": 11250 }, { "epoch": 1.1, "grad_norm": 6.188365459442139, "learning_rate": 4.855555555555556e-06, "loss": 0.7833, "step": 11260 }, { "epoch": 1.1, "grad_norm": 8.0693359375, "learning_rate": 4.85e-06, "loss": 0.7576, "step": 11270 }, { "epoch": 1.11, "grad_norm": 7.56965970993042, "learning_rate": 4.8444444444444446e-06, "loss": 0.7506, "step": 11280 }, { "epoch": 1.11, "grad_norm": 7.482113838195801, "learning_rate": 4.838888888888889e-06, "loss": 0.7646, "step": 11290 }, { "epoch": 1.11, "grad_norm": 5.895560264587402, "learning_rate": 4.833333333333333e-06, "loss": 0.7617, "step": 11300 }, { "epoch": 1.11, "grad_norm": 6.85775089263916, "learning_rate": 4.827777777777778e-06, "loss": 0.7704, "step": 11310 }, { "epoch": 1.11, "grad_norm": 7.422086715698242, "learning_rate": 4.822222222222222e-06, "loss": 0.7487, "step": 11320 }, { "epoch": 1.11, "grad_norm": 8.014841079711914, "learning_rate": 4.816666666666667e-06, "loss": 0.7519, "step": 11330 }, { "epoch": 1.11, "grad_norm": 6.577732563018799, "learning_rate": 4.811111111111111e-06, "loss": 0.7568, "step": 11340 }, { "epoch": 1.11, "grad_norm": 7.3951311111450195, "learning_rate": 4.805555555555556e-06, "loss": 0.7624, "step": 11350 }, { "epoch": 1.11, "grad_norm": 7.7715067863464355, "learning_rate": 4.800000000000001e-06, "loss": 0.7757, "step": 11360 }, { "epoch": 1.11, "grad_norm": 7.8936004638671875, "learning_rate": 4.794444444444445e-06, "loss": 0.7599, "step": 11370 }, { "epoch": 1.12, "grad_norm": 7.399590492248535, "learning_rate": 4.7888888888888894e-06, "loss": 0.7861, "step": 11380 }, { "epoch": 1.12, "grad_norm": 7.190341949462891, "learning_rate": 4.783333333333334e-06, "loss": 0.7584, "step": 11390 }, { "epoch": 1.12, "grad_norm": 7.423425674438477, "learning_rate": 4.777777777777778e-06, "loss": 0.7724, "step": 11400 }, { "epoch": 1.12, "grad_norm": 7.391603469848633, "learning_rate": 4.772222222222223e-06, "loss": 0.7588, "step": 11410 }, { "epoch": 1.12, "grad_norm": 7.759237289428711, "learning_rate": 4.766666666666667e-06, "loss": 0.7545, "step": 11420 }, { "epoch": 1.12, "grad_norm": 7.493537902832031, "learning_rate": 4.7611111111111115e-06, "loss": 0.7589, "step": 11430 }, { "epoch": 1.12, "grad_norm": 7.718510627746582, "learning_rate": 4.755555555555556e-06, "loss": 0.766, "step": 11440 }, { "epoch": 1.12, "grad_norm": 7.5454559326171875, "learning_rate": 4.75e-06, "loss": 0.7574, "step": 11450 }, { "epoch": 1.12, "grad_norm": 7.340122222900391, "learning_rate": 4.744444444444445e-06, "loss": 0.7622, "step": 11460 }, { "epoch": 1.12, "grad_norm": 7.229974269866943, "learning_rate": 4.73888888888889e-06, "loss": 0.739, "step": 11470 }, { "epoch": 1.13, "grad_norm": 7.852952003479004, "learning_rate": 4.7333333333333335e-06, "loss": 0.7439, "step": 11480 }, { "epoch": 1.13, "grad_norm": 7.017978668212891, "learning_rate": 4.727777777777779e-06, "loss": 0.7682, "step": 11490 }, { "epoch": 1.13, "grad_norm": 6.152987957000732, "learning_rate": 4.722222222222222e-06, "loss": 0.7414, "step": 11500 }, { "epoch": 1.13, "grad_norm": 6.93749475479126, "learning_rate": 4.7166666666666675e-06, "loss": 0.7523, "step": 11510 }, { "epoch": 1.13, "grad_norm": 7.598892688751221, "learning_rate": 4.711111111111111e-06, "loss": 0.7663, "step": 11520 }, { "epoch": 1.13, "grad_norm": 9.433706283569336, "learning_rate": 4.705555555555556e-06, "loss": 0.7441, "step": 11530 }, { "epoch": 1.13, "grad_norm": 9.015213966369629, "learning_rate": 4.7e-06, "loss": 0.7583, "step": 11540 }, { "epoch": 1.13, "grad_norm": 5.430954933166504, "learning_rate": 4.694444444444445e-06, "loss": 0.7163, "step": 11550 }, { "epoch": 1.13, "grad_norm": 8.170196533203125, "learning_rate": 4.6888888888888895e-06, "loss": 0.7561, "step": 11560 }, { "epoch": 1.13, "grad_norm": 7.6856794357299805, "learning_rate": 4.683333333333334e-06, "loss": 0.7742, "step": 11570 }, { "epoch": 1.13, "grad_norm": 6.598860740661621, "learning_rate": 4.677777777777778e-06, "loss": 0.75, "step": 11580 }, { "epoch": 1.14, "grad_norm": 7.5726542472839355, "learning_rate": 4.672222222222223e-06, "loss": 0.765, "step": 11590 }, { "epoch": 1.14, "grad_norm": 7.516388893127441, "learning_rate": 4.666666666666667e-06, "loss": 0.7587, "step": 11600 }, { "epoch": 1.14, "grad_norm": 6.6947808265686035, "learning_rate": 4.6611111111111116e-06, "loss": 0.7586, "step": 11610 }, { "epoch": 1.14, "grad_norm": 7.406017780303955, "learning_rate": 4.655555555555556e-06, "loss": 0.749, "step": 11620 }, { "epoch": 1.14, "grad_norm": 5.526077747344971, "learning_rate": 4.65e-06, "loss": 0.7701, "step": 11630 }, { "epoch": 1.14, "grad_norm": 7.722912311553955, "learning_rate": 4.644444444444445e-06, "loss": 0.7492, "step": 11640 }, { "epoch": 1.14, "grad_norm": 8.361542701721191, "learning_rate": 4.638888888888889e-06, "loss": 0.7549, "step": 11650 }, { "epoch": 1.14, "grad_norm": 9.13498592376709, "learning_rate": 4.633333333333334e-06, "loss": 0.7807, "step": 11660 }, { "epoch": 1.14, "grad_norm": 6.876678466796875, "learning_rate": 4.627777777777778e-06, "loss": 0.7583, "step": 11670 }, { "epoch": 1.14, "grad_norm": 7.744997024536133, "learning_rate": 4.622222222222222e-06, "loss": 0.75, "step": 11680 }, { "epoch": 1.15, "grad_norm": 8.711613655090332, "learning_rate": 4.616666666666667e-06, "loss": 0.7636, "step": 11690 }, { "epoch": 1.15, "grad_norm": 6.195899486541748, "learning_rate": 4.611111111111112e-06, "loss": 0.7653, "step": 11700 }, { "epoch": 1.15, "grad_norm": 9.37248420715332, "learning_rate": 4.605555555555556e-06, "loss": 0.7628, "step": 11710 }, { "epoch": 1.15, "grad_norm": 9.166922569274902, "learning_rate": 4.600000000000001e-06, "loss": 0.7659, "step": 11720 }, { "epoch": 1.15, "grad_norm": 7.789681911468506, "learning_rate": 4.594444444444444e-06, "loss": 0.7471, "step": 11730 }, { "epoch": 1.15, "grad_norm": 8.198651313781738, "learning_rate": 4.58888888888889e-06, "loss": 0.7437, "step": 11740 }, { "epoch": 1.15, "grad_norm": 6.831643104553223, "learning_rate": 4.583333333333333e-06, "loss": 0.7645, "step": 11750 }, { "epoch": 1.15, "grad_norm": 7.890697479248047, "learning_rate": 4.5777777777777785e-06, "loss": 0.7567, "step": 11760 }, { "epoch": 1.15, "grad_norm": 6.630425930023193, "learning_rate": 4.572222222222222e-06, "loss": 0.7666, "step": 11770 }, { "epoch": 1.15, "grad_norm": 7.3900675773620605, "learning_rate": 4.566666666666667e-06, "loss": 0.773, "step": 11780 }, { "epoch": 1.16, "grad_norm": 6.876664161682129, "learning_rate": 4.561111111111112e-06, "loss": 0.7694, "step": 11790 }, { "epoch": 1.16, "grad_norm": 9.724068641662598, "learning_rate": 4.555555555555556e-06, "loss": 0.7517, "step": 11800 }, { "epoch": 1.16, "grad_norm": 7.131928443908691, "learning_rate": 4.5500000000000005e-06, "loss": 0.7552, "step": 11810 }, { "epoch": 1.16, "grad_norm": 7.77938985824585, "learning_rate": 4.544444444444445e-06, "loss": 0.7571, "step": 11820 }, { "epoch": 1.16, "grad_norm": 7.912930011749268, "learning_rate": 4.538888888888889e-06, "loss": 0.7567, "step": 11830 }, { "epoch": 1.16, "grad_norm": 7.534212589263916, "learning_rate": 4.533333333333334e-06, "loss": 0.7421, "step": 11840 }, { "epoch": 1.16, "grad_norm": 7.509570598602295, "learning_rate": 4.527777777777778e-06, "loss": 0.7485, "step": 11850 }, { "epoch": 1.16, "grad_norm": 8.72992992401123, "learning_rate": 4.5222222222222225e-06, "loss": 0.7508, "step": 11860 }, { "epoch": 1.16, "grad_norm": 7.030019760131836, "learning_rate": 4.516666666666667e-06, "loss": 0.769, "step": 11870 }, { "epoch": 1.16, "grad_norm": 7.5460076332092285, "learning_rate": 4.511111111111111e-06, "loss": 0.7541, "step": 11880 }, { "epoch": 1.17, "grad_norm": 7.343547821044922, "learning_rate": 4.505555555555556e-06, "loss": 0.7382, "step": 11890 }, { "epoch": 1.17, "grad_norm": 7.487570762634277, "learning_rate": 4.5e-06, "loss": 0.7278, "step": 11900 }, { "epoch": 1.17, "grad_norm": 7.479506015777588, "learning_rate": 4.4944444444444445e-06, "loss": 0.7516, "step": 11910 }, { "epoch": 1.17, "grad_norm": 7.458956718444824, "learning_rate": 4.488888888888889e-06, "loss": 0.7645, "step": 11920 }, { "epoch": 1.17, "grad_norm": 7.698587417602539, "learning_rate": 4.483333333333333e-06, "loss": 0.7499, "step": 11930 }, { "epoch": 1.17, "grad_norm": 8.359041213989258, "learning_rate": 4.477777777777778e-06, "loss": 0.7498, "step": 11940 }, { "epoch": 1.17, "grad_norm": 7.825484752655029, "learning_rate": 4.472222222222223e-06, "loss": 0.7516, "step": 11950 }, { "epoch": 1.17, "grad_norm": 6.909610271453857, "learning_rate": 4.4666666666666665e-06, "loss": 0.7714, "step": 11960 }, { "epoch": 1.17, "grad_norm": 9.600847244262695, "learning_rate": 4.461111111111112e-06, "loss": 0.7351, "step": 11970 }, { "epoch": 1.17, "grad_norm": 7.424592018127441, "learning_rate": 4.455555555555555e-06, "loss": 0.7651, "step": 11980 }, { "epoch": 1.18, "grad_norm": 8.575028419494629, "learning_rate": 4.450000000000001e-06, "loss": 0.7548, "step": 11990 }, { "epoch": 1.18, "grad_norm": 9.268083572387695, "learning_rate": 4.444444444444444e-06, "loss": 0.7569, "step": 12000 }, { "epoch": 1.18, "eval_loss": 0.814923882484436, "eval_runtime": 24.8674, "eval_samples_per_second": 26.3, "eval_steps_per_second": 3.297, "step": 12000 }, { "epoch": 1.18, "grad_norm": 7.420086860656738, "learning_rate": 4.438888888888889e-06, "loss": 0.7541, "step": 12010 }, { "epoch": 1.18, "grad_norm": 7.5985941886901855, "learning_rate": 4.433333333333334e-06, "loss": 0.7473, "step": 12020 }, { "epoch": 1.18, "grad_norm": 7.508575439453125, "learning_rate": 4.427777777777778e-06, "loss": 0.7542, "step": 12030 }, { "epoch": 1.18, "grad_norm": 7.207181930541992, "learning_rate": 4.422222222222223e-06, "loss": 0.755, "step": 12040 }, { "epoch": 1.18, "grad_norm": 7.2031073570251465, "learning_rate": 4.416666666666667e-06, "loss": 0.7569, "step": 12050 }, { "epoch": 1.18, "grad_norm": 8.81457805633545, "learning_rate": 4.411111111111111e-06, "loss": 0.7454, "step": 12060 }, { "epoch": 1.18, "grad_norm": 7.493142127990723, "learning_rate": 4.405555555555556e-06, "loss": 0.7423, "step": 12070 }, { "epoch": 1.18, "grad_norm": 8.065194129943848, "learning_rate": 4.4e-06, "loss": 0.7541, "step": 12080 }, { "epoch": 1.18, "grad_norm": 6.5527544021606445, "learning_rate": 4.3944444444444455e-06, "loss": 0.7426, "step": 12090 }, { "epoch": 1.19, "grad_norm": 8.562774658203125, "learning_rate": 4.388888888888889e-06, "loss": 0.7497, "step": 12100 }, { "epoch": 1.19, "grad_norm": 7.721785545349121, "learning_rate": 4.383333333333334e-06, "loss": 0.7341, "step": 12110 }, { "epoch": 1.19, "grad_norm": 7.356851100921631, "learning_rate": 4.377777777777778e-06, "loss": 0.7481, "step": 12120 }, { "epoch": 1.19, "grad_norm": 7.928640842437744, "learning_rate": 4.372222222222223e-06, "loss": 0.7275, "step": 12130 }, { "epoch": 1.19, "grad_norm": 8.326993942260742, "learning_rate": 4.366666666666667e-06, "loss": 0.7477, "step": 12140 }, { "epoch": 1.19, "grad_norm": 7.219991683959961, "learning_rate": 4.361111111111112e-06, "loss": 0.7475, "step": 12150 }, { "epoch": 1.19, "grad_norm": 7.6060709953308105, "learning_rate": 4.3555555555555555e-06, "loss": 0.7257, "step": 12160 }, { "epoch": 1.19, "grad_norm": 7.237653732299805, "learning_rate": 4.350000000000001e-06, "loss": 0.7566, "step": 12170 }, { "epoch": 1.19, "grad_norm": 7.0059003829956055, "learning_rate": 4.344444444444445e-06, "loss": 0.7573, "step": 12180 }, { "epoch": 1.19, "grad_norm": 6.628710746765137, "learning_rate": 4.3388888888888895e-06, "loss": 0.7337, "step": 12190 }, { "epoch": 1.2, "grad_norm": 8.169317245483398, "learning_rate": 4.333333333333334e-06, "loss": 0.7578, "step": 12200 }, { "epoch": 1.2, "grad_norm": 7.880176544189453, "learning_rate": 4.327777777777778e-06, "loss": 0.7429, "step": 12210 }, { "epoch": 1.2, "grad_norm": 6.478607654571533, "learning_rate": 4.322222222222223e-06, "loss": 0.756, "step": 12220 }, { "epoch": 1.2, "grad_norm": 7.812920093536377, "learning_rate": 4.316666666666667e-06, "loss": 0.7292, "step": 12230 }, { "epoch": 1.2, "grad_norm": 7.692933559417725, "learning_rate": 4.3111111111111115e-06, "loss": 0.7377, "step": 12240 }, { "epoch": 1.2, "grad_norm": 7.9746599197387695, "learning_rate": 4.305555555555556e-06, "loss": 0.7236, "step": 12250 }, { "epoch": 1.2, "grad_norm": 6.8361310958862305, "learning_rate": 4.3e-06, "loss": 0.741, "step": 12260 }, { "epoch": 1.2, "grad_norm": 6.825582981109619, "learning_rate": 4.294444444444445e-06, "loss": 0.748, "step": 12270 }, { "epoch": 1.2, "grad_norm": 7.545291423797607, "learning_rate": 4.288888888888889e-06, "loss": 0.7345, "step": 12280 }, { "epoch": 1.2, "grad_norm": 7.18121337890625, "learning_rate": 4.2833333333333335e-06, "loss": 0.7647, "step": 12290 }, { "epoch": 1.21, "grad_norm": 8.358675003051758, "learning_rate": 4.277777777777778e-06, "loss": 0.7604, "step": 12300 }, { "epoch": 1.21, "grad_norm": 7.752681732177734, "learning_rate": 4.272222222222222e-06, "loss": 0.7349, "step": 12310 }, { "epoch": 1.21, "grad_norm": 7.838076114654541, "learning_rate": 4.266666666666668e-06, "loss": 0.7432, "step": 12320 }, { "epoch": 1.21, "grad_norm": 7.600425720214844, "learning_rate": 4.261111111111111e-06, "loss": 0.7585, "step": 12330 }, { "epoch": 1.21, "grad_norm": 7.591167449951172, "learning_rate": 4.255555555555556e-06, "loss": 0.7514, "step": 12340 }, { "epoch": 1.21, "grad_norm": 8.151092529296875, "learning_rate": 4.25e-06, "loss": 0.7347, "step": 12350 }, { "epoch": 1.21, "grad_norm": 6.637580871582031, "learning_rate": 4.244444444444445e-06, "loss": 0.756, "step": 12360 }, { "epoch": 1.21, "grad_norm": 6.91485595703125, "learning_rate": 4.238888888888889e-06, "loss": 0.7495, "step": 12370 }, { "epoch": 1.21, "grad_norm": 6.892335414886475, "learning_rate": 4.233333333333334e-06, "loss": 0.7468, "step": 12380 }, { "epoch": 1.21, "grad_norm": 7.128693580627441, "learning_rate": 4.227777777777778e-06, "loss": 0.7526, "step": 12390 }, { "epoch": 1.22, "grad_norm": 7.09757137298584, "learning_rate": 4.222222222222223e-06, "loss": 0.7386, "step": 12400 }, { "epoch": 1.22, "grad_norm": 7.46950626373291, "learning_rate": 4.216666666666667e-06, "loss": 0.7529, "step": 12410 }, { "epoch": 1.22, "grad_norm": 7.822963714599609, "learning_rate": 4.211111111111112e-06, "loss": 0.7542, "step": 12420 }, { "epoch": 1.22, "grad_norm": 7.751564979553223, "learning_rate": 4.205555555555556e-06, "loss": 0.7699, "step": 12430 }, { "epoch": 1.22, "grad_norm": 8.372063636779785, "learning_rate": 4.2000000000000004e-06, "loss": 0.7392, "step": 12440 }, { "epoch": 1.22, "grad_norm": 7.708549499511719, "learning_rate": 4.194444444444445e-06, "loss": 0.7347, "step": 12450 }, { "epoch": 1.22, "grad_norm": 7.172439098358154, "learning_rate": 4.188888888888889e-06, "loss": 0.7485, "step": 12460 }, { "epoch": 1.22, "grad_norm": 7.454358100891113, "learning_rate": 4.183333333333334e-06, "loss": 0.7269, "step": 12470 }, { "epoch": 1.22, "grad_norm": 8.666786193847656, "learning_rate": 4.177777777777778e-06, "loss": 0.7454, "step": 12480 }, { "epoch": 1.22, "grad_norm": 8.271525382995605, "learning_rate": 4.1722222222222225e-06, "loss": 0.7526, "step": 12490 }, { "epoch": 1.23, "grad_norm": 6.362132549285889, "learning_rate": 4.166666666666667e-06, "loss": 0.7355, "step": 12500 }, { "epoch": 1.23, "grad_norm": 7.817122936248779, "learning_rate": 4.161111111111111e-06, "loss": 0.7235, "step": 12510 }, { "epoch": 1.23, "grad_norm": 6.638051986694336, "learning_rate": 4.155555555555556e-06, "loss": 0.7549, "step": 12520 }, { "epoch": 1.23, "grad_norm": 6.889431476593018, "learning_rate": 4.15e-06, "loss": 0.7304, "step": 12530 }, { "epoch": 1.23, "grad_norm": 8.96534538269043, "learning_rate": 4.1444444444444445e-06, "loss": 0.7287, "step": 12540 }, { "epoch": 1.23, "grad_norm": 10.080416679382324, "learning_rate": 4.138888888888889e-06, "loss": 0.7337, "step": 12550 }, { "epoch": 1.23, "grad_norm": 7.488158702850342, "learning_rate": 4.133333333333333e-06, "loss": 0.7391, "step": 12560 }, { "epoch": 1.23, "grad_norm": 6.709292888641357, "learning_rate": 4.1277777777777785e-06, "loss": 0.7296, "step": 12570 }, { "epoch": 1.23, "grad_norm": 9.03792953491211, "learning_rate": 4.122222222222222e-06, "loss": 0.7432, "step": 12580 }, { "epoch": 1.23, "grad_norm": 7.2098388671875, "learning_rate": 4.116666666666667e-06, "loss": 0.75, "step": 12590 }, { "epoch": 1.23, "grad_norm": 7.301801681518555, "learning_rate": 4.111111111111111e-06, "loss": 0.7231, "step": 12600 }, { "epoch": 1.24, "grad_norm": 7.947643756866455, "learning_rate": 4.105555555555556e-06, "loss": 0.7512, "step": 12610 }, { "epoch": 1.24, "grad_norm": 7.413721561431885, "learning_rate": 4.1e-06, "loss": 0.7479, "step": 12620 }, { "epoch": 1.24, "grad_norm": 7.5264997482299805, "learning_rate": 4.094444444444445e-06, "loss": 0.7398, "step": 12630 }, { "epoch": 1.24, "grad_norm": 7.645998954772949, "learning_rate": 4.088888888888889e-06, "loss": 0.7492, "step": 12640 }, { "epoch": 1.24, "grad_norm": 7.774545669555664, "learning_rate": 4.083333333333334e-06, "loss": 0.7397, "step": 12650 }, { "epoch": 1.24, "grad_norm": 7.49631404876709, "learning_rate": 4.077777777777778e-06, "loss": 0.7477, "step": 12660 }, { "epoch": 1.24, "grad_norm": 7.461100101470947, "learning_rate": 4.0722222222222226e-06, "loss": 0.7549, "step": 12670 }, { "epoch": 1.24, "grad_norm": 7.210879325866699, "learning_rate": 4.066666666666667e-06, "loss": 0.737, "step": 12680 }, { "epoch": 1.24, "grad_norm": 7.678642749786377, "learning_rate": 4.061111111111111e-06, "loss": 0.7397, "step": 12690 }, { "epoch": 1.24, "grad_norm": 9.065237998962402, "learning_rate": 4.055555555555556e-06, "loss": 0.7163, "step": 12700 }, { "epoch": 1.25, "grad_norm": 8.956584930419922, "learning_rate": 4.05e-06, "loss": 0.7264, "step": 12710 }, { "epoch": 1.25, "grad_norm": 6.252303600311279, "learning_rate": 4.044444444444445e-06, "loss": 0.7434, "step": 12720 }, { "epoch": 1.25, "grad_norm": 7.706328868865967, "learning_rate": 4.038888888888889e-06, "loss": 0.758, "step": 12730 }, { "epoch": 1.25, "grad_norm": 7.414504051208496, "learning_rate": 4.033333333333333e-06, "loss": 0.7593, "step": 12740 }, { "epoch": 1.25, "grad_norm": 6.570133209228516, "learning_rate": 4.027777777777779e-06, "loss": 0.7392, "step": 12750 }, { "epoch": 1.25, "grad_norm": 8.785941123962402, "learning_rate": 4.022222222222222e-06, "loss": 0.7452, "step": 12760 }, { "epoch": 1.25, "grad_norm": 7.725743293762207, "learning_rate": 4.0166666666666675e-06, "loss": 0.7413, "step": 12770 }, { "epoch": 1.25, "grad_norm": 8.243194580078125, "learning_rate": 4.011111111111111e-06, "loss": 0.758, "step": 12780 }, { "epoch": 1.25, "grad_norm": 8.696666717529297, "learning_rate": 4.005555555555556e-06, "loss": 0.7412, "step": 12790 }, { "epoch": 1.25, "grad_norm": 8.810306549072266, "learning_rate": 4.000000000000001e-06, "loss": 0.7318, "step": 12800 }, { "epoch": 1.26, "grad_norm": 8.18591594696045, "learning_rate": 3.994444444444445e-06, "loss": 0.7471, "step": 12810 }, { "epoch": 1.26, "grad_norm": 9.145063400268555, "learning_rate": 3.9888888888888895e-06, "loss": 0.7256, "step": 12820 }, { "epoch": 1.26, "grad_norm": 8.055453300476074, "learning_rate": 3.983333333333334e-06, "loss": 0.7688, "step": 12830 }, { "epoch": 1.26, "grad_norm": 8.157913208007812, "learning_rate": 3.977777777777778e-06, "loss": 0.7436, "step": 12840 }, { "epoch": 1.26, "grad_norm": 7.680800914764404, "learning_rate": 3.972222222222223e-06, "loss": 0.7366, "step": 12850 }, { "epoch": 1.26, "grad_norm": 8.371052742004395, "learning_rate": 3.966666666666667e-06, "loss": 0.7326, "step": 12860 }, { "epoch": 1.26, "grad_norm": 8.189726829528809, "learning_rate": 3.9611111111111115e-06, "loss": 0.7399, "step": 12870 }, { "epoch": 1.26, "grad_norm": 11.572422981262207, "learning_rate": 3.955555555555556e-06, "loss": 0.7478, "step": 12880 }, { "epoch": 1.26, "grad_norm": 8.338741302490234, "learning_rate": 3.95e-06, "loss": 0.7483, "step": 12890 }, { "epoch": 1.26, "grad_norm": 6.87857723236084, "learning_rate": 3.944444444444445e-06, "loss": 0.7392, "step": 12900 }, { "epoch": 1.27, "grad_norm": 7.253726482391357, "learning_rate": 3.938888888888889e-06, "loss": 0.7453, "step": 12910 }, { "epoch": 1.27, "grad_norm": 8.087032318115234, "learning_rate": 3.9333333333333335e-06, "loss": 0.7326, "step": 12920 }, { "epoch": 1.27, "grad_norm": 7.3596510887146, "learning_rate": 3.927777777777778e-06, "loss": 0.7502, "step": 12930 }, { "epoch": 1.27, "grad_norm": 7.622371673583984, "learning_rate": 3.922222222222223e-06, "loss": 0.7154, "step": 12940 }, { "epoch": 1.27, "grad_norm": 8.481427192687988, "learning_rate": 3.916666666666667e-06, "loss": 0.7484, "step": 12950 }, { "epoch": 1.27, "grad_norm": 49.327781677246094, "learning_rate": 3.911111111111112e-06, "loss": 0.7372, "step": 12960 }, { "epoch": 1.27, "grad_norm": 6.995118141174316, "learning_rate": 3.9055555555555555e-06, "loss": 0.7427, "step": 12970 }, { "epoch": 1.27, "grad_norm": 7.287441730499268, "learning_rate": 3.900000000000001e-06, "loss": 0.7441, "step": 12980 }, { "epoch": 1.27, "grad_norm": 7.666792392730713, "learning_rate": 3.894444444444444e-06, "loss": 0.7269, "step": 12990 }, { "epoch": 1.27, "grad_norm": 8.61199951171875, "learning_rate": 3.88888888888889e-06, "loss": 0.73, "step": 13000 }, { "epoch": 1.27, "grad_norm": 9.109180450439453, "learning_rate": 3.883333333333333e-06, "loss": 0.7092, "step": 13010 }, { "epoch": 1.28, "grad_norm": 8.31975269317627, "learning_rate": 3.877777777777778e-06, "loss": 0.736, "step": 13020 }, { "epoch": 1.28, "grad_norm": 8.103517532348633, "learning_rate": 3.872222222222223e-06, "loss": 0.7529, "step": 13030 }, { "epoch": 1.28, "grad_norm": 7.7198991775512695, "learning_rate": 3.866666666666667e-06, "loss": 0.727, "step": 13040 }, { "epoch": 1.28, "grad_norm": 7.725632190704346, "learning_rate": 3.861111111111112e-06, "loss": 0.7345, "step": 13050 }, { "epoch": 1.28, "grad_norm": 7.058694362640381, "learning_rate": 3.855555555555556e-06, "loss": 0.7152, "step": 13060 }, { "epoch": 1.28, "grad_norm": 8.383444786071777, "learning_rate": 3.85e-06, "loss": 0.7415, "step": 13070 }, { "epoch": 1.28, "grad_norm": 9.496871948242188, "learning_rate": 3.844444444444445e-06, "loss": 0.7468, "step": 13080 }, { "epoch": 1.28, "grad_norm": 7.497513294219971, "learning_rate": 3.838888888888889e-06, "loss": 0.7387, "step": 13090 }, { "epoch": 1.28, "grad_norm": 6.643813133239746, "learning_rate": 3.833333333333334e-06, "loss": 0.7346, "step": 13100 }, { "epoch": 1.28, "grad_norm": 7.250507354736328, "learning_rate": 3.827777777777778e-06, "loss": 0.7396, "step": 13110 }, { "epoch": 1.29, "grad_norm": 7.755252838134766, "learning_rate": 3.8222222222222224e-06, "loss": 0.7283, "step": 13120 }, { "epoch": 1.29, "grad_norm": 6.675044059753418, "learning_rate": 3.816666666666667e-06, "loss": 0.7478, "step": 13130 }, { "epoch": 1.29, "grad_norm": 6.9016242027282715, "learning_rate": 3.8111111111111117e-06, "loss": 0.7424, "step": 13140 }, { "epoch": 1.29, "grad_norm": 6.991751194000244, "learning_rate": 3.8055555555555556e-06, "loss": 0.7309, "step": 13150 }, { "epoch": 1.29, "grad_norm": 7.467940807342529, "learning_rate": 3.8000000000000005e-06, "loss": 0.7293, "step": 13160 }, { "epoch": 1.29, "grad_norm": 8.119007110595703, "learning_rate": 3.7944444444444444e-06, "loss": 0.7234, "step": 13170 }, { "epoch": 1.29, "grad_norm": 7.165918827056885, "learning_rate": 3.7888888888888893e-06, "loss": 0.7282, "step": 13180 }, { "epoch": 1.29, "grad_norm": 6.5443220138549805, "learning_rate": 3.7833333333333337e-06, "loss": 0.7417, "step": 13190 }, { "epoch": 1.29, "grad_norm": 8.107033729553223, "learning_rate": 3.777777777777778e-06, "loss": 0.7153, "step": 13200 }, { "epoch": 1.29, "grad_norm": 6.75696325302124, "learning_rate": 3.7722222222222225e-06, "loss": 0.7304, "step": 13210 }, { "epoch": 1.3, "grad_norm": 6.502655506134033, "learning_rate": 3.766666666666667e-06, "loss": 0.7352, "step": 13220 }, { "epoch": 1.3, "grad_norm": 7.507099151611328, "learning_rate": 3.7611111111111113e-06, "loss": 0.7515, "step": 13230 }, { "epoch": 1.3, "grad_norm": 8.283516883850098, "learning_rate": 3.7555555555555557e-06, "loss": 0.7347, "step": 13240 }, { "epoch": 1.3, "grad_norm": 7.471385478973389, "learning_rate": 3.7500000000000005e-06, "loss": 0.7397, "step": 13250 }, { "epoch": 1.3, "grad_norm": 7.856815338134766, "learning_rate": 3.744444444444445e-06, "loss": 0.7134, "step": 13260 }, { "epoch": 1.3, "grad_norm": 6.8537092208862305, "learning_rate": 3.7388888888888893e-06, "loss": 0.7189, "step": 13270 }, { "epoch": 1.3, "grad_norm": 7.271474838256836, "learning_rate": 3.7333333333333337e-06, "loss": 0.7686, "step": 13280 }, { "epoch": 1.3, "grad_norm": 8.50648307800293, "learning_rate": 3.727777777777778e-06, "loss": 0.7065, "step": 13290 }, { "epoch": 1.3, "grad_norm": 8.052899360656738, "learning_rate": 3.7222222222222225e-06, "loss": 0.7408, "step": 13300 }, { "epoch": 1.3, "grad_norm": 8.009530067443848, "learning_rate": 3.716666666666667e-06, "loss": 0.7202, "step": 13310 }, { "epoch": 1.31, "grad_norm": 9.511129379272461, "learning_rate": 3.7111111111111113e-06, "loss": 0.7481, "step": 13320 }, { "epoch": 1.31, "grad_norm": 8.695572853088379, "learning_rate": 3.705555555555556e-06, "loss": 0.743, "step": 13330 }, { "epoch": 1.31, "grad_norm": 8.781993865966797, "learning_rate": 3.7e-06, "loss": 0.7209, "step": 13340 }, { "epoch": 1.31, "grad_norm": 6.763775825500488, "learning_rate": 3.694444444444445e-06, "loss": 0.7438, "step": 13350 }, { "epoch": 1.31, "grad_norm": 8.048567771911621, "learning_rate": 3.688888888888889e-06, "loss": 0.7169, "step": 13360 }, { "epoch": 1.31, "grad_norm": 6.9250969886779785, "learning_rate": 3.6833333333333338e-06, "loss": 0.7045, "step": 13370 }, { "epoch": 1.31, "grad_norm": 9.201193809509277, "learning_rate": 3.6777777777777778e-06, "loss": 0.7361, "step": 13380 }, { "epoch": 1.31, "grad_norm": 7.982819080352783, "learning_rate": 3.6722222222222226e-06, "loss": 0.7251, "step": 13390 }, { "epoch": 1.31, "grad_norm": 7.63870096206665, "learning_rate": 3.6666666666666666e-06, "loss": 0.7337, "step": 13400 }, { "epoch": 1.31, "grad_norm": 7.4017333984375, "learning_rate": 3.6611111111111114e-06, "loss": 0.7323, "step": 13410 }, { "epoch": 1.32, "grad_norm": 8.209833145141602, "learning_rate": 3.6555555555555562e-06, "loss": 0.7356, "step": 13420 }, { "epoch": 1.32, "grad_norm": 7.37398624420166, "learning_rate": 3.65e-06, "loss": 0.7217, "step": 13430 }, { "epoch": 1.32, "grad_norm": 6.759220600128174, "learning_rate": 3.644444444444445e-06, "loss": 0.7434, "step": 13440 }, { "epoch": 1.32, "grad_norm": 7.181795120239258, "learning_rate": 3.638888888888889e-06, "loss": 0.7246, "step": 13450 }, { "epoch": 1.32, "grad_norm": 7.8988037109375, "learning_rate": 3.633333333333334e-06, "loss": 0.7284, "step": 13460 }, { "epoch": 1.32, "grad_norm": 7.325245380401611, "learning_rate": 3.627777777777778e-06, "loss": 0.7124, "step": 13470 }, { "epoch": 1.32, "grad_norm": 7.752197265625, "learning_rate": 3.6222222222222226e-06, "loss": 0.7398, "step": 13480 }, { "epoch": 1.32, "grad_norm": 7.202948570251465, "learning_rate": 3.616666666666667e-06, "loss": 0.736, "step": 13490 }, { "epoch": 1.32, "grad_norm": 7.683246612548828, "learning_rate": 3.6111111111111115e-06, "loss": 0.7273, "step": 13500 }, { "epoch": 1.32, "grad_norm": 8.050540924072266, "learning_rate": 3.605555555555556e-06, "loss": 0.7188, "step": 13510 }, { "epoch": 1.32, "grad_norm": 6.728517055511475, "learning_rate": 3.6000000000000003e-06, "loss": 0.6956, "step": 13520 }, { "epoch": 1.33, "grad_norm": 7.7550811767578125, "learning_rate": 3.5944444444444447e-06, "loss": 0.7573, "step": 13530 }, { "epoch": 1.33, "grad_norm": 7.722121715545654, "learning_rate": 3.588888888888889e-06, "loss": 0.7381, "step": 13540 }, { "epoch": 1.33, "grad_norm": 21.2546443939209, "learning_rate": 3.5833333333333335e-06, "loss": 0.7232, "step": 13550 }, { "epoch": 1.33, "grad_norm": 9.008973121643066, "learning_rate": 3.577777777777778e-06, "loss": 0.7361, "step": 13560 }, { "epoch": 1.33, "grad_norm": 9.022512435913086, "learning_rate": 3.5722222222222223e-06, "loss": 0.734, "step": 13570 }, { "epoch": 1.33, "grad_norm": 7.778449058532715, "learning_rate": 3.566666666666667e-06, "loss": 0.7232, "step": 13580 }, { "epoch": 1.33, "grad_norm": 7.739919662475586, "learning_rate": 3.561111111111111e-06, "loss": 0.7272, "step": 13590 }, { "epoch": 1.33, "grad_norm": 9.186176300048828, "learning_rate": 3.555555555555556e-06, "loss": 0.7368, "step": 13600 }, { "epoch": 1.33, "grad_norm": 7.061432838439941, "learning_rate": 3.5500000000000003e-06, "loss": 0.7308, "step": 13610 }, { "epoch": 1.33, "grad_norm": 7.202503681182861, "learning_rate": 3.5444444444444447e-06, "loss": 0.7379, "step": 13620 }, { "epoch": 1.34, "grad_norm": 6.515293121337891, "learning_rate": 3.538888888888889e-06, "loss": 0.7377, "step": 13630 }, { "epoch": 1.34, "grad_norm": 7.428168296813965, "learning_rate": 3.5333333333333335e-06, "loss": 0.7253, "step": 13640 }, { "epoch": 1.34, "grad_norm": 8.037849426269531, "learning_rate": 3.5277777777777784e-06, "loss": 0.7225, "step": 13650 }, { "epoch": 1.34, "grad_norm": 7.264064788818359, "learning_rate": 3.5222222222222223e-06, "loss": 0.7294, "step": 13660 }, { "epoch": 1.34, "grad_norm": 7.782707691192627, "learning_rate": 3.516666666666667e-06, "loss": 0.7217, "step": 13670 }, { "epoch": 1.34, "grad_norm": 7.600493907928467, "learning_rate": 3.511111111111111e-06, "loss": 0.7263, "step": 13680 }, { "epoch": 1.34, "grad_norm": 7.154909610748291, "learning_rate": 3.505555555555556e-06, "loss": 0.7389, "step": 13690 }, { "epoch": 1.34, "grad_norm": 7.191118240356445, "learning_rate": 3.5e-06, "loss": 0.732, "step": 13700 }, { "epoch": 1.34, "grad_norm": 8.621413230895996, "learning_rate": 3.4944444444444448e-06, "loss": 0.7293, "step": 13710 }, { "epoch": 1.34, "grad_norm": 8.319443702697754, "learning_rate": 3.4888888888888896e-06, "loss": 0.7302, "step": 13720 }, { "epoch": 1.35, "grad_norm": 7.00616979598999, "learning_rate": 3.4833333333333336e-06, "loss": 0.7272, "step": 13730 }, { "epoch": 1.35, "grad_norm": 7.167968273162842, "learning_rate": 3.4777777777777784e-06, "loss": 0.7158, "step": 13740 }, { "epoch": 1.35, "grad_norm": 8.248542785644531, "learning_rate": 3.4722222222222224e-06, "loss": 0.7358, "step": 13750 }, { "epoch": 1.35, "grad_norm": 8.579115867614746, "learning_rate": 3.4666666666666672e-06, "loss": 0.7312, "step": 13760 }, { "epoch": 1.35, "grad_norm": 7.287515163421631, "learning_rate": 3.461111111111111e-06, "loss": 0.7311, "step": 13770 }, { "epoch": 1.35, "grad_norm": 8.434518814086914, "learning_rate": 3.455555555555556e-06, "loss": 0.7418, "step": 13780 }, { "epoch": 1.35, "grad_norm": 7.713160037994385, "learning_rate": 3.45e-06, "loss": 0.7206, "step": 13790 }, { "epoch": 1.35, "grad_norm": 6.587376117706299, "learning_rate": 3.444444444444445e-06, "loss": 0.731, "step": 13800 }, { "epoch": 1.35, "grad_norm": 7.342635631561279, "learning_rate": 3.4388888888888892e-06, "loss": 0.7423, "step": 13810 }, { "epoch": 1.35, "grad_norm": 7.522426605224609, "learning_rate": 3.4333333333333336e-06, "loss": 0.7202, "step": 13820 }, { "epoch": 1.36, "grad_norm": 7.583451271057129, "learning_rate": 3.427777777777778e-06, "loss": 0.7259, "step": 13830 }, { "epoch": 1.36, "grad_norm": 8.353509902954102, "learning_rate": 3.4222222222222224e-06, "loss": 0.7445, "step": 13840 }, { "epoch": 1.36, "grad_norm": 7.3174357414245605, "learning_rate": 3.416666666666667e-06, "loss": 0.7322, "step": 13850 }, { "epoch": 1.36, "grad_norm": 7.909167766571045, "learning_rate": 3.4111111111111113e-06, "loss": 0.7092, "step": 13860 }, { "epoch": 1.36, "grad_norm": 9.0552339553833, "learning_rate": 3.4055555555555557e-06, "loss": 0.7381, "step": 13870 }, { "epoch": 1.36, "grad_norm": 7.913680553436279, "learning_rate": 3.4000000000000005e-06, "loss": 0.7276, "step": 13880 }, { "epoch": 1.36, "grad_norm": 7.699253082275391, "learning_rate": 3.3944444444444445e-06, "loss": 0.7071, "step": 13890 }, { "epoch": 1.36, "grad_norm": 8.405613899230957, "learning_rate": 3.3888888888888893e-06, "loss": 0.7354, "step": 13900 }, { "epoch": 1.36, "grad_norm": 7.840478420257568, "learning_rate": 3.3833333333333333e-06, "loss": 0.7294, "step": 13910 }, { "epoch": 1.36, "grad_norm": 8.074596405029297, "learning_rate": 3.377777777777778e-06, "loss": 0.7056, "step": 13920 }, { "epoch": 1.37, "grad_norm": 8.03394889831543, "learning_rate": 3.372222222222222e-06, "loss": 0.727, "step": 13930 }, { "epoch": 1.37, "grad_norm": 8.011146545410156, "learning_rate": 3.366666666666667e-06, "loss": 0.7256, "step": 13940 }, { "epoch": 1.37, "grad_norm": 7.315608501434326, "learning_rate": 3.3611111111111117e-06, "loss": 0.7468, "step": 13950 }, { "epoch": 1.37, "grad_norm": 7.826065540313721, "learning_rate": 3.3555555555555557e-06, "loss": 0.7239, "step": 13960 }, { "epoch": 1.37, "grad_norm": 8.032958984375, "learning_rate": 3.3500000000000005e-06, "loss": 0.7251, "step": 13970 }, { "epoch": 1.37, "grad_norm": 7.3150410652160645, "learning_rate": 3.3444444444444445e-06, "loss": 0.7255, "step": 13980 }, { "epoch": 1.37, "grad_norm": 8.84343147277832, "learning_rate": 3.3388888888888893e-06, "loss": 0.7036, "step": 13990 }, { "epoch": 1.37, "grad_norm": 7.606759548187256, "learning_rate": 3.3333333333333333e-06, "loss": 0.7367, "step": 14000 }, { "epoch": 1.37, "eval_loss": 0.7810459733009338, "eval_runtime": 24.9641, "eval_samples_per_second": 26.198, "eval_steps_per_second": 3.285, "step": 14000 }, { "epoch": 1.37, "grad_norm": 7.466320514678955, "learning_rate": 3.327777777777778e-06, "loss": 0.715, "step": 14010 }, { "epoch": 1.37, "grad_norm": 7.212748050689697, "learning_rate": 3.322222222222222e-06, "loss": 0.728, "step": 14020 }, { "epoch": 1.37, "grad_norm": 7.793807029724121, "learning_rate": 3.316666666666667e-06, "loss": 0.7223, "step": 14030 }, { "epoch": 1.38, "grad_norm": 8.107499122619629, "learning_rate": 3.3111111111111118e-06, "loss": 0.7179, "step": 14040 }, { "epoch": 1.38, "grad_norm": 8.639795303344727, "learning_rate": 3.3055555555555558e-06, "loss": 0.7105, "step": 14050 }, { "epoch": 1.38, "grad_norm": 7.531116962432861, "learning_rate": 3.3000000000000006e-06, "loss": 0.732, "step": 14060 }, { "epoch": 1.38, "grad_norm": 9.058149337768555, "learning_rate": 3.2944444444444446e-06, "loss": 0.7341, "step": 14070 }, { "epoch": 1.38, "grad_norm": 8.69173812866211, "learning_rate": 3.2888888888888894e-06, "loss": 0.7216, "step": 14080 }, { "epoch": 1.38, "grad_norm": 8.526257514953613, "learning_rate": 3.2833333333333334e-06, "loss": 0.7254, "step": 14090 }, { "epoch": 1.38, "grad_norm": 8.797303199768066, "learning_rate": 3.277777777777778e-06, "loss": 0.7241, "step": 14100 }, { "epoch": 1.38, "grad_norm": 8.553472518920898, "learning_rate": 3.2722222222222226e-06, "loss": 0.7142, "step": 14110 }, { "epoch": 1.38, "grad_norm": 7.387404441833496, "learning_rate": 3.266666666666667e-06, "loss": 0.7419, "step": 14120 }, { "epoch": 1.38, "grad_norm": 6.533252239227295, "learning_rate": 3.2611111111111114e-06, "loss": 0.717, "step": 14130 }, { "epoch": 1.39, "grad_norm": 7.305422782897949, "learning_rate": 3.255555555555556e-06, "loss": 0.7042, "step": 14140 }, { "epoch": 1.39, "grad_norm": 8.66522216796875, "learning_rate": 3.2500000000000002e-06, "loss": 0.7223, "step": 14150 }, { "epoch": 1.39, "grad_norm": 7.750004291534424, "learning_rate": 3.2444444444444446e-06, "loss": 0.7154, "step": 14160 }, { "epoch": 1.39, "grad_norm": 7.397431373596191, "learning_rate": 3.238888888888889e-06, "loss": 0.7107, "step": 14170 }, { "epoch": 1.39, "grad_norm": 6.74337100982666, "learning_rate": 3.2333333333333334e-06, "loss": 0.7124, "step": 14180 }, { "epoch": 1.39, "grad_norm": 7.597622871398926, "learning_rate": 3.227777777777778e-06, "loss": 0.73, "step": 14190 }, { "epoch": 1.39, "grad_norm": 8.62228012084961, "learning_rate": 3.2222222222222227e-06, "loss": 0.7103, "step": 14200 }, { "epoch": 1.39, "grad_norm": 6.56667947769165, "learning_rate": 3.2166666666666666e-06, "loss": 0.722, "step": 14210 }, { "epoch": 1.39, "grad_norm": 7.34277868270874, "learning_rate": 3.2111111111111115e-06, "loss": 0.7183, "step": 14220 }, { "epoch": 1.39, "grad_norm": 8.27258586883545, "learning_rate": 3.2055555555555555e-06, "loss": 0.7295, "step": 14230 }, { "epoch": 1.4, "grad_norm": 7.461408615112305, "learning_rate": 3.2000000000000003e-06, "loss": 0.6994, "step": 14240 }, { "epoch": 1.4, "grad_norm": 8.621198654174805, "learning_rate": 3.1944444444444443e-06, "loss": 0.7146, "step": 14250 }, { "epoch": 1.4, "grad_norm": 7.430715560913086, "learning_rate": 3.188888888888889e-06, "loss": 0.7136, "step": 14260 }, { "epoch": 1.4, "grad_norm": 7.9262895584106445, "learning_rate": 3.183333333333334e-06, "loss": 0.7314, "step": 14270 }, { "epoch": 1.4, "grad_norm": 8.947686195373535, "learning_rate": 3.177777777777778e-06, "loss": 0.7156, "step": 14280 }, { "epoch": 1.4, "grad_norm": 8.07798957824707, "learning_rate": 3.1722222222222227e-06, "loss": 0.7308, "step": 14290 }, { "epoch": 1.4, "grad_norm": 9.081107139587402, "learning_rate": 3.1666666666666667e-06, "loss": 0.7337, "step": 14300 }, { "epoch": 1.4, "grad_norm": 7.645173072814941, "learning_rate": 3.1611111111111115e-06, "loss": 0.7189, "step": 14310 }, { "epoch": 1.4, "grad_norm": 8.009171485900879, "learning_rate": 3.1555555555555555e-06, "loss": 0.7168, "step": 14320 }, { "epoch": 1.4, "grad_norm": 7.321778774261475, "learning_rate": 3.1500000000000003e-06, "loss": 0.7233, "step": 14330 }, { "epoch": 1.41, "grad_norm": 7.579931735992432, "learning_rate": 3.144444444444445e-06, "loss": 0.7182, "step": 14340 }, { "epoch": 1.41, "grad_norm": 6.317110538482666, "learning_rate": 3.138888888888889e-06, "loss": 0.7037, "step": 14350 }, { "epoch": 1.41, "grad_norm": 7.110856533050537, "learning_rate": 3.133333333333334e-06, "loss": 0.7446, "step": 14360 }, { "epoch": 1.41, "grad_norm": 8.166312217712402, "learning_rate": 3.127777777777778e-06, "loss": 0.725, "step": 14370 }, { "epoch": 1.41, "grad_norm": 8.884700775146484, "learning_rate": 3.1222222222222228e-06, "loss": 0.707, "step": 14380 }, { "epoch": 1.41, "grad_norm": 9.269293785095215, "learning_rate": 3.1166666666666668e-06, "loss": 0.7199, "step": 14390 }, { "epoch": 1.41, "grad_norm": 8.078415870666504, "learning_rate": 3.1111111111111116e-06, "loss": 0.6881, "step": 14400 }, { "epoch": 1.41, "grad_norm": 7.898222923278809, "learning_rate": 3.1055555555555556e-06, "loss": 0.7164, "step": 14410 }, { "epoch": 1.41, "grad_norm": 7.919857978820801, "learning_rate": 3.1000000000000004e-06, "loss": 0.7267, "step": 14420 }, { "epoch": 1.41, "grad_norm": 7.353516578674316, "learning_rate": 3.094444444444445e-06, "loss": 0.7189, "step": 14430 }, { "epoch": 1.42, "grad_norm": 7.784292697906494, "learning_rate": 3.088888888888889e-06, "loss": 0.7134, "step": 14440 }, { "epoch": 1.42, "grad_norm": 8.525325775146484, "learning_rate": 3.0833333333333336e-06, "loss": 0.7162, "step": 14450 }, { "epoch": 1.42, "grad_norm": 8.482037544250488, "learning_rate": 3.077777777777778e-06, "loss": 0.7353, "step": 14460 }, { "epoch": 1.42, "grad_norm": 7.856532096862793, "learning_rate": 3.0722222222222224e-06, "loss": 0.7233, "step": 14470 }, { "epoch": 1.42, "grad_norm": 7.264198303222656, "learning_rate": 3.066666666666667e-06, "loss": 0.7261, "step": 14480 }, { "epoch": 1.42, "grad_norm": 9.740427017211914, "learning_rate": 3.0611111111111112e-06, "loss": 0.7143, "step": 14490 }, { "epoch": 1.42, "grad_norm": 7.818991184234619, "learning_rate": 3.055555555555556e-06, "loss": 0.7058, "step": 14500 }, { "epoch": 1.42, "grad_norm": 7.176247596740723, "learning_rate": 3.05e-06, "loss": 0.7142, "step": 14510 }, { "epoch": 1.42, "grad_norm": 7.583724021911621, "learning_rate": 3.044444444444445e-06, "loss": 0.689, "step": 14520 }, { "epoch": 1.42, "grad_norm": 7.005664348602295, "learning_rate": 3.038888888888889e-06, "loss": 0.6922, "step": 14530 }, { "epoch": 1.42, "grad_norm": 8.321796417236328, "learning_rate": 3.0333333333333337e-06, "loss": 0.7314, "step": 14540 }, { "epoch": 1.43, "grad_norm": 8.5574369430542, "learning_rate": 3.0277777777777776e-06, "loss": 0.7191, "step": 14550 }, { "epoch": 1.43, "grad_norm": 8.456605911254883, "learning_rate": 3.0222222222222225e-06, "loss": 0.719, "step": 14560 }, { "epoch": 1.43, "grad_norm": 7.057939052581787, "learning_rate": 3.0166666666666673e-06, "loss": 0.701, "step": 14570 }, { "epoch": 1.43, "grad_norm": 9.707427978515625, "learning_rate": 3.0111111111111113e-06, "loss": 0.7135, "step": 14580 }, { "epoch": 1.43, "grad_norm": 7.764471530914307, "learning_rate": 3.005555555555556e-06, "loss": 0.7083, "step": 14590 }, { "epoch": 1.43, "grad_norm": 7.195455074310303, "learning_rate": 3e-06, "loss": 0.7133, "step": 14600 }, { "epoch": 1.43, "grad_norm": 7.306232929229736, "learning_rate": 2.994444444444445e-06, "loss": 0.6958, "step": 14610 }, { "epoch": 1.43, "grad_norm": 8.516317367553711, "learning_rate": 2.988888888888889e-06, "loss": 0.7065, "step": 14620 }, { "epoch": 1.43, "grad_norm": 8.048177719116211, "learning_rate": 2.9833333333333337e-06, "loss": 0.7053, "step": 14630 }, { "epoch": 1.43, "grad_norm": 7.34199857711792, "learning_rate": 2.9777777777777777e-06, "loss": 0.7, "step": 14640 }, { "epoch": 1.44, "grad_norm": 6.635432243347168, "learning_rate": 2.9722222222222225e-06, "loss": 0.6961, "step": 14650 }, { "epoch": 1.44, "grad_norm": 6.724674701690674, "learning_rate": 2.9666666666666673e-06, "loss": 0.6919, "step": 14660 }, { "epoch": 1.44, "grad_norm": 6.242835521697998, "learning_rate": 2.9611111111111113e-06, "loss": 0.7181, "step": 14670 }, { "epoch": 1.44, "grad_norm": 7.623150825500488, "learning_rate": 2.955555555555556e-06, "loss": 0.71, "step": 14680 }, { "epoch": 1.44, "grad_norm": 8.368824005126953, "learning_rate": 2.95e-06, "loss": 0.7123, "step": 14690 }, { "epoch": 1.44, "grad_norm": 6.852259159088135, "learning_rate": 2.944444444444445e-06, "loss": 0.7191, "step": 14700 }, { "epoch": 1.44, "grad_norm": 7.83400821685791, "learning_rate": 2.938888888888889e-06, "loss": 0.6993, "step": 14710 }, { "epoch": 1.44, "grad_norm": 8.644157409667969, "learning_rate": 2.9333333333333338e-06, "loss": 0.6965, "step": 14720 }, { "epoch": 1.44, "grad_norm": 7.181631565093994, "learning_rate": 2.927777777777778e-06, "loss": 0.7049, "step": 14730 }, { "epoch": 1.44, "grad_norm": 7.693680763244629, "learning_rate": 2.9222222222222226e-06, "loss": 0.7106, "step": 14740 }, { "epoch": 1.45, "grad_norm": 7.933276176452637, "learning_rate": 2.916666666666667e-06, "loss": 0.6972, "step": 14750 }, { "epoch": 1.45, "grad_norm": 7.639453411102295, "learning_rate": 2.9111111111111114e-06, "loss": 0.6962, "step": 14760 }, { "epoch": 1.45, "grad_norm": 7.72624397277832, "learning_rate": 2.9055555555555558e-06, "loss": 0.7224, "step": 14770 }, { "epoch": 1.45, "grad_norm": 6.959993362426758, "learning_rate": 2.9e-06, "loss": 0.7221, "step": 14780 }, { "epoch": 1.45, "grad_norm": 7.171219348907471, "learning_rate": 2.8944444444444446e-06, "loss": 0.6999, "step": 14790 }, { "epoch": 1.45, "grad_norm": 8.354853630065918, "learning_rate": 2.888888888888889e-06, "loss": 0.695, "step": 14800 }, { "epoch": 1.45, "grad_norm": 7.327788352966309, "learning_rate": 2.8833333333333334e-06, "loss": 0.7194, "step": 14810 }, { "epoch": 1.45, "grad_norm": 8.805681228637695, "learning_rate": 2.8777777777777782e-06, "loss": 0.7274, "step": 14820 }, { "epoch": 1.45, "grad_norm": 8.534201622009277, "learning_rate": 2.872222222222222e-06, "loss": 0.7202, "step": 14830 }, { "epoch": 1.45, "grad_norm": 7.35629415512085, "learning_rate": 2.866666666666667e-06, "loss": 0.6984, "step": 14840 }, { "epoch": 1.46, "grad_norm": 8.805977821350098, "learning_rate": 2.861111111111111e-06, "loss": 0.7006, "step": 14850 }, { "epoch": 1.46, "grad_norm": 7.6526947021484375, "learning_rate": 2.855555555555556e-06, "loss": 0.7062, "step": 14860 }, { "epoch": 1.46, "grad_norm": 6.682794570922852, "learning_rate": 2.85e-06, "loss": 0.6998, "step": 14870 }, { "epoch": 1.46, "grad_norm": 8.100156784057617, "learning_rate": 2.8444444444444446e-06, "loss": 0.7154, "step": 14880 }, { "epoch": 1.46, "grad_norm": 7.650350570678711, "learning_rate": 2.8388888888888895e-06, "loss": 0.7071, "step": 14890 }, { "epoch": 1.46, "grad_norm": 7.270593166351318, "learning_rate": 2.8333333333333335e-06, "loss": 0.7066, "step": 14900 }, { "epoch": 1.46, "grad_norm": 7.250268459320068, "learning_rate": 2.8277777777777783e-06, "loss": 0.6847, "step": 14910 }, { "epoch": 1.46, "grad_norm": 6.922353267669678, "learning_rate": 2.8222222222222223e-06, "loss": 0.7189, "step": 14920 }, { "epoch": 1.46, "grad_norm": 10.094352722167969, "learning_rate": 2.816666666666667e-06, "loss": 0.7169, "step": 14930 }, { "epoch": 1.46, "grad_norm": 8.646256446838379, "learning_rate": 2.811111111111111e-06, "loss": 0.7151, "step": 14940 }, { "epoch": 1.47, "grad_norm": 7.776257038116455, "learning_rate": 2.805555555555556e-06, "loss": 0.7041, "step": 14950 }, { "epoch": 1.47, "grad_norm": 9.100178718566895, "learning_rate": 2.8000000000000003e-06, "loss": 0.7172, "step": 14960 }, { "epoch": 1.47, "grad_norm": 7.141878128051758, "learning_rate": 2.7944444444444447e-06, "loss": 0.7105, "step": 14970 }, { "epoch": 1.47, "grad_norm": 7.545955657958984, "learning_rate": 2.788888888888889e-06, "loss": 0.7254, "step": 14980 }, { "epoch": 1.47, "grad_norm": 9.661377906799316, "learning_rate": 2.7833333333333335e-06, "loss": 0.7208, "step": 14990 }, { "epoch": 1.47, "grad_norm": 8.577885627746582, "learning_rate": 2.7777777777777783e-06, "loss": 0.6937, "step": 15000 }, { "epoch": 1.47, "grad_norm": 8.890149116516113, "learning_rate": 2.7722222222222223e-06, "loss": 0.7049, "step": 15010 }, { "epoch": 1.47, "grad_norm": 7.192319393157959, "learning_rate": 2.766666666666667e-06, "loss": 0.6859, "step": 15020 }, { "epoch": 1.47, "grad_norm": 8.102797508239746, "learning_rate": 2.761111111111111e-06, "loss": 0.6933, "step": 15030 }, { "epoch": 1.47, "grad_norm": 7.478940963745117, "learning_rate": 2.755555555555556e-06, "loss": 0.6969, "step": 15040 }, { "epoch": 1.47, "grad_norm": 9.836512565612793, "learning_rate": 2.7500000000000004e-06, "loss": 0.6886, "step": 15050 }, { "epoch": 1.48, "grad_norm": 7.40115213394165, "learning_rate": 2.7444444444444448e-06, "loss": 0.7064, "step": 15060 }, { "epoch": 1.48, "grad_norm": 7.1161699295043945, "learning_rate": 2.738888888888889e-06, "loss": 0.6922, "step": 15070 }, { "epoch": 1.48, "grad_norm": 8.211380958557129, "learning_rate": 2.7333333333333336e-06, "loss": 0.7164, "step": 15080 }, { "epoch": 1.48, "grad_norm": 7.3103532791137695, "learning_rate": 2.727777777777778e-06, "loss": 0.6854, "step": 15090 }, { "epoch": 1.48, "grad_norm": 8.491891860961914, "learning_rate": 2.7222222222222224e-06, "loss": 0.6822, "step": 15100 }, { "epoch": 1.48, "grad_norm": 7.636681079864502, "learning_rate": 2.7166666666666668e-06, "loss": 0.6961, "step": 15110 }, { "epoch": 1.48, "grad_norm": 8.949017524719238, "learning_rate": 2.7111111111111116e-06, "loss": 0.7098, "step": 15120 }, { "epoch": 1.48, "grad_norm": 8.600934028625488, "learning_rate": 2.7055555555555556e-06, "loss": 0.7023, "step": 15130 }, { "epoch": 1.48, "grad_norm": 7.90761137008667, "learning_rate": 2.7000000000000004e-06, "loss": 0.6857, "step": 15140 }, { "epoch": 1.48, "grad_norm": 7.623889446258545, "learning_rate": 2.6944444444444444e-06, "loss": 0.7164, "step": 15150 }, { "epoch": 1.49, "grad_norm": 9.048887252807617, "learning_rate": 2.6888888888888892e-06, "loss": 0.681, "step": 15160 }, { "epoch": 1.49, "grad_norm": 8.551778793334961, "learning_rate": 2.683333333333333e-06, "loss": 0.7101, "step": 15170 }, { "epoch": 1.49, "grad_norm": 9.208138465881348, "learning_rate": 2.677777777777778e-06, "loss": 0.6965, "step": 15180 }, { "epoch": 1.49, "grad_norm": 8.863828659057617, "learning_rate": 2.672222222222223e-06, "loss": 0.6925, "step": 15190 }, { "epoch": 1.49, "grad_norm": 7.545036315917969, "learning_rate": 2.666666666666667e-06, "loss": 0.6977, "step": 15200 }, { "epoch": 1.49, "grad_norm": 8.355212211608887, "learning_rate": 2.6611111111111117e-06, "loss": 0.6865, "step": 15210 }, { "epoch": 1.49, "grad_norm": 8.240880012512207, "learning_rate": 2.6555555555555556e-06, "loss": 0.6918, "step": 15220 }, { "epoch": 1.49, "grad_norm": 7.768767833709717, "learning_rate": 2.6500000000000005e-06, "loss": 0.7034, "step": 15230 }, { "epoch": 1.49, "grad_norm": 7.450006484985352, "learning_rate": 2.6444444444444444e-06, "loss": 0.6942, "step": 15240 }, { "epoch": 1.49, "grad_norm": 6.358761787414551, "learning_rate": 2.6388888888888893e-06, "loss": 0.6849, "step": 15250 }, { "epoch": 1.5, "grad_norm": 8.804125785827637, "learning_rate": 2.6333333333333332e-06, "loss": 0.6736, "step": 15260 }, { "epoch": 1.5, "grad_norm": 7.710080146789551, "learning_rate": 2.627777777777778e-06, "loss": 0.6954, "step": 15270 }, { "epoch": 1.5, "grad_norm": 7.249362468719482, "learning_rate": 2.6222222222222225e-06, "loss": 0.7072, "step": 15280 }, { "epoch": 1.5, "grad_norm": 8.31942367553711, "learning_rate": 2.616666666666667e-06, "loss": 0.6939, "step": 15290 }, { "epoch": 1.5, "grad_norm": 6.8723063468933105, "learning_rate": 2.6111111111111113e-06, "loss": 0.7033, "step": 15300 }, { "epoch": 1.5, "grad_norm": 7.74310302734375, "learning_rate": 2.6055555555555557e-06, "loss": 0.6986, "step": 15310 }, { "epoch": 1.5, "grad_norm": 9.32869815826416, "learning_rate": 2.6e-06, "loss": 0.7025, "step": 15320 }, { "epoch": 1.5, "grad_norm": 8.256912231445312, "learning_rate": 2.5944444444444445e-06, "loss": 0.7118, "step": 15330 }, { "epoch": 1.5, "grad_norm": 7.0270233154296875, "learning_rate": 2.5888888888888893e-06, "loss": 0.7074, "step": 15340 }, { "epoch": 1.5, "grad_norm": 6.899842739105225, "learning_rate": 2.5833333333333337e-06, "loss": 0.7138, "step": 15350 }, { "epoch": 1.51, "grad_norm": 6.879330635070801, "learning_rate": 2.577777777777778e-06, "loss": 0.6811, "step": 15360 }, { "epoch": 1.51, "grad_norm": 9.720717430114746, "learning_rate": 2.5722222222222225e-06, "loss": 0.7161, "step": 15370 }, { "epoch": 1.51, "grad_norm": 9.052262306213379, "learning_rate": 2.566666666666667e-06, "loss": 0.704, "step": 15380 }, { "epoch": 1.51, "grad_norm": 7.492184638977051, "learning_rate": 2.5611111111111113e-06, "loss": 0.7134, "step": 15390 }, { "epoch": 1.51, "grad_norm": 7.19498872756958, "learning_rate": 2.5555555555555557e-06, "loss": 0.6895, "step": 15400 }, { "epoch": 1.51, "grad_norm": 8.55247974395752, "learning_rate": 2.55e-06, "loss": 0.7106, "step": 15410 }, { "epoch": 1.51, "grad_norm": 6.797913074493408, "learning_rate": 2.5444444444444446e-06, "loss": 0.7106, "step": 15420 }, { "epoch": 1.51, "grad_norm": 7.924322128295898, "learning_rate": 2.538888888888889e-06, "loss": 0.6616, "step": 15430 }, { "epoch": 1.51, "grad_norm": 6.8249335289001465, "learning_rate": 2.5333333333333338e-06, "loss": 0.678, "step": 15440 }, { "epoch": 1.51, "grad_norm": 8.250901222229004, "learning_rate": 2.5277777777777778e-06, "loss": 0.7024, "step": 15450 }, { "epoch": 1.52, "grad_norm": 6.8705525398254395, "learning_rate": 2.5222222222222226e-06, "loss": 0.6848, "step": 15460 }, { "epoch": 1.52, "grad_norm": 7.558277606964111, "learning_rate": 2.5166666666666666e-06, "loss": 0.6995, "step": 15470 }, { "epoch": 1.52, "grad_norm": 7.9494218826293945, "learning_rate": 2.5111111111111114e-06, "loss": 0.7106, "step": 15480 }, { "epoch": 1.52, "grad_norm": 9.530181884765625, "learning_rate": 2.5055555555555554e-06, "loss": 0.696, "step": 15490 }, { "epoch": 1.52, "grad_norm": 8.237500190734863, "learning_rate": 2.5e-06, "loss": 0.6769, "step": 15500 }, { "epoch": 1.52, "grad_norm": 8.244582176208496, "learning_rate": 2.4944444444444446e-06, "loss": 0.7005, "step": 15510 }, { "epoch": 1.52, "grad_norm": 7.270286560058594, "learning_rate": 2.488888888888889e-06, "loss": 0.6873, "step": 15520 }, { "epoch": 1.52, "grad_norm": 7.327165126800537, "learning_rate": 2.4833333333333334e-06, "loss": 0.7178, "step": 15530 }, { "epoch": 1.52, "grad_norm": 7.925502300262451, "learning_rate": 2.4777777777777782e-06, "loss": 0.6839, "step": 15540 }, { "epoch": 1.52, "grad_norm": 8.256528854370117, "learning_rate": 2.4722222222222226e-06, "loss": 0.6916, "step": 15550 }, { "epoch": 1.52, "grad_norm": 7.866616725921631, "learning_rate": 2.466666666666667e-06, "loss": 0.6978, "step": 15560 }, { "epoch": 1.53, "grad_norm": 7.1968793869018555, "learning_rate": 2.4611111111111115e-06, "loss": 0.7021, "step": 15570 }, { "epoch": 1.53, "grad_norm": 8.610920906066895, "learning_rate": 2.455555555555556e-06, "loss": 0.7036, "step": 15580 }, { "epoch": 1.53, "grad_norm": 8.102506637573242, "learning_rate": 2.4500000000000003e-06, "loss": 0.6982, "step": 15590 }, { "epoch": 1.53, "grad_norm": 8.764822959899902, "learning_rate": 2.4444444444444447e-06, "loss": 0.7046, "step": 15600 }, { "epoch": 1.53, "grad_norm": 8.262327194213867, "learning_rate": 2.438888888888889e-06, "loss": 0.6798, "step": 15610 }, { "epoch": 1.53, "grad_norm": 8.54809284210205, "learning_rate": 2.4333333333333335e-06, "loss": 0.7189, "step": 15620 }, { "epoch": 1.53, "grad_norm": 8.357661247253418, "learning_rate": 2.427777777777778e-06, "loss": 0.7052, "step": 15630 }, { "epoch": 1.53, "grad_norm": 9.22020435333252, "learning_rate": 2.4222222222222223e-06, "loss": 0.6991, "step": 15640 }, { "epoch": 1.53, "grad_norm": 7.182758808135986, "learning_rate": 2.4166666666666667e-06, "loss": 0.6964, "step": 15650 }, { "epoch": 1.53, "grad_norm": 7.722668170928955, "learning_rate": 2.411111111111111e-06, "loss": 0.6896, "step": 15660 }, { "epoch": 1.54, "grad_norm": 9.535320281982422, "learning_rate": 2.4055555555555555e-06, "loss": 0.6975, "step": 15670 }, { "epoch": 1.54, "grad_norm": 9.065465927124023, "learning_rate": 2.4000000000000003e-06, "loss": 0.6858, "step": 15680 }, { "epoch": 1.54, "grad_norm": 8.863288879394531, "learning_rate": 2.3944444444444447e-06, "loss": 0.7104, "step": 15690 }, { "epoch": 1.54, "grad_norm": 7.992947101593018, "learning_rate": 2.388888888888889e-06, "loss": 0.6936, "step": 15700 }, { "epoch": 1.54, "grad_norm": 8.600565910339355, "learning_rate": 2.3833333333333335e-06, "loss": 0.6772, "step": 15710 }, { "epoch": 1.54, "grad_norm": 7.972917079925537, "learning_rate": 2.377777777777778e-06, "loss": 0.6832, "step": 15720 }, { "epoch": 1.54, "grad_norm": 8.99276065826416, "learning_rate": 2.3722222222222223e-06, "loss": 0.6988, "step": 15730 }, { "epoch": 1.54, "grad_norm": 9.35024642944336, "learning_rate": 2.3666666666666667e-06, "loss": 0.6992, "step": 15740 }, { "epoch": 1.54, "grad_norm": 8.231270790100098, "learning_rate": 2.361111111111111e-06, "loss": 0.6922, "step": 15750 }, { "epoch": 1.54, "grad_norm": 7.735280990600586, "learning_rate": 2.3555555555555555e-06, "loss": 0.6913, "step": 15760 }, { "epoch": 1.55, "grad_norm": 8.103928565979004, "learning_rate": 2.35e-06, "loss": 0.7008, "step": 15770 }, { "epoch": 1.55, "grad_norm": 7.807718753814697, "learning_rate": 2.3444444444444448e-06, "loss": 0.7148, "step": 15780 }, { "epoch": 1.55, "grad_norm": 7.432494640350342, "learning_rate": 2.338888888888889e-06, "loss": 0.6876, "step": 15790 }, { "epoch": 1.55, "grad_norm": 9.734456062316895, "learning_rate": 2.3333333333333336e-06, "loss": 0.6836, "step": 15800 }, { "epoch": 1.55, "grad_norm": 6.299215316772461, "learning_rate": 2.327777777777778e-06, "loss": 0.6865, "step": 15810 }, { "epoch": 1.55, "grad_norm": 8.709647178649902, "learning_rate": 2.3222222222222224e-06, "loss": 0.6916, "step": 15820 }, { "epoch": 1.55, "grad_norm": 8.96340274810791, "learning_rate": 2.316666666666667e-06, "loss": 0.7136, "step": 15830 }, { "epoch": 1.55, "grad_norm": 9.868083953857422, "learning_rate": 2.311111111111111e-06, "loss": 0.6815, "step": 15840 }, { "epoch": 1.55, "grad_norm": 8.279577255249023, "learning_rate": 2.305555555555556e-06, "loss": 0.6793, "step": 15850 }, { "epoch": 1.55, "grad_norm": 7.479567527770996, "learning_rate": 2.3000000000000004e-06, "loss": 0.7038, "step": 15860 }, { "epoch": 1.56, "grad_norm": 7.596058368682861, "learning_rate": 2.294444444444445e-06, "loss": 0.6904, "step": 15870 }, { "epoch": 1.56, "grad_norm": 7.97269868850708, "learning_rate": 2.2888888888888892e-06, "loss": 0.6952, "step": 15880 }, { "epoch": 1.56, "grad_norm": 8.536275863647461, "learning_rate": 2.2833333333333336e-06, "loss": 0.7005, "step": 15890 }, { "epoch": 1.56, "grad_norm": 7.484653949737549, "learning_rate": 2.277777777777778e-06, "loss": 0.6812, "step": 15900 }, { "epoch": 1.56, "grad_norm": 7.451747417449951, "learning_rate": 2.2722222222222224e-06, "loss": 0.6718, "step": 15910 }, { "epoch": 1.56, "grad_norm": 7.6047234535217285, "learning_rate": 2.266666666666667e-06, "loss": 0.6949, "step": 15920 }, { "epoch": 1.56, "grad_norm": 8.607637405395508, "learning_rate": 2.2611111111111112e-06, "loss": 0.692, "step": 15930 }, { "epoch": 1.56, "grad_norm": 7.243441104888916, "learning_rate": 2.2555555555555557e-06, "loss": 0.6707, "step": 15940 }, { "epoch": 1.56, "grad_norm": 7.475924968719482, "learning_rate": 2.25e-06, "loss": 0.6868, "step": 15950 }, { "epoch": 1.56, "grad_norm": 6.407505512237549, "learning_rate": 2.2444444444444445e-06, "loss": 0.7057, "step": 15960 }, { "epoch": 1.57, "grad_norm": 8.183919906616211, "learning_rate": 2.238888888888889e-06, "loss": 0.7117, "step": 15970 }, { "epoch": 1.57, "grad_norm": 6.761192321777344, "learning_rate": 2.2333333333333333e-06, "loss": 0.6974, "step": 15980 }, { "epoch": 1.57, "grad_norm": 7.724715709686279, "learning_rate": 2.2277777777777777e-06, "loss": 0.7019, "step": 15990 }, { "epoch": 1.57, "grad_norm": 8.485870361328125, "learning_rate": 2.222222222222222e-06, "loss": 0.6839, "step": 16000 }, { "epoch": 1.57, "eval_loss": 0.7433316111564636, "eval_runtime": 25.0393, "eval_samples_per_second": 26.119, "eval_steps_per_second": 3.275, "step": 16000 }, { "epoch": 1.57, "grad_norm": 8.15938663482666, "learning_rate": 2.216666666666667e-06, "loss": 0.6914, "step": 16010 }, { "epoch": 1.57, "grad_norm": 8.71955394744873, "learning_rate": 2.2111111111111113e-06, "loss": 0.7001, "step": 16020 }, { "epoch": 1.57, "grad_norm": 7.9860711097717285, "learning_rate": 2.2055555555555557e-06, "loss": 0.6786, "step": 16030 }, { "epoch": 1.57, "grad_norm": 9.189857482910156, "learning_rate": 2.2e-06, "loss": 0.6889, "step": 16040 }, { "epoch": 1.57, "grad_norm": 7.3093976974487305, "learning_rate": 2.1944444444444445e-06, "loss": 0.7091, "step": 16050 }, { "epoch": 1.57, "grad_norm": 7.384908676147461, "learning_rate": 2.188888888888889e-06, "loss": 0.6963, "step": 16060 }, { "epoch": 1.57, "grad_norm": 7.253487586975098, "learning_rate": 2.1833333333333333e-06, "loss": 0.6833, "step": 16070 }, { "epoch": 1.58, "grad_norm": 8.575284004211426, "learning_rate": 2.1777777777777777e-06, "loss": 0.6816, "step": 16080 }, { "epoch": 1.58, "grad_norm": 7.2930707931518555, "learning_rate": 2.1722222222222226e-06, "loss": 0.6854, "step": 16090 }, { "epoch": 1.58, "grad_norm": 10.121221542358398, "learning_rate": 2.166666666666667e-06, "loss": 0.6821, "step": 16100 }, { "epoch": 1.58, "grad_norm": 8.335285186767578, "learning_rate": 2.1611111111111114e-06, "loss": 0.6977, "step": 16110 }, { "epoch": 1.58, "grad_norm": 6.553925514221191, "learning_rate": 2.1555555555555558e-06, "loss": 0.6874, "step": 16120 }, { "epoch": 1.58, "grad_norm": 8.642473220825195, "learning_rate": 2.15e-06, "loss": 0.6938, "step": 16130 }, { "epoch": 1.58, "grad_norm": 8.634443283081055, "learning_rate": 2.1444444444444446e-06, "loss": 0.6874, "step": 16140 }, { "epoch": 1.58, "grad_norm": 7.858991622924805, "learning_rate": 2.138888888888889e-06, "loss": 0.6763, "step": 16150 }, { "epoch": 1.58, "grad_norm": 8.437248229980469, "learning_rate": 2.133333333333334e-06, "loss": 0.6831, "step": 16160 }, { "epoch": 1.58, "grad_norm": 8.241456985473633, "learning_rate": 2.127777777777778e-06, "loss": 0.6783, "step": 16170 }, { "epoch": 1.59, "grad_norm": 7.698614120483398, "learning_rate": 2.1222222222222226e-06, "loss": 0.6764, "step": 16180 }, { "epoch": 1.59, "grad_norm": 9.09626293182373, "learning_rate": 2.116666666666667e-06, "loss": 0.6751, "step": 16190 }, { "epoch": 1.59, "grad_norm": 7.77241325378418, "learning_rate": 2.1111111111111114e-06, "loss": 0.6781, "step": 16200 }, { "epoch": 1.59, "grad_norm": 8.654509544372559, "learning_rate": 2.105555555555556e-06, "loss": 0.6969, "step": 16210 }, { "epoch": 1.59, "grad_norm": 7.994089126586914, "learning_rate": 2.1000000000000002e-06, "loss": 0.6849, "step": 16220 }, { "epoch": 1.59, "grad_norm": 8.711403846740723, "learning_rate": 2.0944444444444446e-06, "loss": 0.6961, "step": 16230 }, { "epoch": 1.59, "grad_norm": 7.664073467254639, "learning_rate": 2.088888888888889e-06, "loss": 0.6741, "step": 16240 }, { "epoch": 1.59, "grad_norm": 7.845112323760986, "learning_rate": 2.0833333333333334e-06, "loss": 0.6939, "step": 16250 }, { "epoch": 1.59, "grad_norm": 6.89766788482666, "learning_rate": 2.077777777777778e-06, "loss": 0.7008, "step": 16260 }, { "epoch": 1.59, "grad_norm": 6.681533336639404, "learning_rate": 2.0722222222222222e-06, "loss": 0.6857, "step": 16270 }, { "epoch": 1.6, "grad_norm": 8.535284996032715, "learning_rate": 2.0666666666666666e-06, "loss": 0.6608, "step": 16280 }, { "epoch": 1.6, "grad_norm": 8.343973159790039, "learning_rate": 2.061111111111111e-06, "loss": 0.6729, "step": 16290 }, { "epoch": 1.6, "grad_norm": 7.632655620574951, "learning_rate": 2.0555555555555555e-06, "loss": 0.6565, "step": 16300 }, { "epoch": 1.6, "grad_norm": 7.182013034820557, "learning_rate": 2.05e-06, "loss": 0.6788, "step": 16310 }, { "epoch": 1.6, "grad_norm": 6.371190071105957, "learning_rate": 2.0444444444444447e-06, "loss": 0.6792, "step": 16320 }, { "epoch": 1.6, "grad_norm": 7.764781475067139, "learning_rate": 2.038888888888889e-06, "loss": 0.684, "step": 16330 }, { "epoch": 1.6, "grad_norm": 8.308398246765137, "learning_rate": 2.0333333333333335e-06, "loss": 0.6869, "step": 16340 }, { "epoch": 1.6, "grad_norm": 8.159722328186035, "learning_rate": 2.027777777777778e-06, "loss": 0.698, "step": 16350 }, { "epoch": 1.6, "grad_norm": 8.627520561218262, "learning_rate": 2.0222222222222223e-06, "loss": 0.6817, "step": 16360 }, { "epoch": 1.6, "grad_norm": 5.9063944816589355, "learning_rate": 2.0166666666666667e-06, "loss": 0.6809, "step": 16370 }, { "epoch": 1.61, "grad_norm": 7.794890880584717, "learning_rate": 2.011111111111111e-06, "loss": 0.6831, "step": 16380 }, { "epoch": 1.61, "grad_norm": 9.579720497131348, "learning_rate": 2.0055555555555555e-06, "loss": 0.6763, "step": 16390 }, { "epoch": 1.61, "grad_norm": 8.79791259765625, "learning_rate": 2.0000000000000003e-06, "loss": 0.678, "step": 16400 }, { "epoch": 1.61, "grad_norm": 7.430709362030029, "learning_rate": 1.9944444444444447e-06, "loss": 0.6864, "step": 16410 }, { "epoch": 1.61, "grad_norm": 12.191941261291504, "learning_rate": 1.988888888888889e-06, "loss": 0.6875, "step": 16420 }, { "epoch": 1.61, "grad_norm": 8.991518020629883, "learning_rate": 1.9833333333333335e-06, "loss": 0.6865, "step": 16430 }, { "epoch": 1.61, "grad_norm": 8.0477933883667, "learning_rate": 1.977777777777778e-06, "loss": 0.6694, "step": 16440 }, { "epoch": 1.61, "grad_norm": 9.10106372833252, "learning_rate": 1.9722222222222224e-06, "loss": 0.6692, "step": 16450 }, { "epoch": 1.61, "grad_norm": 7.223536968231201, "learning_rate": 1.9666666666666668e-06, "loss": 0.6776, "step": 16460 }, { "epoch": 1.61, "grad_norm": 7.779234409332275, "learning_rate": 1.9611111111111116e-06, "loss": 0.6751, "step": 16470 }, { "epoch": 1.62, "grad_norm": 9.36398696899414, "learning_rate": 1.955555555555556e-06, "loss": 0.6722, "step": 16480 }, { "epoch": 1.62, "grad_norm": 7.933370590209961, "learning_rate": 1.9500000000000004e-06, "loss": 0.7001, "step": 16490 }, { "epoch": 1.62, "grad_norm": 9.299789428710938, "learning_rate": 1.944444444444445e-06, "loss": 0.6792, "step": 16500 }, { "epoch": 1.62, "grad_norm": 8.29805850982666, "learning_rate": 1.938888888888889e-06, "loss": 0.7031, "step": 16510 }, { "epoch": 1.62, "grad_norm": 8.542588233947754, "learning_rate": 1.9333333333333336e-06, "loss": 0.6772, "step": 16520 }, { "epoch": 1.62, "grad_norm": 8.613343238830566, "learning_rate": 1.927777777777778e-06, "loss": 0.6828, "step": 16530 }, { "epoch": 1.62, "grad_norm": 7.384303092956543, "learning_rate": 1.9222222222222224e-06, "loss": 0.6934, "step": 16540 }, { "epoch": 1.62, "grad_norm": 8.128457069396973, "learning_rate": 1.916666666666667e-06, "loss": 0.6669, "step": 16550 }, { "epoch": 1.62, "grad_norm": 7.111478328704834, "learning_rate": 1.9111111111111112e-06, "loss": 0.6627, "step": 16560 }, { "epoch": 1.62, "grad_norm": 8.547881126403809, "learning_rate": 1.9055555555555558e-06, "loss": 0.7089, "step": 16570 }, { "epoch": 1.62, "grad_norm": 8.97326374053955, "learning_rate": 1.9000000000000002e-06, "loss": 0.7019, "step": 16580 }, { "epoch": 1.63, "grad_norm": 7.707952976226807, "learning_rate": 1.8944444444444446e-06, "loss": 0.6812, "step": 16590 }, { "epoch": 1.63, "grad_norm": 8.509544372558594, "learning_rate": 1.888888888888889e-06, "loss": 0.6873, "step": 16600 }, { "epoch": 1.63, "grad_norm": 15.570175170898438, "learning_rate": 1.8833333333333334e-06, "loss": 0.7021, "step": 16610 }, { "epoch": 1.63, "grad_norm": 8.381664276123047, "learning_rate": 1.8777777777777778e-06, "loss": 0.6743, "step": 16620 }, { "epoch": 1.63, "grad_norm": 9.59155559539795, "learning_rate": 1.8722222222222225e-06, "loss": 0.6687, "step": 16630 }, { "epoch": 1.63, "grad_norm": 7.101619243621826, "learning_rate": 1.8666666666666669e-06, "loss": 0.6697, "step": 16640 }, { "epoch": 1.63, "grad_norm": 9.513158798217773, "learning_rate": 1.8611111111111113e-06, "loss": 0.6874, "step": 16650 }, { "epoch": 1.63, "grad_norm": 8.232749938964844, "learning_rate": 1.8555555555555557e-06, "loss": 0.681, "step": 16660 }, { "epoch": 1.63, "grad_norm": 8.431181907653809, "learning_rate": 1.85e-06, "loss": 0.6865, "step": 16670 }, { "epoch": 1.63, "grad_norm": 8.771259307861328, "learning_rate": 1.8444444444444445e-06, "loss": 0.6644, "step": 16680 }, { "epoch": 1.64, "grad_norm": 8.837810516357422, "learning_rate": 1.8388888888888889e-06, "loss": 0.674, "step": 16690 }, { "epoch": 1.64, "grad_norm": 7.990067481994629, "learning_rate": 1.8333333333333333e-06, "loss": 0.6769, "step": 16700 }, { "epoch": 1.64, "grad_norm": 8.130684852600098, "learning_rate": 1.8277777777777781e-06, "loss": 0.6889, "step": 16710 }, { "epoch": 1.64, "grad_norm": 9.670692443847656, "learning_rate": 1.8222222222222225e-06, "loss": 0.677, "step": 16720 }, { "epoch": 1.64, "grad_norm": 8.954739570617676, "learning_rate": 1.816666666666667e-06, "loss": 0.6734, "step": 16730 }, { "epoch": 1.64, "grad_norm": 8.378406524658203, "learning_rate": 1.8111111111111113e-06, "loss": 0.6568, "step": 16740 }, { "epoch": 1.64, "grad_norm": 8.486390113830566, "learning_rate": 1.8055555555555557e-06, "loss": 0.6923, "step": 16750 }, { "epoch": 1.64, "grad_norm": 8.827221870422363, "learning_rate": 1.8000000000000001e-06, "loss": 0.6881, "step": 16760 }, { "epoch": 1.64, "grad_norm": 10.179788589477539, "learning_rate": 1.7944444444444445e-06, "loss": 0.6577, "step": 16770 }, { "epoch": 1.64, "grad_norm": 7.6715874671936035, "learning_rate": 1.788888888888889e-06, "loss": 0.6757, "step": 16780 }, { "epoch": 1.65, "grad_norm": 8.573400497436523, "learning_rate": 1.7833333333333336e-06, "loss": 0.6807, "step": 16790 }, { "epoch": 1.65, "grad_norm": 8.332742691040039, "learning_rate": 1.777777777777778e-06, "loss": 0.6941, "step": 16800 }, { "epoch": 1.65, "grad_norm": 8.943684577941895, "learning_rate": 1.7722222222222224e-06, "loss": 0.6707, "step": 16810 }, { "epoch": 1.65, "grad_norm": 7.561721324920654, "learning_rate": 1.7666666666666668e-06, "loss": 0.6688, "step": 16820 }, { "epoch": 1.65, "grad_norm": 6.723324298858643, "learning_rate": 1.7611111111111112e-06, "loss": 0.692, "step": 16830 }, { "epoch": 1.65, "grad_norm": 8.959000587463379, "learning_rate": 1.7555555555555556e-06, "loss": 0.6804, "step": 16840 }, { "epoch": 1.65, "grad_norm": 8.555145263671875, "learning_rate": 1.75e-06, "loss": 0.6871, "step": 16850 }, { "epoch": 1.65, "grad_norm": 8.957594871520996, "learning_rate": 1.7444444444444448e-06, "loss": 0.674, "step": 16860 }, { "epoch": 1.65, "grad_norm": 8.278112411499023, "learning_rate": 1.7388888888888892e-06, "loss": 0.6654, "step": 16870 }, { "epoch": 1.65, "grad_norm": 8.588724136352539, "learning_rate": 1.7333333333333336e-06, "loss": 0.6581, "step": 16880 }, { "epoch": 1.66, "grad_norm": 7.745607376098633, "learning_rate": 1.727777777777778e-06, "loss": 0.696, "step": 16890 }, { "epoch": 1.66, "grad_norm": 7.972159385681152, "learning_rate": 1.7222222222222224e-06, "loss": 0.6829, "step": 16900 }, { "epoch": 1.66, "grad_norm": 8.071720123291016, "learning_rate": 1.7166666666666668e-06, "loss": 0.6854, "step": 16910 }, { "epoch": 1.66, "grad_norm": 7.857290267944336, "learning_rate": 1.7111111111111112e-06, "loss": 0.681, "step": 16920 }, { "epoch": 1.66, "grad_norm": 10.596278190612793, "learning_rate": 1.7055555555555556e-06, "loss": 0.6634, "step": 16930 }, { "epoch": 1.66, "grad_norm": 8.513778686523438, "learning_rate": 1.7000000000000002e-06, "loss": 0.6923, "step": 16940 }, { "epoch": 1.66, "grad_norm": 8.409786224365234, "learning_rate": 1.6944444444444446e-06, "loss": 0.6814, "step": 16950 }, { "epoch": 1.66, "grad_norm": 8.084321975708008, "learning_rate": 1.688888888888889e-06, "loss": 0.6747, "step": 16960 }, { "epoch": 1.66, "grad_norm": 9.001971244812012, "learning_rate": 1.6833333333333335e-06, "loss": 0.669, "step": 16970 }, { "epoch": 1.66, "grad_norm": 8.309261322021484, "learning_rate": 1.6777777777777779e-06, "loss": 0.6755, "step": 16980 }, { "epoch": 1.67, "grad_norm": 7.28376579284668, "learning_rate": 1.6722222222222223e-06, "loss": 0.6706, "step": 16990 }, { "epoch": 1.67, "grad_norm": 7.564328670501709, "learning_rate": 1.6666666666666667e-06, "loss": 0.6749, "step": 17000 }, { "epoch": 1.67, "grad_norm": 8.553927421569824, "learning_rate": 1.661111111111111e-06, "loss": 0.6508, "step": 17010 }, { "epoch": 1.67, "grad_norm": 8.787723541259766, "learning_rate": 1.6555555555555559e-06, "loss": 0.6652, "step": 17020 }, { "epoch": 1.67, "grad_norm": 8.24637222290039, "learning_rate": 1.6500000000000003e-06, "loss": 0.6873, "step": 17030 }, { "epoch": 1.67, "grad_norm": 7.692873954772949, "learning_rate": 1.6444444444444447e-06, "loss": 0.6749, "step": 17040 }, { "epoch": 1.67, "grad_norm": 7.1952619552612305, "learning_rate": 1.638888888888889e-06, "loss": 0.6807, "step": 17050 }, { "epoch": 1.67, "grad_norm": 7.89901065826416, "learning_rate": 1.6333333333333335e-06, "loss": 0.675, "step": 17060 }, { "epoch": 1.67, "grad_norm": 8.810429573059082, "learning_rate": 1.627777777777778e-06, "loss": 0.6767, "step": 17070 }, { "epoch": 1.67, "grad_norm": 9.383179664611816, "learning_rate": 1.6222222222222223e-06, "loss": 0.6787, "step": 17080 }, { "epoch": 1.67, "grad_norm": 8.387530326843262, "learning_rate": 1.6166666666666667e-06, "loss": 0.67, "step": 17090 }, { "epoch": 1.68, "grad_norm": 8.123832702636719, "learning_rate": 1.6111111111111113e-06, "loss": 0.6783, "step": 17100 }, { "epoch": 1.68, "grad_norm": 7.707393646240234, "learning_rate": 1.6055555555555557e-06, "loss": 0.6563, "step": 17110 }, { "epoch": 1.68, "grad_norm": 7.7289018630981445, "learning_rate": 1.6000000000000001e-06, "loss": 0.678, "step": 17120 }, { "epoch": 1.68, "grad_norm": 9.00395393371582, "learning_rate": 1.5944444444444445e-06, "loss": 0.6585, "step": 17130 }, { "epoch": 1.68, "grad_norm": 6.342504978179932, "learning_rate": 1.588888888888889e-06, "loss": 0.6652, "step": 17140 }, { "epoch": 1.68, "grad_norm": 9.37197494506836, "learning_rate": 1.5833333333333333e-06, "loss": 0.6593, "step": 17150 }, { "epoch": 1.68, "grad_norm": 7.6431097984313965, "learning_rate": 1.5777777777777778e-06, "loss": 0.6663, "step": 17160 }, { "epoch": 1.68, "grad_norm": 8.029258728027344, "learning_rate": 1.5722222222222226e-06, "loss": 0.6743, "step": 17170 }, { "epoch": 1.68, "grad_norm": 7.301062107086182, "learning_rate": 1.566666666666667e-06, "loss": 0.6786, "step": 17180 }, { "epoch": 1.68, "grad_norm": 8.0167236328125, "learning_rate": 1.5611111111111114e-06, "loss": 0.6567, "step": 17190 }, { "epoch": 1.69, "grad_norm": 6.612271308898926, "learning_rate": 1.5555555555555558e-06, "loss": 0.7031, "step": 17200 }, { "epoch": 1.69, "grad_norm": 9.50379753112793, "learning_rate": 1.5500000000000002e-06, "loss": 0.6758, "step": 17210 }, { "epoch": 1.69, "grad_norm": 7.628124713897705, "learning_rate": 1.5444444444444446e-06, "loss": 0.6867, "step": 17220 }, { "epoch": 1.69, "grad_norm": 8.261235237121582, "learning_rate": 1.538888888888889e-06, "loss": 0.6783, "step": 17230 }, { "epoch": 1.69, "grad_norm": 7.9098310470581055, "learning_rate": 1.5333333333333334e-06, "loss": 0.6697, "step": 17240 }, { "epoch": 1.69, "grad_norm": 7.7388014793396, "learning_rate": 1.527777777777778e-06, "loss": 0.6581, "step": 17250 }, { "epoch": 1.69, "grad_norm": 8.349539756774902, "learning_rate": 1.5222222222222224e-06, "loss": 0.6645, "step": 17260 }, { "epoch": 1.69, "grad_norm": 8.44169807434082, "learning_rate": 1.5166666666666668e-06, "loss": 0.6755, "step": 17270 }, { "epoch": 1.69, "grad_norm": 9.241918563842773, "learning_rate": 1.5111111111111112e-06, "loss": 0.6726, "step": 17280 }, { "epoch": 1.69, "grad_norm": 8.220808982849121, "learning_rate": 1.5055555555555556e-06, "loss": 0.6782, "step": 17290 }, { "epoch": 1.7, "grad_norm": 8.259944915771484, "learning_rate": 1.5e-06, "loss": 0.6905, "step": 17300 }, { "epoch": 1.7, "grad_norm": 8.442787170410156, "learning_rate": 1.4944444444444444e-06, "loss": 0.6735, "step": 17310 }, { "epoch": 1.7, "grad_norm": 6.550194263458252, "learning_rate": 1.4888888888888888e-06, "loss": 0.6804, "step": 17320 }, { "epoch": 1.7, "grad_norm": 7.951900482177734, "learning_rate": 1.4833333333333337e-06, "loss": 0.6793, "step": 17330 }, { "epoch": 1.7, "grad_norm": 7.283532619476318, "learning_rate": 1.477777777777778e-06, "loss": 0.6482, "step": 17340 }, { "epoch": 1.7, "grad_norm": 7.303703308105469, "learning_rate": 1.4722222222222225e-06, "loss": 0.6775, "step": 17350 }, { "epoch": 1.7, "grad_norm": 8.561707496643066, "learning_rate": 1.4666666666666669e-06, "loss": 0.6745, "step": 17360 }, { "epoch": 1.7, "grad_norm": 7.2460036277771, "learning_rate": 1.4611111111111113e-06, "loss": 0.6831, "step": 17370 }, { "epoch": 1.7, "grad_norm": 8.444445610046387, "learning_rate": 1.4555555555555557e-06, "loss": 0.6568, "step": 17380 }, { "epoch": 1.7, "grad_norm": 8.560053825378418, "learning_rate": 1.45e-06, "loss": 0.6694, "step": 17390 }, { "epoch": 1.71, "grad_norm": 8.545026779174805, "learning_rate": 1.4444444444444445e-06, "loss": 0.6585, "step": 17400 }, { "epoch": 1.71, "grad_norm": 7.855259895324707, "learning_rate": 1.4388888888888891e-06, "loss": 0.6716, "step": 17410 }, { "epoch": 1.71, "grad_norm": 8.551973342895508, "learning_rate": 1.4333333333333335e-06, "loss": 0.6805, "step": 17420 }, { "epoch": 1.71, "grad_norm": 8.271525382995605, "learning_rate": 1.427777777777778e-06, "loss": 0.6978, "step": 17430 }, { "epoch": 1.71, "grad_norm": 7.42857551574707, "learning_rate": 1.4222222222222223e-06, "loss": 0.6809, "step": 17440 }, { "epoch": 1.71, "grad_norm": 10.231410026550293, "learning_rate": 1.4166666666666667e-06, "loss": 0.6843, "step": 17450 }, { "epoch": 1.71, "grad_norm": 9.00693130493164, "learning_rate": 1.4111111111111111e-06, "loss": 0.6632, "step": 17460 }, { "epoch": 1.71, "grad_norm": 8.15339183807373, "learning_rate": 1.4055555555555555e-06, "loss": 0.6807, "step": 17470 }, { "epoch": 1.71, "grad_norm": 8.130613327026367, "learning_rate": 1.4000000000000001e-06, "loss": 0.6522, "step": 17480 }, { "epoch": 1.71, "grad_norm": 6.92191743850708, "learning_rate": 1.3944444444444446e-06, "loss": 0.6556, "step": 17490 }, { "epoch": 1.72, "grad_norm": 8.498601913452148, "learning_rate": 1.3888888888888892e-06, "loss": 0.6745, "step": 17500 }, { "epoch": 1.72, "grad_norm": 8.496179580688477, "learning_rate": 1.3833333333333336e-06, "loss": 0.665, "step": 17510 }, { "epoch": 1.72, "grad_norm": 7.871016025543213, "learning_rate": 1.377777777777778e-06, "loss": 0.6785, "step": 17520 }, { "epoch": 1.72, "grad_norm": 8.465128898620605, "learning_rate": 1.3722222222222224e-06, "loss": 0.6803, "step": 17530 }, { "epoch": 1.72, "grad_norm": 8.244367599487305, "learning_rate": 1.3666666666666668e-06, "loss": 0.6776, "step": 17540 }, { "epoch": 1.72, "grad_norm": 7.3993024826049805, "learning_rate": 1.3611111111111112e-06, "loss": 0.6573, "step": 17550 }, { "epoch": 1.72, "grad_norm": 7.680909156799316, "learning_rate": 1.3555555555555558e-06, "loss": 0.6524, "step": 17560 }, { "epoch": 1.72, "grad_norm": 7.973646640777588, "learning_rate": 1.3500000000000002e-06, "loss": 0.6695, "step": 17570 }, { "epoch": 1.72, "grad_norm": 8.354438781738281, "learning_rate": 1.3444444444444446e-06, "loss": 0.6767, "step": 17580 }, { "epoch": 1.72, "grad_norm": 7.140261650085449, "learning_rate": 1.338888888888889e-06, "loss": 0.6435, "step": 17590 }, { "epoch": 1.72, "grad_norm": 8.397757530212402, "learning_rate": 1.3333333333333334e-06, "loss": 0.6643, "step": 17600 }, { "epoch": 1.73, "grad_norm": 9.530304908752441, "learning_rate": 1.3277777777777778e-06, "loss": 0.6625, "step": 17610 }, { "epoch": 1.73, "grad_norm": 7.113949775695801, "learning_rate": 1.3222222222222222e-06, "loss": 0.6779, "step": 17620 }, { "epoch": 1.73, "grad_norm": 8.61610221862793, "learning_rate": 1.3166666666666666e-06, "loss": 0.6542, "step": 17630 }, { "epoch": 1.73, "grad_norm": 8.433843612670898, "learning_rate": 1.3111111111111112e-06, "loss": 0.6894, "step": 17640 }, { "epoch": 1.73, "grad_norm": 8.218851089477539, "learning_rate": 1.3055555555555556e-06, "loss": 0.6679, "step": 17650 }, { "epoch": 1.73, "grad_norm": 7.89171838760376, "learning_rate": 1.3e-06, "loss": 0.6802, "step": 17660 }, { "epoch": 1.73, "grad_norm": 8.941899299621582, "learning_rate": 1.2944444444444447e-06, "loss": 0.6751, "step": 17670 }, { "epoch": 1.73, "grad_norm": 6.714540958404541, "learning_rate": 1.288888888888889e-06, "loss": 0.6857, "step": 17680 }, { "epoch": 1.73, "grad_norm": 8.140737533569336, "learning_rate": 1.2833333333333335e-06, "loss": 0.6698, "step": 17690 }, { "epoch": 1.73, "grad_norm": 9.72225284576416, "learning_rate": 1.2777777777777779e-06, "loss": 0.6843, "step": 17700 }, { "epoch": 1.74, "grad_norm": 7.738189697265625, "learning_rate": 1.2722222222222223e-06, "loss": 0.6654, "step": 17710 }, { "epoch": 1.74, "grad_norm": 9.279894828796387, "learning_rate": 1.2666666666666669e-06, "loss": 0.6645, "step": 17720 }, { "epoch": 1.74, "grad_norm": 8.040486335754395, "learning_rate": 1.2611111111111113e-06, "loss": 0.6651, "step": 17730 }, { "epoch": 1.74, "grad_norm": 7.415245056152344, "learning_rate": 1.2555555555555557e-06, "loss": 0.6897, "step": 17740 }, { "epoch": 1.74, "grad_norm": 7.240573883056641, "learning_rate": 1.25e-06, "loss": 0.6797, "step": 17750 }, { "epoch": 1.74, "grad_norm": 7.835941314697266, "learning_rate": 1.2444444444444445e-06, "loss": 0.6618, "step": 17760 }, { "epoch": 1.74, "grad_norm": 8.135210990905762, "learning_rate": 1.2388888888888891e-06, "loss": 0.6744, "step": 17770 }, { "epoch": 1.74, "grad_norm": 7.172480583190918, "learning_rate": 1.2333333333333335e-06, "loss": 0.6606, "step": 17780 }, { "epoch": 1.74, "grad_norm": 8.14366340637207, "learning_rate": 1.227777777777778e-06, "loss": 0.6361, "step": 17790 }, { "epoch": 1.74, "grad_norm": 8.758234977722168, "learning_rate": 1.2222222222222223e-06, "loss": 0.6481, "step": 17800 }, { "epoch": 1.75, "grad_norm": 10.209847450256348, "learning_rate": 1.2166666666666667e-06, "loss": 0.7025, "step": 17810 }, { "epoch": 1.75, "grad_norm": 8.536412239074707, "learning_rate": 1.2111111111111111e-06, "loss": 0.6612, "step": 17820 }, { "epoch": 1.75, "grad_norm": 8.121233940124512, "learning_rate": 1.2055555555555555e-06, "loss": 0.6432, "step": 17830 }, { "epoch": 1.75, "grad_norm": 9.053804397583008, "learning_rate": 1.2000000000000002e-06, "loss": 0.6743, "step": 17840 }, { "epoch": 1.75, "grad_norm": 6.7864484786987305, "learning_rate": 1.1944444444444446e-06, "loss": 0.6768, "step": 17850 }, { "epoch": 1.75, "grad_norm": 7.80435037612915, "learning_rate": 1.188888888888889e-06, "loss": 0.6547, "step": 17860 }, { "epoch": 1.75, "grad_norm": 9.068230628967285, "learning_rate": 1.1833333333333334e-06, "loss": 0.657, "step": 17870 }, { "epoch": 1.75, "grad_norm": 8.736069679260254, "learning_rate": 1.1777777777777778e-06, "loss": 0.6542, "step": 17880 }, { "epoch": 1.75, "grad_norm": 7.827920913696289, "learning_rate": 1.1722222222222224e-06, "loss": 0.6493, "step": 17890 }, { "epoch": 1.75, "grad_norm": 8.577750205993652, "learning_rate": 1.1666666666666668e-06, "loss": 0.6892, "step": 17900 }, { "epoch": 1.76, "grad_norm": 8.393552780151367, "learning_rate": 1.1611111111111112e-06, "loss": 0.672, "step": 17910 }, { "epoch": 1.76, "grad_norm": 7.642767429351807, "learning_rate": 1.1555555555555556e-06, "loss": 0.6611, "step": 17920 }, { "epoch": 1.76, "grad_norm": 8.363187789916992, "learning_rate": 1.1500000000000002e-06, "loss": 0.6541, "step": 17930 }, { "epoch": 1.76, "grad_norm": 8.427385330200195, "learning_rate": 1.1444444444444446e-06, "loss": 0.6635, "step": 17940 }, { "epoch": 1.76, "grad_norm": 6.838406562805176, "learning_rate": 1.138888888888889e-06, "loss": 0.659, "step": 17950 }, { "epoch": 1.76, "grad_norm": 7.995869159698486, "learning_rate": 1.1333333333333334e-06, "loss": 0.6752, "step": 17960 }, { "epoch": 1.76, "grad_norm": 9.559296607971191, "learning_rate": 1.1277777777777778e-06, "loss": 0.6546, "step": 17970 }, { "epoch": 1.76, "grad_norm": 7.05047607421875, "learning_rate": 1.1222222222222222e-06, "loss": 0.6515, "step": 17980 }, { "epoch": 1.76, "grad_norm": 9.214991569519043, "learning_rate": 1.1166666666666666e-06, "loss": 0.6564, "step": 17990 }, { "epoch": 1.76, "grad_norm": 8.954300880432129, "learning_rate": 1.111111111111111e-06, "loss": 0.6536, "step": 18000 }, { "epoch": 1.76, "eval_loss": 0.7127931714057922, "eval_runtime": 25.0427, "eval_samples_per_second": 26.115, "eval_steps_per_second": 3.274, "step": 18000 }, { "epoch": 1.76, "grad_norm": 8.115190505981445, "learning_rate": 1.1055555555555557e-06, "loss": 0.6832, "step": 18010 }, { "epoch": 1.77, "grad_norm": 8.640135765075684, "learning_rate": 1.1e-06, "loss": 0.6813, "step": 18020 }, { "epoch": 1.77, "grad_norm": 7.598577499389648, "learning_rate": 1.0944444444444445e-06, "loss": 0.6704, "step": 18030 }, { "epoch": 1.77, "grad_norm": 8.381420135498047, "learning_rate": 1.0888888888888889e-06, "loss": 0.6618, "step": 18040 }, { "epoch": 1.77, "grad_norm": 8.975688934326172, "learning_rate": 1.0833333333333335e-06, "loss": 0.6701, "step": 18050 }, { "epoch": 1.77, "grad_norm": 6.753466606140137, "learning_rate": 1.0777777777777779e-06, "loss": 0.6652, "step": 18060 }, { "epoch": 1.77, "grad_norm": 8.410473823547363, "learning_rate": 1.0722222222222223e-06, "loss": 0.6756, "step": 18070 }, { "epoch": 1.77, "grad_norm": 9.152502059936523, "learning_rate": 1.066666666666667e-06, "loss": 0.666, "step": 18080 }, { "epoch": 1.77, "grad_norm": 8.959446907043457, "learning_rate": 1.0611111111111113e-06, "loss": 0.6711, "step": 18090 }, { "epoch": 1.77, "grad_norm": 8.306169509887695, "learning_rate": 1.0555555555555557e-06, "loss": 0.6569, "step": 18100 }, { "epoch": 1.77, "grad_norm": 7.52501916885376, "learning_rate": 1.0500000000000001e-06, "loss": 0.6606, "step": 18110 }, { "epoch": 1.78, "grad_norm": 10.036842346191406, "learning_rate": 1.0444444444444445e-06, "loss": 0.6463, "step": 18120 }, { "epoch": 1.78, "grad_norm": 7.371220588684082, "learning_rate": 1.038888888888889e-06, "loss": 0.684, "step": 18130 }, { "epoch": 1.78, "grad_norm": 7.5346150398254395, "learning_rate": 1.0333333333333333e-06, "loss": 0.6391, "step": 18140 }, { "epoch": 1.78, "grad_norm": 7.054285526275635, "learning_rate": 1.0277777777777777e-06, "loss": 0.6843, "step": 18150 }, { "epoch": 1.78, "grad_norm": 9.114176750183105, "learning_rate": 1.0222222222222223e-06, "loss": 0.6447, "step": 18160 }, { "epoch": 1.78, "grad_norm": 6.937114715576172, "learning_rate": 1.0166666666666667e-06, "loss": 0.6583, "step": 18170 }, { "epoch": 1.78, "grad_norm": 8.191717147827148, "learning_rate": 1.0111111111111111e-06, "loss": 0.6452, "step": 18180 }, { "epoch": 1.78, "grad_norm": 9.889886856079102, "learning_rate": 1.0055555555555556e-06, "loss": 0.6498, "step": 18190 }, { "epoch": 1.78, "grad_norm": 8.173121452331543, "learning_rate": 1.0000000000000002e-06, "loss": 0.6644, "step": 18200 }, { "epoch": 1.78, "grad_norm": 8.336097717285156, "learning_rate": 9.944444444444446e-07, "loss": 0.6462, "step": 18210 }, { "epoch": 1.79, "grad_norm": 8.586885452270508, "learning_rate": 9.88888888888889e-07, "loss": 0.6459, "step": 18220 }, { "epoch": 1.79, "grad_norm": 9.936676025390625, "learning_rate": 9.833333333333334e-07, "loss": 0.629, "step": 18230 }, { "epoch": 1.79, "grad_norm": 6.890833854675293, "learning_rate": 9.77777777777778e-07, "loss": 0.6761, "step": 18240 }, { "epoch": 1.79, "grad_norm": 10.50243091583252, "learning_rate": 9.722222222222224e-07, "loss": 0.6541, "step": 18250 }, { "epoch": 1.79, "grad_norm": 10.062907218933105, "learning_rate": 9.666666666666668e-07, "loss": 0.6515, "step": 18260 }, { "epoch": 1.79, "grad_norm": 10.538150787353516, "learning_rate": 9.611111111111112e-07, "loss": 0.6701, "step": 18270 }, { "epoch": 1.79, "grad_norm": 9.058283805847168, "learning_rate": 9.555555555555556e-07, "loss": 0.6621, "step": 18280 }, { "epoch": 1.79, "grad_norm": 6.798927307128906, "learning_rate": 9.500000000000001e-07, "loss": 0.6821, "step": 18290 }, { "epoch": 1.79, "grad_norm": 7.449670314788818, "learning_rate": 9.444444444444445e-07, "loss": 0.6583, "step": 18300 }, { "epoch": 1.79, "grad_norm": 9.92177963256836, "learning_rate": 9.388888888888889e-07, "loss": 0.6541, "step": 18310 }, { "epoch": 1.8, "grad_norm": 8.684669494628906, "learning_rate": 9.333333333333334e-07, "loss": 0.6512, "step": 18320 }, { "epoch": 1.8, "grad_norm": 7.827058792114258, "learning_rate": 9.277777777777778e-07, "loss": 0.6549, "step": 18330 }, { "epoch": 1.8, "grad_norm": 8.48637580871582, "learning_rate": 9.222222222222222e-07, "loss": 0.6532, "step": 18340 }, { "epoch": 1.8, "grad_norm": 8.333334922790527, "learning_rate": 9.166666666666666e-07, "loss": 0.6394, "step": 18350 }, { "epoch": 1.8, "grad_norm": 9.906614303588867, "learning_rate": 9.111111111111113e-07, "loss": 0.6373, "step": 18360 }, { "epoch": 1.8, "grad_norm": 7.90922212600708, "learning_rate": 9.055555555555557e-07, "loss": 0.6585, "step": 18370 }, { "epoch": 1.8, "grad_norm": 7.814781665802002, "learning_rate": 9.000000000000001e-07, "loss": 0.6616, "step": 18380 }, { "epoch": 1.8, "grad_norm": 7.313722133636475, "learning_rate": 8.944444444444445e-07, "loss": 0.6645, "step": 18390 }, { "epoch": 1.8, "grad_norm": 8.72338581085205, "learning_rate": 8.88888888888889e-07, "loss": 0.6482, "step": 18400 }, { "epoch": 1.8, "grad_norm": 10.157296180725098, "learning_rate": 8.833333333333334e-07, "loss": 0.656, "step": 18410 }, { "epoch": 1.81, "grad_norm": 9.318395614624023, "learning_rate": 8.777777777777778e-07, "loss": 0.6583, "step": 18420 }, { "epoch": 1.81, "grad_norm": 8.36423397064209, "learning_rate": 8.722222222222224e-07, "loss": 0.6599, "step": 18430 }, { "epoch": 1.81, "grad_norm": 8.469670295715332, "learning_rate": 8.666666666666668e-07, "loss": 0.6699, "step": 18440 }, { "epoch": 1.81, "grad_norm": 8.974568367004395, "learning_rate": 8.611111111111112e-07, "loss": 0.661, "step": 18450 }, { "epoch": 1.81, "grad_norm": 7.746393203735352, "learning_rate": 8.555555555555556e-07, "loss": 0.6641, "step": 18460 }, { "epoch": 1.81, "grad_norm": 7.701857089996338, "learning_rate": 8.500000000000001e-07, "loss": 0.6545, "step": 18470 }, { "epoch": 1.81, "grad_norm": 7.929514408111572, "learning_rate": 8.444444444444445e-07, "loss": 0.6445, "step": 18480 }, { "epoch": 1.81, "grad_norm": 8.786981582641602, "learning_rate": 8.388888888888889e-07, "loss": 0.6624, "step": 18490 }, { "epoch": 1.81, "grad_norm": 9.13001823425293, "learning_rate": 8.333333333333333e-07, "loss": 0.6576, "step": 18500 }, { "epoch": 1.81, "grad_norm": 8.059815406799316, "learning_rate": 8.277777777777779e-07, "loss": 0.6543, "step": 18510 }, { "epoch": 1.81, "grad_norm": 8.815535545349121, "learning_rate": 8.222222222222223e-07, "loss": 0.6403, "step": 18520 }, { "epoch": 1.82, "grad_norm": 8.182717323303223, "learning_rate": 8.166666666666668e-07, "loss": 0.6589, "step": 18530 }, { "epoch": 1.82, "grad_norm": 8.724108695983887, "learning_rate": 8.111111111111112e-07, "loss": 0.6626, "step": 18540 }, { "epoch": 1.82, "grad_norm": 9.29687786102295, "learning_rate": 8.055555555555557e-07, "loss": 0.6477, "step": 18550 }, { "epoch": 1.82, "grad_norm": 8.760370254516602, "learning_rate": 8.000000000000001e-07, "loss": 0.6366, "step": 18560 }, { "epoch": 1.82, "grad_norm": 7.839179039001465, "learning_rate": 7.944444444444445e-07, "loss": 0.6865, "step": 18570 }, { "epoch": 1.82, "grad_norm": 9.337043762207031, "learning_rate": 7.888888888888889e-07, "loss": 0.6603, "step": 18580 }, { "epoch": 1.82, "grad_norm": 10.087823867797852, "learning_rate": 7.833333333333335e-07, "loss": 0.6755, "step": 18590 }, { "epoch": 1.82, "grad_norm": 9.58806324005127, "learning_rate": 7.777777777777779e-07, "loss": 0.6738, "step": 18600 }, { "epoch": 1.82, "grad_norm": 8.919584274291992, "learning_rate": 7.722222222222223e-07, "loss": 0.6667, "step": 18610 }, { "epoch": 1.82, "grad_norm": 8.638771057128906, "learning_rate": 7.666666666666667e-07, "loss": 0.634, "step": 18620 }, { "epoch": 1.83, "grad_norm": 9.505468368530273, "learning_rate": 7.611111111111112e-07, "loss": 0.6581, "step": 18630 }, { "epoch": 1.83, "grad_norm": 8.852348327636719, "learning_rate": 7.555555555555556e-07, "loss": 0.6514, "step": 18640 }, { "epoch": 1.83, "grad_norm": 8.56619644165039, "learning_rate": 7.5e-07, "loss": 0.6309, "step": 18650 }, { "epoch": 1.83, "grad_norm": 9.371169090270996, "learning_rate": 7.444444444444444e-07, "loss": 0.6564, "step": 18660 }, { "epoch": 1.83, "grad_norm": 9.205166816711426, "learning_rate": 7.38888888888889e-07, "loss": 0.6498, "step": 18670 }, { "epoch": 1.83, "grad_norm": 9.55856990814209, "learning_rate": 7.333333333333334e-07, "loss": 0.6589, "step": 18680 }, { "epoch": 1.83, "grad_norm": 7.039092540740967, "learning_rate": 7.277777777777778e-07, "loss": 0.6528, "step": 18690 }, { "epoch": 1.83, "grad_norm": 8.943285942077637, "learning_rate": 7.222222222222222e-07, "loss": 0.6596, "step": 18700 }, { "epoch": 1.83, "grad_norm": 7.541864395141602, "learning_rate": 7.166666666666668e-07, "loss": 0.6282, "step": 18710 }, { "epoch": 1.83, "grad_norm": 8.540939331054688, "learning_rate": 7.111111111111112e-07, "loss": 0.6494, "step": 18720 }, { "epoch": 1.84, "grad_norm": 7.6741509437561035, "learning_rate": 7.055555555555556e-07, "loss": 0.6574, "step": 18730 }, { "epoch": 1.84, "grad_norm": 6.1836323738098145, "learning_rate": 7.000000000000001e-07, "loss": 0.6421, "step": 18740 }, { "epoch": 1.84, "grad_norm": 9.809643745422363, "learning_rate": 6.944444444444446e-07, "loss": 0.6568, "step": 18750 }, { "epoch": 1.84, "grad_norm": 8.765548706054688, "learning_rate": 6.88888888888889e-07, "loss": 0.6399, "step": 18760 }, { "epoch": 1.84, "grad_norm": 9.214972496032715, "learning_rate": 6.833333333333334e-07, "loss": 0.6562, "step": 18770 }, { "epoch": 1.84, "grad_norm": 7.295991897583008, "learning_rate": 6.777777777777779e-07, "loss": 0.6619, "step": 18780 }, { "epoch": 1.84, "grad_norm": 7.566501617431641, "learning_rate": 6.722222222222223e-07, "loss": 0.641, "step": 18790 }, { "epoch": 1.84, "grad_norm": 10.237143516540527, "learning_rate": 6.666666666666667e-07, "loss": 0.6578, "step": 18800 }, { "epoch": 1.84, "grad_norm": 10.003223419189453, "learning_rate": 6.611111111111111e-07, "loss": 0.6572, "step": 18810 }, { "epoch": 1.84, "grad_norm": 10.348155975341797, "learning_rate": 6.555555555555556e-07, "loss": 0.6561, "step": 18820 }, { "epoch": 1.85, "grad_norm": 7.612809181213379, "learning_rate": 6.5e-07, "loss": 0.6447, "step": 18830 }, { "epoch": 1.85, "grad_norm": 8.928174018859863, "learning_rate": 6.444444444444445e-07, "loss": 0.6312, "step": 18840 }, { "epoch": 1.85, "grad_norm": 8.383500099182129, "learning_rate": 6.388888888888889e-07, "loss": 0.6438, "step": 18850 }, { "epoch": 1.85, "grad_norm": 9.134782791137695, "learning_rate": 6.333333333333334e-07, "loss": 0.6371, "step": 18860 }, { "epoch": 1.85, "grad_norm": 8.415270805358887, "learning_rate": 6.277777777777778e-07, "loss": 0.6583, "step": 18870 }, { "epoch": 1.85, "grad_norm": 8.320952415466309, "learning_rate": 6.222222222222223e-07, "loss": 0.6395, "step": 18880 }, { "epoch": 1.85, "grad_norm": 8.688267707824707, "learning_rate": 6.166666666666668e-07, "loss": 0.6677, "step": 18890 }, { "epoch": 1.85, "grad_norm": 8.908681869506836, "learning_rate": 6.111111111111112e-07, "loss": 0.6223, "step": 18900 }, { "epoch": 1.85, "grad_norm": 9.066765785217285, "learning_rate": 6.055555555555556e-07, "loss": 0.663, "step": 18910 }, { "epoch": 1.85, "grad_norm": 7.663233280181885, "learning_rate": 6.000000000000001e-07, "loss": 0.6525, "step": 18920 }, { "epoch": 1.86, "grad_norm": 9.141779899597168, "learning_rate": 5.944444444444445e-07, "loss": 0.6565, "step": 18930 }, { "epoch": 1.86, "grad_norm": 9.113740921020508, "learning_rate": 5.888888888888889e-07, "loss": 0.655, "step": 18940 }, { "epoch": 1.86, "grad_norm": 10.090544700622559, "learning_rate": 5.833333333333334e-07, "loss": 0.6674, "step": 18950 }, { "epoch": 1.86, "grad_norm": 10.163101196289062, "learning_rate": 5.777777777777778e-07, "loss": 0.6354, "step": 18960 }, { "epoch": 1.86, "grad_norm": 7.253233432769775, "learning_rate": 5.722222222222223e-07, "loss": 0.6624, "step": 18970 }, { "epoch": 1.86, "grad_norm": 10.549760818481445, "learning_rate": 5.666666666666667e-07, "loss": 0.6467, "step": 18980 }, { "epoch": 1.86, "grad_norm": 8.816473960876465, "learning_rate": 5.611111111111111e-07, "loss": 0.6339, "step": 18990 }, { "epoch": 1.86, "grad_norm": 7.9804277420043945, "learning_rate": 5.555555555555555e-07, "loss": 0.6494, "step": 19000 }, { "epoch": 1.86, "grad_norm": 8.418985366821289, "learning_rate": 5.5e-07, "loss": 0.6617, "step": 19010 }, { "epoch": 1.86, "grad_norm": 8.473084449768066, "learning_rate": 5.444444444444444e-07, "loss": 0.6407, "step": 19020 }, { "epoch": 1.86, "grad_norm": 8.026798248291016, "learning_rate": 5.388888888888889e-07, "loss": 0.66, "step": 19030 }, { "epoch": 1.87, "grad_norm": 6.1661376953125, "learning_rate": 5.333333333333335e-07, "loss": 0.622, "step": 19040 }, { "epoch": 1.87, "grad_norm": 8.03042221069336, "learning_rate": 5.277777777777779e-07, "loss": 0.6529, "step": 19050 }, { "epoch": 1.87, "grad_norm": 9.176237106323242, "learning_rate": 5.222222222222223e-07, "loss": 0.6633, "step": 19060 }, { "epoch": 1.87, "grad_norm": 10.600275039672852, "learning_rate": 5.166666666666667e-07, "loss": 0.6485, "step": 19070 }, { "epoch": 1.87, "grad_norm": 7.930903911590576, "learning_rate": 5.111111111111112e-07, "loss": 0.6538, "step": 19080 }, { "epoch": 1.87, "grad_norm": 8.785584449768066, "learning_rate": 5.055555555555556e-07, "loss": 0.6631, "step": 19090 }, { "epoch": 1.87, "grad_norm": 7.33726692199707, "learning_rate": 5.000000000000001e-07, "loss": 0.6357, "step": 19100 }, { "epoch": 1.87, "grad_norm": 7.6543965339660645, "learning_rate": 4.944444444444445e-07, "loss": 0.6489, "step": 19110 }, { "epoch": 1.87, "grad_norm": 7.727292537689209, "learning_rate": 4.88888888888889e-07, "loss": 0.6459, "step": 19120 }, { "epoch": 1.87, "grad_norm": 8.907550811767578, "learning_rate": 4.833333333333334e-07, "loss": 0.6618, "step": 19130 }, { "epoch": 1.88, "grad_norm": 7.5037970542907715, "learning_rate": 4.777777777777778e-07, "loss": 0.6451, "step": 19140 }, { "epoch": 1.88, "grad_norm": 9.037306785583496, "learning_rate": 4.7222222222222226e-07, "loss": 0.6603, "step": 19150 }, { "epoch": 1.88, "grad_norm": 8.772835731506348, "learning_rate": 4.666666666666667e-07, "loss": 0.6391, "step": 19160 }, { "epoch": 1.88, "grad_norm": 7.629842281341553, "learning_rate": 4.611111111111111e-07, "loss": 0.6492, "step": 19170 }, { "epoch": 1.88, "grad_norm": 9.215010643005371, "learning_rate": 4.5555555555555563e-07, "loss": 0.6229, "step": 19180 }, { "epoch": 1.88, "grad_norm": 8.465002059936523, "learning_rate": 4.5000000000000003e-07, "loss": 0.6323, "step": 19190 }, { "epoch": 1.88, "grad_norm": 8.497922897338867, "learning_rate": 4.444444444444445e-07, "loss": 0.6498, "step": 19200 }, { "epoch": 1.88, "grad_norm": 9.152692794799805, "learning_rate": 4.388888888888889e-07, "loss": 0.6577, "step": 19210 }, { "epoch": 1.88, "grad_norm": 9.2376070022583, "learning_rate": 4.333333333333334e-07, "loss": 0.6375, "step": 19220 }, { "epoch": 1.88, "grad_norm": 8.524798393249512, "learning_rate": 4.277777777777778e-07, "loss": 0.6391, "step": 19230 }, { "epoch": 1.89, "grad_norm": 8.709053993225098, "learning_rate": 4.2222222222222226e-07, "loss": 0.6583, "step": 19240 }, { "epoch": 1.89, "grad_norm": 8.390191078186035, "learning_rate": 4.1666666666666667e-07, "loss": 0.632, "step": 19250 }, { "epoch": 1.89, "grad_norm": 8.530904769897461, "learning_rate": 4.111111111111112e-07, "loss": 0.6642, "step": 19260 }, { "epoch": 1.89, "grad_norm": 10.57852840423584, "learning_rate": 4.055555555555556e-07, "loss": 0.6472, "step": 19270 }, { "epoch": 1.89, "grad_norm": 8.655448913574219, "learning_rate": 4.0000000000000003e-07, "loss": 0.642, "step": 19280 }, { "epoch": 1.89, "grad_norm": 11.114307403564453, "learning_rate": 3.9444444444444444e-07, "loss": 0.6738, "step": 19290 }, { "epoch": 1.89, "grad_norm": 7.406481742858887, "learning_rate": 3.8888888888888895e-07, "loss": 0.6623, "step": 19300 }, { "epoch": 1.89, "grad_norm": 8.099161148071289, "learning_rate": 3.8333333333333335e-07, "loss": 0.6409, "step": 19310 }, { "epoch": 1.89, "grad_norm": 7.960798263549805, "learning_rate": 3.777777777777778e-07, "loss": 0.6532, "step": 19320 }, { "epoch": 1.89, "grad_norm": 8.398820877075195, "learning_rate": 3.722222222222222e-07, "loss": 0.6638, "step": 19330 }, { "epoch": 1.9, "grad_norm": 6.729916572570801, "learning_rate": 3.666666666666667e-07, "loss": 0.6522, "step": 19340 }, { "epoch": 1.9, "grad_norm": 8.169163703918457, "learning_rate": 3.611111111111111e-07, "loss": 0.656, "step": 19350 }, { "epoch": 1.9, "grad_norm": 9.184800148010254, "learning_rate": 3.555555555555556e-07, "loss": 0.6549, "step": 19360 }, { "epoch": 1.9, "grad_norm": 7.309620380401611, "learning_rate": 3.5000000000000004e-07, "loss": 0.6615, "step": 19370 }, { "epoch": 1.9, "grad_norm": 8.672090530395508, "learning_rate": 3.444444444444445e-07, "loss": 0.6652, "step": 19380 }, { "epoch": 1.9, "grad_norm": 6.958488941192627, "learning_rate": 3.3888888888888895e-07, "loss": 0.6375, "step": 19390 }, { "epoch": 1.9, "grad_norm": 9.871621131896973, "learning_rate": 3.3333333333333335e-07, "loss": 0.6593, "step": 19400 }, { "epoch": 1.9, "grad_norm": 9.769947052001953, "learning_rate": 3.277777777777778e-07, "loss": 0.6377, "step": 19410 }, { "epoch": 1.9, "grad_norm": 8.695141792297363, "learning_rate": 3.2222222222222227e-07, "loss": 0.6607, "step": 19420 }, { "epoch": 1.9, "grad_norm": 9.121576309204102, "learning_rate": 3.166666666666667e-07, "loss": 0.6461, "step": 19430 }, { "epoch": 1.91, "grad_norm": 10.443702697753906, "learning_rate": 3.111111111111111e-07, "loss": 0.6626, "step": 19440 }, { "epoch": 1.91, "grad_norm": 6.589414596557617, "learning_rate": 3.055555555555556e-07, "loss": 0.6501, "step": 19450 }, { "epoch": 1.91, "grad_norm": 8.429988861083984, "learning_rate": 3.0000000000000004e-07, "loss": 0.6362, "step": 19460 }, { "epoch": 1.91, "grad_norm": 9.502488136291504, "learning_rate": 2.9444444444444444e-07, "loss": 0.65, "step": 19470 }, { "epoch": 1.91, "grad_norm": 8.772028923034668, "learning_rate": 2.888888888888889e-07, "loss": 0.6513, "step": 19480 }, { "epoch": 1.91, "grad_norm": 8.207404136657715, "learning_rate": 2.8333333333333336e-07, "loss": 0.653, "step": 19490 }, { "epoch": 1.91, "grad_norm": 8.614123344421387, "learning_rate": 2.7777777777777776e-07, "loss": 0.6604, "step": 19500 }, { "epoch": 1.91, "grad_norm": 9.29067611694336, "learning_rate": 2.722222222222222e-07, "loss": 0.6601, "step": 19510 }, { "epoch": 1.91, "grad_norm": 8.018574714660645, "learning_rate": 2.666666666666667e-07, "loss": 0.6484, "step": 19520 }, { "epoch": 1.91, "grad_norm": 9.233197212219238, "learning_rate": 2.6111111111111113e-07, "loss": 0.6479, "step": 19530 }, { "epoch": 1.91, "grad_norm": 7.148528575897217, "learning_rate": 2.555555555555556e-07, "loss": 0.6494, "step": 19540 }, { "epoch": 1.92, "grad_norm": 9.95731258392334, "learning_rate": 2.5000000000000004e-07, "loss": 0.6689, "step": 19550 }, { "epoch": 1.92, "grad_norm": 6.882303237915039, "learning_rate": 2.444444444444445e-07, "loss": 0.6597, "step": 19560 }, { "epoch": 1.92, "grad_norm": 8.601186752319336, "learning_rate": 2.388888888888889e-07, "loss": 0.6527, "step": 19570 }, { "epoch": 1.92, "grad_norm": 8.610597610473633, "learning_rate": 2.3333333333333336e-07, "loss": 0.6412, "step": 19580 }, { "epoch": 1.92, "grad_norm": 7.233694553375244, "learning_rate": 2.2777777777777781e-07, "loss": 0.6561, "step": 19590 }, { "epoch": 1.92, "grad_norm": 7.422303199768066, "learning_rate": 2.2222222222222224e-07, "loss": 0.6515, "step": 19600 }, { "epoch": 1.92, "grad_norm": 8.440985679626465, "learning_rate": 2.166666666666667e-07, "loss": 0.6512, "step": 19610 }, { "epoch": 1.92, "grad_norm": 8.715599060058594, "learning_rate": 2.1111111111111113e-07, "loss": 0.6431, "step": 19620 }, { "epoch": 1.92, "grad_norm": 9.305037498474121, "learning_rate": 2.055555555555556e-07, "loss": 0.6419, "step": 19630 }, { "epoch": 1.92, "grad_norm": 7.697106838226318, "learning_rate": 2.0000000000000002e-07, "loss": 0.6494, "step": 19640 }, { "epoch": 1.93, "grad_norm": 10.639257431030273, "learning_rate": 1.9444444444444447e-07, "loss": 0.6605, "step": 19650 }, { "epoch": 1.93, "grad_norm": 7.247119903564453, "learning_rate": 1.888888888888889e-07, "loss": 0.6682, "step": 19660 }, { "epoch": 1.93, "grad_norm": 8.35676097869873, "learning_rate": 1.8333333333333336e-07, "loss": 0.6525, "step": 19670 }, { "epoch": 1.93, "grad_norm": 8.366414070129395, "learning_rate": 1.777777777777778e-07, "loss": 0.6582, "step": 19680 }, { "epoch": 1.93, "grad_norm": 8.281257629394531, "learning_rate": 1.7222222222222225e-07, "loss": 0.6616, "step": 19690 }, { "epoch": 1.93, "grad_norm": 7.995990753173828, "learning_rate": 1.6666666666666668e-07, "loss": 0.6291, "step": 19700 }, { "epoch": 1.93, "grad_norm": 7.391417026519775, "learning_rate": 1.6111111111111113e-07, "loss": 0.6517, "step": 19710 }, { "epoch": 1.93, "grad_norm": 8.39581298828125, "learning_rate": 1.5555555555555556e-07, "loss": 0.6476, "step": 19720 }, { "epoch": 1.93, "grad_norm": 10.042542457580566, "learning_rate": 1.5000000000000002e-07, "loss": 0.6558, "step": 19730 }, { "epoch": 1.93, "grad_norm": 9.45720100402832, "learning_rate": 1.4444444444444445e-07, "loss": 0.6315, "step": 19740 }, { "epoch": 1.94, "grad_norm": 9.80663013458252, "learning_rate": 1.3888888888888888e-07, "loss": 0.6465, "step": 19750 }, { "epoch": 1.94, "grad_norm": 8.370614051818848, "learning_rate": 1.3333333333333336e-07, "loss": 0.6392, "step": 19760 }, { "epoch": 1.94, "grad_norm": 10.43559741973877, "learning_rate": 1.277777777777778e-07, "loss": 0.6353, "step": 19770 }, { "epoch": 1.94, "grad_norm": 7.869793891906738, "learning_rate": 1.2222222222222225e-07, "loss": 0.6352, "step": 19780 }, { "epoch": 1.94, "grad_norm": 9.977088928222656, "learning_rate": 1.1666666666666668e-07, "loss": 0.6548, "step": 19790 }, { "epoch": 1.94, "grad_norm": 8.949152946472168, "learning_rate": 1.1111111111111112e-07, "loss": 0.6333, "step": 19800 }, { "epoch": 1.94, "grad_norm": 8.809663772583008, "learning_rate": 1.0555555555555557e-07, "loss": 0.6391, "step": 19810 }, { "epoch": 1.94, "grad_norm": 10.625847816467285, "learning_rate": 1.0000000000000001e-07, "loss": 0.6467, "step": 19820 }, { "epoch": 1.94, "grad_norm": 8.383655548095703, "learning_rate": 9.444444444444445e-08, "loss": 0.6277, "step": 19830 }, { "epoch": 1.94, "grad_norm": 7.44985818862915, "learning_rate": 8.88888888888889e-08, "loss": 0.6442, "step": 19840 }, { "epoch": 1.95, "grad_norm": 7.30179500579834, "learning_rate": 8.333333333333334e-08, "loss": 0.6377, "step": 19850 }, { "epoch": 1.95, "grad_norm": 10.289073944091797, "learning_rate": 7.777777777777778e-08, "loss": 0.6545, "step": 19860 }, { "epoch": 1.95, "grad_norm": 8.414495468139648, "learning_rate": 7.222222222222222e-08, "loss": 0.6483, "step": 19870 }, { "epoch": 1.95, "grad_norm": 7.419222831726074, "learning_rate": 6.666666666666668e-08, "loss": 0.6423, "step": 19880 }, { "epoch": 1.95, "grad_norm": 8.271702766418457, "learning_rate": 6.111111111111112e-08, "loss": 0.6376, "step": 19890 }, { "epoch": 1.95, "grad_norm": 9.014019966125488, "learning_rate": 5.555555555555556e-08, "loss": 0.6423, "step": 19900 }, { "epoch": 1.95, "grad_norm": 7.186811923980713, "learning_rate": 5.0000000000000004e-08, "loss": 0.63, "step": 19910 }, { "epoch": 1.95, "grad_norm": 10.329126358032227, "learning_rate": 4.444444444444445e-08, "loss": 0.6514, "step": 19920 }, { "epoch": 1.95, "grad_norm": 7.666452884674072, "learning_rate": 3.888888888888889e-08, "loss": 0.6533, "step": 19930 }, { "epoch": 1.95, "grad_norm": 8.645721435546875, "learning_rate": 3.333333333333334e-08, "loss": 0.6585, "step": 19940 }, { "epoch": 1.96, "grad_norm": 8.800760269165039, "learning_rate": 2.777777777777778e-08, "loss": 0.6536, "step": 19950 }, { "epoch": 1.96, "grad_norm": 8.414071083068848, "learning_rate": 2.2222222222222224e-08, "loss": 0.6384, "step": 19960 }, { "epoch": 1.96, "grad_norm": 8.279068946838379, "learning_rate": 1.666666666666667e-08, "loss": 0.6619, "step": 19970 }, { "epoch": 1.96, "grad_norm": 8.45858097076416, "learning_rate": 1.1111111111111112e-08, "loss": 0.6296, "step": 19980 }, { "epoch": 1.96, "grad_norm": 8.840774536132812, "learning_rate": 5.555555555555556e-09, "loss": 0.6411, "step": 19990 }, { "epoch": 1.96, "grad_norm": 8.836251258850098, "learning_rate": 0.0, "loss": 0.6223, "step": 20000 }, { "epoch": 1.96, "eval_loss": 0.697283923625946, "eval_runtime": 25.0963, "eval_samples_per_second": 26.06, "eval_steps_per_second": 3.267, "step": 20000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1, "total_flos": 5.077201303159964e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }