{ "best_metric": 1.0089582204818726, "best_model_checkpoint": "./output/checkpoint-4950", "epoch": 0.17911419887103777, "eval_steps": 150, "global_step": 4950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003618468664061369, "grad_norm": 1.2045749425888062, "learning_rate": 5.500000000000001e-06, "loss": 1.144, "step": 10 }, { "epoch": 0.0007236937328122738, "grad_norm": 1.50728178024292, "learning_rate": 1.1000000000000001e-05, "loss": 1.1809, "step": 20 }, { "epoch": 0.0010855405992184109, "grad_norm": 0.9494473934173584, "learning_rate": 1.65e-05, "loss": 1.0738, "step": 30 }, { "epoch": 0.0014473874656245477, "grad_norm": 0.957133948802948, "learning_rate": 2.2000000000000003e-05, "loss": 0.9573, "step": 40 }, { "epoch": 0.0018092343320306847, "grad_norm": 1.7268428802490234, "learning_rate": 2.75e-05, "loss": 1.0361, "step": 50 }, { "epoch": 0.0021710811984368217, "grad_norm": 1.1843866109848022, "learning_rate": 3.3e-05, "loss": 1.0351, "step": 60 }, { "epoch": 0.0025329280648429585, "grad_norm": 1.5817480087280273, "learning_rate": 3.85e-05, "loss": 1.1654, "step": 70 }, { "epoch": 0.0028947749312490953, "grad_norm": 0.8221575617790222, "learning_rate": 4.4000000000000006e-05, "loss": 1.1031, "step": 80 }, { "epoch": 0.003256621797655232, "grad_norm": 1.0210144519805908, "learning_rate": 4.9500000000000004e-05, "loss": 1.2083, "step": 90 }, { "epoch": 0.0036184686640613694, "grad_norm": 1.6523082256317139, "learning_rate": 5.5e-05, "loss": 1.1551, "step": 100 }, { "epoch": 0.003980315530467506, "grad_norm": 1.3959214687347412, "learning_rate": 5.4999434791355066e-05, "loss": 1.2088, "step": 110 }, { "epoch": 0.004342162396873643, "grad_norm": 1.7850854396820068, "learning_rate": 5.4997739188653784e-05, "loss": 1.0394, "step": 120 }, { "epoch": 0.00470400926327978, "grad_norm": 1.707861304283142, "learning_rate": 5.4994913261595724e-05, "loss": 1.0406, "step": 130 }, { "epoch": 0.005065856129685917, "grad_norm": 1.622674584388733, "learning_rate": 5.49909571263437e-05, "loss": 1.0781, "step": 140 }, { "epoch": 0.005427702996092054, "grad_norm": 1.135132908821106, "learning_rate": 5.498587094551892e-05, "loss": 1.2658, "step": 150 }, { "epoch": 0.005427702996092054, "eval_loss": 1.0342717170715332, "eval_runtime": 68.3584, "eval_samples_per_second": 7.314, "eval_steps_per_second": 7.314, "step": 150 }, { "epoch": 0.005789549862498191, "grad_norm": 1.2902684211730957, "learning_rate": 5.497965492819436e-05, "loss": 1.1822, "step": 160 }, { "epoch": 0.0061513967289043275, "grad_norm": 1.518955111503601, "learning_rate": 5.4972309329886156e-05, "loss": 1.1241, "step": 170 }, { "epoch": 0.006513243595310464, "grad_norm": 1.0765914916992188, "learning_rate": 5.496383445254307e-05, "loss": 1.1326, "step": 180 }, { "epoch": 0.006875090461716602, "grad_norm": 1.3479045629501343, "learning_rate": 5.495423064453413e-05, "loss": 1.097, "step": 190 }, { "epoch": 0.007236937328122739, "grad_norm": 1.528658151626587, "learning_rate": 5.4943498300634254e-05, "loss": 0.9511, "step": 200 }, { "epoch": 0.007598784194528876, "grad_norm": 1.3864845037460327, "learning_rate": 5.493163786200807e-05, "loss": 1.0936, "step": 210 }, { "epoch": 0.007960631060935012, "grad_norm": 1.2771916389465332, "learning_rate": 5.491864981619175e-05, "loss": 1.1032, "step": 220 }, { "epoch": 0.00832247792734115, "grad_norm": 1.3746259212493896, "learning_rate": 5.4904534697073e-05, "loss": 1.0904, "step": 230 }, { "epoch": 0.008684324793747287, "grad_norm": 1.0093128681182861, "learning_rate": 5.488929308486908e-05, "loss": 1.1204, "step": 240 }, { "epoch": 0.009046171660153424, "grad_norm": 1.7553023099899292, "learning_rate": 5.487292560610295e-05, "loss": 1.1739, "step": 250 }, { "epoch": 0.00940801852655956, "grad_norm": 1.1970763206481934, "learning_rate": 5.485543293357758e-05, "loss": 1.0918, "step": 260 }, { "epoch": 0.009769865392965697, "grad_norm": 2.0827841758728027, "learning_rate": 5.483681578634821e-05, "loss": 1.2, "step": 270 }, { "epoch": 0.010131712259371834, "grad_norm": 1.5536803007125854, "learning_rate": 5.481707492969285e-05, "loss": 1.1228, "step": 280 }, { "epoch": 0.010493559125777971, "grad_norm": 0.9219818115234375, "learning_rate": 5.479621117508079e-05, "loss": 1.0677, "step": 290 }, { "epoch": 0.010855405992184108, "grad_norm": 2.04948353767395, "learning_rate": 5.477422538013927e-05, "loss": 1.0464, "step": 300 }, { "epoch": 0.010855405992184108, "eval_loss": 1.0371991395950317, "eval_runtime": 68.6028, "eval_samples_per_second": 7.288, "eval_steps_per_second": 7.288, "step": 300 }, { "epoch": 0.011217252858590245, "grad_norm": 1.1368346214294434, "learning_rate": 5.475111844861821e-05, "loss": 1.0531, "step": 310 }, { "epoch": 0.011579099724996381, "grad_norm": 1.423888087272644, "learning_rate": 5.4726891330353056e-05, "loss": 1.0611, "step": 320 }, { "epoch": 0.011940946591402518, "grad_norm": 1.2623565196990967, "learning_rate": 5.4701545021225746e-05, "loss": 1.0878, "step": 330 }, { "epoch": 0.012302793457808655, "grad_norm": 5.527337551116943, "learning_rate": 5.4675080563123786e-05, "loss": 1.1356, "step": 340 }, { "epoch": 0.012664640324214792, "grad_norm": 2.037787914276123, "learning_rate": 5.4647499043897386e-05, "loss": 0.9942, "step": 350 }, { "epoch": 0.013026487190620929, "grad_norm": 1.0420938730239868, "learning_rate": 5.461880159731476e-05, "loss": 1.1172, "step": 360 }, { "epoch": 0.013388334057027065, "grad_norm": 1.4465577602386475, "learning_rate": 5.4588989403015564e-05, "loss": 1.1869, "step": 370 }, { "epoch": 0.013750180923433204, "grad_norm": 1.2765371799468994, "learning_rate": 5.4558063686462315e-05, "loss": 1.2266, "step": 380 }, { "epoch": 0.01411202778983934, "grad_norm": 1.6594829559326172, "learning_rate": 5.4526025718890104e-05, "loss": 1.1412, "step": 390 }, { "epoch": 0.014473874656245478, "grad_norm": 3.317193031311035, "learning_rate": 5.44928768172543e-05, "loss": 1.2663, "step": 400 }, { "epoch": 0.014835721522651614, "grad_norm": 1.1468137502670288, "learning_rate": 5.44586183441764e-05, "loss": 0.9972, "step": 410 }, { "epoch": 0.015197568389057751, "grad_norm": 0.899064838886261, "learning_rate": 5.442325170788806e-05, "loss": 1.0367, "step": 420 }, { "epoch": 0.015559415255463888, "grad_norm": 1.5718448162078857, "learning_rate": 5.438677836217317e-05, "loss": 0.8761, "step": 430 }, { "epoch": 0.015921262121870023, "grad_norm": 1.5378243923187256, "learning_rate": 5.434919980630811e-05, "loss": 1.0723, "step": 440 }, { "epoch": 0.01628310898827616, "grad_norm": 1.8993847370147705, "learning_rate": 5.431051758500015e-05, "loss": 1.2253, "step": 450 }, { "epoch": 0.01628310898827616, "eval_loss": 1.032397985458374, "eval_runtime": 68.5965, "eval_samples_per_second": 7.289, "eval_steps_per_second": 7.289, "step": 450 }, { "epoch": 0.0166449558546823, "grad_norm": 1.047999620437622, "learning_rate": 5.427073328832388e-05, "loss": 1.1485, "step": 460 }, { "epoch": 0.017006802721088437, "grad_norm": 1.5674751996994019, "learning_rate": 5.422984855165592e-05, "loss": 1.1283, "step": 470 }, { "epoch": 0.017368649587494574, "grad_norm": 1.691994309425354, "learning_rate": 5.418786505560766e-05, "loss": 1.1811, "step": 480 }, { "epoch": 0.01773049645390071, "grad_norm": 0.521550178527832, "learning_rate": 5.414478452595617e-05, "loss": 1.0966, "step": 490 }, { "epoch": 0.018092343320306847, "grad_norm": 1.1992415189743042, "learning_rate": 5.4100608733573315e-05, "loss": 1.1177, "step": 500 }, { "epoch": 0.018454190186712984, "grad_norm": 1.4018158912658691, "learning_rate": 5.4055339494352874e-05, "loss": 1.1203, "step": 510 }, { "epoch": 0.01881603705311912, "grad_norm": 1.0247960090637207, "learning_rate": 5.400897866913597e-05, "loss": 1.3419, "step": 520 }, { "epoch": 0.019177883919525258, "grad_norm": 1.0322446823120117, "learning_rate": 5.3961528163634546e-05, "loss": 0.9993, "step": 530 }, { "epoch": 0.019539730785931395, "grad_norm": 1.6083568334579468, "learning_rate": 5.391298992835303e-05, "loss": 0.9918, "step": 540 }, { "epoch": 0.01990157765233753, "grad_norm": 1.3827457427978516, "learning_rate": 5.386336595850817e-05, "loss": 1.1648, "step": 550 }, { "epoch": 0.020263424518743668, "grad_norm": 1.080629587173462, "learning_rate": 5.3812658293946995e-05, "loss": 1.0942, "step": 560 }, { "epoch": 0.020625271385149805, "grad_norm": 1.469224214553833, "learning_rate": 5.376086901906299e-05, "loss": 1.133, "step": 570 }, { "epoch": 0.020987118251555942, "grad_norm": 1.598305106163025, "learning_rate": 5.37080002627104e-05, "loss": 1.1817, "step": 580 }, { "epoch": 0.02134896511796208, "grad_norm": 1.6718581914901733, "learning_rate": 5.365405419811673e-05, "loss": 1.2593, "step": 590 }, { "epoch": 0.021710811984368215, "grad_norm": 1.4570552110671997, "learning_rate": 5.359903304279339e-05, "loss": 1.0208, "step": 600 }, { "epoch": 0.021710811984368215, "eval_loss": 1.0313901901245117, "eval_runtime": 67.8847, "eval_samples_per_second": 7.365, "eval_steps_per_second": 7.365, "step": 600 }, { "epoch": 0.022072658850774352, "grad_norm": 2.346705198287964, "learning_rate": 5.354293905844459e-05, "loss": 0.985, "step": 610 }, { "epoch": 0.02243450571718049, "grad_norm": 0.9685454368591309, "learning_rate": 5.3485774550874306e-05, "loss": 1.0461, "step": 620 }, { "epoch": 0.022796352583586626, "grad_norm": 1.9338666200637817, "learning_rate": 5.3427541869891556e-05, "loss": 1.042, "step": 630 }, { "epoch": 0.023158199449992763, "grad_norm": 1.9683053493499756, "learning_rate": 5.336824340921377e-05, "loss": 1.2535, "step": 640 }, { "epoch": 0.0235200463163989, "grad_norm": 2.317091941833496, "learning_rate": 5.330788160636841e-05, "loss": 1.1503, "step": 650 }, { "epoch": 0.023881893182805036, "grad_norm": 1.8341249227523804, "learning_rate": 5.3246458942592776e-05, "loss": 1.0744, "step": 660 }, { "epoch": 0.024243740049211173, "grad_norm": 1.035079836845398, "learning_rate": 5.318397794273199e-05, "loss": 1.1281, "step": 670 }, { "epoch": 0.02460558691561731, "grad_norm": 1.7849830389022827, "learning_rate": 5.312044117513524e-05, "loss": 1.0792, "step": 680 }, { "epoch": 0.024967433782023447, "grad_norm": 1.472554087638855, "learning_rate": 5.305585125155018e-05, "loss": 1.1503, "step": 690 }, { "epoch": 0.025329280648429583, "grad_norm": 1.3164336681365967, "learning_rate": 5.29902108270156e-05, "loss": 1.1399, "step": 700 }, { "epoch": 0.02569112751483572, "grad_norm": 1.5491093397140503, "learning_rate": 5.2923522599752245e-05, "loss": 1.1174, "step": 710 }, { "epoch": 0.026052974381241857, "grad_norm": 1.188999891281128, "learning_rate": 5.2855789311051945e-05, "loss": 1.1546, "step": 720 }, { "epoch": 0.026414821247647994, "grad_norm": 1.0815441608428955, "learning_rate": 5.27870137451649e-05, "loss": 1.057, "step": 730 }, { "epoch": 0.02677666811405413, "grad_norm": 1.3092780113220215, "learning_rate": 5.2717198729185245e-05, "loss": 1.1296, "step": 740 }, { "epoch": 0.027138514980460268, "grad_norm": 1.8252133131027222, "learning_rate": 5.264634713293485e-05, "loss": 1.1683, "step": 750 }, { "epoch": 0.027138514980460268, "eval_loss": 1.0285245180130005, "eval_runtime": 68.0631, "eval_samples_per_second": 7.346, "eval_steps_per_second": 7.346, "step": 750 }, { "epoch": 0.027500361846866408, "grad_norm": 1.1759158372879028, "learning_rate": 5.2574461868845316e-05, "loss": 1.0939, "step": 760 }, { "epoch": 0.027862208713272545, "grad_norm": 1.3588383197784424, "learning_rate": 5.2501545891838315e-05, "loss": 1.1117, "step": 770 }, { "epoch": 0.02822405557967868, "grad_norm": 0.8850705027580261, "learning_rate": 5.242760219920405e-05, "loss": 0.9594, "step": 780 }, { "epoch": 0.028585902446084818, "grad_norm": 1.5548638105392456, "learning_rate": 5.235263383047812e-05, "loss": 1.0963, "step": 790 }, { "epoch": 0.028947749312490955, "grad_norm": 1.242234468460083, "learning_rate": 5.2276643867316525e-05, "loss": 0.915, "step": 800 }, { "epoch": 0.029309596178897092, "grad_norm": 1.5917716026306152, "learning_rate": 5.219963543336902e-05, "loss": 1.1261, "step": 810 }, { "epoch": 0.02967144304530323, "grad_norm": 1.3814702033996582, "learning_rate": 5.212161169415071e-05, "loss": 1.1657, "step": 820 }, { "epoch": 0.030033289911709365, "grad_norm": 2.3621139526367188, "learning_rate": 5.204257585691191e-05, "loss": 1.121, "step": 830 }, { "epoch": 0.030395136778115502, "grad_norm": 1.4163732528686523, "learning_rate": 5.196253117050633e-05, "loss": 1.0952, "step": 840 }, { "epoch": 0.03075698364452164, "grad_norm": 1.7806810140609741, "learning_rate": 5.188148092525751e-05, "loss": 1.079, "step": 850 }, { "epoch": 0.031118830510927776, "grad_norm": 1.4245884418487549, "learning_rate": 5.179942845282357e-05, "loss": 1.0976, "step": 860 }, { "epoch": 0.03148067737733391, "grad_norm": 1.1371049880981445, "learning_rate": 5.17163771260603e-05, "loss": 1.2228, "step": 870 }, { "epoch": 0.031842524243740046, "grad_norm": 1.6769368648529053, "learning_rate": 5.163233035888244e-05, "loss": 1.0914, "step": 880 }, { "epoch": 0.032204371110146186, "grad_norm": 1.042022466659546, "learning_rate": 5.154729160612338e-05, "loss": 1.044, "step": 890 }, { "epoch": 0.03256621797655232, "grad_norm": 5.278915882110596, "learning_rate": 5.146126436339321e-05, "loss": 1.0234, "step": 900 }, { "epoch": 0.03256621797655232, "eval_loss": 1.0301809310913086, "eval_runtime": 67.3202, "eval_samples_per_second": 7.427, "eval_steps_per_second": 7.427, "step": 900 }, { "epoch": 0.03292806484295846, "grad_norm": 1.3499010801315308, "learning_rate": 5.137425216693491e-05, "loss": 1.1798, "step": 910 }, { "epoch": 0.0332899117093646, "grad_norm": 1.4902702569961548, "learning_rate": 5.128625859347907e-05, "loss": 1.2082, "step": 920 }, { "epoch": 0.033651758575770734, "grad_norm": 0.9549224376678467, "learning_rate": 5.1197287260096865e-05, "loss": 1.0619, "step": 930 }, { "epoch": 0.034013605442176874, "grad_norm": 1.7096573114395142, "learning_rate": 5.110734182405132e-05, "loss": 1.0151, "step": 940 }, { "epoch": 0.03437545230858301, "grad_norm": 1.4213861227035522, "learning_rate": 5.1016425982647025e-05, "loss": 1.1281, "step": 950 }, { "epoch": 0.03473729917498915, "grad_norm": 0.9421877861022949, "learning_rate": 5.092454347307812e-05, "loss": 1.1774, "step": 960 }, { "epoch": 0.03509914604139528, "grad_norm": 2.414092779159546, "learning_rate": 5.08316980722747e-05, "loss": 1.2083, "step": 970 }, { "epoch": 0.03546099290780142, "grad_norm": 2.7780957221984863, "learning_rate": 5.0737893596747534e-05, "loss": 1.0388, "step": 980 }, { "epoch": 0.035822839774207554, "grad_norm": 1.1338950395584106, "learning_rate": 5.064313390243121e-05, "loss": 1.21, "step": 990 }, { "epoch": 0.036184686640613695, "grad_norm": 1.8430678844451904, "learning_rate": 5.054742288452562e-05, "loss": 1.1968, "step": 1000 }, { "epoch": 0.03654653350701983, "grad_norm": 1.3380565643310547, "learning_rate": 5.0450764477335825e-05, "loss": 1.0445, "step": 1010 }, { "epoch": 0.03690838037342597, "grad_norm": 1.1650495529174805, "learning_rate": 5.035316265411036e-05, "loss": 1.1366, "step": 1020 }, { "epoch": 0.0372702272398321, "grad_norm": 1.743957757949829, "learning_rate": 5.02546214268779e-05, "loss": 1.1349, "step": 1030 }, { "epoch": 0.03763207410623824, "grad_norm": 1.059372067451477, "learning_rate": 5.0155144846282345e-05, "loss": 1.0034, "step": 1040 }, { "epoch": 0.037993920972644375, "grad_norm": 1.9560377597808838, "learning_rate": 5.005473700141629e-05, "loss": 1.1981, "step": 1050 }, { "epoch": 0.037993920972644375, "eval_loss": 1.0264325141906738, "eval_runtime": 68.1791, "eval_samples_per_second": 7.334, "eval_steps_per_second": 7.334, "step": 1050 }, { "epoch": 0.038355767839050516, "grad_norm": 4.450094223022461, "learning_rate": 4.995340201965296e-05, "loss": 1.2232, "step": 1060 }, { "epoch": 0.03871761470545665, "grad_norm": 1.3787989616394043, "learning_rate": 4.985114406647658e-05, "loss": 1.1268, "step": 1070 }, { "epoch": 0.03907946157186279, "grad_norm": 1.3675071001052856, "learning_rate": 4.9747967345311055e-05, "loss": 1.0921, "step": 1080 }, { "epoch": 0.03944130843826892, "grad_norm": 1.2795029878616333, "learning_rate": 4.9643876097347296e-05, "loss": 1.0545, "step": 1090 }, { "epoch": 0.03980315530467506, "grad_norm": 3.8761634826660156, "learning_rate": 4.953887460136881e-05, "loss": 1.1652, "step": 1100 }, { "epoch": 0.040165002171081196, "grad_norm": 1.2283992767333984, "learning_rate": 4.943296717357583e-05, "loss": 1.1534, "step": 1110 }, { "epoch": 0.040526849037487336, "grad_norm": 1.2567368745803833, "learning_rate": 4.93261581674079e-05, "loss": 0.9526, "step": 1120 }, { "epoch": 0.04088869590389347, "grad_norm": 1.1028484106063843, "learning_rate": 4.921845197336491e-05, "loss": 1.1153, "step": 1130 }, { "epoch": 0.04125054277029961, "grad_norm": 0.965897262096405, "learning_rate": 4.910985301882667e-05, "loss": 1.2386, "step": 1140 }, { "epoch": 0.04161238963670574, "grad_norm": 1.3109748363494873, "learning_rate": 4.9000365767870824e-05, "loss": 1.163, "step": 1150 }, { "epoch": 0.041974236503111884, "grad_norm": 1.360034704208374, "learning_rate": 4.8889994721089426e-05, "loss": 1.0163, "step": 1160 }, { "epoch": 0.04233608336951802, "grad_norm": 1.422423005104065, "learning_rate": 4.877874441540394e-05, "loss": 1.1239, "step": 1170 }, { "epoch": 0.04269793023592416, "grad_norm": 0.9702832698822021, "learning_rate": 4.866661942387867e-05, "loss": 1.3376, "step": 1180 }, { "epoch": 0.04305977710233029, "grad_norm": 1.6079213619232178, "learning_rate": 4.855362435553285e-05, "loss": 1.1713, "step": 1190 }, { "epoch": 0.04342162396873643, "grad_norm": 0.9138990640640259, "learning_rate": 4.84397638551512e-05, "loss": 1.0748, "step": 1200 }, { "epoch": 0.04342162396873643, "eval_loss": 1.0269697904586792, "eval_runtime": 68.4525, "eval_samples_per_second": 7.304, "eval_steps_per_second": 7.304, "step": 1200 }, { "epoch": 0.04378347083514257, "grad_norm": 1.3866156339645386, "learning_rate": 4.83250426030929e-05, "loss": 1.12, "step": 1210 }, { "epoch": 0.044145317701548704, "grad_norm": 1.9904563426971436, "learning_rate": 4.82094653150993e-05, "loss": 1.3514, "step": 1220 }, { "epoch": 0.044507164567954845, "grad_norm": 1.5623399019241333, "learning_rate": 4.8093036742100026e-05, "loss": 1.0571, "step": 1230 }, { "epoch": 0.04486901143436098, "grad_norm": 0.9663859009742737, "learning_rate": 4.79757616700177e-05, "loss": 1.1396, "step": 1240 }, { "epoch": 0.04523085830076712, "grad_norm": 0.9798938632011414, "learning_rate": 4.7857644919571176e-05, "loss": 1.0056, "step": 1250 }, { "epoch": 0.04559270516717325, "grad_norm": 1.2511754035949707, "learning_rate": 4.773869134607747e-05, "loss": 1.0136, "step": 1260 }, { "epoch": 0.04595455203357939, "grad_norm": 2.900221347808838, "learning_rate": 4.761890583925204e-05, "loss": 1.0976, "step": 1270 }, { "epoch": 0.046316398899985525, "grad_norm": 2.022226333618164, "learning_rate": 4.749829332300792e-05, "loss": 1.1334, "step": 1280 }, { "epoch": 0.046678245766391666, "grad_norm": 1.6037700176239014, "learning_rate": 4.737685875525327e-05, "loss": 1.0719, "step": 1290 }, { "epoch": 0.0470400926327978, "grad_norm": 0.7749394178390503, "learning_rate": 4.725460712768751e-05, "loss": 1.0659, "step": 1300 }, { "epoch": 0.04740193949920394, "grad_norm": 2.769139289855957, "learning_rate": 4.7131543465596236e-05, "loss": 1.1173, "step": 1310 }, { "epoch": 0.04776378636561007, "grad_norm": 0.9987087249755859, "learning_rate": 4.700767282764459e-05, "loss": 1.0688, "step": 1320 }, { "epoch": 0.04812563323201621, "grad_norm": 1.5522490739822388, "learning_rate": 4.688300030566933e-05, "loss": 0.9504, "step": 1330 }, { "epoch": 0.048487480098422346, "grad_norm": 1.4506887197494507, "learning_rate": 4.6757531024469514e-05, "loss": 1.061, "step": 1340 }, { "epoch": 0.048849326964828486, "grad_norm": 1.1611425876617432, "learning_rate": 4.663127014159588e-05, "loss": 1.0629, "step": 1350 }, { "epoch": 0.048849326964828486, "eval_loss": 1.0257459878921509, "eval_runtime": 68.1868, "eval_samples_per_second": 7.333, "eval_steps_per_second": 7.333, "step": 1350 }, { "epoch": 0.04921117383123462, "grad_norm": 1.5671613216400146, "learning_rate": 4.650422284713878e-05, "loss": 1.1071, "step": 1360 }, { "epoch": 0.04957302069764076, "grad_norm": 1.1590784788131714, "learning_rate": 4.637639436351489e-05, "loss": 1.1151, "step": 1370 }, { "epoch": 0.04993486756404689, "grad_norm": 14.556044578552246, "learning_rate": 4.624778994525249e-05, "loss": 0.9887, "step": 1380 }, { "epoch": 0.050296714430453034, "grad_norm": 1.6293423175811768, "learning_rate": 4.6118414878775514e-05, "loss": 1.0136, "step": 1390 }, { "epoch": 0.05065856129685917, "grad_norm": 1.2317134141921997, "learning_rate": 4.5988274482186214e-05, "loss": 1.0885, "step": 1400 }, { "epoch": 0.05102040816326531, "grad_norm": 1.2565045356750488, "learning_rate": 4.5857374105046574e-05, "loss": 0.9929, "step": 1410 }, { "epoch": 0.05138225502967144, "grad_norm": 2.7197911739349365, "learning_rate": 4.572571912815838e-05, "loss": 1.0843, "step": 1420 }, { "epoch": 0.05174410189607758, "grad_norm": 1.3743138313293457, "learning_rate": 4.55933149633421e-05, "loss": 1.0817, "step": 1430 }, { "epoch": 0.052105948762483714, "grad_norm": 1.5609480142593384, "learning_rate": 4.5460167053214335e-05, "loss": 1.0412, "step": 1440 }, { "epoch": 0.052467795628889854, "grad_norm": 1.5621366500854492, "learning_rate": 4.532628087096419e-05, "loss": 1.238, "step": 1450 }, { "epoch": 0.05282964249529599, "grad_norm": 1.0901319980621338, "learning_rate": 4.5191661920128194e-05, "loss": 1.0242, "step": 1460 }, { "epoch": 0.05319148936170213, "grad_norm": 1.1015669107437134, "learning_rate": 4.5056315734364154e-05, "loss": 1.1004, "step": 1470 }, { "epoch": 0.05355333622810826, "grad_norm": 1.2542566061019897, "learning_rate": 4.492024787722368e-05, "loss": 1.0685, "step": 1480 }, { "epoch": 0.0539151830945144, "grad_norm": 1.645579218864441, "learning_rate": 4.47834639419234e-05, "loss": 1.2279, "step": 1490 }, { "epoch": 0.054277029960920535, "grad_norm": 0.945669412612915, "learning_rate": 4.464596955111518e-05, "loss": 1.0911, "step": 1500 }, { "epoch": 0.054277029960920535, "eval_loss": 1.0207065343856812, "eval_runtime": 70.4281, "eval_samples_per_second": 7.099, "eval_steps_per_second": 7.099, "step": 1500 }, { "epoch": 0.054638876827326675, "grad_norm": 1.6988177299499512, "learning_rate": 4.450777035665487e-05, "loss": 1.0155, "step": 1510 }, { "epoch": 0.055000723693732816, "grad_norm": 1.5770177841186523, "learning_rate": 4.436887203937009e-05, "loss": 0.9457, "step": 1520 }, { "epoch": 0.05536257056013895, "grad_norm": 1.2684322595596313, "learning_rate": 4.422928030882661e-05, "loss": 1.1031, "step": 1530 }, { "epoch": 0.05572441742654509, "grad_norm": 1.9332138299942017, "learning_rate": 4.4089000903093746e-05, "loss": 1.1228, "step": 1540 }, { "epoch": 0.05608626429295122, "grad_norm": 1.6723462343215942, "learning_rate": 4.394803958850844e-05, "loss": 1.1539, "step": 1550 }, { "epoch": 0.05644811115935736, "grad_norm": 2.3764090538024902, "learning_rate": 4.380640215943821e-05, "loss": 0.9692, "step": 1560 }, { "epoch": 0.056809958025763496, "grad_norm": 1.5812585353851318, "learning_rate": 4.366409443804301e-05, "loss": 1.2003, "step": 1570 }, { "epoch": 0.057171804892169636, "grad_norm": 1.1422808170318604, "learning_rate": 4.352112227403589e-05, "loss": 1.0949, "step": 1580 }, { "epoch": 0.05753365175857577, "grad_norm": 1.386093020439148, "learning_rate": 4.337749154444254e-05, "loss": 0.9529, "step": 1590 }, { "epoch": 0.05789549862498191, "grad_norm": 1.3573766946792603, "learning_rate": 4.3233208153359665e-05, "loss": 1.1159, "step": 1600 }, { "epoch": 0.05825734549138804, "grad_norm": 0.9206913113594055, "learning_rate": 4.308827803171238e-05, "loss": 1.1379, "step": 1610 }, { "epoch": 0.058619192357794184, "grad_norm": 0.970383882522583, "learning_rate": 4.294270713701031e-05, "loss": 1.0876, "step": 1620 }, { "epoch": 0.05898103922420032, "grad_norm": 1.290249228477478, "learning_rate": 4.2796501453102784e-05, "loss": 1.2368, "step": 1630 }, { "epoch": 0.05934288609060646, "grad_norm": 1.5415809154510498, "learning_rate": 4.264966698993282e-05, "loss": 1.2861, "step": 1640 }, { "epoch": 0.05970473295701259, "grad_norm": 1.1242855787277222, "learning_rate": 4.2502209783290085e-05, "loss": 1.1262, "step": 1650 }, { "epoch": 0.05970473295701259, "eval_loss": 1.0186049938201904, "eval_runtime": 68.1883, "eval_samples_per_second": 7.333, "eval_steps_per_second": 7.333, "step": 1650 }, { "epoch": 0.06006657982341873, "grad_norm": 2.1882779598236084, "learning_rate": 4.235413589456281e-05, "loss": 1.1456, "step": 1660 }, { "epoch": 0.060428426689824864, "grad_norm": 1.3615387678146362, "learning_rate": 4.2205451410488565e-05, "loss": 1.0501, "step": 1670 }, { "epoch": 0.060790273556231005, "grad_norm": 1.084717869758606, "learning_rate": 4.205616244290416e-05, "loss": 1.2036, "step": 1680 }, { "epoch": 0.06115212042263714, "grad_norm": 3.1126012802124023, "learning_rate": 4.1906275128494296e-05, "loss": 0.9203, "step": 1690 }, { "epoch": 0.06151396728904328, "grad_norm": 0.9188119173049927, "learning_rate": 4.175579562853945e-05, "loss": 1.0536, "step": 1700 }, { "epoch": 0.06187581415544941, "grad_norm": 1.5308047533035278, "learning_rate": 4.160473012866242e-05, "loss": 1.0237, "step": 1710 }, { "epoch": 0.06223766102185555, "grad_norm": 2.6539103984832764, "learning_rate": 4.145308483857426e-05, "loss": 1.0854, "step": 1720 }, { "epoch": 0.06259950788826169, "grad_norm": 1.3063393831253052, "learning_rate": 4.1300865991818885e-05, "loss": 1.1074, "step": 1730 }, { "epoch": 0.06296135475466783, "grad_norm": 1.2517176866531372, "learning_rate": 4.114807984551688e-05, "loss": 1.2658, "step": 1740 }, { "epoch": 0.06332320162107397, "grad_norm": 1.6254620552062988, "learning_rate": 4.0994732680108296e-05, "loss": 1.0614, "step": 1750 }, { "epoch": 0.06368504848748009, "grad_norm": 2.9168360233306885, "learning_rate": 4.084083079909448e-05, "loss": 1.1139, "step": 1760 }, { "epoch": 0.06404689535388623, "grad_norm": 1.35561203956604, "learning_rate": 4.068638052877899e-05, "loss": 1.203, "step": 1770 }, { "epoch": 0.06440874222029237, "grad_norm": 1.4280883073806763, "learning_rate": 4.0531388218007466e-05, "loss": 1.1808, "step": 1780 }, { "epoch": 0.06477058908669851, "grad_norm": 1.5916036367416382, "learning_rate": 4.037586023790676e-05, "loss": 1.1019, "step": 1790 }, { "epoch": 0.06513243595310464, "grad_norm": 2.5409014225006104, "learning_rate": 4.0219802981622975e-05, "loss": 1.0124, "step": 1800 }, { "epoch": 0.06513243595310464, "eval_loss": 1.0174832344055176, "eval_runtime": 68.409, "eval_samples_per_second": 7.309, "eval_steps_per_second": 7.309, "step": 1800 }, { "epoch": 0.06549428281951078, "grad_norm": 1.073611855506897, "learning_rate": 4.006322286405867e-05, "loss": 1.1544, "step": 1810 }, { "epoch": 0.06585612968591692, "grad_norm": 1.1647859811782837, "learning_rate": 3.99061263216092e-05, "loss": 0.9499, "step": 1820 }, { "epoch": 0.06621797655232306, "grad_norm": 1.0000501871109009, "learning_rate": 3.974851981189813e-05, "loss": 1.1405, "step": 1830 }, { "epoch": 0.0665798234187292, "grad_norm": 1.062009572982788, "learning_rate": 3.9590409813511765e-05, "loss": 1.1108, "step": 1840 }, { "epoch": 0.06694167028513533, "grad_norm": 1.331750512123108, "learning_rate": 3.943180282573285e-05, "loss": 1.0102, "step": 1850 }, { "epoch": 0.06730351715154147, "grad_norm": 1.2916260957717896, "learning_rate": 3.927270536827346e-05, "loss": 1.228, "step": 1860 }, { "epoch": 0.06766536401794761, "grad_norm": 2.2148149013519287, "learning_rate": 3.91131239810069e-05, "loss": 1.1242, "step": 1870 }, { "epoch": 0.06802721088435375, "grad_norm": 2.7903716564178467, "learning_rate": 3.895306522369898e-05, "loss": 1.0128, "step": 1880 }, { "epoch": 0.06838905775075987, "grad_norm": 1.241196632385254, "learning_rate": 3.87925356757383e-05, "loss": 1.1023, "step": 1890 }, { "epoch": 0.06875090461716601, "grad_norm": 1.2638871669769287, "learning_rate": 3.863154193586583e-05, "loss": 1.1439, "step": 1900 }, { "epoch": 0.06911275148357215, "grad_norm": 5.679568290710449, "learning_rate": 3.847009062190365e-05, "loss": 1.1571, "step": 1910 }, { "epoch": 0.0694745983499783, "grad_norm": 1.760812520980835, "learning_rate": 3.83081883704829e-05, "loss": 1.2032, "step": 1920 }, { "epoch": 0.06983644521638442, "grad_norm": 1.1166123151779175, "learning_rate": 3.814584183677102e-05, "loss": 0.9787, "step": 1930 }, { "epoch": 0.07019829208279056, "grad_norm": 1.0514557361602783, "learning_rate": 3.7983057694198145e-05, "loss": 1.0041, "step": 1940 }, { "epoch": 0.0705601389491967, "grad_norm": 1.1916346549987793, "learning_rate": 3.781984263418279e-05, "loss": 1.189, "step": 1950 }, { "epoch": 0.0705601389491967, "eval_loss": 1.020028829574585, "eval_runtime": 68.056, "eval_samples_per_second": 7.347, "eval_steps_per_second": 7.347, "step": 1950 }, { "epoch": 0.07092198581560284, "grad_norm": 1.2160232067108154, "learning_rate": 3.76562033658568e-05, "loss": 1.0226, "step": 1960 }, { "epoch": 0.07128383268200897, "grad_norm": 1.3149514198303223, "learning_rate": 3.749214661578957e-05, "loss": 1.221, "step": 1970 }, { "epoch": 0.07164567954841511, "grad_norm": 0.8975876569747925, "learning_rate": 3.732767912771153e-05, "loss": 1.0484, "step": 1980 }, { "epoch": 0.07200752641482125, "grad_norm": 1.7094556093215942, "learning_rate": 3.716280766223693e-05, "loss": 1.1154, "step": 1990 }, { "epoch": 0.07236937328122739, "grad_norm": 1.4048560857772827, "learning_rate": 3.699753899658596e-05, "loss": 1.1227, "step": 2000 }, { "epoch": 0.07273122014763352, "grad_norm": 1.6184380054473877, "learning_rate": 3.683187992430616e-05, "loss": 1.1265, "step": 2010 }, { "epoch": 0.07309306701403966, "grad_norm": 1.3908544778823853, "learning_rate": 3.666583725499315e-05, "loss": 1.0651, "step": 2020 }, { "epoch": 0.0734549138804458, "grad_norm": 1.3871246576309204, "learning_rate": 3.6499417814010715e-05, "loss": 1.1756, "step": 2030 }, { "epoch": 0.07381676074685194, "grad_norm": 1.1331907510757446, "learning_rate": 3.6332628442210255e-05, "loss": 1.0245, "step": 2040 }, { "epoch": 0.07417860761325806, "grad_norm": 1.8790123462677002, "learning_rate": 3.616547599564958e-05, "loss": 1.2065, "step": 2050 }, { "epoch": 0.0745404544796642, "grad_norm": 1.9032275676727295, "learning_rate": 3.599796734531105e-05, "loss": 1.014, "step": 2060 }, { "epoch": 0.07490230134607034, "grad_norm": 2.346637487411499, "learning_rate": 3.5830109376819235e-05, "loss": 1.3065, "step": 2070 }, { "epoch": 0.07526414821247648, "grad_norm": 1.2389724254608154, "learning_rate": 3.566190899015774e-05, "loss": 1.1081, "step": 2080 }, { "epoch": 0.07562599507888261, "grad_norm": 0.932698130607605, "learning_rate": 3.5493373099385677e-05, "loss": 1.1203, "step": 2090 }, { "epoch": 0.07598784194528875, "grad_norm": 1.388152003288269, "learning_rate": 3.5324508632353394e-05, "loss": 1.1712, "step": 2100 }, { "epoch": 0.07598784194528875, "eval_loss": 1.0178335905075073, "eval_runtime": 67.9026, "eval_samples_per_second": 7.363, "eval_steps_per_second": 7.363, "step": 2100 }, { "epoch": 0.07634968881169489, "grad_norm": 1.2164424657821655, "learning_rate": 3.515532253041774e-05, "loss": 1.2461, "step": 2110 }, { "epoch": 0.07671153567810103, "grad_norm": 1.2548147439956665, "learning_rate": 3.498582174815671e-05, "loss": 0.9243, "step": 2120 }, { "epoch": 0.07707338254450717, "grad_norm": 0.979133665561676, "learning_rate": 3.481601325308357e-05, "loss": 1.0301, "step": 2130 }, { "epoch": 0.0774352294109133, "grad_norm": 2.0382702350616455, "learning_rate": 3.4645904025360455e-05, "loss": 1.0064, "step": 2140 }, { "epoch": 0.07779707627731944, "grad_norm": 1.809077501296997, "learning_rate": 3.447550105751145e-05, "loss": 1.4113, "step": 2150 }, { "epoch": 0.07815892314372558, "grad_norm": 1.6284586191177368, "learning_rate": 3.4304811354135145e-05, "loss": 1.2318, "step": 2160 }, { "epoch": 0.07852077001013172, "grad_norm": 1.2782353162765503, "learning_rate": 3.4133841931616696e-05, "loss": 1.1572, "step": 2170 }, { "epoch": 0.07888261687653784, "grad_norm": 1.2072981595993042, "learning_rate": 3.396259981783942e-05, "loss": 1.1379, "step": 2180 }, { "epoch": 0.07924446374294399, "grad_norm": 1.3825560808181763, "learning_rate": 3.37910920518959e-05, "loss": 0.9664, "step": 2190 }, { "epoch": 0.07960631060935013, "grad_norm": 3.7854928970336914, "learning_rate": 3.3619325683798646e-05, "loss": 1.0347, "step": 2200 }, { "epoch": 0.07996815747575627, "grad_norm": 1.3101471662521362, "learning_rate": 3.3447307774190296e-05, "loss": 1.1279, "step": 2210 }, { "epoch": 0.08033000434216239, "grad_norm": 1.956546425819397, "learning_rate": 3.327504539405335e-05, "loss": 1.0542, "step": 2220 }, { "epoch": 0.08069185120856853, "grad_norm": 3.031519651412964, "learning_rate": 3.3102545624419583e-05, "loss": 1.1012, "step": 2230 }, { "epoch": 0.08105369807497467, "grad_norm": 1.459944725036621, "learning_rate": 3.292981555607884e-05, "loss": 0.9772, "step": 2240 }, { "epoch": 0.08141554494138081, "grad_norm": 1.2718662023544312, "learning_rate": 3.2756862289287746e-05, "loss": 1.1809, "step": 2250 }, { "epoch": 0.08141554494138081, "eval_loss": 1.0176585912704468, "eval_runtime": 67.5983, "eval_samples_per_second": 7.397, "eval_steps_per_second": 7.397, "step": 2250 }, { "epoch": 0.08177739180778694, "grad_norm": 0.9579117298126221, "learning_rate": 3.258369293347764e-05, "loss": 1.1362, "step": 2260 }, { "epoch": 0.08213923867419308, "grad_norm": 1.5445349216461182, "learning_rate": 3.241031460696251e-05, "loss": 1.0691, "step": 2270 }, { "epoch": 0.08250108554059922, "grad_norm": 1.1376118659973145, "learning_rate": 3.223673443664627e-05, "loss": 1.1405, "step": 2280 }, { "epoch": 0.08286293240700536, "grad_norm": 0.8457480669021606, "learning_rate": 3.206295955772987e-05, "loss": 1.1196, "step": 2290 }, { "epoch": 0.08322477927341149, "grad_norm": 0.8977763652801514, "learning_rate": 3.188899711341793e-05, "loss": 1.0519, "step": 2300 }, { "epoch": 0.08358662613981763, "grad_norm": 0.9885033965110779, "learning_rate": 3.171485425462518e-05, "loss": 1.1251, "step": 2310 }, { "epoch": 0.08394847300622377, "grad_norm": 2.3595972061157227, "learning_rate": 3.15405381396825e-05, "loss": 1.15, "step": 2320 }, { "epoch": 0.08431031987262991, "grad_norm": 1.94205641746521, "learning_rate": 3.136605593404258e-05, "loss": 1.1824, "step": 2330 }, { "epoch": 0.08467216673903603, "grad_norm": 1.0730341672897339, "learning_rate": 3.119141480998553e-05, "loss": 0.9284, "step": 2340 }, { "epoch": 0.08503401360544217, "grad_norm": 1.1775847673416138, "learning_rate": 3.101662194632392e-05, "loss": 1.1064, "step": 2350 }, { "epoch": 0.08539586047184831, "grad_norm": 1.1937066316604614, "learning_rate": 3.0841684528107766e-05, "loss": 1.2703, "step": 2360 }, { "epoch": 0.08575770733825445, "grad_norm": 3.568132162094116, "learning_rate": 3.066660974632914e-05, "loss": 1.0436, "step": 2370 }, { "epoch": 0.08611955420466058, "grad_norm": 0.9505758285522461, "learning_rate": 3.0491404797626605e-05, "loss": 1.1975, "step": 2380 }, { "epoch": 0.08648140107106672, "grad_norm": 1.142575740814209, "learning_rate": 3.031607688398936e-05, "loss": 1.0677, "step": 2390 }, { "epoch": 0.08684324793747286, "grad_norm": 1.8652774095535278, "learning_rate": 3.0140633212461248e-05, "loss": 1.0845, "step": 2400 }, { "epoch": 0.08684324793747286, "eval_loss": 1.0178141593933105, "eval_runtime": 68.0706, "eval_samples_per_second": 7.345, "eval_steps_per_second": 7.345, "step": 2400 }, { "epoch": 0.087205094803879, "grad_norm": 1.2081557512283325, "learning_rate": 2.9965080994844422e-05, "loss": 1.0088, "step": 2410 }, { "epoch": 0.08756694167028514, "grad_norm": 1.3424208164215088, "learning_rate": 2.978942744740296e-05, "loss": 1.0655, "step": 2420 }, { "epoch": 0.08792878853669127, "grad_norm": 1.820688247680664, "learning_rate": 2.961367979056621e-05, "loss": 1.0138, "step": 2430 }, { "epoch": 0.08829063540309741, "grad_norm": 1.937402367591858, "learning_rate": 2.9437845248631984e-05, "loss": 1.2169, "step": 2440 }, { "epoch": 0.08865248226950355, "grad_norm": 1.6876219511032104, "learning_rate": 2.926193104946961e-05, "loss": 1.1082, "step": 2450 }, { "epoch": 0.08901432913590969, "grad_norm": 2.0569660663604736, "learning_rate": 2.90859444242228e-05, "loss": 0.8941, "step": 2460 }, { "epoch": 0.08937617600231582, "grad_norm": 1.5469706058502197, "learning_rate": 2.8909892607012427e-05, "loss": 1.0006, "step": 2470 }, { "epoch": 0.08973802286872196, "grad_norm": 0.9952434301376343, "learning_rate": 2.8733782834639165e-05, "loss": 0.9643, "step": 2480 }, { "epoch": 0.0900998697351281, "grad_norm": 1.4197076559066772, "learning_rate": 2.8557622346285957e-05, "loss": 0.9211, "step": 2490 }, { "epoch": 0.09046171660153424, "grad_norm": 1.6205191612243652, "learning_rate": 2.8381418383220526e-05, "loss": 1.0245, "step": 2500 }, { "epoch": 0.09082356346794036, "grad_norm": 1.6856911182403564, "learning_rate": 2.8205178188497627e-05, "loss": 0.9578, "step": 2510 }, { "epoch": 0.0911854103343465, "grad_norm": 1.2006592750549316, "learning_rate": 2.8028909006661396e-05, "loss": 1.0817, "step": 2520 }, { "epoch": 0.09154725720075264, "grad_norm": 1.5140140056610107, "learning_rate": 2.78526180834475e-05, "loss": 1.1583, "step": 2530 }, { "epoch": 0.09190910406715878, "grad_norm": 1.0385756492614746, "learning_rate": 2.7676312665485307e-05, "loss": 1.1505, "step": 2540 }, { "epoch": 0.09227095093356491, "grad_norm": 1.3027018308639526, "learning_rate": 2.75e-05, "loss": 1.072, "step": 2550 }, { "epoch": 0.09227095093356491, "eval_loss": 1.0139663219451904, "eval_runtime": 68.3939, "eval_samples_per_second": 7.311, "eval_steps_per_second": 7.311, "step": 2550 }, { "epoch": 0.09263279779997105, "grad_norm": 1.0844429731369019, "learning_rate": 2.7323687334514695e-05, "loss": 1.092, "step": 2560 }, { "epoch": 0.09299464466637719, "grad_norm": 1.1637288331985474, "learning_rate": 2.71473819165525e-05, "loss": 1.1578, "step": 2570 }, { "epoch": 0.09335649153278333, "grad_norm": 1.1265349388122559, "learning_rate": 2.6971090993338606e-05, "loss": 1.2328, "step": 2580 }, { "epoch": 0.09371833839918946, "grad_norm": 1.6455796957015991, "learning_rate": 2.679482181150238e-05, "loss": 1.1561, "step": 2590 }, { "epoch": 0.0940801852655956, "grad_norm": 1.497781753540039, "learning_rate": 2.6618581616779483e-05, "loss": 1.2405, "step": 2600 }, { "epoch": 0.09444203213200174, "grad_norm": 1.4378868341445923, "learning_rate": 2.644237765371404e-05, "loss": 0.9899, "step": 2610 }, { "epoch": 0.09480387899840788, "grad_norm": 1.2380419969558716, "learning_rate": 2.626621716536085e-05, "loss": 1.1065, "step": 2620 }, { "epoch": 0.095165725864814, "grad_norm": 1.0482791662216187, "learning_rate": 2.6090107392987575e-05, "loss": 1.1852, "step": 2630 }, { "epoch": 0.09552757273122015, "grad_norm": 1.1260417699813843, "learning_rate": 2.591405557577721e-05, "loss": 0.9224, "step": 2640 }, { "epoch": 0.09588941959762629, "grad_norm": 1.0875478982925415, "learning_rate": 2.5738068950530398e-05, "loss": 1.1763, "step": 2650 }, { "epoch": 0.09625126646403243, "grad_norm": 1.7674652338027954, "learning_rate": 2.5562154751368014e-05, "loss": 1.1492, "step": 2660 }, { "epoch": 0.09661311333043855, "grad_norm": 1.3366526365280151, "learning_rate": 2.5386320209433798e-05, "loss": 1.1322, "step": 2670 }, { "epoch": 0.09697496019684469, "grad_norm": 1.9675370454788208, "learning_rate": 2.5210572552597046e-05, "loss": 1.0435, "step": 2680 }, { "epoch": 0.09733680706325083, "grad_norm": 1.9823009967803955, "learning_rate": 2.5034919005155583e-05, "loss": 0.9524, "step": 2690 }, { "epoch": 0.09769865392965697, "grad_norm": 1.4506902694702148, "learning_rate": 2.4859366787538754e-05, "loss": 1.0546, "step": 2700 }, { "epoch": 0.09769865392965697, "eval_loss": 1.0141299962997437, "eval_runtime": 68.604, "eval_samples_per_second": 7.288, "eval_steps_per_second": 7.288, "step": 2700 }, { "epoch": 0.0980605007960631, "grad_norm": 1.5831595659255981, "learning_rate": 2.468392311601064e-05, "loss": 0.9996, "step": 2710 }, { "epoch": 0.09842234766246924, "grad_norm": 1.5436463356018066, "learning_rate": 2.4508595202373404e-05, "loss": 0.9825, "step": 2720 }, { "epoch": 0.09878419452887538, "grad_norm": 1.350256085395813, "learning_rate": 2.433339025367087e-05, "loss": 1.0197, "step": 2730 }, { "epoch": 0.09914604139528152, "grad_norm": 2.5600554943084717, "learning_rate": 2.415831547189224e-05, "loss": 1.1426, "step": 2740 }, { "epoch": 0.09950788826168766, "grad_norm": 1.3251597881317139, "learning_rate": 2.3983378053676083e-05, "loss": 0.9772, "step": 2750 }, { "epoch": 0.09986973512809379, "grad_norm": 1.9492292404174805, "learning_rate": 2.3808585190014484e-05, "loss": 0.9923, "step": 2760 }, { "epoch": 0.10023158199449993, "grad_norm": 4.378963947296143, "learning_rate": 2.3633944065957427e-05, "loss": 1.0051, "step": 2770 }, { "epoch": 0.10059342886090607, "grad_norm": 1.2288966178894043, "learning_rate": 2.345946186031751e-05, "loss": 1.0537, "step": 2780 }, { "epoch": 0.10095527572731221, "grad_norm": 0.9232672452926636, "learning_rate": 2.328514574537481e-05, "loss": 1.1619, "step": 2790 }, { "epoch": 0.10131712259371833, "grad_norm": 1.7456896305084229, "learning_rate": 2.311100288658208e-05, "loss": 1.1998, "step": 2800 }, { "epoch": 0.10167896946012447, "grad_norm": 2.0725297927856445, "learning_rate": 2.2937040442270142e-05, "loss": 1.0535, "step": 2810 }, { "epoch": 0.10204081632653061, "grad_norm": 1.4991570711135864, "learning_rate": 2.2763265563353733e-05, "loss": 1.1204, "step": 2820 }, { "epoch": 0.10240266319293675, "grad_norm": 1.201222538948059, "learning_rate": 2.2589685393037495e-05, "loss": 1.0227, "step": 2830 }, { "epoch": 0.10276451005934288, "grad_norm": 11.434679985046387, "learning_rate": 2.241630706652236e-05, "loss": 1.2553, "step": 2840 }, { "epoch": 0.10312635692574902, "grad_norm": 0.8312958478927612, "learning_rate": 2.2243137710712266e-05, "loss": 1.2161, "step": 2850 }, { "epoch": 0.10312635692574902, "eval_loss": 1.0155481100082397, "eval_runtime": 68.0244, "eval_samples_per_second": 7.35, "eval_steps_per_second": 7.35, "step": 2850 }, { "epoch": 0.10348820379215516, "grad_norm": 1.6741582155227661, "learning_rate": 2.2070184443921156e-05, "loss": 1.2261, "step": 2860 }, { "epoch": 0.1038500506585613, "grad_norm": 1.7291451692581177, "learning_rate": 2.1897454375580425e-05, "loss": 1.1574, "step": 2870 }, { "epoch": 0.10421189752496743, "grad_norm": 4.206969261169434, "learning_rate": 2.1724954605946642e-05, "loss": 1.1456, "step": 2880 }, { "epoch": 0.10457374439137357, "grad_norm": 1.6589950323104858, "learning_rate": 2.1552692225809706e-05, "loss": 1.1267, "step": 2890 }, { "epoch": 0.10493559125777971, "grad_norm": 1.2547482252120972, "learning_rate": 2.1380674316201356e-05, "loss": 0.9718, "step": 2900 }, { "epoch": 0.10529743812418585, "grad_norm": 1.0732470750808716, "learning_rate": 2.1208907948104105e-05, "loss": 1.0404, "step": 2910 }, { "epoch": 0.10565928499059198, "grad_norm": 1.315737009048462, "learning_rate": 2.1037400182160584e-05, "loss": 1.0946, "step": 2920 }, { "epoch": 0.10602113185699812, "grad_norm": 1.0078600645065308, "learning_rate": 2.0866158068383306e-05, "loss": 1.082, "step": 2930 }, { "epoch": 0.10638297872340426, "grad_norm": 0.8248572945594788, "learning_rate": 2.069518864586486e-05, "loss": 1.12, "step": 2940 }, { "epoch": 0.1067448255898104, "grad_norm": 1.3113569021224976, "learning_rate": 2.052449894248855e-05, "loss": 1.1006, "step": 2950 }, { "epoch": 0.10710667245621652, "grad_norm": 1.4050854444503784, "learning_rate": 2.035409597463955e-05, "loss": 1.0876, "step": 2960 }, { "epoch": 0.10746851932262266, "grad_norm": 2.4245057106018066, "learning_rate": 2.0183986746916438e-05, "loss": 1.0999, "step": 2970 }, { "epoch": 0.1078303661890288, "grad_norm": 1.2002867460250854, "learning_rate": 2.0014178251843294e-05, "loss": 1.1002, "step": 2980 }, { "epoch": 0.10819221305543494, "grad_norm": 1.3750348091125488, "learning_rate": 1.9844677469582266e-05, "loss": 1.0426, "step": 2990 }, { "epoch": 0.10855405992184107, "grad_norm": 1.6099352836608887, "learning_rate": 1.967549136764661e-05, "loss": 1.0865, "step": 3000 }, { "epoch": 0.10855405992184107, "eval_loss": 1.0141035318374634, "eval_runtime": 68.0329, "eval_samples_per_second": 7.349, "eval_steps_per_second": 7.349, "step": 3000 }, { "epoch": 0.10891590678824721, "grad_norm": 1.0684062242507935, "learning_rate": 1.950662690061433e-05, "loss": 1.0239, "step": 3010 }, { "epoch": 0.10927775365465335, "grad_norm": 5.7095232009887695, "learning_rate": 1.9338091009842258e-05, "loss": 1.0762, "step": 3020 }, { "epoch": 0.10963960052105949, "grad_norm": 1.2373403310775757, "learning_rate": 1.916989062318077e-05, "loss": 1.0276, "step": 3030 }, { "epoch": 0.11000144738746563, "grad_norm": 0.8737964034080505, "learning_rate": 1.900203265468895e-05, "loss": 1.029, "step": 3040 }, { "epoch": 0.11036329425387176, "grad_norm": 1.7825664281845093, "learning_rate": 1.8834524004350432e-05, "loss": 1.204, "step": 3050 }, { "epoch": 0.1107251411202779, "grad_norm": 2.1149163246154785, "learning_rate": 1.8667371557789747e-05, "loss": 1.0395, "step": 3060 }, { "epoch": 0.11108698798668404, "grad_norm": 1.4019514322280884, "learning_rate": 1.8500582185989287e-05, "loss": 0.9915, "step": 3070 }, { "epoch": 0.11144883485309018, "grad_norm": 0.8376224040985107, "learning_rate": 1.8334162745006857e-05, "loss": 1.0396, "step": 3080 }, { "epoch": 0.1118106817194963, "grad_norm": 1.4035556316375732, "learning_rate": 1.8168120075693843e-05, "loss": 1.074, "step": 3090 }, { "epoch": 0.11217252858590245, "grad_norm": 1.9495182037353516, "learning_rate": 1.8002461003414043e-05, "loss": 1.0015, "step": 3100 }, { "epoch": 0.11253437545230859, "grad_norm": 1.8730123043060303, "learning_rate": 1.7837192337763072e-05, "loss": 1.0545, "step": 3110 }, { "epoch": 0.11289622231871473, "grad_norm": 1.1504132747650146, "learning_rate": 1.7672320872288483e-05, "loss": 1.0284, "step": 3120 }, { "epoch": 0.11325806918512085, "grad_norm": 2.172187328338623, "learning_rate": 1.750785338421044e-05, "loss": 0.985, "step": 3130 }, { "epoch": 0.11361991605152699, "grad_norm": 1.6594125032424927, "learning_rate": 1.7343796634143204e-05, "loss": 1.1224, "step": 3140 }, { "epoch": 0.11398176291793313, "grad_norm": 1.0405821800231934, "learning_rate": 1.7180157365817214e-05, "loss": 1.123, "step": 3150 }, { "epoch": 0.11398176291793313, "eval_loss": 1.0120776891708374, "eval_runtime": 69.1058, "eval_samples_per_second": 7.235, "eval_steps_per_second": 7.235, "step": 3150 }, { "epoch": 0.11434360978433927, "grad_norm": 1.7492406368255615, "learning_rate": 1.7016942305801853e-05, "loss": 1.1078, "step": 3160 }, { "epoch": 0.1147054566507454, "grad_norm": 1.5439279079437256, "learning_rate": 1.6854158163228982e-05, "loss": 1.1343, "step": 3170 }, { "epoch": 0.11506730351715154, "grad_norm": 1.453338384628296, "learning_rate": 1.6691811629517104e-05, "loss": 1.0727, "step": 3180 }, { "epoch": 0.11542915038355768, "grad_norm": 1.3687138557434082, "learning_rate": 1.6529909378096355e-05, "loss": 1.1866, "step": 3190 }, { "epoch": 0.11579099724996382, "grad_norm": 1.1038978099822998, "learning_rate": 1.636845806413417e-05, "loss": 1.0286, "step": 3200 }, { "epoch": 0.11615284411636995, "grad_norm": 0.9542801380157471, "learning_rate": 1.6207464324261707e-05, "loss": 0.889, "step": 3210 }, { "epoch": 0.11651469098277609, "grad_norm": 1.8660609722137451, "learning_rate": 1.6046934776301034e-05, "loss": 1.1525, "step": 3220 }, { "epoch": 0.11687653784918223, "grad_norm": 1.022739291191101, "learning_rate": 1.588687601899311e-05, "loss": 1.1592, "step": 3230 }, { "epoch": 0.11723838471558837, "grad_norm": 1.157504916191101, "learning_rate": 1.5727294631726555e-05, "loss": 1.1434, "step": 3240 }, { "epoch": 0.1176002315819945, "grad_norm": 0.8439706563949585, "learning_rate": 1.5568197174267155e-05, "loss": 1.0556, "step": 3250 }, { "epoch": 0.11796207844840063, "grad_norm": 1.4141348600387573, "learning_rate": 1.5409590186488247e-05, "loss": 1.1489, "step": 3260 }, { "epoch": 0.11832392531480677, "grad_norm": 1.321804165840149, "learning_rate": 1.5251480188101872e-05, "loss": 1.1439, "step": 3270 }, { "epoch": 0.11868577218121291, "grad_norm": 1.4006226062774658, "learning_rate": 1.5093873678390796e-05, "loss": 1.1881, "step": 3280 }, { "epoch": 0.11904761904761904, "grad_norm": 1.4769755601882935, "learning_rate": 1.4936777135941329e-05, "loss": 1.1255, "step": 3290 }, { "epoch": 0.11940946591402518, "grad_norm": 1.5317277908325195, "learning_rate": 1.4780197018377037e-05, "loss": 0.9975, "step": 3300 }, { "epoch": 0.11940946591402518, "eval_loss": 1.0121560096740723, "eval_runtime": 67.8243, "eval_samples_per_second": 7.372, "eval_steps_per_second": 7.372, "step": 3300 }, { "epoch": 0.11977131278043132, "grad_norm": 2.143234968185425, "learning_rate": 1.4624139762093247e-05, "loss": 1.0912, "step": 3310 }, { "epoch": 0.12013315964683746, "grad_norm": 1.0862255096435547, "learning_rate": 1.4468611781992537e-05, "loss": 1.0349, "step": 3320 }, { "epoch": 0.1204950065132436, "grad_norm": 1.2058424949645996, "learning_rate": 1.4313619471221022e-05, "loss": 1.0743, "step": 3330 }, { "epoch": 0.12085685337964973, "grad_norm": 1.6770751476287842, "learning_rate": 1.4159169200905515e-05, "loss": 1.0926, "step": 3340 }, { "epoch": 0.12121870024605587, "grad_norm": 3.1638665199279785, "learning_rate": 1.4005267319891719e-05, "loss": 1.1392, "step": 3350 }, { "epoch": 0.12158054711246201, "grad_norm": 1.6747909784317017, "learning_rate": 1.3851920154483133e-05, "loss": 1.02, "step": 3360 }, { "epoch": 0.12194239397886815, "grad_norm": 1.1009001731872559, "learning_rate": 1.3699134008181126e-05, "loss": 1.1063, "step": 3370 }, { "epoch": 0.12230424084527428, "grad_norm": 1.591524362564087, "learning_rate": 1.3546915161425745e-05, "loss": 0.9342, "step": 3380 }, { "epoch": 0.12266608771168042, "grad_norm": 2.062162160873413, "learning_rate": 1.3395269871337586e-05, "loss": 1.1618, "step": 3390 }, { "epoch": 0.12302793457808656, "grad_norm": 1.3654874563217163, "learning_rate": 1.3244204371460562e-05, "loss": 1.223, "step": 3400 }, { "epoch": 0.1233897814444927, "grad_norm": 1.500443935394287, "learning_rate": 1.3093724871505698e-05, "loss": 1.1976, "step": 3410 }, { "epoch": 0.12375162831089882, "grad_norm": 1.17684006690979, "learning_rate": 1.2943837557095845e-05, "loss": 1.0071, "step": 3420 }, { "epoch": 0.12411347517730496, "grad_norm": 1.6402300596237183, "learning_rate": 1.2794548589511433e-05, "loss": 1.0819, "step": 3430 }, { "epoch": 0.1244753220437111, "grad_norm": 1.4662994146347046, "learning_rate": 1.2645864105437201e-05, "loss": 1.1352, "step": 3440 }, { "epoch": 0.12483716891011724, "grad_norm": 0.8377330899238586, "learning_rate": 1.2497790216709914e-05, "loss": 1.1601, "step": 3450 }, { "epoch": 0.12483716891011724, "eval_loss": 1.012189507484436, "eval_runtime": 69.0114, "eval_samples_per_second": 7.245, "eval_steps_per_second": 7.245, "step": 3450 }, { "epoch": 0.12519901577652337, "grad_norm": 1.4229249954223633, "learning_rate": 1.2350333010067184e-05, "loss": 1.0838, "step": 3460 }, { "epoch": 0.1255608626429295, "grad_norm": 1.6762115955352783, "learning_rate": 1.2203498546897221e-05, "loss": 1.2404, "step": 3470 }, { "epoch": 0.12592270950933565, "grad_norm": 1.5850247144699097, "learning_rate": 1.2057292862989693e-05, "loss": 1.1793, "step": 3480 }, { "epoch": 0.1262845563757418, "grad_norm": 1.1561609506607056, "learning_rate": 1.1911721968287635e-05, "loss": 1.0021, "step": 3490 }, { "epoch": 0.12664640324214793, "grad_norm": 2.5090935230255127, "learning_rate": 1.176679184664034e-05, "loss": 0.9957, "step": 3500 }, { "epoch": 0.12700825010855407, "grad_norm": 0.9981018900871277, "learning_rate": 1.1622508455557471e-05, "loss": 1.1019, "step": 3510 }, { "epoch": 0.12737009697496018, "grad_norm": 1.4055578708648682, "learning_rate": 1.1478877725964109e-05, "loss": 1.1383, "step": 3520 }, { "epoch": 0.12773194384136632, "grad_norm": 1.629893183708191, "learning_rate": 1.1335905561956992e-05, "loss": 1.0497, "step": 3530 }, { "epoch": 0.12809379070777246, "grad_norm": 1.537331223487854, "learning_rate": 1.1193597840561793e-05, "loss": 1.1707, "step": 3540 }, { "epoch": 0.1284556375741786, "grad_norm": 1.1500210762023926, "learning_rate": 1.1051960411491561e-05, "loss": 0.9467, "step": 3550 }, { "epoch": 0.12881748444058475, "grad_norm": 1.6494172811508179, "learning_rate": 1.0910999096906248e-05, "loss": 1.2392, "step": 3560 }, { "epoch": 0.12917933130699089, "grad_norm": 1.4481260776519775, "learning_rate": 1.0770719691173388e-05, "loss": 0.9995, "step": 3570 }, { "epoch": 0.12954117817339703, "grad_norm": 1.3375111818313599, "learning_rate": 1.0631127960629924e-05, "loss": 1.1152, "step": 3580 }, { "epoch": 0.12990302503980317, "grad_norm": 2.4674742221832275, "learning_rate": 1.0492229643345136e-05, "loss": 1.1453, "step": 3590 }, { "epoch": 0.13026487190620928, "grad_norm": 1.7152785062789917, "learning_rate": 1.0354030448884829e-05, "loss": 1.1212, "step": 3600 }, { "epoch": 0.13026487190620928, "eval_loss": 1.0104962587356567, "eval_runtime": 70.1532, "eval_samples_per_second": 7.127, "eval_steps_per_second": 7.127, "step": 3600 }, { "epoch": 0.13062671877261542, "grad_norm": 1.0393935441970825, "learning_rate": 1.02165360580766e-05, "loss": 1.0769, "step": 3610 }, { "epoch": 0.13098856563902156, "grad_norm": 1.1804404258728027, "learning_rate": 1.0079752122776338e-05, "loss": 1.2117, "step": 3620 }, { "epoch": 0.1313504125054277, "grad_norm": 2.4288265705108643, "learning_rate": 9.94368426563585e-06, "loss": 1.0263, "step": 3630 }, { "epoch": 0.13171225937183384, "grad_norm": 1.774293303489685, "learning_rate": 9.80833807987182e-06, "loss": 0.9905, "step": 3640 }, { "epoch": 0.13207410623823998, "grad_norm": 1.4655253887176514, "learning_rate": 9.673719129035826e-06, "loss": 1.0889, "step": 3650 }, { "epoch": 0.13243595310464612, "grad_norm": 1.2071261405944824, "learning_rate": 9.53983294678566e-06, "loss": 1.1737, "step": 3660 }, { "epoch": 0.13279779997105226, "grad_norm": 1.2400939464569092, "learning_rate": 9.406685036657904e-06, "loss": 0.9934, "step": 3670 }, { "epoch": 0.1331596468374584, "grad_norm": 1.1579022407531738, "learning_rate": 9.27428087184162e-06, "loss": 1.1242, "step": 3680 }, { "epoch": 0.1335214937038645, "grad_norm": 1.5240107774734497, "learning_rate": 9.142625894953431e-06, "loss": 0.9896, "step": 3690 }, { "epoch": 0.13388334057027065, "grad_norm": 1.670318365097046, "learning_rate": 9.011725517813786e-06, "loss": 1.0314, "step": 3700 }, { "epoch": 0.1342451874366768, "grad_norm": 1.226494312286377, "learning_rate": 8.881585121224496e-06, "loss": 1.1641, "step": 3710 }, { "epoch": 0.13460703430308293, "grad_norm": 1.2739923000335693, "learning_rate": 8.752210054747517e-06, "loss": 1.2567, "step": 3720 }, { "epoch": 0.13496888116948907, "grad_norm": 1.7752466201782227, "learning_rate": 8.623605636485119e-06, "loss": 1.1038, "step": 3730 }, { "epoch": 0.13533072803589521, "grad_norm": 1.6884175539016724, "learning_rate": 8.495777152861222e-06, "loss": 1.0835, "step": 3740 }, { "epoch": 0.13569257490230135, "grad_norm": 1.4947561025619507, "learning_rate": 8.368729858404125e-06, "loss": 1.0129, "step": 3750 }, { "epoch": 0.13569257490230135, "eval_loss": 1.0097154378890991, "eval_runtime": 68.9446, "eval_samples_per_second": 7.252, "eval_steps_per_second": 7.252, "step": 3750 }, { "epoch": 0.1360544217687075, "grad_norm": 1.044838547706604, "learning_rate": 8.242468975530497e-06, "loss": 1.0651, "step": 3760 }, { "epoch": 0.1364162686351136, "grad_norm": 1.2362924814224243, "learning_rate": 8.116999694330684e-06, "loss": 1.2429, "step": 3770 }, { "epoch": 0.13677811550151975, "grad_norm": 1.666659951210022, "learning_rate": 7.99232717235541e-06, "loss": 1.119, "step": 3780 }, { "epoch": 0.1371399623679259, "grad_norm": 1.5528266429901123, "learning_rate": 7.86845653440376e-06, "loss": 1.1454, "step": 3790 }, { "epoch": 0.13750180923433203, "grad_norm": 1.15411376953125, "learning_rate": 7.745392872312495e-06, "loss": 1.2258, "step": 3800 }, { "epoch": 0.13786365610073817, "grad_norm": 1.49830961227417, "learning_rate": 7.623141244746736e-06, "loss": 1.089, "step": 3810 }, { "epoch": 0.1382255029671443, "grad_norm": 1.9670263528823853, "learning_rate": 7.5017066769920735e-06, "loss": 1.0233, "step": 3820 }, { "epoch": 0.13858734983355045, "grad_norm": 1.298459529876709, "learning_rate": 7.381094160747963e-06, "loss": 1.1055, "step": 3830 }, { "epoch": 0.1389491966999566, "grad_norm": 1.5746151208877563, "learning_rate": 7.261308653922539e-06, "loss": 1.0377, "step": 3840 }, { "epoch": 0.1393110435663627, "grad_norm": 1.7434015274047852, "learning_rate": 7.1423550804288275e-06, "loss": 1.1685, "step": 3850 }, { "epoch": 0.13967289043276884, "grad_norm": 0.8890132904052734, "learning_rate": 7.024238329982311e-06, "loss": 1.0089, "step": 3860 }, { "epoch": 0.14003473729917498, "grad_norm": 1.4679300785064697, "learning_rate": 6.906963257899975e-06, "loss": 1.1897, "step": 3870 }, { "epoch": 0.14039658416558112, "grad_norm": 1.2193775177001953, "learning_rate": 6.7905346849007014e-06, "loss": 1.0448, "step": 3880 }, { "epoch": 0.14075843103198726, "grad_norm": 1.275647759437561, "learning_rate": 6.674957396907109e-06, "loss": 1.0907, "step": 3890 }, { "epoch": 0.1411202778983934, "grad_norm": 1.4282574653625488, "learning_rate": 6.560236144848803e-06, "loss": 1.001, "step": 3900 }, { "epoch": 0.1411202778983934, "eval_loss": 1.0095244646072388, "eval_runtime": 69.9976, "eval_samples_per_second": 7.143, "eval_steps_per_second": 7.143, "step": 3900 }, { "epoch": 0.14148212476479954, "grad_norm": 1.5553085803985596, "learning_rate": 6.4463756444671446e-06, "loss": 1.1813, "step": 3910 }, { "epoch": 0.14184397163120568, "grad_norm": 1.0424302816390991, "learning_rate": 6.333380576121334e-06, "loss": 1.0278, "step": 3920 }, { "epoch": 0.14220581849761182, "grad_norm": 1.413281798362732, "learning_rate": 6.221255584596061e-06, "loss": 1.1354, "step": 3930 }, { "epoch": 0.14256766536401794, "grad_norm": 1.1732711791992188, "learning_rate": 6.110005278910572e-06, "loss": 1.1993, "step": 3940 }, { "epoch": 0.14292951223042408, "grad_norm": 1.0241459608078003, "learning_rate": 5.999634232129181e-06, "loss": 0.9585, "step": 3950 }, { "epoch": 0.14329135909683022, "grad_norm": 1.531690239906311, "learning_rate": 5.890146981173336e-06, "loss": 1.0821, "step": 3960 }, { "epoch": 0.14365320596323636, "grad_norm": 1.4708391427993774, "learning_rate": 5.781548026635087e-06, "loss": 1.0692, "step": 3970 }, { "epoch": 0.1440150528296425, "grad_norm": 1.1054376363754272, "learning_rate": 5.673841832592114e-06, "loss": 0.9396, "step": 3980 }, { "epoch": 0.14437689969604864, "grad_norm": 1.9526485204696655, "learning_rate": 5.56703282642418e-06, "loss": 1.2189, "step": 3990 }, { "epoch": 0.14473874656245478, "grad_norm": 3.9769842624664307, "learning_rate": 5.461125398631196e-06, "loss": 1.0525, "step": 4000 }, { "epoch": 0.14510059342886092, "grad_norm": 1.3415507078170776, "learning_rate": 5.356123902652707e-06, "loss": 1.1193, "step": 4010 }, { "epoch": 0.14546244029526703, "grad_norm": 1.0923255681991577, "learning_rate": 5.252032654688949e-06, "loss": 1.1096, "step": 4020 }, { "epoch": 0.14582428716167317, "grad_norm": 1.1630265712738037, "learning_rate": 5.148855933523428e-06, "loss": 1.0534, "step": 4030 }, { "epoch": 0.1461861340280793, "grad_norm": 1.2223676443099976, "learning_rate": 5.046597980347035e-06, "loss": 1.1897, "step": 4040 }, { "epoch": 0.14654798089448545, "grad_norm": 0.836165726184845, "learning_rate": 4.945262998583711e-06, "loss": 1.045, "step": 4050 }, { "epoch": 0.14654798089448545, "eval_loss": 1.0101861953735352, "eval_runtime": 69.2573, "eval_samples_per_second": 7.219, "eval_steps_per_second": 7.219, "step": 4050 }, { "epoch": 0.1469098277608916, "grad_norm": 1.161815881729126, "learning_rate": 4.844855153717654e-06, "loss": 1.0237, "step": 4060 }, { "epoch": 0.14727167462729773, "grad_norm": 1.7267327308654785, "learning_rate": 4.745378573122101e-06, "loss": 1.0998, "step": 4070 }, { "epoch": 0.14763352149370387, "grad_norm": 1.8059628009796143, "learning_rate": 4.646837345889642e-06, "loss": 1.1101, "step": 4080 }, { "epoch": 0.14799536836011, "grad_norm": 1.074566125869751, "learning_rate": 4.5492355226641775e-06, "loss": 1.0435, "step": 4090 }, { "epoch": 0.14835721522651613, "grad_norm": 1.1857138872146606, "learning_rate": 4.452577115474384e-06, "loss": 1.0054, "step": 4100 }, { "epoch": 0.14871906209292227, "grad_norm": 1.183436393737793, "learning_rate": 4.3568660975687884e-06, "loss": 1.089, "step": 4110 }, { "epoch": 0.1490809089593284, "grad_norm": 1.2938114404678345, "learning_rate": 4.262106403252474e-06, "loss": 1.02, "step": 4120 }, { "epoch": 0.14944275582573455, "grad_norm": 1.0263599157333374, "learning_rate": 4.168301927725312e-06, "loss": 1.0283, "step": 4130 }, { "epoch": 0.1498046026921407, "grad_norm": 1.2773736715316772, "learning_rate": 4.075456526921887e-06, "loss": 1.12, "step": 4140 }, { "epoch": 0.15016644955854683, "grad_norm": 1.073967456817627, "learning_rate": 3.983574017352983e-06, "loss": 1.0542, "step": 4150 }, { "epoch": 0.15052829642495297, "grad_norm": 1.1125755310058594, "learning_rate": 3.8926581759486824e-06, "loss": 1.0694, "step": 4160 }, { "epoch": 0.1508901432913591, "grad_norm": 1.8294768333435059, "learning_rate": 3.8027127399031364e-06, "loss": 1.1087, "step": 4170 }, { "epoch": 0.15125199015776522, "grad_norm": 1.3126678466796875, "learning_rate": 3.7137414065209284e-06, "loss": 1.1202, "step": 4180 }, { "epoch": 0.15161383702417136, "grad_norm": 1.0770281553268433, "learning_rate": 3.6257478330650916e-06, "loss": 1.1615, "step": 4190 }, { "epoch": 0.1519756838905775, "grad_norm": 0.8374683856964111, "learning_rate": 3.5387356366067913e-06, "loss": 1.0404, "step": 4200 }, { "epoch": 0.1519756838905775, "eval_loss": 1.0101358890533447, "eval_runtime": 68.4101, "eval_samples_per_second": 7.309, "eval_steps_per_second": 7.309, "step": 4200 }, { "epoch": 0.15233753075698364, "grad_norm": 1.4065223932266235, "learning_rate": 3.45270839387662e-06, "loss": 0.9871, "step": 4210 }, { "epoch": 0.15269937762338978, "grad_norm": 0.9496563076972961, "learning_rate": 3.3676696411175727e-06, "loss": 1.0336, "step": 4220 }, { "epoch": 0.15306122448979592, "grad_norm": 1.5397430658340454, "learning_rate": 3.283622873939705e-06, "loss": 1.1096, "step": 4230 }, { "epoch": 0.15342307135620206, "grad_norm": 1.4058085680007935, "learning_rate": 3.2005715471764303e-06, "loss": 1.0993, "step": 4240 }, { "epoch": 0.1537849182226082, "grad_norm": 1.152244210243225, "learning_rate": 3.118519074742497e-06, "loss": 1.0168, "step": 4250 }, { "epoch": 0.15414676508901434, "grad_norm": 1.0403828620910645, "learning_rate": 3.037468829493679e-06, "loss": 1.1511, "step": 4260 }, { "epoch": 0.15450861195542046, "grad_norm": 2.3750505447387695, "learning_rate": 2.9574241430880926e-06, "loss": 1.0394, "step": 4270 }, { "epoch": 0.1548704588218266, "grad_norm": 1.343712568283081, "learning_rate": 2.878388305849292e-06, "loss": 1.0915, "step": 4280 }, { "epoch": 0.15523230568823274, "grad_norm": 1.6926482915878296, "learning_rate": 2.8003645666309768e-06, "loss": 1.1844, "step": 4290 }, { "epoch": 0.15559415255463888, "grad_norm": 1.7009713649749756, "learning_rate": 2.7233561326834765e-06, "loss": 1.1362, "step": 4300 }, { "epoch": 0.15595599942104502, "grad_norm": 1.9161940813064575, "learning_rate": 2.647366169521881e-06, "loss": 0.8494, "step": 4310 }, { "epoch": 0.15631784628745116, "grad_norm": 1.526503086090088, "learning_rate": 2.5723978007959507e-06, "loss": 1.0053, "step": 4320 }, { "epoch": 0.1566796931538573, "grad_norm": 1.46095871925354, "learning_rate": 2.4984541081616895e-06, "loss": 1.0156, "step": 4330 }, { "epoch": 0.15704154002026344, "grad_norm": 1.0103462934494019, "learning_rate": 2.4255381311546833e-06, "loss": 1.0062, "step": 4340 }, { "epoch": 0.15740338688666955, "grad_norm": 3.1792240142822266, "learning_rate": 2.3536528670651595e-06, "loss": 1.1572, "step": 4350 }, { "epoch": 0.15740338688666955, "eval_loss": 1.0096508264541626, "eval_runtime": 69.6507, "eval_samples_per_second": 7.179, "eval_steps_per_second": 7.179, "step": 4350 }, { "epoch": 0.1577652337530757, "grad_norm": 1.453674077987671, "learning_rate": 2.2828012708147603e-06, "loss": 1.1574, "step": 4360 }, { "epoch": 0.15812708061948183, "grad_norm": 1.140619158744812, "learning_rate": 2.2129862548351094e-06, "loss": 1.1826, "step": 4370 }, { "epoch": 0.15848892748588797, "grad_norm": 2.512465715408325, "learning_rate": 2.1442106889480615e-06, "loss": 1.0294, "step": 4380 }, { "epoch": 0.1588507743522941, "grad_norm": 1.5785260200500488, "learning_rate": 2.0764774002477615e-06, "loss": 1.1146, "step": 4390 }, { "epoch": 0.15921262121870025, "grad_norm": 0.9974178671836853, "learning_rate": 2.009789172984405e-06, "loss": 1.0514, "step": 4400 }, { "epoch": 0.1595744680851064, "grad_norm": 0.9857182502746582, "learning_rate": 1.9441487484498223e-06, "loss": 0.9823, "step": 4410 }, { "epoch": 0.15993631495151253, "grad_norm": 1.258324384689331, "learning_rate": 1.8795588248647634e-06, "loss": 1.004, "step": 4420 }, { "epoch": 0.16029816181791864, "grad_norm": 0.8922631144523621, "learning_rate": 1.8160220572680145e-06, "loss": 1.0236, "step": 4430 }, { "epoch": 0.16066000868432478, "grad_norm": 1.073943853378296, "learning_rate": 1.753541057407227e-06, "loss": 1.1361, "step": 4440 }, { "epoch": 0.16102185555073092, "grad_norm": 0.8987687826156616, "learning_rate": 1.692118393631588e-06, "loss": 1.1352, "step": 4450 }, { "epoch": 0.16138370241713706, "grad_norm": 1.6021450757980347, "learning_rate": 1.6317565907862317e-06, "loss": 1.0586, "step": 4460 }, { "epoch": 0.1617455492835432, "grad_norm": 1.2443656921386719, "learning_rate": 1.5724581301084432e-06, "loss": 1.0626, "step": 4470 }, { "epoch": 0.16210739614994935, "grad_norm": 0.954608678817749, "learning_rate": 1.5142254491256988e-06, "loss": 1.1989, "step": 4480 }, { "epoch": 0.16246924301635549, "grad_norm": 3.4262044429779053, "learning_rate": 1.4570609415554178e-06, "loss": 1.2245, "step": 4490 }, { "epoch": 0.16283108988276163, "grad_norm": 1.2041702270507812, "learning_rate": 1.4009669572066124e-06, "loss": 1.0698, "step": 4500 }, { "epoch": 0.16283108988276163, "eval_loss": 1.0091307163238525, "eval_runtime": 67.7843, "eval_samples_per_second": 7.376, "eval_steps_per_second": 7.376, "step": 4500 }, { "epoch": 0.16319293674916774, "grad_norm": 1.188515067100525, "learning_rate": 1.345945801883278e-06, "loss": 1.1011, "step": 4510 }, { "epoch": 0.16355478361557388, "grad_norm": 1.1092623472213745, "learning_rate": 1.2919997372896026e-06, "loss": 1.1452, "step": 4520 }, { "epoch": 0.16391663048198002, "grad_norm": 1.1018388271331787, "learning_rate": 1.2391309809370159e-06, "loss": 1.047, "step": 4530 }, { "epoch": 0.16427847734838616, "grad_norm": 3.1872828006744385, "learning_rate": 1.18734170605301e-06, "loss": 0.9842, "step": 4540 }, { "epoch": 0.1646403242147923, "grad_norm": 1.7638747692108154, "learning_rate": 1.136634041491834e-06, "loss": 1.0505, "step": 4550 }, { "epoch": 0.16500217108119844, "grad_norm": 1.1123310327529907, "learning_rate": 1.0870100716469694e-06, "loss": 1.0477, "step": 4560 }, { "epoch": 0.16536401794760458, "grad_norm": 1.0227220058441162, "learning_rate": 1.0384718363654598e-06, "loss": 0.9642, "step": 4570 }, { "epoch": 0.16572586481401072, "grad_norm": 1.2616156339645386, "learning_rate": 9.910213308640359e-07, "loss": 0.9172, "step": 4580 }, { "epoch": 0.16608771168041686, "grad_norm": 1.3117682933807373, "learning_rate": 9.446605056471311e-07, "loss": 1.0213, "step": 4590 }, { "epoch": 0.16644955854682297, "grad_norm": 2.8420636653900146, "learning_rate": 8.993912664266901e-07, "loss": 0.9927, "step": 4600 }, { "epoch": 0.1668114054132291, "grad_norm": 1.8774317502975464, "learning_rate": 8.5521547404383e-07, "loss": 1.0472, "step": 4610 }, { "epoch": 0.16717325227963525, "grad_norm": 0.9148290753364563, "learning_rate": 8.121349443923473e-07, "loss": 1.1518, "step": 4620 }, { "epoch": 0.1675350991460414, "grad_norm": 1.0895439386367798, "learning_rate": 7.701514483440844e-07, "loss": 1.1432, "step": 4630 }, { "epoch": 0.16789694601244753, "grad_norm": 1.0801299810409546, "learning_rate": 7.292667116761223e-07, "loss": 1.0563, "step": 4640 }, { "epoch": 0.16825879287885367, "grad_norm": 1.8083324432373047, "learning_rate": 6.894824149998505e-07, "loss": 0.9847, "step": 4650 }, { "epoch": 0.16825879287885367, "eval_loss": 1.00955331325531, "eval_runtime": 70.6613, "eval_samples_per_second": 7.076, "eval_steps_per_second": 7.076, "step": 4650 }, { "epoch": 0.16862063974525981, "grad_norm": 1.1133240461349487, "learning_rate": 6.508001936918873e-07, "loss": 1.1913, "step": 4660 }, { "epoch": 0.16898248661166596, "grad_norm": 1.0921710729599, "learning_rate": 6.132216378268379e-07, "loss": 1.1986, "step": 4670 }, { "epoch": 0.16934433347807207, "grad_norm": 1.8579936027526855, "learning_rate": 5.767482921119461e-07, "loss": 1.1056, "step": 4680 }, { "epoch": 0.1697061803444782, "grad_norm": 1.5773149728775024, "learning_rate": 5.413816558236007e-07, "loss": 1.0688, "step": 4690 }, { "epoch": 0.17006802721088435, "grad_norm": 3.157620906829834, "learning_rate": 5.071231827457004e-07, "loss": 1.1053, "step": 4700 }, { "epoch": 0.1704298740772905, "grad_norm": 1.8579378128051758, "learning_rate": 4.739742811098946e-07, "loss": 1.2058, "step": 4710 }, { "epoch": 0.17079172094369663, "grad_norm": 2.6028099060058594, "learning_rate": 4.4193631353768414e-07, "loss": 1.1532, "step": 4720 }, { "epoch": 0.17115356781010277, "grad_norm": 1.9248805046081543, "learning_rate": 4.1101059698443965e-07, "loss": 1.2536, "step": 4730 }, { "epoch": 0.1715154146765089, "grad_norm": 1.356221079826355, "learning_rate": 3.8119840268523914e-07, "loss": 1.0921, "step": 4740 }, { "epoch": 0.17187726154291505, "grad_norm": 2.5284345149993896, "learning_rate": 3.525009561026202e-07, "loss": 1.0666, "step": 4750 }, { "epoch": 0.17223910840932116, "grad_norm": 0.9027464389801025, "learning_rate": 3.2491943687621873e-07, "loss": 1.2425, "step": 4760 }, { "epoch": 0.1726009552757273, "grad_norm": 3.553765296936035, "learning_rate": 2.984549787742552e-07, "loss": 1.1428, "step": 4770 }, { "epoch": 0.17296280214213344, "grad_norm": 1.0256719589233398, "learning_rate": 2.731086696469501e-07, "loss": 1.1501, "step": 4780 }, { "epoch": 0.17332464900853958, "grad_norm": 0.8957388997077942, "learning_rate": 2.4888155138179576e-07, "loss": 1.1053, "step": 4790 }, { "epoch": 0.17368649587494572, "grad_norm": 1.2737832069396973, "learning_rate": 2.2577461986073356e-07, "loss": 0.9279, "step": 4800 }, { "epoch": 0.17368649587494572, "eval_loss": 1.0090162754058838, "eval_runtime": 68.1211, "eval_samples_per_second": 7.34, "eval_steps_per_second": 7.34, "step": 4800 }, { "epoch": 0.17404834274135186, "grad_norm": 3.6224241256713867, "learning_rate": 2.0378882491921159e-07, "loss": 1.0616, "step": 4810 }, { "epoch": 0.174410189607758, "grad_norm": 1.605273723602295, "learning_rate": 1.8292507030715362e-07, "loss": 1.1681, "step": 4820 }, { "epoch": 0.17477203647416414, "grad_norm": 1.325554609298706, "learning_rate": 1.6318421365179055e-07, "loss": 1.0721, "step": 4830 }, { "epoch": 0.17513388334057028, "grad_norm": 1.8882009983062744, "learning_rate": 1.4456706642242134e-07, "loss": 1.1359, "step": 4840 }, { "epoch": 0.1754957302069764, "grad_norm": 1.310950517654419, "learning_rate": 1.2707439389704867e-07, "loss": 1.0071, "step": 4850 }, { "epoch": 0.17585757707338254, "grad_norm": 1.9121229648590088, "learning_rate": 1.1070691513092563e-07, "loss": 1.1355, "step": 4860 }, { "epoch": 0.17621942393978868, "grad_norm": 1.402378797531128, "learning_rate": 9.546530292699863e-08, "loss": 1.1337, "step": 4870 }, { "epoch": 0.17658127080619482, "grad_norm": 1.7570936679840088, "learning_rate": 8.135018380824921e-08, "loss": 1.2101, "step": 4880 }, { "epoch": 0.17694311767260096, "grad_norm": 2.1342835426330566, "learning_rate": 6.836213799193497e-08, "loss": 0.9914, "step": 4890 }, { "epoch": 0.1773049645390071, "grad_norm": 1.0870988368988037, "learning_rate": 5.6501699365750784e-08, "loss": 1.1218, "step": 4900 }, { "epoch": 0.17766681140541324, "grad_norm": 1.0682313442230225, "learning_rate": 4.5769355465876964e-08, "loss": 1.0238, "step": 4910 }, { "epoch": 0.17802865827181938, "grad_norm": 1.6820406913757324, "learning_rate": 3.616554745692946e-08, "loss": 1.2016, "step": 4920 }, { "epoch": 0.1783905051382255, "grad_norm": 1.4748146533966064, "learning_rate": 2.7690670113848792e-08, "loss": 1.0566, "step": 4930 }, { "epoch": 0.17875235200463163, "grad_norm": 1.8944076299667358, "learning_rate": 2.034507180563916e-08, "loss": 1.1868, "step": 4940 }, { "epoch": 0.17911419887103777, "grad_norm": 1.0298928022384644, "learning_rate": 1.4129054481082926e-08, "loss": 0.9945, "step": 4950 }, { "epoch": 0.17911419887103777, "eval_loss": 1.0089582204818726, "eval_runtime": 68.6655, "eval_samples_per_second": 7.282, "eval_steps_per_second": 7.282, "step": 4950 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.430239058215649e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }