{ "best_metric": 0.8714, "best_model_checkpoint": "checkpoint/swin-tiny/checkpoint-30969", "epoch": 93.0, "eval_steps": 500, "global_step": 30969, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 5.0367608070373535, "learning_rate": 9.998998998999e-06, "loss": 4.6422, "step": 10 }, { "epoch": 0.06, "grad_norm": 4.677215099334717, "learning_rate": 9.997997997998e-06, "loss": 4.5999, "step": 20 }, { "epoch": 0.09, "grad_norm": 4.010698318481445, "learning_rate": 9.996996996996998e-06, "loss": 4.5735, "step": 30 }, { "epoch": 0.12, "grad_norm": 4.1278815269470215, "learning_rate": 9.995995995995997e-06, "loss": 4.5379, "step": 40 }, { "epoch": 0.15, "grad_norm": 4.357997894287109, "learning_rate": 9.994994994994995e-06, "loss": 4.5355, "step": 50 }, { "epoch": 0.18, "grad_norm": 4.672063827514648, "learning_rate": 9.993993993993994e-06, "loss": 4.5144, "step": 60 }, { "epoch": 0.21, "grad_norm": 4.977757930755615, "learning_rate": 9.992992992992994e-06, "loss": 4.4731, "step": 70 }, { "epoch": 0.24, "grad_norm": 4.93468713760376, "learning_rate": 9.991991991991993e-06, "loss": 4.4178, "step": 80 }, { "epoch": 0.27, "grad_norm": 5.7341413497924805, "learning_rate": 9.990990990990992e-06, "loss": 4.4057, "step": 90 }, { "epoch": 0.3, "grad_norm": 6.587381362915039, "learning_rate": 9.989989989989992e-06, "loss": 4.3499, "step": 100 }, { "epoch": 0.33, "grad_norm": 8.376825332641602, "learning_rate": 9.98898898898899e-06, "loss": 4.3059, "step": 110 }, { "epoch": 0.36, "grad_norm": 8.455249786376953, "learning_rate": 9.987987987987989e-06, "loss": 4.262, "step": 120 }, { "epoch": 0.39, "grad_norm": 10.391611099243164, "learning_rate": 9.986986986986988e-06, "loss": 4.1585, "step": 130 }, { "epoch": 0.42, "grad_norm": 9.762007713317871, "learning_rate": 9.985985985985986e-06, "loss": 4.0906, "step": 140 }, { "epoch": 0.45, "grad_norm": 14.533121109008789, "learning_rate": 9.984984984984985e-06, "loss": 4.0148, "step": 150 }, { "epoch": 0.48, "grad_norm": 10.88967514038086, "learning_rate": 9.983983983983985e-06, "loss": 3.9155, "step": 160 }, { "epoch": 0.51, "grad_norm": 12.256155967712402, "learning_rate": 9.982982982982984e-06, "loss": 3.8407, "step": 170 }, { "epoch": 0.54, "grad_norm": 11.568601608276367, "learning_rate": 9.981981981981982e-06, "loss": 3.7481, "step": 180 }, { "epoch": 0.57, "grad_norm": 13.319816589355469, "learning_rate": 9.980980980980983e-06, "loss": 3.6948, "step": 190 }, { "epoch": 0.6, "grad_norm": 13.403288841247559, "learning_rate": 9.979979979979981e-06, "loss": 3.6057, "step": 200 }, { "epoch": 0.63, "grad_norm": 14.109684944152832, "learning_rate": 9.97897897897898e-06, "loss": 3.5081, "step": 210 }, { "epoch": 0.66, "grad_norm": 14.084592819213867, "learning_rate": 9.977977977977978e-06, "loss": 3.4799, "step": 220 }, { "epoch": 0.69, "grad_norm": 11.704780578613281, "learning_rate": 9.976976976976977e-06, "loss": 3.4245, "step": 230 }, { "epoch": 0.72, "grad_norm": 14.129402160644531, "learning_rate": 9.975975975975977e-06, "loss": 3.3685, "step": 240 }, { "epoch": 0.75, "grad_norm": 20.837770462036133, "learning_rate": 9.974974974974976e-06, "loss": 3.2695, "step": 250 }, { "epoch": 0.78, "grad_norm": 20.406055450439453, "learning_rate": 9.973973973973974e-06, "loss": 3.2005, "step": 260 }, { "epoch": 0.81, "grad_norm": 17.758235931396484, "learning_rate": 9.972972972972975e-06, "loss": 3.1784, "step": 270 }, { "epoch": 0.84, "grad_norm": 12.266830444335938, "learning_rate": 9.971971971971973e-06, "loss": 3.1205, "step": 280 }, { "epoch": 0.87, "grad_norm": 15.541472434997559, "learning_rate": 9.970970970970972e-06, "loss": 3.0157, "step": 290 }, { "epoch": 0.9, "grad_norm": 13.052526473999023, "learning_rate": 9.96996996996997e-06, "loss": 2.9835, "step": 300 }, { "epoch": 0.93, "grad_norm": 15.945847511291504, "learning_rate": 9.968968968968969e-06, "loss": 2.9316, "step": 310 }, { "epoch": 0.96, "grad_norm": 14.683821678161621, "learning_rate": 9.96796796796797e-06, "loss": 2.9266, "step": 320 }, { "epoch": 0.99, "grad_norm": 16.834957122802734, "learning_rate": 9.966966966966968e-06, "loss": 2.8188, "step": 330 }, { "epoch": 1.0, "eval_accuracy": 0.4372, "eval_loss": 2.4231927394866943, "eval_runtime": 15.1879, "eval_samples_per_second": 658.419, "eval_steps_per_second": 2.634, "step": 333 }, { "epoch": 1.02, "grad_norm": 14.244836807250977, "learning_rate": 9.965965965965967e-06, "loss": 2.7959, "step": 340 }, { "epoch": 1.05, "grad_norm": 14.228090286254883, "learning_rate": 9.964964964964965e-06, "loss": 2.7708, "step": 350 }, { "epoch": 1.08, "grad_norm": 15.202799797058105, "learning_rate": 9.963963963963965e-06, "loss": 2.7653, "step": 360 }, { "epoch": 1.11, "grad_norm": 24.322519302368164, "learning_rate": 9.962962962962964e-06, "loss": 2.7451, "step": 370 }, { "epoch": 1.14, "grad_norm": 13.843741416931152, "learning_rate": 9.961961961961963e-06, "loss": 2.6411, "step": 380 }, { "epoch": 1.17, "grad_norm": 14.800859451293945, "learning_rate": 9.960960960960961e-06, "loss": 2.6024, "step": 390 }, { "epoch": 1.2, "grad_norm": 11.714691162109375, "learning_rate": 9.95995995995996e-06, "loss": 2.5992, "step": 400 }, { "epoch": 1.23, "grad_norm": 12.235062599182129, "learning_rate": 9.95895895895896e-06, "loss": 2.5759, "step": 410 }, { "epoch": 1.26, "grad_norm": 16.454328536987305, "learning_rate": 9.957957957957959e-06, "loss": 2.5312, "step": 420 }, { "epoch": 1.29, "grad_norm": 13.53082275390625, "learning_rate": 9.956956956956957e-06, "loss": 2.4982, "step": 430 }, { "epoch": 1.32, "grad_norm": 15.531620979309082, "learning_rate": 9.955955955955958e-06, "loss": 2.4553, "step": 440 }, { "epoch": 1.35, "grad_norm": 18.312143325805664, "learning_rate": 9.954954954954956e-06, "loss": 2.4303, "step": 450 }, { "epoch": 1.38, "grad_norm": 14.15007209777832, "learning_rate": 9.953953953953955e-06, "loss": 2.4106, "step": 460 }, { "epoch": 1.41, "grad_norm": 19.596813201904297, "learning_rate": 9.952952952952953e-06, "loss": 2.3459, "step": 470 }, { "epoch": 1.44, "grad_norm": 14.045530319213867, "learning_rate": 9.951951951951952e-06, "loss": 2.3336, "step": 480 }, { "epoch": 1.47, "grad_norm": 17.40843391418457, "learning_rate": 9.950950950950952e-06, "loss": 2.2681, "step": 490 }, { "epoch": 1.5, "grad_norm": 17.60472297668457, "learning_rate": 9.949949949949951e-06, "loss": 2.2444, "step": 500 }, { "epoch": 1.53, "grad_norm": 12.754247665405273, "learning_rate": 9.94894894894895e-06, "loss": 2.2344, "step": 510 }, { "epoch": 1.56, "grad_norm": 16.31540870666504, "learning_rate": 9.94794794794795e-06, "loss": 2.3322, "step": 520 }, { "epoch": 1.59, "grad_norm": 19.430391311645508, "learning_rate": 9.946946946946948e-06, "loss": 2.1679, "step": 530 }, { "epoch": 1.62, "grad_norm": 15.180327415466309, "learning_rate": 9.945945945945947e-06, "loss": 2.2569, "step": 540 }, { "epoch": 1.65, "grad_norm": 16.49898338317871, "learning_rate": 9.944944944944946e-06, "loss": 2.1861, "step": 550 }, { "epoch": 1.68, "grad_norm": 15.13155460357666, "learning_rate": 9.943943943943944e-06, "loss": 2.136, "step": 560 }, { "epoch": 1.71, "grad_norm": 15.498371124267578, "learning_rate": 9.942942942942944e-06, "loss": 2.195, "step": 570 }, { "epoch": 1.74, "grad_norm": 29.57242202758789, "learning_rate": 9.941941941941943e-06, "loss": 2.1519, "step": 580 }, { "epoch": 1.77, "grad_norm": 18.424434661865234, "learning_rate": 9.940940940940942e-06, "loss": 2.1399, "step": 590 }, { "epoch": 1.8, "grad_norm": 17.38271713256836, "learning_rate": 9.93993993993994e-06, "loss": 2.1748, "step": 600 }, { "epoch": 1.83, "grad_norm": 15.109564781188965, "learning_rate": 9.93893893893894e-06, "loss": 2.101, "step": 610 }, { "epoch": 1.86, "grad_norm": 19.079050064086914, "learning_rate": 9.937937937937939e-06, "loss": 2.0935, "step": 620 }, { "epoch": 1.89, "grad_norm": 15.897712707519531, "learning_rate": 9.936936936936938e-06, "loss": 2.0537, "step": 630 }, { "epoch": 1.92, "grad_norm": 15.689188003540039, "learning_rate": 9.935935935935936e-06, "loss": 2.0217, "step": 640 }, { "epoch": 1.95, "grad_norm": 12.665223121643066, "learning_rate": 9.934934934934935e-06, "loss": 2.0426, "step": 650 }, { "epoch": 1.98, "grad_norm": 18.258056640625, "learning_rate": 9.933933933933935e-06, "loss": 2.0411, "step": 660 }, { "epoch": 2.0, "eval_accuracy": 0.6269, "eval_loss": 1.4235050678253174, "eval_runtime": 12.7355, "eval_samples_per_second": 785.208, "eval_steps_per_second": 3.141, "step": 666 }, { "epoch": 2.01, "grad_norm": 16.366247177124023, "learning_rate": 9.932932932932934e-06, "loss": 1.9882, "step": 670 }, { "epoch": 2.04, "grad_norm": 18.89084815979004, "learning_rate": 9.931931931931932e-06, "loss": 1.9636, "step": 680 }, { "epoch": 2.07, "grad_norm": 15.520702362060547, "learning_rate": 9.930930930930933e-06, "loss": 1.909, "step": 690 }, { "epoch": 2.1, "grad_norm": 16.264108657836914, "learning_rate": 9.929929929929931e-06, "loss": 1.9156, "step": 700 }, { "epoch": 2.13, "grad_norm": 12.999436378479004, "learning_rate": 9.92892892892893e-06, "loss": 1.9296, "step": 710 }, { "epoch": 2.16, "grad_norm": 13.601841926574707, "learning_rate": 9.927927927927928e-06, "loss": 1.916, "step": 720 }, { "epoch": 2.19, "grad_norm": 16.44266128540039, "learning_rate": 9.926926926926927e-06, "loss": 1.9489, "step": 730 }, { "epoch": 2.22, "grad_norm": 16.974905014038086, "learning_rate": 9.925925925925927e-06, "loss": 1.8826, "step": 740 }, { "epoch": 2.25, "grad_norm": 13.924214363098145, "learning_rate": 9.924924924924926e-06, "loss": 1.8674, "step": 750 }, { "epoch": 2.28, "grad_norm": 13.93298053741455, "learning_rate": 9.923923923923925e-06, "loss": 1.8513, "step": 760 }, { "epoch": 2.31, "grad_norm": 15.00749397277832, "learning_rate": 9.922922922922925e-06, "loss": 1.8428, "step": 770 }, { "epoch": 2.34, "grad_norm": 17.950767517089844, "learning_rate": 9.921921921921923e-06, "loss": 1.7832, "step": 780 }, { "epoch": 2.37, "grad_norm": 15.769499778747559, "learning_rate": 9.920920920920922e-06, "loss": 1.7948, "step": 790 }, { "epoch": 2.4, "grad_norm": 14.434747695922852, "learning_rate": 9.91991991991992e-06, "loss": 1.8534, "step": 800 }, { "epoch": 2.43, "grad_norm": 17.389707565307617, "learning_rate": 9.91891891891892e-06, "loss": 1.8856, "step": 810 }, { "epoch": 2.46, "grad_norm": 19.550718307495117, "learning_rate": 9.917917917917918e-06, "loss": 1.8047, "step": 820 }, { "epoch": 2.49, "grad_norm": 16.427532196044922, "learning_rate": 9.916916916916918e-06, "loss": 1.7848, "step": 830 }, { "epoch": 2.52, "grad_norm": 13.922759056091309, "learning_rate": 9.915915915915917e-06, "loss": 1.8626, "step": 840 }, { "epoch": 2.55, "grad_norm": 16.29351234436035, "learning_rate": 9.914914914914915e-06, "loss": 1.7767, "step": 850 }, { "epoch": 2.58, "grad_norm": 16.79395866394043, "learning_rate": 9.913913913913916e-06, "loss": 1.665, "step": 860 }, { "epoch": 2.61, "grad_norm": 15.282559394836426, "learning_rate": 9.912912912912914e-06, "loss": 1.7964, "step": 870 }, { "epoch": 2.64, "grad_norm": 14.09691333770752, "learning_rate": 9.911911911911913e-06, "loss": 1.8192, "step": 880 }, { "epoch": 2.67, "grad_norm": 12.11885929107666, "learning_rate": 9.910910910910911e-06, "loss": 1.7374, "step": 890 }, { "epoch": 2.7, "grad_norm": 20.295997619628906, "learning_rate": 9.90990990990991e-06, "loss": 1.7322, "step": 900 }, { "epoch": 2.73, "grad_norm": 15.611212730407715, "learning_rate": 9.90890890890891e-06, "loss": 1.8141, "step": 910 }, { "epoch": 2.76, "grad_norm": 15.991599082946777, "learning_rate": 9.907907907907909e-06, "loss": 1.6688, "step": 920 }, { "epoch": 2.79, "grad_norm": 15.549467086791992, "learning_rate": 9.906906906906907e-06, "loss": 1.7381, "step": 930 }, { "epoch": 2.82, "grad_norm": 14.182323455810547, "learning_rate": 9.905905905905908e-06, "loss": 1.6536, "step": 940 }, { "epoch": 2.85, "grad_norm": 14.629528045654297, "learning_rate": 9.904904904904906e-06, "loss": 1.6791, "step": 950 }, { "epoch": 2.88, "grad_norm": 17.533554077148438, "learning_rate": 9.903903903903905e-06, "loss": 1.6827, "step": 960 }, { "epoch": 2.91, "grad_norm": 15.777318954467773, "learning_rate": 9.902902902902903e-06, "loss": 1.6448, "step": 970 }, { "epoch": 2.94, "grad_norm": 18.05567741394043, "learning_rate": 9.901901901901902e-06, "loss": 1.6161, "step": 980 }, { "epoch": 2.97, "grad_norm": 19.055246353149414, "learning_rate": 9.900900900900902e-06, "loss": 1.7069, "step": 990 }, { "epoch": 3.0, "eval_accuracy": 0.7102, "eval_loss": 1.055800437927246, "eval_runtime": 12.7654, "eval_samples_per_second": 783.37, "eval_steps_per_second": 3.133, "step": 999 }, { "epoch": 3.0, "grad_norm": 17.5743408203125, "learning_rate": 9.899899899899901e-06, "loss": 1.5756, "step": 1000 }, { "epoch": 3.03, "grad_norm": 15.102668762207031, "learning_rate": 9.8988988988989e-06, "loss": 1.5809, "step": 1010 }, { "epoch": 3.06, "grad_norm": 20.911672592163086, "learning_rate": 9.8978978978979e-06, "loss": 1.6458, "step": 1020 }, { "epoch": 3.09, "grad_norm": 13.850802421569824, "learning_rate": 9.896896896896898e-06, "loss": 1.602, "step": 1030 }, { "epoch": 3.12, "grad_norm": 15.126977920532227, "learning_rate": 9.895895895895895e-06, "loss": 1.5504, "step": 1040 }, { "epoch": 3.15, "grad_norm": 16.3144474029541, "learning_rate": 9.894894894894896e-06, "loss": 1.6521, "step": 1050 }, { "epoch": 3.18, "grad_norm": 14.613848686218262, "learning_rate": 9.893893893893894e-06, "loss": 1.5786, "step": 1060 }, { "epoch": 3.21, "grad_norm": 15.141481399536133, "learning_rate": 9.892892892892893e-06, "loss": 1.6004, "step": 1070 }, { "epoch": 3.24, "grad_norm": 18.73663902282715, "learning_rate": 9.891891891891893e-06, "loss": 1.6059, "step": 1080 }, { "epoch": 3.27, "grad_norm": 15.25769329071045, "learning_rate": 9.890890890890892e-06, "loss": 1.634, "step": 1090 }, { "epoch": 3.3, "grad_norm": 16.038311004638672, "learning_rate": 9.88988988988989e-06, "loss": 1.5777, "step": 1100 }, { "epoch": 3.33, "grad_norm": 16.076766967773438, "learning_rate": 9.88888888888889e-06, "loss": 1.6154, "step": 1110 }, { "epoch": 3.36, "grad_norm": 21.815214157104492, "learning_rate": 9.88788788788789e-06, "loss": 1.5889, "step": 1120 }, { "epoch": 3.39, "grad_norm": 19.274152755737305, "learning_rate": 9.886886886886888e-06, "loss": 1.5948, "step": 1130 }, { "epoch": 3.42, "grad_norm": 14.293699264526367, "learning_rate": 9.885885885885886e-06, "loss": 1.5951, "step": 1140 }, { "epoch": 3.45, "grad_norm": 19.594980239868164, "learning_rate": 9.884884884884885e-06, "loss": 1.6085, "step": 1150 }, { "epoch": 3.48, "grad_norm": 19.125072479248047, "learning_rate": 9.883883883883885e-06, "loss": 1.6067, "step": 1160 }, { "epoch": 3.51, "grad_norm": 19.758527755737305, "learning_rate": 9.882882882882884e-06, "loss": 1.6021, "step": 1170 }, { "epoch": 3.54, "grad_norm": 17.54904556274414, "learning_rate": 9.881881881881882e-06, "loss": 1.6093, "step": 1180 }, { "epoch": 3.57, "grad_norm": 15.328018188476562, "learning_rate": 9.880880880880883e-06, "loss": 1.4568, "step": 1190 }, { "epoch": 3.6, "grad_norm": 15.049988746643066, "learning_rate": 9.879879879879881e-06, "loss": 1.4653, "step": 1200 }, { "epoch": 3.63, "grad_norm": 15.316150665283203, "learning_rate": 9.87887887887888e-06, "loss": 1.5626, "step": 1210 }, { "epoch": 3.66, "grad_norm": 15.885323524475098, "learning_rate": 9.877877877877879e-06, "loss": 1.5439, "step": 1220 }, { "epoch": 3.69, "grad_norm": 14.563307762145996, "learning_rate": 9.876876876876877e-06, "loss": 1.5633, "step": 1230 }, { "epoch": 3.72, "grad_norm": 12.845205307006836, "learning_rate": 9.875875875875877e-06, "loss": 1.5821, "step": 1240 }, { "epoch": 3.75, "grad_norm": 15.530064582824707, "learning_rate": 9.874874874874876e-06, "loss": 1.4554, "step": 1250 }, { "epoch": 3.78, "grad_norm": 14.50391674041748, "learning_rate": 9.873873873873875e-06, "loss": 1.5637, "step": 1260 }, { "epoch": 3.81, "grad_norm": 14.745131492614746, "learning_rate": 9.872872872872873e-06, "loss": 1.5024, "step": 1270 }, { "epoch": 3.84, "grad_norm": 15.122355461120605, "learning_rate": 9.871871871871873e-06, "loss": 1.4905, "step": 1280 }, { "epoch": 3.87, "grad_norm": 15.616936683654785, "learning_rate": 9.87087087087087e-06, "loss": 1.5544, "step": 1290 }, { "epoch": 3.9, "grad_norm": 13.670624732971191, "learning_rate": 9.86986986986987e-06, "loss": 1.4526, "step": 1300 }, { "epoch": 3.93, "grad_norm": 12.70984172821045, "learning_rate": 9.86886886886887e-06, "loss": 1.4987, "step": 1310 }, { "epoch": 3.96, "grad_norm": 15.029037475585938, "learning_rate": 9.867867867867868e-06, "loss": 1.5155, "step": 1320 }, { "epoch": 3.99, "grad_norm": 15.3666410446167, "learning_rate": 9.866866866866868e-06, "loss": 1.5722, "step": 1330 }, { "epoch": 4.0, "eval_accuracy": 0.7504, "eval_loss": 0.8656909465789795, "eval_runtime": 12.8593, "eval_samples_per_second": 777.649, "eval_steps_per_second": 3.111, "step": 1332 }, { "epoch": 4.02, "grad_norm": 20.454769134521484, "learning_rate": 9.865865865865867e-06, "loss": 1.5871, "step": 1340 }, { "epoch": 4.05, "grad_norm": 14.027545928955078, "learning_rate": 9.864864864864865e-06, "loss": 1.4042, "step": 1350 }, { "epoch": 4.08, "grad_norm": 25.39883804321289, "learning_rate": 9.863863863863866e-06, "loss": 1.4751, "step": 1360 }, { "epoch": 4.11, "grad_norm": 16.596513748168945, "learning_rate": 9.862862862862864e-06, "loss": 1.4249, "step": 1370 }, { "epoch": 4.14, "grad_norm": 14.449197769165039, "learning_rate": 9.861861861861863e-06, "loss": 1.492, "step": 1380 }, { "epoch": 4.17, "grad_norm": 18.086870193481445, "learning_rate": 9.860860860860861e-06, "loss": 1.4249, "step": 1390 }, { "epoch": 4.2, "grad_norm": 18.966970443725586, "learning_rate": 9.85985985985986e-06, "loss": 1.4132, "step": 1400 }, { "epoch": 4.23, "grad_norm": 19.73224639892578, "learning_rate": 9.85885885885886e-06, "loss": 1.5375, "step": 1410 }, { "epoch": 4.26, "grad_norm": 20.53548812866211, "learning_rate": 9.857857857857859e-06, "loss": 1.4464, "step": 1420 }, { "epoch": 4.29, "grad_norm": 16.314666748046875, "learning_rate": 9.856856856856857e-06, "loss": 1.4919, "step": 1430 }, { "epoch": 4.32, "grad_norm": 14.961431503295898, "learning_rate": 9.855855855855858e-06, "loss": 1.3864, "step": 1440 }, { "epoch": 4.35, "grad_norm": 19.140153884887695, "learning_rate": 9.854854854854856e-06, "loss": 1.3799, "step": 1450 }, { "epoch": 4.38, "grad_norm": 15.397107124328613, "learning_rate": 9.853853853853855e-06, "loss": 1.4314, "step": 1460 }, { "epoch": 4.41, "grad_norm": 17.417640686035156, "learning_rate": 9.852852852852854e-06, "loss": 1.4163, "step": 1470 }, { "epoch": 4.44, "grad_norm": 17.78976058959961, "learning_rate": 9.851851851851852e-06, "loss": 1.3148, "step": 1480 }, { "epoch": 4.47, "grad_norm": 15.137016296386719, "learning_rate": 9.85085085085085e-06, "loss": 1.3684, "step": 1490 }, { "epoch": 4.5, "grad_norm": 18.988313674926758, "learning_rate": 9.849849849849851e-06, "loss": 1.4638, "step": 1500 }, { "epoch": 4.53, "grad_norm": 21.507238388061523, "learning_rate": 9.84884884884885e-06, "loss": 1.4034, "step": 1510 }, { "epoch": 4.56, "grad_norm": 14.309231758117676, "learning_rate": 9.847847847847848e-06, "loss": 1.2893, "step": 1520 }, { "epoch": 4.59, "grad_norm": 17.174428939819336, "learning_rate": 9.846846846846849e-06, "loss": 1.4635, "step": 1530 }, { "epoch": 4.62, "grad_norm": 18.358692169189453, "learning_rate": 9.845845845845845e-06, "loss": 1.3765, "step": 1540 }, { "epoch": 4.65, "grad_norm": 17.51491928100586, "learning_rate": 9.844844844844846e-06, "loss": 1.4361, "step": 1550 }, { "epoch": 4.68, "grad_norm": 15.559100151062012, "learning_rate": 9.843843843843844e-06, "loss": 1.299, "step": 1560 }, { "epoch": 4.71, "grad_norm": 17.001848220825195, "learning_rate": 9.842842842842843e-06, "loss": 1.3858, "step": 1570 }, { "epoch": 4.74, "grad_norm": 17.293275833129883, "learning_rate": 9.841841841841843e-06, "loss": 1.3953, "step": 1580 }, { "epoch": 4.77, "grad_norm": 20.24597930908203, "learning_rate": 9.840840840840842e-06, "loss": 1.4582, "step": 1590 }, { "epoch": 4.8, "grad_norm": 15.464946746826172, "learning_rate": 9.83983983983984e-06, "loss": 1.4351, "step": 1600 }, { "epoch": 4.83, "grad_norm": 20.658306121826172, "learning_rate": 9.83883883883884e-06, "loss": 1.3934, "step": 1610 }, { "epoch": 4.86, "grad_norm": 14.093180656433105, "learning_rate": 9.83783783783784e-06, "loss": 1.4409, "step": 1620 }, { "epoch": 4.89, "grad_norm": 17.276779174804688, "learning_rate": 9.836836836836838e-06, "loss": 1.405, "step": 1630 }, { "epoch": 4.92, "grad_norm": 20.552637100219727, "learning_rate": 9.835835835835836e-06, "loss": 1.3978, "step": 1640 }, { "epoch": 4.95, "grad_norm": 13.823814392089844, "learning_rate": 9.834834834834835e-06, "loss": 1.4429, "step": 1650 }, { "epoch": 4.98, "grad_norm": 18.30078125, "learning_rate": 9.833833833833835e-06, "loss": 1.346, "step": 1660 }, { "epoch": 5.0, "eval_accuracy": 0.7721, "eval_loss": 0.7773870825767517, "eval_runtime": 12.5993, "eval_samples_per_second": 793.697, "eval_steps_per_second": 3.175, "step": 1665 }, { "epoch": 5.02, "grad_norm": 13.90429973602295, "learning_rate": 9.832832832832834e-06, "loss": 1.1999, "step": 1670 }, { "epoch": 5.05, "grad_norm": 17.547555923461914, "learning_rate": 9.831831831831833e-06, "loss": 1.3139, "step": 1680 }, { "epoch": 5.08, "grad_norm": 17.695215225219727, "learning_rate": 9.830830830830833e-06, "loss": 1.3693, "step": 1690 }, { "epoch": 5.11, "grad_norm": 13.60401439666748, "learning_rate": 9.829829829829831e-06, "loss": 1.3242, "step": 1700 }, { "epoch": 5.14, "grad_norm": 17.991899490356445, "learning_rate": 9.82882882882883e-06, "loss": 1.3636, "step": 1710 }, { "epoch": 5.17, "grad_norm": 14.60073471069336, "learning_rate": 9.827827827827829e-06, "loss": 1.2807, "step": 1720 }, { "epoch": 5.2, "grad_norm": 16.130075454711914, "learning_rate": 9.826826826826827e-06, "loss": 1.4117, "step": 1730 }, { "epoch": 5.23, "grad_norm": 24.69490623474121, "learning_rate": 9.825825825825826e-06, "loss": 1.3732, "step": 1740 }, { "epoch": 5.26, "grad_norm": 14.710121154785156, "learning_rate": 9.824824824824826e-06, "loss": 1.2513, "step": 1750 }, { "epoch": 5.29, "grad_norm": 18.907339096069336, "learning_rate": 9.823823823823825e-06, "loss": 1.3643, "step": 1760 }, { "epoch": 5.32, "grad_norm": 14.353303909301758, "learning_rate": 9.822822822822823e-06, "loss": 1.3138, "step": 1770 }, { "epoch": 5.35, "grad_norm": 17.80875015258789, "learning_rate": 9.821821821821824e-06, "loss": 1.259, "step": 1780 }, { "epoch": 5.38, "grad_norm": 22.915904998779297, "learning_rate": 9.82082082082082e-06, "loss": 1.3137, "step": 1790 }, { "epoch": 5.41, "grad_norm": 15.092241287231445, "learning_rate": 9.81981981981982e-06, "loss": 1.3528, "step": 1800 }, { "epoch": 5.44, "grad_norm": 19.2230281829834, "learning_rate": 9.81881881881882e-06, "loss": 1.2926, "step": 1810 }, { "epoch": 5.47, "grad_norm": 15.056241989135742, "learning_rate": 9.817817817817818e-06, "loss": 1.3463, "step": 1820 }, { "epoch": 5.5, "grad_norm": 15.7086181640625, "learning_rate": 9.816816816816818e-06, "loss": 1.3135, "step": 1830 }, { "epoch": 5.53, "grad_norm": 21.66438102722168, "learning_rate": 9.815815815815817e-06, "loss": 1.3012, "step": 1840 }, { "epoch": 5.56, "grad_norm": 17.32779884338379, "learning_rate": 9.814814814814815e-06, "loss": 1.3644, "step": 1850 }, { "epoch": 5.59, "grad_norm": 15.512614250183105, "learning_rate": 9.813813813813816e-06, "loss": 1.2463, "step": 1860 }, { "epoch": 5.62, "grad_norm": 16.07647705078125, "learning_rate": 9.812812812812814e-06, "loss": 1.3773, "step": 1870 }, { "epoch": 5.65, "grad_norm": 13.52031421661377, "learning_rate": 9.811811811811813e-06, "loss": 1.3064, "step": 1880 }, { "epoch": 5.68, "grad_norm": 14.820934295654297, "learning_rate": 9.810810810810811e-06, "loss": 1.2048, "step": 1890 }, { "epoch": 5.71, "grad_norm": 17.276151657104492, "learning_rate": 9.80980980980981e-06, "loss": 1.3738, "step": 1900 }, { "epoch": 5.74, "grad_norm": 13.631850242614746, "learning_rate": 9.80880880880881e-06, "loss": 1.3378, "step": 1910 }, { "epoch": 5.77, "grad_norm": 19.11239242553711, "learning_rate": 9.807807807807809e-06, "loss": 1.2402, "step": 1920 }, { "epoch": 5.8, "grad_norm": 14.28607177734375, "learning_rate": 9.806806806806808e-06, "loss": 1.3308, "step": 1930 }, { "epoch": 5.83, "grad_norm": 13.145687103271484, "learning_rate": 9.805805805805808e-06, "loss": 1.324, "step": 1940 }, { "epoch": 5.86, "grad_norm": 15.426340103149414, "learning_rate": 9.804804804804806e-06, "loss": 1.3066, "step": 1950 }, { "epoch": 5.89, "grad_norm": 11.67961597442627, "learning_rate": 9.803803803803803e-06, "loss": 1.3375, "step": 1960 }, { "epoch": 5.92, "grad_norm": 15.971795082092285, "learning_rate": 9.802802802802804e-06, "loss": 1.2993, "step": 1970 }, { "epoch": 5.95, "grad_norm": 16.692211151123047, "learning_rate": 9.801801801801802e-06, "loss": 1.2503, "step": 1980 }, { "epoch": 5.98, "grad_norm": 14.695980072021484, "learning_rate": 9.8008008008008e-06, "loss": 1.303, "step": 1990 }, { "epoch": 6.0, "eval_accuracy": 0.7874, "eval_loss": 0.7137871980667114, "eval_runtime": 12.7077, "eval_samples_per_second": 786.925, "eval_steps_per_second": 3.148, "step": 1998 }, { "epoch": 6.01, "grad_norm": 13.036826133728027, "learning_rate": 9.799799799799801e-06, "loss": 1.5263, "step": 2000 }, { "epoch": 6.04, "grad_norm": 20.32438087463379, "learning_rate": 9.7987987987988e-06, "loss": 1.3226, "step": 2010 }, { "epoch": 6.07, "grad_norm": 17.317169189453125, "learning_rate": 9.797797797797798e-06, "loss": 1.2233, "step": 2020 }, { "epoch": 6.1, "grad_norm": 17.734073638916016, "learning_rate": 9.796796796796799e-06, "loss": 1.1739, "step": 2030 }, { "epoch": 6.13, "grad_norm": 15.11571216583252, "learning_rate": 9.795795795795795e-06, "loss": 1.2039, "step": 2040 }, { "epoch": 6.16, "grad_norm": 17.901887893676758, "learning_rate": 9.794794794794796e-06, "loss": 1.2837, "step": 2050 }, { "epoch": 6.19, "grad_norm": 16.518293380737305, "learning_rate": 9.793793793793794e-06, "loss": 1.2622, "step": 2060 }, { "epoch": 6.22, "grad_norm": 16.371145248413086, "learning_rate": 9.792792792792793e-06, "loss": 1.2561, "step": 2070 }, { "epoch": 6.25, "grad_norm": 13.058977127075195, "learning_rate": 9.791791791791793e-06, "loss": 1.2545, "step": 2080 }, { "epoch": 6.28, "grad_norm": 16.877670288085938, "learning_rate": 9.790790790790792e-06, "loss": 1.1889, "step": 2090 }, { "epoch": 6.31, "grad_norm": 19.347734451293945, "learning_rate": 9.78978978978979e-06, "loss": 1.2674, "step": 2100 }, { "epoch": 6.34, "grad_norm": 16.47791290283203, "learning_rate": 9.78878878878879e-06, "loss": 1.254, "step": 2110 }, { "epoch": 6.37, "grad_norm": 24.682647705078125, "learning_rate": 9.787787787787788e-06, "loss": 1.2575, "step": 2120 }, { "epoch": 6.4, "grad_norm": 31.932409286499023, "learning_rate": 9.786786786786788e-06, "loss": 1.3132, "step": 2130 }, { "epoch": 6.43, "grad_norm": 12.494312286376953, "learning_rate": 9.785785785785787e-06, "loss": 1.1676, "step": 2140 }, { "epoch": 6.46, "grad_norm": 24.20763397216797, "learning_rate": 9.784784784784785e-06, "loss": 1.3191, "step": 2150 }, { "epoch": 6.49, "grad_norm": 14.357993125915527, "learning_rate": 9.783783783783785e-06, "loss": 1.2434, "step": 2160 }, { "epoch": 6.52, "grad_norm": 17.260360717773438, "learning_rate": 9.782782782782784e-06, "loss": 1.2659, "step": 2170 }, { "epoch": 6.55, "grad_norm": 17.088218688964844, "learning_rate": 9.781781781781783e-06, "loss": 1.2073, "step": 2180 }, { "epoch": 6.58, "grad_norm": 17.340211868286133, "learning_rate": 9.780780780780781e-06, "loss": 1.2804, "step": 2190 }, { "epoch": 6.61, "grad_norm": 19.425783157348633, "learning_rate": 9.779779779779781e-06, "loss": 1.2694, "step": 2200 }, { "epoch": 6.64, "grad_norm": 14.971159934997559, "learning_rate": 9.778778778778778e-06, "loss": 1.2434, "step": 2210 }, { "epoch": 6.67, "grad_norm": 15.645931243896484, "learning_rate": 9.777777777777779e-06, "loss": 1.309, "step": 2220 }, { "epoch": 6.7, "grad_norm": 14.082205772399902, "learning_rate": 9.776776776776777e-06, "loss": 1.2739, "step": 2230 }, { "epoch": 6.73, "grad_norm": 14.896344184875488, "learning_rate": 9.775775775775776e-06, "loss": 1.2321, "step": 2240 }, { "epoch": 6.76, "grad_norm": 14.076533317565918, "learning_rate": 9.774774774774776e-06, "loss": 1.0925, "step": 2250 }, { "epoch": 6.79, "grad_norm": 17.046342849731445, "learning_rate": 9.773773773773775e-06, "loss": 1.2189, "step": 2260 }, { "epoch": 6.82, "grad_norm": 23.03464126586914, "learning_rate": 9.772772772772773e-06, "loss": 1.1931, "step": 2270 }, { "epoch": 6.85, "grad_norm": 20.006450653076172, "learning_rate": 9.771771771771774e-06, "loss": 1.2965, "step": 2280 }, { "epoch": 6.88, "grad_norm": 45.20359420776367, "learning_rate": 9.77077077077077e-06, "loss": 1.2089, "step": 2290 }, { "epoch": 6.91, "grad_norm": 24.090991973876953, "learning_rate": 9.76976976976977e-06, "loss": 1.1629, "step": 2300 }, { "epoch": 6.94, "grad_norm": 21.395421981811523, "learning_rate": 9.76876876876877e-06, "loss": 1.2583, "step": 2310 }, { "epoch": 6.97, "grad_norm": 17.435306549072266, "learning_rate": 9.767767767767768e-06, "loss": 1.2224, "step": 2320 }, { "epoch": 7.0, "grad_norm": 14.937963485717773, "learning_rate": 9.766766766766768e-06, "loss": 1.2045, "step": 2330 }, { "epoch": 7.0, "eval_accuracy": 0.7986, "eval_loss": 0.6616324186325073, "eval_runtime": 12.886, "eval_samples_per_second": 776.034, "eval_steps_per_second": 3.104, "step": 2331 }, { "epoch": 7.03, "grad_norm": 17.74285125732422, "learning_rate": 9.765765765765767e-06, "loss": 1.213, "step": 2340 }, { "epoch": 7.06, "grad_norm": 18.209003448486328, "learning_rate": 9.764764764764765e-06, "loss": 1.2171, "step": 2350 }, { "epoch": 7.09, "grad_norm": 16.93661117553711, "learning_rate": 9.763763763763766e-06, "loss": 1.2575, "step": 2360 }, { "epoch": 7.12, "grad_norm": 13.612496376037598, "learning_rate": 9.762762762762763e-06, "loss": 1.1219, "step": 2370 }, { "epoch": 7.15, "grad_norm": 15.661642074584961, "learning_rate": 9.761761761761763e-06, "loss": 1.1897, "step": 2380 }, { "epoch": 7.18, "grad_norm": 14.000067710876465, "learning_rate": 9.760760760760762e-06, "loss": 1.1881, "step": 2390 }, { "epoch": 7.21, "grad_norm": 15.062593460083008, "learning_rate": 9.75975975975976e-06, "loss": 1.2742, "step": 2400 }, { "epoch": 7.24, "grad_norm": 16.873779296875, "learning_rate": 9.758758758758759e-06, "loss": 1.1432, "step": 2410 }, { "epoch": 7.27, "grad_norm": 14.98442554473877, "learning_rate": 9.757757757757759e-06, "loss": 1.1578, "step": 2420 }, { "epoch": 7.3, "grad_norm": 20.095666885375977, "learning_rate": 9.756756756756758e-06, "loss": 1.1644, "step": 2430 }, { "epoch": 7.33, "grad_norm": 17.67222023010254, "learning_rate": 9.755755755755756e-06, "loss": 1.1308, "step": 2440 }, { "epoch": 7.36, "grad_norm": 12.987893104553223, "learning_rate": 9.754754754754756e-06, "loss": 1.218, "step": 2450 }, { "epoch": 7.39, "grad_norm": 14.277360916137695, "learning_rate": 9.753753753753753e-06, "loss": 1.1472, "step": 2460 }, { "epoch": 7.42, "grad_norm": 20.748971939086914, "learning_rate": 9.752752752752754e-06, "loss": 1.2294, "step": 2470 }, { "epoch": 7.45, "grad_norm": 16.136808395385742, "learning_rate": 9.751751751751752e-06, "loss": 1.1492, "step": 2480 }, { "epoch": 7.48, "grad_norm": 13.233691215515137, "learning_rate": 9.750750750750751e-06, "loss": 1.186, "step": 2490 }, { "epoch": 7.51, "grad_norm": 18.350692749023438, "learning_rate": 9.749749749749751e-06, "loss": 1.2337, "step": 2500 }, { "epoch": 7.54, "grad_norm": 18.468725204467773, "learning_rate": 9.74874874874875e-06, "loss": 1.2043, "step": 2510 }, { "epoch": 7.57, "grad_norm": 15.029038429260254, "learning_rate": 9.747747747747748e-06, "loss": 1.2252, "step": 2520 }, { "epoch": 7.6, "grad_norm": 15.551007270812988, "learning_rate": 9.746746746746749e-06, "loss": 1.1802, "step": 2530 }, { "epoch": 7.63, "grad_norm": 13.695267677307129, "learning_rate": 9.745745745745746e-06, "loss": 1.1828, "step": 2540 }, { "epoch": 7.66, "grad_norm": 22.22896385192871, "learning_rate": 9.744744744744746e-06, "loss": 1.1379, "step": 2550 }, { "epoch": 7.69, "grad_norm": 14.21728229522705, "learning_rate": 9.743743743743744e-06, "loss": 1.1172, "step": 2560 }, { "epoch": 7.72, "grad_norm": 19.173032760620117, "learning_rate": 9.742742742742743e-06, "loss": 1.2143, "step": 2570 }, { "epoch": 7.75, "grad_norm": 24.064266204833984, "learning_rate": 9.741741741741743e-06, "loss": 1.1707, "step": 2580 }, { "epoch": 7.78, "grad_norm": 19.597206115722656, "learning_rate": 9.740740740740742e-06, "loss": 1.1723, "step": 2590 }, { "epoch": 7.81, "grad_norm": 15.806962966918945, "learning_rate": 9.73973973973974e-06, "loss": 1.2468, "step": 2600 }, { "epoch": 7.84, "grad_norm": 19.199220657348633, "learning_rate": 9.73873873873874e-06, "loss": 1.2025, "step": 2610 }, { "epoch": 7.87, "grad_norm": 19.19084358215332, "learning_rate": 9.737737737737738e-06, "loss": 1.1438, "step": 2620 }, { "epoch": 7.9, "grad_norm": 14.284010887145996, "learning_rate": 9.736736736736738e-06, "loss": 1.2175, "step": 2630 }, { "epoch": 7.93, "grad_norm": 19.574195861816406, "learning_rate": 9.735735735735737e-06, "loss": 1.0988, "step": 2640 }, { "epoch": 7.96, "grad_norm": 13.940256118774414, "learning_rate": 9.734734734734735e-06, "loss": 1.1101, "step": 2650 }, { "epoch": 7.99, "grad_norm": 20.282869338989258, "learning_rate": 9.733733733733734e-06, "loss": 1.2482, "step": 2660 }, { "epoch": 8.0, "eval_accuracy": 0.8128, "eval_loss": 0.6209900379180908, "eval_runtime": 12.6026, "eval_samples_per_second": 793.49, "eval_steps_per_second": 3.174, "step": 2664 }, { "epoch": 8.02, "grad_norm": 16.045846939086914, "learning_rate": 9.732732732732734e-06, "loss": 1.1542, "step": 2670 }, { "epoch": 8.05, "grad_norm": 14.654650688171387, "learning_rate": 9.731731731731733e-06, "loss": 1.2251, "step": 2680 }, { "epoch": 8.08, "grad_norm": 19.446109771728516, "learning_rate": 9.730730730730731e-06, "loss": 1.1321, "step": 2690 }, { "epoch": 8.11, "grad_norm": 14.171721458435059, "learning_rate": 9.729729729729732e-06, "loss": 1.1187, "step": 2700 }, { "epoch": 8.14, "grad_norm": 18.1235408782959, "learning_rate": 9.728728728728728e-06, "loss": 1.1489, "step": 2710 }, { "epoch": 8.17, "grad_norm": 21.12098503112793, "learning_rate": 9.727727727727729e-06, "loss": 1.1572, "step": 2720 }, { "epoch": 8.2, "grad_norm": 13.129083633422852, "learning_rate": 9.726726726726727e-06, "loss": 1.173, "step": 2730 }, { "epoch": 8.23, "grad_norm": 14.0629301071167, "learning_rate": 9.725725725725726e-06, "loss": 1.157, "step": 2740 }, { "epoch": 8.26, "grad_norm": 21.699491500854492, "learning_rate": 9.724724724724726e-06, "loss": 1.1807, "step": 2750 }, { "epoch": 8.29, "grad_norm": 15.412301063537598, "learning_rate": 9.723723723723725e-06, "loss": 1.1862, "step": 2760 }, { "epoch": 8.32, "grad_norm": 21.340255737304688, "learning_rate": 9.722722722722723e-06, "loss": 1.1649, "step": 2770 }, { "epoch": 8.35, "grad_norm": 14.753629684448242, "learning_rate": 9.721721721721724e-06, "loss": 1.156, "step": 2780 }, { "epoch": 8.38, "grad_norm": 18.49135398864746, "learning_rate": 9.72072072072072e-06, "loss": 1.1557, "step": 2790 }, { "epoch": 8.41, "grad_norm": 16.890783309936523, "learning_rate": 9.719719719719721e-06, "loss": 1.1386, "step": 2800 }, { "epoch": 8.44, "grad_norm": 16.26944351196289, "learning_rate": 9.71871871871872e-06, "loss": 1.1508, "step": 2810 }, { "epoch": 8.47, "grad_norm": 13.584962844848633, "learning_rate": 9.717717717717718e-06, "loss": 1.0747, "step": 2820 }, { "epoch": 8.5, "grad_norm": 19.696990966796875, "learning_rate": 9.716716716716718e-06, "loss": 1.1413, "step": 2830 }, { "epoch": 8.53, "grad_norm": 19.515819549560547, "learning_rate": 9.715715715715717e-06, "loss": 1.129, "step": 2840 }, { "epoch": 8.56, "grad_norm": 26.190208435058594, "learning_rate": 9.714714714714716e-06, "loss": 1.1476, "step": 2850 }, { "epoch": 8.59, "grad_norm": 16.262331008911133, "learning_rate": 9.713713713713714e-06, "loss": 1.1215, "step": 2860 }, { "epoch": 8.62, "grad_norm": 14.385309219360352, "learning_rate": 9.712712712712713e-06, "loss": 1.093, "step": 2870 }, { "epoch": 8.65, "grad_norm": 18.503665924072266, "learning_rate": 9.711711711711711e-06, "loss": 1.0973, "step": 2880 }, { "epoch": 8.68, "grad_norm": 17.09222984313965, "learning_rate": 9.710710710710712e-06, "loss": 1.1493, "step": 2890 }, { "epoch": 8.71, "grad_norm": 17.18866729736328, "learning_rate": 9.70970970970971e-06, "loss": 1.1105, "step": 2900 }, { "epoch": 8.74, "grad_norm": 12.714863777160645, "learning_rate": 9.708708708708709e-06, "loss": 1.1722, "step": 2910 }, { "epoch": 8.77, "grad_norm": 18.195629119873047, "learning_rate": 9.707707707707709e-06, "loss": 1.1124, "step": 2920 }, { "epoch": 8.8, "grad_norm": 16.43483543395996, "learning_rate": 9.706706706706708e-06, "loss": 1.0822, "step": 2930 }, { "epoch": 8.83, "grad_norm": 20.604055404663086, "learning_rate": 9.705705705705706e-06, "loss": 1.1663, "step": 2940 }, { "epoch": 8.86, "grad_norm": 16.31377410888672, "learning_rate": 9.704704704704707e-06, "loss": 1.0176, "step": 2950 }, { "epoch": 8.89, "grad_norm": 17.643062591552734, "learning_rate": 9.703703703703703e-06, "loss": 1.208, "step": 2960 }, { "epoch": 8.92, "grad_norm": 14.060117721557617, "learning_rate": 9.702702702702704e-06, "loss": 1.1233, "step": 2970 }, { "epoch": 8.95, "grad_norm": 19.547208786010742, "learning_rate": 9.701701701701702e-06, "loss": 1.0834, "step": 2980 }, { "epoch": 8.98, "grad_norm": 16.685522079467773, "learning_rate": 9.700700700700701e-06, "loss": 1.1202, "step": 2990 }, { "epoch": 9.0, "eval_accuracy": 0.8185, "eval_loss": 0.5924676060676575, "eval_runtime": 12.456, "eval_samples_per_second": 802.828, "eval_steps_per_second": 3.211, "step": 2997 }, { "epoch": 9.01, "grad_norm": 12.231082916259766, "learning_rate": 9.699699699699701e-06, "loss": 1.2633, "step": 3000 }, { "epoch": 9.04, "grad_norm": 15.255853652954102, "learning_rate": 9.6986986986987e-06, "loss": 1.0957, "step": 3010 }, { "epoch": 9.07, "grad_norm": 14.946885108947754, "learning_rate": 9.697697697697698e-06, "loss": 1.1606, "step": 3020 }, { "epoch": 9.1, "grad_norm": 15.070345878601074, "learning_rate": 9.696696696696699e-06, "loss": 1.0868, "step": 3030 }, { "epoch": 9.13, "grad_norm": 17.080293655395508, "learning_rate": 9.695695695695696e-06, "loss": 1.0463, "step": 3040 }, { "epoch": 9.16, "grad_norm": 17.268754959106445, "learning_rate": 9.694694694694696e-06, "loss": 1.1091, "step": 3050 }, { "epoch": 9.19, "grad_norm": 23.904386520385742, "learning_rate": 9.693693693693694e-06, "loss": 1.1861, "step": 3060 }, { "epoch": 9.22, "grad_norm": 21.123323440551758, "learning_rate": 9.692692692692693e-06, "loss": 1.0999, "step": 3070 }, { "epoch": 9.25, "grad_norm": 13.559005737304688, "learning_rate": 9.691691691691693e-06, "loss": 1.0932, "step": 3080 }, { "epoch": 9.28, "grad_norm": 24.972814559936523, "learning_rate": 9.690690690690692e-06, "loss": 0.9839, "step": 3090 }, { "epoch": 9.31, "grad_norm": 14.8424654006958, "learning_rate": 9.68968968968969e-06, "loss": 1.1106, "step": 3100 }, { "epoch": 9.34, "grad_norm": 15.940447807312012, "learning_rate": 9.68868868868869e-06, "loss": 1.1486, "step": 3110 }, { "epoch": 9.37, "grad_norm": 12.488078117370605, "learning_rate": 9.687687687687688e-06, "loss": 1.1388, "step": 3120 }, { "epoch": 9.4, "grad_norm": 18.363967895507812, "learning_rate": 9.686686686686686e-06, "loss": 1.1205, "step": 3130 }, { "epoch": 9.43, "grad_norm": 15.243896484375, "learning_rate": 9.685685685685687e-06, "loss": 1.1399, "step": 3140 }, { "epoch": 9.46, "grad_norm": 15.083489418029785, "learning_rate": 9.684684684684685e-06, "loss": 1.0967, "step": 3150 }, { "epoch": 9.49, "grad_norm": 17.603118896484375, "learning_rate": 9.683683683683684e-06, "loss": 1.0997, "step": 3160 }, { "epoch": 9.52, "grad_norm": 13.779284477233887, "learning_rate": 9.682682682682684e-06, "loss": 1.0508, "step": 3170 }, { "epoch": 9.55, "grad_norm": 17.325523376464844, "learning_rate": 9.681681681681683e-06, "loss": 1.2167, "step": 3180 }, { "epoch": 9.58, "grad_norm": 20.735912322998047, "learning_rate": 9.680680680680681e-06, "loss": 1.0901, "step": 3190 }, { "epoch": 9.61, "grad_norm": 15.422182083129883, "learning_rate": 9.67967967967968e-06, "loss": 1.0652, "step": 3200 }, { "epoch": 9.64, "grad_norm": 14.05704402923584, "learning_rate": 9.678678678678679e-06, "loss": 1.0333, "step": 3210 }, { "epoch": 9.67, "grad_norm": 18.3939208984375, "learning_rate": 9.677677677677679e-06, "loss": 1.1638, "step": 3220 }, { "epoch": 9.7, "grad_norm": 17.519819259643555, "learning_rate": 9.676676676676677e-06, "loss": 1.0882, "step": 3230 }, { "epoch": 9.73, "grad_norm": 14.839508056640625, "learning_rate": 9.675675675675676e-06, "loss": 1.0706, "step": 3240 }, { "epoch": 9.76, "grad_norm": 16.574199676513672, "learning_rate": 9.674674674674676e-06, "loss": 1.1276, "step": 3250 }, { "epoch": 9.79, "grad_norm": 14.425287246704102, "learning_rate": 9.673673673673675e-06, "loss": 1.0369, "step": 3260 }, { "epoch": 9.82, "grad_norm": 11.453363418579102, "learning_rate": 9.672672672672673e-06, "loss": 1.13, "step": 3270 }, { "epoch": 9.85, "grad_norm": 15.991726875305176, "learning_rate": 9.671671671671674e-06, "loss": 1.1164, "step": 3280 }, { "epoch": 9.88, "grad_norm": 19.66295623779297, "learning_rate": 9.67067067067067e-06, "loss": 1.0496, "step": 3290 }, { "epoch": 9.91, "grad_norm": 17.811168670654297, "learning_rate": 9.669669669669671e-06, "loss": 1.0741, "step": 3300 }, { "epoch": 9.94, "grad_norm": 16.96430015563965, "learning_rate": 9.66866866866867e-06, "loss": 1.1444, "step": 3310 }, { "epoch": 9.97, "grad_norm": 17.067520141601562, "learning_rate": 9.667667667667668e-06, "loss": 1.0883, "step": 3320 }, { "epoch": 10.0, "grad_norm": 27.854724884033203, "learning_rate": 9.666666666666667e-06, "loss": 1.0021, "step": 3330 }, { "epoch": 10.0, "eval_accuracy": 0.8235, "eval_loss": 0.5728312730789185, "eval_runtime": 12.787, "eval_samples_per_second": 782.045, "eval_steps_per_second": 3.128, "step": 3330 }, { "epoch": 10.03, "grad_norm": 15.601628303527832, "learning_rate": 9.665665665665667e-06, "loss": 1.0575, "step": 3340 }, { "epoch": 10.06, "grad_norm": 16.880401611328125, "learning_rate": 9.664664664664666e-06, "loss": 1.0543, "step": 3350 }, { "epoch": 10.09, "grad_norm": 16.332054138183594, "learning_rate": 9.663663663663664e-06, "loss": 1.0403, "step": 3360 }, { "epoch": 10.12, "grad_norm": 16.27848243713379, "learning_rate": 9.662662662662663e-06, "loss": 1.0386, "step": 3370 }, { "epoch": 10.15, "grad_norm": 15.288803100585938, "learning_rate": 9.661661661661661e-06, "loss": 1.1157, "step": 3380 }, { "epoch": 10.18, "grad_norm": 19.522502899169922, "learning_rate": 9.660660660660662e-06, "loss": 1.0648, "step": 3390 }, { "epoch": 10.21, "grad_norm": 17.908447265625, "learning_rate": 9.65965965965966e-06, "loss": 1.1023, "step": 3400 }, { "epoch": 10.24, "grad_norm": 13.076386451721191, "learning_rate": 9.658658658658659e-06, "loss": 1.032, "step": 3410 }, { "epoch": 10.27, "grad_norm": 15.524860382080078, "learning_rate": 9.65765765765766e-06, "loss": 1.0363, "step": 3420 }, { "epoch": 10.3, "grad_norm": 14.770012855529785, "learning_rate": 9.656656656656658e-06, "loss": 1.0339, "step": 3430 }, { "epoch": 10.33, "grad_norm": 19.564733505249023, "learning_rate": 9.655655655655656e-06, "loss": 1.0874, "step": 3440 }, { "epoch": 10.36, "grad_norm": 16.416458129882812, "learning_rate": 9.654654654654655e-06, "loss": 1.0911, "step": 3450 }, { "epoch": 10.39, "grad_norm": 18.273672103881836, "learning_rate": 9.653653653653654e-06, "loss": 1.0919, "step": 3460 }, { "epoch": 10.42, "grad_norm": 15.215808868408203, "learning_rate": 9.652652652652654e-06, "loss": 1.0825, "step": 3470 }, { "epoch": 10.45, "grad_norm": 15.298432350158691, "learning_rate": 9.651651651651652e-06, "loss": 1.086, "step": 3480 }, { "epoch": 10.48, "grad_norm": 12.385087966918945, "learning_rate": 9.650650650650651e-06, "loss": 1.1275, "step": 3490 }, { "epoch": 10.51, "grad_norm": 16.057527542114258, "learning_rate": 9.649649649649651e-06, "loss": 1.0923, "step": 3500 }, { "epoch": 10.54, "grad_norm": 15.67593765258789, "learning_rate": 9.64864864864865e-06, "loss": 1.0532, "step": 3510 }, { "epoch": 10.57, "grad_norm": 15.688575744628906, "learning_rate": 9.647647647647648e-06, "loss": 1.0284, "step": 3520 }, { "epoch": 10.6, "grad_norm": 14.33437442779541, "learning_rate": 9.646646646646649e-06, "loss": 1.057, "step": 3530 }, { "epoch": 10.63, "grad_norm": 17.05129623413086, "learning_rate": 9.645645645645646e-06, "loss": 1.0499, "step": 3540 }, { "epoch": 10.66, "grad_norm": 17.67121696472168, "learning_rate": 9.644644644644644e-06, "loss": 1.0749, "step": 3550 }, { "epoch": 10.69, "grad_norm": 16.625755310058594, "learning_rate": 9.643643643643645e-06, "loss": 1.0579, "step": 3560 }, { "epoch": 10.72, "grad_norm": 19.9012508392334, "learning_rate": 9.642642642642643e-06, "loss": 1.0356, "step": 3570 }, { "epoch": 10.75, "grad_norm": 21.382139205932617, "learning_rate": 9.641641641641642e-06, "loss": 1.0528, "step": 3580 }, { "epoch": 10.78, "grad_norm": 18.916330337524414, "learning_rate": 9.640640640640642e-06, "loss": 0.9962, "step": 3590 }, { "epoch": 10.81, "grad_norm": 18.414865493774414, "learning_rate": 9.63963963963964e-06, "loss": 1.0563, "step": 3600 }, { "epoch": 10.84, "grad_norm": 13.265142440795898, "learning_rate": 9.63863863863864e-06, "loss": 0.9864, "step": 3610 }, { "epoch": 10.87, "grad_norm": 14.302437782287598, "learning_rate": 9.637637637637638e-06, "loss": 1.0679, "step": 3620 }, { "epoch": 10.9, "grad_norm": 17.272510528564453, "learning_rate": 9.636636636636636e-06, "loss": 1.0747, "step": 3630 }, { "epoch": 10.93, "grad_norm": 17.251792907714844, "learning_rate": 9.635635635635637e-06, "loss": 1.0952, "step": 3640 }, { "epoch": 10.96, "grad_norm": 20.940811157226562, "learning_rate": 9.634634634634635e-06, "loss": 1.0542, "step": 3650 }, { "epoch": 10.99, "grad_norm": 18.766862869262695, "learning_rate": 9.633633633633634e-06, "loss": 1.0662, "step": 3660 }, { "epoch": 11.0, "eval_accuracy": 0.829, "eval_loss": 0.5636696815490723, "eval_runtime": 12.5849, "eval_samples_per_second": 794.602, "eval_steps_per_second": 3.178, "step": 3663 }, { "epoch": 11.02, "grad_norm": 14.225459098815918, "learning_rate": 9.632632632632634e-06, "loss": 1.2346, "step": 3670 }, { "epoch": 11.05, "grad_norm": 13.948782920837402, "learning_rate": 9.631631631631633e-06, "loss": 1.0052, "step": 3680 }, { "epoch": 11.08, "grad_norm": 17.608837127685547, "learning_rate": 9.630630630630631e-06, "loss": 1.0115, "step": 3690 }, { "epoch": 11.11, "grad_norm": 23.72847557067871, "learning_rate": 9.62962962962963e-06, "loss": 1.0427, "step": 3700 }, { "epoch": 11.14, "grad_norm": 16.49607276916504, "learning_rate": 9.628628628628629e-06, "loss": 1.0758, "step": 3710 }, { "epoch": 11.17, "grad_norm": 15.795092582702637, "learning_rate": 9.627627627627629e-06, "loss": 0.9999, "step": 3720 }, { "epoch": 11.2, "grad_norm": 15.525904655456543, "learning_rate": 9.626626626626627e-06, "loss": 1.0969, "step": 3730 }, { "epoch": 11.23, "grad_norm": 16.011144638061523, "learning_rate": 9.625625625625626e-06, "loss": 0.9972, "step": 3740 }, { "epoch": 11.26, "grad_norm": 17.082075119018555, "learning_rate": 9.624624624624626e-06, "loss": 1.023, "step": 3750 }, { "epoch": 11.29, "grad_norm": 17.181434631347656, "learning_rate": 9.623623623623625e-06, "loss": 1.0567, "step": 3760 }, { "epoch": 11.32, "grad_norm": 23.32236099243164, "learning_rate": 9.622622622622624e-06, "loss": 1.1356, "step": 3770 }, { "epoch": 11.35, "grad_norm": 14.4651460647583, "learning_rate": 9.621621621621622e-06, "loss": 1.1166, "step": 3780 }, { "epoch": 11.38, "grad_norm": 16.139591217041016, "learning_rate": 9.62062062062062e-06, "loss": 1.0839, "step": 3790 }, { "epoch": 11.41, "grad_norm": 12.324134826660156, "learning_rate": 9.61961961961962e-06, "loss": 1.0817, "step": 3800 }, { "epoch": 11.44, "grad_norm": 15.287452697753906, "learning_rate": 9.61861861861862e-06, "loss": 1.0017, "step": 3810 }, { "epoch": 11.47, "grad_norm": 18.197134017944336, "learning_rate": 9.617617617617618e-06, "loss": 1.0289, "step": 3820 }, { "epoch": 11.5, "grad_norm": 17.314950942993164, "learning_rate": 9.616616616616617e-06, "loss": 1.0783, "step": 3830 }, { "epoch": 11.53, "grad_norm": 16.530426025390625, "learning_rate": 9.615615615615617e-06, "loss": 1.0625, "step": 3840 }, { "epoch": 11.56, "grad_norm": 18.19109535217285, "learning_rate": 9.614614614614616e-06, "loss": 1.0336, "step": 3850 }, { "epoch": 11.59, "grad_norm": 17.461894989013672, "learning_rate": 9.613613613613614e-06, "loss": 1.078, "step": 3860 }, { "epoch": 11.62, "grad_norm": 15.058967590332031, "learning_rate": 9.612612612612613e-06, "loss": 1.0271, "step": 3870 }, { "epoch": 11.65, "grad_norm": 16.8382568359375, "learning_rate": 9.611611611611611e-06, "loss": 1.0783, "step": 3880 }, { "epoch": 11.68, "grad_norm": 16.57802391052246, "learning_rate": 9.610610610610612e-06, "loss": 1.0591, "step": 3890 }, { "epoch": 11.71, "grad_norm": 19.649625778198242, "learning_rate": 9.60960960960961e-06, "loss": 1.0703, "step": 3900 }, { "epoch": 11.74, "grad_norm": 23.350257873535156, "learning_rate": 9.608608608608609e-06, "loss": 1.1037, "step": 3910 }, { "epoch": 11.77, "grad_norm": 28.183189392089844, "learning_rate": 9.60760760760761e-06, "loss": 1.0253, "step": 3920 }, { "epoch": 11.8, "grad_norm": 21.10226058959961, "learning_rate": 9.606606606606608e-06, "loss": 1.0825, "step": 3930 }, { "epoch": 11.83, "grad_norm": 19.632354736328125, "learning_rate": 9.605605605605606e-06, "loss": 0.9878, "step": 3940 }, { "epoch": 11.86, "grad_norm": 17.59662437438965, "learning_rate": 9.604604604604605e-06, "loss": 1.0505, "step": 3950 }, { "epoch": 11.89, "grad_norm": 18.002777099609375, "learning_rate": 9.603603603603604e-06, "loss": 1.0303, "step": 3960 }, { "epoch": 11.92, "grad_norm": 19.330671310424805, "learning_rate": 9.602602602602604e-06, "loss": 1.0271, "step": 3970 }, { "epoch": 11.95, "grad_norm": 14.030105590820312, "learning_rate": 9.601601601601602e-06, "loss": 1.0634, "step": 3980 }, { "epoch": 11.98, "grad_norm": 27.249462127685547, "learning_rate": 9.600600600600601e-06, "loss": 1.0263, "step": 3990 }, { "epoch": 12.0, "eval_accuracy": 0.8303, "eval_loss": 0.5441622734069824, "eval_runtime": 12.8921, "eval_samples_per_second": 775.671, "eval_steps_per_second": 3.103, "step": 3996 }, { "epoch": 12.01, "grad_norm": 21.14763641357422, "learning_rate": 9.5995995995996e-06, "loss": 1.209, "step": 4000 }, { "epoch": 12.04, "grad_norm": 16.916152954101562, "learning_rate": 9.5985985985986e-06, "loss": 0.964, "step": 4010 }, { "epoch": 12.07, "grad_norm": 20.873559951782227, "learning_rate": 9.597597597597599e-06, "loss": 0.9949, "step": 4020 }, { "epoch": 12.1, "grad_norm": 14.839821815490723, "learning_rate": 9.596596596596597e-06, "loss": 0.9582, "step": 4030 }, { "epoch": 12.13, "grad_norm": 17.228927612304688, "learning_rate": 9.595595595595596e-06, "loss": 1.0007, "step": 4040 }, { "epoch": 12.16, "grad_norm": 27.08382797241211, "learning_rate": 9.594594594594594e-06, "loss": 1.0368, "step": 4050 }, { "epoch": 12.19, "grad_norm": 15.229191780090332, "learning_rate": 9.593593593593595e-06, "loss": 1.0135, "step": 4060 }, { "epoch": 12.22, "grad_norm": 14.073891639709473, "learning_rate": 9.592592592592593e-06, "loss": 1.0114, "step": 4070 }, { "epoch": 12.25, "grad_norm": 16.11541748046875, "learning_rate": 9.591591591591592e-06, "loss": 1.0698, "step": 4080 }, { "epoch": 12.28, "grad_norm": 16.108747482299805, "learning_rate": 9.590590590590592e-06, "loss": 1.0887, "step": 4090 }, { "epoch": 12.31, "grad_norm": 16.445880889892578, "learning_rate": 9.58958958958959e-06, "loss": 1.0269, "step": 4100 }, { "epoch": 12.34, "grad_norm": 13.307125091552734, "learning_rate": 9.58858858858859e-06, "loss": 1.0142, "step": 4110 }, { "epoch": 12.37, "grad_norm": 14.601872444152832, "learning_rate": 9.587587587587588e-06, "loss": 0.9902, "step": 4120 }, { "epoch": 12.4, "grad_norm": 18.373258590698242, "learning_rate": 9.586586586586586e-06, "loss": 1.0375, "step": 4130 }, { "epoch": 12.43, "grad_norm": 19.522417068481445, "learning_rate": 9.585585585585587e-06, "loss": 1.0825, "step": 4140 }, { "epoch": 12.46, "grad_norm": 18.246854782104492, "learning_rate": 9.584584584584585e-06, "loss": 0.9618, "step": 4150 }, { "epoch": 12.49, "grad_norm": 13.992356300354004, "learning_rate": 9.583583583583584e-06, "loss": 1.0104, "step": 4160 }, { "epoch": 12.52, "grad_norm": 12.092374801635742, "learning_rate": 9.582582582582584e-06, "loss": 1.0246, "step": 4170 }, { "epoch": 12.55, "grad_norm": 13.865076065063477, "learning_rate": 9.581581581581583e-06, "loss": 1.0414, "step": 4180 }, { "epoch": 12.58, "grad_norm": 15.241948127746582, "learning_rate": 9.580580580580581e-06, "loss": 0.9945, "step": 4190 }, { "epoch": 12.61, "grad_norm": 16.216447830200195, "learning_rate": 9.57957957957958e-06, "loss": 0.9745, "step": 4200 }, { "epoch": 12.64, "grad_norm": 15.302637100219727, "learning_rate": 9.578578578578579e-06, "loss": 0.987, "step": 4210 }, { "epoch": 12.67, "grad_norm": 18.61682891845703, "learning_rate": 9.577577577577579e-06, "loss": 0.9604, "step": 4220 }, { "epoch": 12.7, "grad_norm": 19.719160079956055, "learning_rate": 9.576576576576578e-06, "loss": 1.0032, "step": 4230 }, { "epoch": 12.73, "grad_norm": 18.480501174926758, "learning_rate": 9.575575575575576e-06, "loss": 1.0035, "step": 4240 }, { "epoch": 12.76, "grad_norm": 14.176997184753418, "learning_rate": 9.574574574574575e-06, "loss": 1.0165, "step": 4250 }, { "epoch": 12.79, "grad_norm": 20.608097076416016, "learning_rate": 9.573573573573575e-06, "loss": 0.9697, "step": 4260 }, { "epoch": 12.82, "grad_norm": 14.59773063659668, "learning_rate": 9.572572572572574e-06, "loss": 1.0698, "step": 4270 }, { "epoch": 12.85, "grad_norm": 12.92850112915039, "learning_rate": 9.571571571571572e-06, "loss": 1.0114, "step": 4280 }, { "epoch": 12.88, "grad_norm": 16.94991111755371, "learning_rate": 9.57057057057057e-06, "loss": 0.9624, "step": 4290 }, { "epoch": 12.91, "grad_norm": 14.561684608459473, "learning_rate": 9.56956956956957e-06, "loss": 1.0075, "step": 4300 }, { "epoch": 12.94, "grad_norm": 17.857587814331055, "learning_rate": 9.56856856856857e-06, "loss": 0.9379, "step": 4310 }, { "epoch": 12.97, "grad_norm": 22.47743034362793, "learning_rate": 9.567567567567568e-06, "loss": 1.0581, "step": 4320 }, { "epoch": 13.0, "eval_accuracy": 0.8379, "eval_loss": 0.5319282412528992, "eval_runtime": 12.9534, "eval_samples_per_second": 772.0, "eval_steps_per_second": 3.088, "step": 4329 }, { "epoch": 13.0, "grad_norm": 16.860095977783203, "learning_rate": 9.566566566566567e-06, "loss": 0.9651, "step": 4330 }, { "epoch": 13.03, "grad_norm": 14.949163436889648, "learning_rate": 9.565565565565567e-06, "loss": 0.933, "step": 4340 }, { "epoch": 13.06, "grad_norm": 18.923927307128906, "learning_rate": 9.564564564564566e-06, "loss": 1.0005, "step": 4350 }, { "epoch": 13.09, "grad_norm": 15.809991836547852, "learning_rate": 9.563563563563564e-06, "loss": 1.0425, "step": 4360 }, { "epoch": 13.12, "grad_norm": 18.985448837280273, "learning_rate": 9.562562562562563e-06, "loss": 1.0406, "step": 4370 }, { "epoch": 13.15, "grad_norm": 15.309414863586426, "learning_rate": 9.561561561561562e-06, "loss": 0.9674, "step": 4380 }, { "epoch": 13.18, "grad_norm": 18.867652893066406, "learning_rate": 9.560560560560562e-06, "loss": 1.0264, "step": 4390 }, { "epoch": 13.21, "grad_norm": 14.455560684204102, "learning_rate": 9.55955955955956e-06, "loss": 1.0368, "step": 4400 }, { "epoch": 13.24, "grad_norm": 19.578123092651367, "learning_rate": 9.558558558558559e-06, "loss": 0.9718, "step": 4410 }, { "epoch": 13.27, "grad_norm": 16.656957626342773, "learning_rate": 9.55755755755756e-06, "loss": 1.0459, "step": 4420 }, { "epoch": 13.3, "grad_norm": 28.463747024536133, "learning_rate": 9.556556556556558e-06, "loss": 1.006, "step": 4430 }, { "epoch": 13.33, "grad_norm": 20.07007598876953, "learning_rate": 9.555555555555556e-06, "loss": 0.9791, "step": 4440 }, { "epoch": 13.36, "grad_norm": 14.087189674377441, "learning_rate": 9.554554554554555e-06, "loss": 0.9722, "step": 4450 }, { "epoch": 13.39, "grad_norm": 18.301103591918945, "learning_rate": 9.553553553553554e-06, "loss": 1.0023, "step": 4460 }, { "epoch": 13.42, "grad_norm": 15.600085258483887, "learning_rate": 9.552552552552552e-06, "loss": 1.0245, "step": 4470 }, { "epoch": 13.45, "grad_norm": 11.453582763671875, "learning_rate": 9.551551551551553e-06, "loss": 0.9964, "step": 4480 }, { "epoch": 13.48, "grad_norm": 12.498774528503418, "learning_rate": 9.550550550550551e-06, "loss": 0.9411, "step": 4490 }, { "epoch": 13.51, "grad_norm": 14.276036262512207, "learning_rate": 9.54954954954955e-06, "loss": 0.9921, "step": 4500 }, { "epoch": 13.54, "grad_norm": 15.989923477172852, "learning_rate": 9.54854854854855e-06, "loss": 1.0208, "step": 4510 }, { "epoch": 13.57, "grad_norm": 12.61364459991455, "learning_rate": 9.547547547547549e-06, "loss": 0.9676, "step": 4520 }, { "epoch": 13.6, "grad_norm": 10.876855850219727, "learning_rate": 9.546546546546547e-06, "loss": 1.012, "step": 4530 }, { "epoch": 13.63, "grad_norm": 17.27198600769043, "learning_rate": 9.545545545545546e-06, "loss": 1.057, "step": 4540 }, { "epoch": 13.66, "grad_norm": 19.67862319946289, "learning_rate": 9.544544544544544e-06, "loss": 1.0207, "step": 4550 }, { "epoch": 13.69, "grad_norm": 15.878100395202637, "learning_rate": 9.543543543543545e-06, "loss": 1.0365, "step": 4560 }, { "epoch": 13.72, "grad_norm": 14.027839660644531, "learning_rate": 9.542542542542543e-06, "loss": 1.0017, "step": 4570 }, { "epoch": 13.75, "grad_norm": 16.443965911865234, "learning_rate": 9.541541541541542e-06, "loss": 0.9169, "step": 4580 }, { "epoch": 13.78, "grad_norm": 23.566343307495117, "learning_rate": 9.540540540540542e-06, "loss": 0.9557, "step": 4590 }, { "epoch": 13.81, "grad_norm": 14.526195526123047, "learning_rate": 9.53953953953954e-06, "loss": 0.9989, "step": 4600 }, { "epoch": 13.84, "grad_norm": 16.737215042114258, "learning_rate": 9.53853853853854e-06, "loss": 1.0153, "step": 4610 }, { "epoch": 13.87, "grad_norm": 14.807560920715332, "learning_rate": 9.537537537537538e-06, "loss": 0.8934, "step": 4620 }, { "epoch": 13.9, "grad_norm": 13.126487731933594, "learning_rate": 9.536536536536537e-06, "loss": 1.0443, "step": 4630 }, { "epoch": 13.93, "grad_norm": 20.58649253845215, "learning_rate": 9.535535535535537e-06, "loss": 1.0182, "step": 4640 }, { "epoch": 13.96, "grad_norm": 19.2255859375, "learning_rate": 9.534534534534535e-06, "loss": 0.93, "step": 4650 }, { "epoch": 13.99, "grad_norm": 13.996789932250977, "learning_rate": 9.533533533533534e-06, "loss": 0.9922, "step": 4660 }, { "epoch": 14.0, "eval_accuracy": 0.8388, "eval_loss": 0.5215397477149963, "eval_runtime": 12.6288, "eval_samples_per_second": 791.843, "eval_steps_per_second": 3.167, "step": 4662 }, { "epoch": 14.02, "grad_norm": 15.150822639465332, "learning_rate": 9.532532532532534e-06, "loss": 1.0928, "step": 4670 }, { "epoch": 14.05, "grad_norm": 17.099411010742188, "learning_rate": 9.531531531531533e-06, "loss": 0.9612, "step": 4680 }, { "epoch": 14.08, "grad_norm": 13.981841087341309, "learning_rate": 9.530530530530532e-06, "loss": 0.9182, "step": 4690 }, { "epoch": 14.11, "grad_norm": 20.836044311523438, "learning_rate": 9.52952952952953e-06, "loss": 0.953, "step": 4700 }, { "epoch": 14.14, "grad_norm": 18.01522445678711, "learning_rate": 9.528528528528529e-06, "loss": 0.9751, "step": 4710 }, { "epoch": 14.17, "grad_norm": 14.915099143981934, "learning_rate": 9.527527527527527e-06, "loss": 0.9972, "step": 4720 }, { "epoch": 14.2, "grad_norm": 19.267436981201172, "learning_rate": 9.526526526526528e-06, "loss": 1.0456, "step": 4730 }, { "epoch": 14.23, "grad_norm": 15.626789093017578, "learning_rate": 9.525525525525526e-06, "loss": 0.9834, "step": 4740 }, { "epoch": 14.26, "grad_norm": 17.810468673706055, "learning_rate": 9.524524524524525e-06, "loss": 0.9818, "step": 4750 }, { "epoch": 14.29, "grad_norm": 13.250941276550293, "learning_rate": 9.523523523523525e-06, "loss": 1.0015, "step": 4760 }, { "epoch": 14.32, "grad_norm": 19.812030792236328, "learning_rate": 9.522522522522524e-06, "loss": 0.9355, "step": 4770 }, { "epoch": 14.35, "grad_norm": 14.321195602416992, "learning_rate": 9.521521521521522e-06, "loss": 0.9828, "step": 4780 }, { "epoch": 14.38, "grad_norm": 18.426902770996094, "learning_rate": 9.520520520520521e-06, "loss": 0.9369, "step": 4790 }, { "epoch": 14.41, "grad_norm": 20.919023513793945, "learning_rate": 9.51951951951952e-06, "loss": 0.9663, "step": 4800 }, { "epoch": 14.44, "grad_norm": 15.742774963378906, "learning_rate": 9.51851851851852e-06, "loss": 1.0001, "step": 4810 }, { "epoch": 14.47, "grad_norm": 14.45275592803955, "learning_rate": 9.517517517517518e-06, "loss": 1.0518, "step": 4820 }, { "epoch": 14.5, "grad_norm": 21.9227294921875, "learning_rate": 9.516516516516517e-06, "loss": 1.0238, "step": 4830 }, { "epoch": 14.53, "grad_norm": 17.867456436157227, "learning_rate": 9.515515515515517e-06, "loss": 0.9458, "step": 4840 }, { "epoch": 14.56, "grad_norm": 19.42156219482422, "learning_rate": 9.514514514514516e-06, "loss": 0.9996, "step": 4850 }, { "epoch": 14.59, "grad_norm": 18.59282684326172, "learning_rate": 9.513513513513514e-06, "loss": 0.9686, "step": 4860 }, { "epoch": 14.62, "grad_norm": 15.362764358520508, "learning_rate": 9.512512512512513e-06, "loss": 0.9812, "step": 4870 }, { "epoch": 14.65, "grad_norm": 18.349945068359375, "learning_rate": 9.511511511511512e-06, "loss": 0.9779, "step": 4880 }, { "epoch": 14.68, "grad_norm": 13.918747901916504, "learning_rate": 9.510510510510512e-06, "loss": 0.9844, "step": 4890 }, { "epoch": 14.71, "grad_norm": 15.943765640258789, "learning_rate": 9.50950950950951e-06, "loss": 0.9398, "step": 4900 }, { "epoch": 14.74, "grad_norm": 11.988840103149414, "learning_rate": 9.508508508508509e-06, "loss": 0.9981, "step": 4910 }, { "epoch": 14.77, "grad_norm": 17.07158088684082, "learning_rate": 9.507507507507508e-06, "loss": 0.9832, "step": 4920 }, { "epoch": 14.8, "grad_norm": 15.438054084777832, "learning_rate": 9.506506506506508e-06, "loss": 0.9799, "step": 4930 }, { "epoch": 14.83, "grad_norm": 17.7360897064209, "learning_rate": 9.505505505505507e-06, "loss": 1.0055, "step": 4940 }, { "epoch": 14.86, "grad_norm": 14.747228622436523, "learning_rate": 9.504504504504505e-06, "loss": 1.0331, "step": 4950 }, { "epoch": 14.89, "grad_norm": 16.84065055847168, "learning_rate": 9.503503503503504e-06, "loss": 0.9135, "step": 4960 }, { "epoch": 14.92, "grad_norm": 18.45875358581543, "learning_rate": 9.502502502502502e-06, "loss": 1.0109, "step": 4970 }, { "epoch": 14.95, "grad_norm": 20.146686553955078, "learning_rate": 9.501501501501503e-06, "loss": 1.0193, "step": 4980 }, { "epoch": 14.98, "grad_norm": 15.921470642089844, "learning_rate": 9.500500500500501e-06, "loss": 0.9643, "step": 4990 }, { "epoch": 15.0, "eval_accuracy": 0.8399, "eval_loss": 0.5144081115722656, "eval_runtime": 12.647, "eval_samples_per_second": 790.699, "eval_steps_per_second": 3.163, "step": 4995 }, { "epoch": 15.02, "grad_norm": 14.36604118347168, "learning_rate": 9.4994994994995e-06, "loss": 0.8555, "step": 5000 }, { "epoch": 15.05, "grad_norm": 17.989551544189453, "learning_rate": 9.4984984984985e-06, "loss": 0.8889, "step": 5010 }, { "epoch": 15.08, "grad_norm": 15.000627517700195, "learning_rate": 9.497497497497499e-06, "loss": 0.9641, "step": 5020 }, { "epoch": 15.11, "grad_norm": 16.05202865600586, "learning_rate": 9.496496496496497e-06, "loss": 1.0126, "step": 5030 }, { "epoch": 15.14, "grad_norm": 16.95032501220703, "learning_rate": 9.495495495495496e-06, "loss": 0.9473, "step": 5040 }, { "epoch": 15.17, "grad_norm": 15.519562721252441, "learning_rate": 9.494494494494494e-06, "loss": 0.9128, "step": 5050 }, { "epoch": 15.2, "grad_norm": 20.99886131286621, "learning_rate": 9.493493493493495e-06, "loss": 0.9673, "step": 5060 }, { "epoch": 15.23, "grad_norm": 16.14808464050293, "learning_rate": 9.492492492492493e-06, "loss": 1.0206, "step": 5070 }, { "epoch": 15.26, "grad_norm": 14.916738510131836, "learning_rate": 9.491491491491492e-06, "loss": 0.9811, "step": 5080 }, { "epoch": 15.29, "grad_norm": 13.439791679382324, "learning_rate": 9.490490490490492e-06, "loss": 0.9779, "step": 5090 }, { "epoch": 15.32, "grad_norm": 15.340365409851074, "learning_rate": 9.489489489489491e-06, "loss": 0.9864, "step": 5100 }, { "epoch": 15.35, "grad_norm": 18.60448455810547, "learning_rate": 9.48848848848849e-06, "loss": 0.8913, "step": 5110 }, { "epoch": 15.38, "grad_norm": 11.516568183898926, "learning_rate": 9.487487487487488e-06, "loss": 0.9572, "step": 5120 }, { "epoch": 15.41, "grad_norm": 16.937572479248047, "learning_rate": 9.486486486486487e-06, "loss": 0.9264, "step": 5130 }, { "epoch": 15.44, "grad_norm": 13.213362693786621, "learning_rate": 9.485485485485487e-06, "loss": 0.8959, "step": 5140 }, { "epoch": 15.47, "grad_norm": 15.299351692199707, "learning_rate": 9.484484484484486e-06, "loss": 0.8851, "step": 5150 }, { "epoch": 15.5, "grad_norm": 14.688894271850586, "learning_rate": 9.483483483483484e-06, "loss": 0.9851, "step": 5160 }, { "epoch": 15.53, "grad_norm": 11.646239280700684, "learning_rate": 9.482482482482483e-06, "loss": 1.0083, "step": 5170 }, { "epoch": 15.56, "grad_norm": 18.091474533081055, "learning_rate": 9.481481481481483e-06, "loss": 0.8872, "step": 5180 }, { "epoch": 15.59, "grad_norm": 15.272651672363281, "learning_rate": 9.480480480480482e-06, "loss": 0.9658, "step": 5190 }, { "epoch": 15.62, "grad_norm": 21.172088623046875, "learning_rate": 9.47947947947948e-06, "loss": 0.9051, "step": 5200 }, { "epoch": 15.65, "grad_norm": 13.88657283782959, "learning_rate": 9.478478478478479e-06, "loss": 0.8966, "step": 5210 }, { "epoch": 15.68, "grad_norm": 14.709793090820312, "learning_rate": 9.477477477477477e-06, "loss": 0.8853, "step": 5220 }, { "epoch": 15.71, "grad_norm": 22.535512924194336, "learning_rate": 9.476476476476478e-06, "loss": 0.9747, "step": 5230 }, { "epoch": 15.74, "grad_norm": 20.59426498413086, "learning_rate": 9.475475475475476e-06, "loss": 0.9599, "step": 5240 }, { "epoch": 15.77, "grad_norm": 15.672638893127441, "learning_rate": 9.474474474474475e-06, "loss": 0.9429, "step": 5250 }, { "epoch": 15.8, "grad_norm": 21.314970016479492, "learning_rate": 9.473473473473475e-06, "loss": 0.9716, "step": 5260 }, { "epoch": 15.83, "grad_norm": 16.837343215942383, "learning_rate": 9.472472472472474e-06, "loss": 0.8394, "step": 5270 }, { "epoch": 15.86, "grad_norm": 15.07120132446289, "learning_rate": 9.471471471471472e-06, "loss": 0.9175, "step": 5280 }, { "epoch": 15.89, "grad_norm": 15.313618659973145, "learning_rate": 9.470470470470471e-06, "loss": 0.9212, "step": 5290 }, { "epoch": 15.92, "grad_norm": 21.591821670532227, "learning_rate": 9.46946946946947e-06, "loss": 0.9788, "step": 5300 }, { "epoch": 15.95, "grad_norm": 14.006278991699219, "learning_rate": 9.46846846846847e-06, "loss": 0.9606, "step": 5310 }, { "epoch": 15.98, "grad_norm": 13.330794334411621, "learning_rate": 9.467467467467468e-06, "loss": 0.9687, "step": 5320 }, { "epoch": 16.0, "eval_accuracy": 0.8413, "eval_loss": 0.5102618932723999, "eval_runtime": 12.8842, "eval_samples_per_second": 776.145, "eval_steps_per_second": 3.105, "step": 5328 }, { "epoch": 16.01, "grad_norm": 18.027389526367188, "learning_rate": 9.466466466466467e-06, "loss": 1.1231, "step": 5330 }, { "epoch": 16.04, "grad_norm": 16.487071990966797, "learning_rate": 9.465465465465467e-06, "loss": 0.9447, "step": 5340 }, { "epoch": 16.07, "grad_norm": 14.945123672485352, "learning_rate": 9.464464464464466e-06, "loss": 0.9224, "step": 5350 }, { "epoch": 16.1, "grad_norm": 23.017677307128906, "learning_rate": 9.463463463463464e-06, "loss": 0.9404, "step": 5360 }, { "epoch": 16.13, "grad_norm": 16.184133529663086, "learning_rate": 9.462462462462463e-06, "loss": 0.9412, "step": 5370 }, { "epoch": 16.16, "grad_norm": 15.728362083435059, "learning_rate": 9.461461461461462e-06, "loss": 0.9546, "step": 5380 }, { "epoch": 16.19, "grad_norm": 16.84214973449707, "learning_rate": 9.46046046046046e-06, "loss": 0.9157, "step": 5390 }, { "epoch": 16.22, "grad_norm": 15.392135620117188, "learning_rate": 9.45945945945946e-06, "loss": 0.9431, "step": 5400 }, { "epoch": 16.25, "grad_norm": 17.416915893554688, "learning_rate": 9.458458458458459e-06, "loss": 0.9596, "step": 5410 }, { "epoch": 16.28, "grad_norm": 14.695914268493652, "learning_rate": 9.457457457457458e-06, "loss": 0.9424, "step": 5420 }, { "epoch": 16.31, "grad_norm": 14.552547454833984, "learning_rate": 9.456456456456458e-06, "loss": 0.9052, "step": 5430 }, { "epoch": 16.34, "grad_norm": 21.200580596923828, "learning_rate": 9.455455455455457e-06, "loss": 0.9945, "step": 5440 }, { "epoch": 16.37, "grad_norm": 20.810646057128906, "learning_rate": 9.454454454454455e-06, "loss": 0.9052, "step": 5450 }, { "epoch": 16.4, "grad_norm": 21.23904037475586, "learning_rate": 9.453453453453454e-06, "loss": 0.98, "step": 5460 }, { "epoch": 16.43, "grad_norm": 14.602259635925293, "learning_rate": 9.452452452452452e-06, "loss": 0.9851, "step": 5470 }, { "epoch": 16.46, "grad_norm": 16.075517654418945, "learning_rate": 9.451451451451453e-06, "loss": 1.0038, "step": 5480 }, { "epoch": 16.49, "grad_norm": 13.073135375976562, "learning_rate": 9.450450450450451e-06, "loss": 0.9587, "step": 5490 }, { "epoch": 16.52, "grad_norm": 16.11018180847168, "learning_rate": 9.44944944944945e-06, "loss": 0.9649, "step": 5500 }, { "epoch": 16.55, "grad_norm": 17.22406578063965, "learning_rate": 9.44844844844845e-06, "loss": 0.8726, "step": 5510 }, { "epoch": 16.58, "grad_norm": 12.76385498046875, "learning_rate": 9.447447447447449e-06, "loss": 0.893, "step": 5520 }, { "epoch": 16.61, "grad_norm": 16.00253677368164, "learning_rate": 9.446446446446447e-06, "loss": 0.9512, "step": 5530 }, { "epoch": 16.64, "grad_norm": 15.32001781463623, "learning_rate": 9.445445445445446e-06, "loss": 0.9346, "step": 5540 }, { "epoch": 16.67, "grad_norm": 17.619962692260742, "learning_rate": 9.444444444444445e-06, "loss": 0.9255, "step": 5550 }, { "epoch": 16.7, "grad_norm": 18.094051361083984, "learning_rate": 9.443443443443445e-06, "loss": 0.942, "step": 5560 }, { "epoch": 16.73, "grad_norm": 14.032143592834473, "learning_rate": 9.442442442442443e-06, "loss": 0.8866, "step": 5570 }, { "epoch": 16.76, "grad_norm": 13.485750198364258, "learning_rate": 9.441441441441442e-06, "loss": 0.9054, "step": 5580 }, { "epoch": 16.79, "grad_norm": 18.393156051635742, "learning_rate": 9.440440440440442e-06, "loss": 0.9418, "step": 5590 }, { "epoch": 16.82, "grad_norm": 21.527530670166016, "learning_rate": 9.439439439439441e-06, "loss": 0.9645, "step": 5600 }, { "epoch": 16.85, "grad_norm": 16.544702529907227, "learning_rate": 9.43843843843844e-06, "loss": 0.9386, "step": 5610 }, { "epoch": 16.88, "grad_norm": 15.755117416381836, "learning_rate": 9.437437437437438e-06, "loss": 0.9343, "step": 5620 }, { "epoch": 16.91, "grad_norm": 19.206357955932617, "learning_rate": 9.436436436436437e-06, "loss": 0.9381, "step": 5630 }, { "epoch": 16.94, "grad_norm": 16.807981491088867, "learning_rate": 9.435435435435435e-06, "loss": 0.8847, "step": 5640 }, { "epoch": 16.97, "grad_norm": 14.00562572479248, "learning_rate": 9.434434434434436e-06, "loss": 1.0009, "step": 5650 }, { "epoch": 17.0, "grad_norm": 25.151762008666992, "learning_rate": 9.433433433433434e-06, "loss": 0.9464, "step": 5660 }, { "epoch": 17.0, "eval_accuracy": 0.8422, "eval_loss": 0.5020662546157837, "eval_runtime": 12.5537, "eval_samples_per_second": 796.58, "eval_steps_per_second": 3.186, "step": 5661 }, { "epoch": 17.03, "grad_norm": 15.472583770751953, "learning_rate": 9.432432432432433e-06, "loss": 1.0577, "step": 5670 }, { "epoch": 17.06, "grad_norm": 16.592697143554688, "learning_rate": 9.431431431431433e-06, "loss": 0.9467, "step": 5680 }, { "epoch": 17.09, "grad_norm": 22.673389434814453, "learning_rate": 9.430430430430432e-06, "loss": 0.9585, "step": 5690 }, { "epoch": 17.12, "grad_norm": 17.902254104614258, "learning_rate": 9.42942942942943e-06, "loss": 0.8856, "step": 5700 }, { "epoch": 17.15, "grad_norm": 20.870149612426758, "learning_rate": 9.428428428428429e-06, "loss": 0.9163, "step": 5710 }, { "epoch": 17.18, "grad_norm": 17.17050552368164, "learning_rate": 9.427427427427427e-06, "loss": 0.9319, "step": 5720 }, { "epoch": 17.21, "grad_norm": 15.78785514831543, "learning_rate": 9.426426426426428e-06, "loss": 0.8843, "step": 5730 }, { "epoch": 17.24, "grad_norm": 15.386320114135742, "learning_rate": 9.425425425425426e-06, "loss": 0.8933, "step": 5740 }, { "epoch": 17.27, "grad_norm": 19.489320755004883, "learning_rate": 9.424424424424425e-06, "loss": 0.8776, "step": 5750 }, { "epoch": 17.3, "grad_norm": 34.896873474121094, "learning_rate": 9.423423423423425e-06, "loss": 0.9789, "step": 5760 }, { "epoch": 17.33, "grad_norm": 11.27894401550293, "learning_rate": 9.422422422422424e-06, "loss": 0.9901, "step": 5770 }, { "epoch": 17.36, "grad_norm": 20.736225128173828, "learning_rate": 9.421421421421422e-06, "loss": 0.9602, "step": 5780 }, { "epoch": 17.39, "grad_norm": 14.630104064941406, "learning_rate": 9.420420420420421e-06, "loss": 0.8607, "step": 5790 }, { "epoch": 17.42, "grad_norm": 16.037254333496094, "learning_rate": 9.41941941941942e-06, "loss": 0.9457, "step": 5800 }, { "epoch": 17.45, "grad_norm": 18.13541603088379, "learning_rate": 9.41841841841842e-06, "loss": 0.9498, "step": 5810 }, { "epoch": 17.48, "grad_norm": 11.522095680236816, "learning_rate": 9.417417417417418e-06, "loss": 0.8653, "step": 5820 }, { "epoch": 17.51, "grad_norm": 16.0896053314209, "learning_rate": 9.416416416416417e-06, "loss": 0.9219, "step": 5830 }, { "epoch": 17.54, "grad_norm": 13.031556129455566, "learning_rate": 9.415415415415416e-06, "loss": 0.8919, "step": 5840 }, { "epoch": 17.57, "grad_norm": 13.607986450195312, "learning_rate": 9.414414414414416e-06, "loss": 0.9314, "step": 5850 }, { "epoch": 17.6, "grad_norm": 21.58922004699707, "learning_rate": 9.413413413413413e-06, "loss": 0.887, "step": 5860 }, { "epoch": 17.63, "grad_norm": 17.312057495117188, "learning_rate": 9.412412412412413e-06, "loss": 0.9299, "step": 5870 }, { "epoch": 17.66, "grad_norm": 17.167619705200195, "learning_rate": 9.411411411411412e-06, "loss": 0.9754, "step": 5880 }, { "epoch": 17.69, "grad_norm": 17.25926399230957, "learning_rate": 9.41041041041041e-06, "loss": 0.8431, "step": 5890 }, { "epoch": 17.72, "grad_norm": 24.359844207763672, "learning_rate": 9.40940940940941e-06, "loss": 0.9661, "step": 5900 }, { "epoch": 17.75, "grad_norm": 17.767169952392578, "learning_rate": 9.40840840840841e-06, "loss": 0.9478, "step": 5910 }, { "epoch": 17.78, "grad_norm": 18.630041122436523, "learning_rate": 9.407407407407408e-06, "loss": 0.8989, "step": 5920 }, { "epoch": 17.81, "grad_norm": 13.352246284484863, "learning_rate": 9.406406406406408e-06, "loss": 0.964, "step": 5930 }, { "epoch": 17.84, "grad_norm": 14.739523887634277, "learning_rate": 9.405405405405407e-06, "loss": 0.9195, "step": 5940 }, { "epoch": 17.87, "grad_norm": 18.194190979003906, "learning_rate": 9.404404404404405e-06, "loss": 0.9098, "step": 5950 }, { "epoch": 17.9, "grad_norm": 20.455427169799805, "learning_rate": 9.403403403403404e-06, "loss": 0.8647, "step": 5960 }, { "epoch": 17.93, "grad_norm": 17.527029037475586, "learning_rate": 9.402402402402402e-06, "loss": 0.967, "step": 5970 }, { "epoch": 17.96, "grad_norm": 16.669889450073242, "learning_rate": 9.401401401401403e-06, "loss": 0.9095, "step": 5980 }, { "epoch": 17.99, "grad_norm": 16.57728385925293, "learning_rate": 9.400400400400401e-06, "loss": 0.8651, "step": 5990 }, { "epoch": 18.0, "eval_accuracy": 0.8483, "eval_loss": 0.4867164194583893, "eval_runtime": 12.5556, "eval_samples_per_second": 796.458, "eval_steps_per_second": 3.186, "step": 5994 }, { "epoch": 18.02, "grad_norm": 16.830820083618164, "learning_rate": 9.3993993993994e-06, "loss": 0.8231, "step": 6000 }, { "epoch": 18.05, "grad_norm": 13.973091125488281, "learning_rate": 9.3983983983984e-06, "loss": 0.8579, "step": 6010 }, { "epoch": 18.08, "grad_norm": 26.990978240966797, "learning_rate": 9.397397397397399e-06, "loss": 0.9031, "step": 6020 }, { "epoch": 18.11, "grad_norm": 19.440040588378906, "learning_rate": 9.396396396396397e-06, "loss": 0.8558, "step": 6030 }, { "epoch": 18.14, "grad_norm": 16.811927795410156, "learning_rate": 9.395395395395396e-06, "loss": 0.8856, "step": 6040 }, { "epoch": 18.17, "grad_norm": 15.919425964355469, "learning_rate": 9.394394394394395e-06, "loss": 0.8952, "step": 6050 }, { "epoch": 18.2, "grad_norm": 14.041596412658691, "learning_rate": 9.393393393393393e-06, "loss": 0.8978, "step": 6060 }, { "epoch": 18.23, "grad_norm": 15.746431350708008, "learning_rate": 9.392392392392394e-06, "loss": 0.9087, "step": 6070 }, { "epoch": 18.26, "grad_norm": 11.961185455322266, "learning_rate": 9.391391391391392e-06, "loss": 0.9387, "step": 6080 }, { "epoch": 18.29, "grad_norm": 18.50293731689453, "learning_rate": 9.39039039039039e-06, "loss": 0.9178, "step": 6090 }, { "epoch": 18.32, "grad_norm": 16.186243057250977, "learning_rate": 9.389389389389391e-06, "loss": 0.9176, "step": 6100 }, { "epoch": 18.35, "grad_norm": 16.879858016967773, "learning_rate": 9.388388388388388e-06, "loss": 0.8795, "step": 6110 }, { "epoch": 18.38, "grad_norm": 14.703543663024902, "learning_rate": 9.387387387387388e-06, "loss": 0.8627, "step": 6120 }, { "epoch": 18.41, "grad_norm": 14.977249145507812, "learning_rate": 9.386386386386387e-06, "loss": 0.9203, "step": 6130 }, { "epoch": 18.44, "grad_norm": 23.646411895751953, "learning_rate": 9.385385385385385e-06, "loss": 0.8878, "step": 6140 }, { "epoch": 18.47, "grad_norm": 20.65953254699707, "learning_rate": 9.384384384384386e-06, "loss": 0.8367, "step": 6150 }, { "epoch": 18.5, "grad_norm": 15.61327838897705, "learning_rate": 9.383383383383384e-06, "loss": 0.8978, "step": 6160 }, { "epoch": 18.53, "grad_norm": 15.82312297821045, "learning_rate": 9.382382382382383e-06, "loss": 0.9158, "step": 6170 }, { "epoch": 18.56, "grad_norm": 15.94821548461914, "learning_rate": 9.381381381381383e-06, "loss": 0.9168, "step": 6180 }, { "epoch": 18.59, "grad_norm": 14.084745407104492, "learning_rate": 9.380380380380382e-06, "loss": 0.914, "step": 6190 }, { "epoch": 18.62, "grad_norm": 14.570650100708008, "learning_rate": 9.37937937937938e-06, "loss": 0.8604, "step": 6200 }, { "epoch": 18.65, "grad_norm": 14.09744930267334, "learning_rate": 9.378378378378379e-06, "loss": 0.8786, "step": 6210 }, { "epoch": 18.68, "grad_norm": 18.788267135620117, "learning_rate": 9.377377377377378e-06, "loss": 0.9293, "step": 6220 }, { "epoch": 18.71, "grad_norm": 15.376623153686523, "learning_rate": 9.376376376376378e-06, "loss": 0.8625, "step": 6230 }, { "epoch": 18.74, "grad_norm": 17.357419967651367, "learning_rate": 9.375375375375376e-06, "loss": 0.9037, "step": 6240 }, { "epoch": 18.77, "grad_norm": 13.640501022338867, "learning_rate": 9.374374374374375e-06, "loss": 0.9497, "step": 6250 }, { "epoch": 18.8, "grad_norm": 15.700154304504395, "learning_rate": 9.373373373373375e-06, "loss": 0.9401, "step": 6260 }, { "epoch": 18.83, "grad_norm": 17.42572784423828, "learning_rate": 9.372372372372374e-06, "loss": 0.9247, "step": 6270 }, { "epoch": 18.86, "grad_norm": 17.637113571166992, "learning_rate": 9.371371371371372e-06, "loss": 0.917, "step": 6280 }, { "epoch": 18.89, "grad_norm": 13.902384757995605, "learning_rate": 9.370370370370371e-06, "loss": 0.8463, "step": 6290 }, { "epoch": 18.92, "grad_norm": 16.326753616333008, "learning_rate": 9.36936936936937e-06, "loss": 0.8557, "step": 6300 }, { "epoch": 18.95, "grad_norm": 14.994547843933105, "learning_rate": 9.368368368368368e-06, "loss": 0.9127, "step": 6310 }, { "epoch": 18.98, "grad_norm": 23.689701080322266, "learning_rate": 9.367367367367369e-06, "loss": 0.8122, "step": 6320 }, { "epoch": 19.0, "eval_accuracy": 0.8457, "eval_loss": 0.48648661375045776, "eval_runtime": 12.9306, "eval_samples_per_second": 773.358, "eval_steps_per_second": 3.093, "step": 6327 }, { "epoch": 19.01, "grad_norm": 14.423419952392578, "learning_rate": 9.366366366366367e-06, "loss": 0.9854, "step": 6330 }, { "epoch": 19.04, "grad_norm": 18.28612518310547, "learning_rate": 9.365365365365366e-06, "loss": 0.9195, "step": 6340 }, { "epoch": 19.07, "grad_norm": 14.99592399597168, "learning_rate": 9.364364364364366e-06, "loss": 0.8935, "step": 6350 }, { "epoch": 19.1, "grad_norm": 16.575809478759766, "learning_rate": 9.363363363363363e-06, "loss": 0.9202, "step": 6360 }, { "epoch": 19.13, "grad_norm": 21.606903076171875, "learning_rate": 9.362362362362363e-06, "loss": 0.9039, "step": 6370 }, { "epoch": 19.16, "grad_norm": 14.512263298034668, "learning_rate": 9.361361361361362e-06, "loss": 0.8536, "step": 6380 }, { "epoch": 19.19, "grad_norm": 19.58755874633789, "learning_rate": 9.36036036036036e-06, "loss": 0.839, "step": 6390 }, { "epoch": 19.22, "grad_norm": 22.35240364074707, "learning_rate": 9.35935935935936e-06, "loss": 0.8474, "step": 6400 }, { "epoch": 19.25, "grad_norm": 12.306565284729004, "learning_rate": 9.35835835835836e-06, "loss": 0.927, "step": 6410 }, { "epoch": 19.28, "grad_norm": 19.176259994506836, "learning_rate": 9.357357357357358e-06, "loss": 0.9118, "step": 6420 }, { "epoch": 19.31, "grad_norm": 16.825773239135742, "learning_rate": 9.356356356356358e-06, "loss": 0.9009, "step": 6430 }, { "epoch": 19.34, "grad_norm": 23.165769577026367, "learning_rate": 9.355355355355357e-06, "loss": 0.8732, "step": 6440 }, { "epoch": 19.37, "grad_norm": 13.343769073486328, "learning_rate": 9.354354354354355e-06, "loss": 0.9431, "step": 6450 }, { "epoch": 19.4, "grad_norm": 13.997703552246094, "learning_rate": 9.353353353353354e-06, "loss": 0.8847, "step": 6460 }, { "epoch": 19.43, "grad_norm": 16.435840606689453, "learning_rate": 9.352352352352353e-06, "loss": 0.8549, "step": 6470 }, { "epoch": 19.46, "grad_norm": 15.106839179992676, "learning_rate": 9.351351351351353e-06, "loss": 0.8082, "step": 6480 }, { "epoch": 19.49, "grad_norm": 18.501834869384766, "learning_rate": 9.350350350350351e-06, "loss": 0.9178, "step": 6490 }, { "epoch": 19.52, "grad_norm": 16.704853057861328, "learning_rate": 9.34934934934935e-06, "loss": 0.8601, "step": 6500 }, { "epoch": 19.55, "grad_norm": 13.834967613220215, "learning_rate": 9.34834834834835e-06, "loss": 0.9497, "step": 6510 }, { "epoch": 19.58, "grad_norm": 14.41886043548584, "learning_rate": 9.347347347347349e-06, "loss": 0.8517, "step": 6520 }, { "epoch": 19.61, "grad_norm": 18.210779190063477, "learning_rate": 9.346346346346346e-06, "loss": 0.8575, "step": 6530 }, { "epoch": 19.64, "grad_norm": 13.764591217041016, "learning_rate": 9.345345345345346e-06, "loss": 0.9236, "step": 6540 }, { "epoch": 19.67, "grad_norm": 19.188827514648438, "learning_rate": 9.344344344344345e-06, "loss": 0.8965, "step": 6550 }, { "epoch": 19.7, "grad_norm": 16.712377548217773, "learning_rate": 9.343343343343343e-06, "loss": 0.8958, "step": 6560 }, { "epoch": 19.73, "grad_norm": 16.866111755371094, "learning_rate": 9.342342342342344e-06, "loss": 0.8984, "step": 6570 }, { "epoch": 19.76, "grad_norm": 16.404024124145508, "learning_rate": 9.341341341341342e-06, "loss": 0.8949, "step": 6580 }, { "epoch": 19.79, "grad_norm": 14.78878402709961, "learning_rate": 9.34034034034034e-06, "loss": 0.852, "step": 6590 }, { "epoch": 19.82, "grad_norm": 19.638397216796875, "learning_rate": 9.339339339339341e-06, "loss": 0.9375, "step": 6600 }, { "epoch": 19.85, "grad_norm": 15.815281867980957, "learning_rate": 9.338338338338338e-06, "loss": 0.922, "step": 6610 }, { "epoch": 19.88, "grad_norm": 12.827592849731445, "learning_rate": 9.337337337337338e-06, "loss": 0.8804, "step": 6620 }, { "epoch": 19.91, "grad_norm": 16.541942596435547, "learning_rate": 9.336336336336337e-06, "loss": 0.9358, "step": 6630 }, { "epoch": 19.94, "grad_norm": 17.47844123840332, "learning_rate": 9.335335335335335e-06, "loss": 0.9289, "step": 6640 }, { "epoch": 19.97, "grad_norm": 15.426770210266113, "learning_rate": 9.334334334334336e-06, "loss": 0.8539, "step": 6650 }, { "epoch": 20.0, "grad_norm": 25.500600814819336, "learning_rate": 9.333333333333334e-06, "loss": 0.7918, "step": 6660 }, { "epoch": 20.0, "eval_accuracy": 0.8486, "eval_loss": 0.4876756966114044, "eval_runtime": 12.6548, "eval_samples_per_second": 790.216, "eval_steps_per_second": 3.161, "step": 6660 }, { "epoch": 20.03, "grad_norm": 20.535430908203125, "learning_rate": 9.332332332332333e-06, "loss": 0.8509, "step": 6670 }, { "epoch": 20.06, "grad_norm": 15.317164421081543, "learning_rate": 9.331331331331333e-06, "loss": 0.8385, "step": 6680 }, { "epoch": 20.09, "grad_norm": 21.33066749572754, "learning_rate": 9.330330330330332e-06, "loss": 0.8901, "step": 6690 }, { "epoch": 20.12, "grad_norm": 13.255439758300781, "learning_rate": 9.32932932932933e-06, "loss": 0.8398, "step": 6700 }, { "epoch": 20.15, "grad_norm": 17.686439514160156, "learning_rate": 9.328328328328329e-06, "loss": 0.716, "step": 6710 }, { "epoch": 20.18, "grad_norm": 17.935535430908203, "learning_rate": 9.327327327327328e-06, "loss": 0.8567, "step": 6720 }, { "epoch": 20.21, "grad_norm": 12.795973777770996, "learning_rate": 9.326326326326328e-06, "loss": 0.876, "step": 6730 }, { "epoch": 20.24, "grad_norm": 12.963747024536133, "learning_rate": 9.325325325325326e-06, "loss": 0.8559, "step": 6740 }, { "epoch": 20.27, "grad_norm": 20.37497329711914, "learning_rate": 9.324324324324325e-06, "loss": 0.9035, "step": 6750 }, { "epoch": 20.3, "grad_norm": 15.318814277648926, "learning_rate": 9.323323323323324e-06, "loss": 0.8254, "step": 6760 }, { "epoch": 20.33, "grad_norm": 14.240460395812988, "learning_rate": 9.322322322322324e-06, "loss": 0.8645, "step": 6770 }, { "epoch": 20.36, "grad_norm": 17.571277618408203, "learning_rate": 9.321321321321321e-06, "loss": 0.9952, "step": 6780 }, { "epoch": 20.39, "grad_norm": 22.65481948852539, "learning_rate": 9.320320320320321e-06, "loss": 0.8574, "step": 6790 }, { "epoch": 20.42, "grad_norm": 20.18941879272461, "learning_rate": 9.31931931931932e-06, "loss": 0.9161, "step": 6800 }, { "epoch": 20.45, "grad_norm": 15.671890258789062, "learning_rate": 9.318318318318318e-06, "loss": 0.8537, "step": 6810 }, { "epoch": 20.48, "grad_norm": 15.43073844909668, "learning_rate": 9.317317317317319e-06, "loss": 0.9022, "step": 6820 }, { "epoch": 20.51, "grad_norm": 12.564041137695312, "learning_rate": 9.316316316316317e-06, "loss": 0.858, "step": 6830 }, { "epoch": 20.54, "grad_norm": 14.09218978881836, "learning_rate": 9.315315315315316e-06, "loss": 0.8307, "step": 6840 }, { "epoch": 20.57, "grad_norm": 17.242481231689453, "learning_rate": 9.314314314314316e-06, "loss": 0.8369, "step": 6850 }, { "epoch": 20.6, "grad_norm": 14.80042552947998, "learning_rate": 9.313313313313313e-06, "loss": 0.9467, "step": 6860 }, { "epoch": 20.63, "grad_norm": 14.18408203125, "learning_rate": 9.312312312312313e-06, "loss": 0.8499, "step": 6870 }, { "epoch": 20.66, "grad_norm": 14.217787742614746, "learning_rate": 9.311311311311312e-06, "loss": 0.8993, "step": 6880 }, { "epoch": 20.69, "grad_norm": 15.242293357849121, "learning_rate": 9.31031031031031e-06, "loss": 0.8853, "step": 6890 }, { "epoch": 20.72, "grad_norm": 12.323200225830078, "learning_rate": 9.30930930930931e-06, "loss": 0.8234, "step": 6900 }, { "epoch": 20.75, "grad_norm": 17.57102394104004, "learning_rate": 9.30830830830831e-06, "loss": 0.8901, "step": 6910 }, { "epoch": 20.78, "grad_norm": 10.80214786529541, "learning_rate": 9.307307307307308e-06, "loss": 0.8077, "step": 6920 }, { "epoch": 20.81, "grad_norm": 11.871294021606445, "learning_rate": 9.306306306306308e-06, "loss": 0.8732, "step": 6930 }, { "epoch": 20.84, "grad_norm": 14.237271308898926, "learning_rate": 9.305305305305305e-06, "loss": 0.8879, "step": 6940 }, { "epoch": 20.87, "grad_norm": 16.12568473815918, "learning_rate": 9.304304304304305e-06, "loss": 0.9051, "step": 6950 }, { "epoch": 20.9, "grad_norm": 16.860570907592773, "learning_rate": 9.303303303303304e-06, "loss": 0.9073, "step": 6960 }, { "epoch": 20.93, "grad_norm": 13.383695602416992, "learning_rate": 9.302302302302303e-06, "loss": 0.8684, "step": 6970 }, { "epoch": 20.96, "grad_norm": 16.58403205871582, "learning_rate": 9.301301301301301e-06, "loss": 0.8682, "step": 6980 }, { "epoch": 20.99, "grad_norm": 14.263192176818848, "learning_rate": 9.300300300300302e-06, "loss": 0.8994, "step": 6990 }, { "epoch": 21.0, "eval_accuracy": 0.8502, "eval_loss": 0.4836479425430298, "eval_runtime": 12.9867, "eval_samples_per_second": 770.017, "eval_steps_per_second": 3.08, "step": 6993 }, { "epoch": 21.02, "grad_norm": 14.633737564086914, "learning_rate": 9.2992992992993e-06, "loss": 0.9701, "step": 7000 }, { "epoch": 21.05, "grad_norm": 16.935035705566406, "learning_rate": 9.298298298298299e-06, "loss": 0.8863, "step": 7010 }, { "epoch": 21.08, "grad_norm": 14.481687545776367, "learning_rate": 9.297297297297299e-06, "loss": 0.84, "step": 7020 }, { "epoch": 21.11, "grad_norm": 11.950042724609375, "learning_rate": 9.296296296296296e-06, "loss": 0.794, "step": 7030 }, { "epoch": 21.14, "grad_norm": 18.95380210876465, "learning_rate": 9.295295295295296e-06, "loss": 0.8477, "step": 7040 }, { "epoch": 21.17, "grad_norm": 13.028655052185059, "learning_rate": 9.294294294294295e-06, "loss": 0.8056, "step": 7050 }, { "epoch": 21.2, "grad_norm": 16.111392974853516, "learning_rate": 9.293293293293293e-06, "loss": 0.8293, "step": 7060 }, { "epoch": 21.23, "grad_norm": 28.997385025024414, "learning_rate": 9.292292292292294e-06, "loss": 0.8456, "step": 7070 }, { "epoch": 21.26, "grad_norm": 16.135433197021484, "learning_rate": 9.291291291291292e-06, "loss": 0.8633, "step": 7080 }, { "epoch": 21.29, "grad_norm": 18.188039779663086, "learning_rate": 9.29029029029029e-06, "loss": 0.9532, "step": 7090 }, { "epoch": 21.32, "grad_norm": 18.010501861572266, "learning_rate": 9.289289289289291e-06, "loss": 0.7915, "step": 7100 }, { "epoch": 21.35, "grad_norm": 17.852554321289062, "learning_rate": 9.288288288288288e-06, "loss": 0.8928, "step": 7110 }, { "epoch": 21.38, "grad_norm": 15.39466667175293, "learning_rate": 9.287287287287288e-06, "loss": 0.7896, "step": 7120 }, { "epoch": 21.41, "grad_norm": 16.305221557617188, "learning_rate": 9.286286286286287e-06, "loss": 0.9138, "step": 7130 }, { "epoch": 21.44, "grad_norm": 14.550299644470215, "learning_rate": 9.285285285285286e-06, "loss": 0.916, "step": 7140 }, { "epoch": 21.47, "grad_norm": 18.119386672973633, "learning_rate": 9.284284284284286e-06, "loss": 0.8451, "step": 7150 }, { "epoch": 21.5, "grad_norm": 21.804479598999023, "learning_rate": 9.283283283283284e-06, "loss": 0.887, "step": 7160 }, { "epoch": 21.53, "grad_norm": 15.080596923828125, "learning_rate": 9.282282282282283e-06, "loss": 0.9324, "step": 7170 }, { "epoch": 21.56, "grad_norm": 15.653203010559082, "learning_rate": 9.281281281281283e-06, "loss": 0.8725, "step": 7180 }, { "epoch": 21.59, "grad_norm": 13.02376937866211, "learning_rate": 9.28028028028028e-06, "loss": 0.8557, "step": 7190 }, { "epoch": 21.62, "grad_norm": 16.726030349731445, "learning_rate": 9.27927927927928e-06, "loss": 0.804, "step": 7200 }, { "epoch": 21.65, "grad_norm": 24.801246643066406, "learning_rate": 9.278278278278279e-06, "loss": 0.895, "step": 7210 }, { "epoch": 21.68, "grad_norm": 18.446496963500977, "learning_rate": 9.277277277277278e-06, "loss": 0.9239, "step": 7220 }, { "epoch": 21.71, "grad_norm": 19.611867904663086, "learning_rate": 9.276276276276276e-06, "loss": 0.7734, "step": 7230 }, { "epoch": 21.74, "grad_norm": 17.296485900878906, "learning_rate": 9.275275275275277e-06, "loss": 0.8745, "step": 7240 }, { "epoch": 21.77, "grad_norm": 22.829599380493164, "learning_rate": 9.274274274274275e-06, "loss": 0.8779, "step": 7250 }, { "epoch": 21.8, "grad_norm": 17.0224552154541, "learning_rate": 9.273273273273274e-06, "loss": 0.8653, "step": 7260 }, { "epoch": 21.83, "grad_norm": 19.823801040649414, "learning_rate": 9.272272272272274e-06, "loss": 0.8803, "step": 7270 }, { "epoch": 21.86, "grad_norm": 13.497579574584961, "learning_rate": 9.271271271271271e-06, "loss": 0.8718, "step": 7280 }, { "epoch": 21.89, "grad_norm": 13.915096282958984, "learning_rate": 9.270270270270271e-06, "loss": 0.8566, "step": 7290 }, { "epoch": 21.92, "grad_norm": 19.492271423339844, "learning_rate": 9.26926926926927e-06, "loss": 0.8285, "step": 7300 }, { "epoch": 21.95, "grad_norm": 11.919459342956543, "learning_rate": 9.268268268268268e-06, "loss": 0.859, "step": 7310 }, { "epoch": 21.98, "grad_norm": 12.806656837463379, "learning_rate": 9.267267267267269e-06, "loss": 0.8661, "step": 7320 }, { "epoch": 22.0, "eval_accuracy": 0.8538, "eval_loss": 0.47356659173965454, "eval_runtime": 12.8169, "eval_samples_per_second": 780.217, "eval_steps_per_second": 3.121, "step": 7326 }, { "epoch": 22.01, "grad_norm": 13.644493103027344, "learning_rate": 9.266266266266267e-06, "loss": 0.9675, "step": 7330 }, { "epoch": 22.04, "grad_norm": 18.335735321044922, "learning_rate": 9.265265265265266e-06, "loss": 0.8491, "step": 7340 }, { "epoch": 22.07, "grad_norm": 15.356688499450684, "learning_rate": 9.264264264264266e-06, "loss": 0.85, "step": 7350 }, { "epoch": 22.1, "grad_norm": 16.97271156311035, "learning_rate": 9.263263263263263e-06, "loss": 0.8038, "step": 7360 }, { "epoch": 22.13, "grad_norm": 15.182245254516602, "learning_rate": 9.262262262262263e-06, "loss": 0.8134, "step": 7370 }, { "epoch": 22.16, "grad_norm": 15.863039016723633, "learning_rate": 9.261261261261262e-06, "loss": 0.7958, "step": 7380 }, { "epoch": 22.19, "grad_norm": 11.823266983032227, "learning_rate": 9.26026026026026e-06, "loss": 0.8699, "step": 7390 }, { "epoch": 22.22, "grad_norm": 14.815451622009277, "learning_rate": 9.25925925925926e-06, "loss": 0.8234, "step": 7400 }, { "epoch": 22.25, "grad_norm": 20.885053634643555, "learning_rate": 9.25825825825826e-06, "loss": 0.8443, "step": 7410 }, { "epoch": 22.28, "grad_norm": 14.285574913024902, "learning_rate": 9.257257257257258e-06, "loss": 0.845, "step": 7420 }, { "epoch": 22.31, "grad_norm": 17.085548400878906, "learning_rate": 9.256256256256257e-06, "loss": 0.8106, "step": 7430 }, { "epoch": 22.34, "grad_norm": 27.535512924194336, "learning_rate": 9.255255255255255e-06, "loss": 0.8112, "step": 7440 }, { "epoch": 22.37, "grad_norm": 15.75393295288086, "learning_rate": 9.254254254254254e-06, "loss": 0.8391, "step": 7450 }, { "epoch": 22.4, "grad_norm": 22.823278427124023, "learning_rate": 9.253253253253254e-06, "loss": 0.8324, "step": 7460 }, { "epoch": 22.43, "grad_norm": 20.358417510986328, "learning_rate": 9.252252252252253e-06, "loss": 0.8966, "step": 7470 }, { "epoch": 22.46, "grad_norm": 15.353719711303711, "learning_rate": 9.251251251251251e-06, "loss": 0.8763, "step": 7480 }, { "epoch": 22.49, "grad_norm": 16.366374969482422, "learning_rate": 9.250250250250252e-06, "loss": 0.8872, "step": 7490 }, { "epoch": 22.52, "grad_norm": 20.56578254699707, "learning_rate": 9.24924924924925e-06, "loss": 0.8348, "step": 7500 }, { "epoch": 22.55, "grad_norm": 14.54299259185791, "learning_rate": 9.248248248248249e-06, "loss": 0.8175, "step": 7510 }, { "epoch": 22.58, "grad_norm": 25.86806869506836, "learning_rate": 9.247247247247249e-06, "loss": 0.8123, "step": 7520 }, { "epoch": 22.61, "grad_norm": 14.958423614501953, "learning_rate": 9.246246246246246e-06, "loss": 0.8022, "step": 7530 }, { "epoch": 22.64, "grad_norm": 15.241181373596191, "learning_rate": 9.245245245245246e-06, "loss": 0.8186, "step": 7540 }, { "epoch": 22.67, "grad_norm": 13.112909317016602, "learning_rate": 9.244244244244245e-06, "loss": 0.8263, "step": 7550 }, { "epoch": 22.7, "grad_norm": 17.80851173400879, "learning_rate": 9.243243243243243e-06, "loss": 0.875, "step": 7560 }, { "epoch": 22.73, "grad_norm": 18.219003677368164, "learning_rate": 9.242242242242244e-06, "loss": 0.8504, "step": 7570 }, { "epoch": 22.76, "grad_norm": 15.969429969787598, "learning_rate": 9.241241241241242e-06, "loss": 0.9018, "step": 7580 }, { "epoch": 22.79, "grad_norm": 14.084677696228027, "learning_rate": 9.240240240240241e-06, "loss": 0.8464, "step": 7590 }, { "epoch": 22.82, "grad_norm": 12.595839500427246, "learning_rate": 9.239239239239241e-06, "loss": 0.8395, "step": 7600 }, { "epoch": 22.85, "grad_norm": 14.709906578063965, "learning_rate": 9.238238238238238e-06, "loss": 0.9069, "step": 7610 }, { "epoch": 22.88, "grad_norm": 15.314925193786621, "learning_rate": 9.237237237237238e-06, "loss": 0.8889, "step": 7620 }, { "epoch": 22.91, "grad_norm": 16.0838680267334, "learning_rate": 9.236236236236237e-06, "loss": 0.8378, "step": 7630 }, { "epoch": 22.94, "grad_norm": 22.73562240600586, "learning_rate": 9.235235235235236e-06, "loss": 0.8536, "step": 7640 }, { "epoch": 22.97, "grad_norm": 15.428640365600586, "learning_rate": 9.234234234234236e-06, "loss": 0.869, "step": 7650 }, { "epoch": 23.0, "eval_accuracy": 0.8528, "eval_loss": 0.4703277349472046, "eval_runtime": 12.7652, "eval_samples_per_second": 783.377, "eval_steps_per_second": 3.134, "step": 7659 }, { "epoch": 23.0, "grad_norm": 15.16240119934082, "learning_rate": 9.233233233233234e-06, "loss": 0.7846, "step": 7660 }, { "epoch": 23.03, "grad_norm": 19.369976043701172, "learning_rate": 9.232232232232233e-06, "loss": 0.8101, "step": 7670 }, { "epoch": 23.06, "grad_norm": 17.12249183654785, "learning_rate": 9.231231231231232e-06, "loss": 0.8754, "step": 7680 }, { "epoch": 23.09, "grad_norm": 15.322559356689453, "learning_rate": 9.23023023023023e-06, "loss": 0.8196, "step": 7690 }, { "epoch": 23.12, "grad_norm": 17.422138214111328, "learning_rate": 9.229229229229229e-06, "loss": 0.7734, "step": 7700 }, { "epoch": 23.15, "grad_norm": 14.549483299255371, "learning_rate": 9.228228228228229e-06, "loss": 0.8335, "step": 7710 }, { "epoch": 23.18, "grad_norm": 14.285603523254395, "learning_rate": 9.227227227227228e-06, "loss": 0.9079, "step": 7720 }, { "epoch": 23.21, "grad_norm": 15.609331130981445, "learning_rate": 9.226226226226226e-06, "loss": 0.7925, "step": 7730 }, { "epoch": 23.24, "grad_norm": 15.03654670715332, "learning_rate": 9.225225225225227e-06, "loss": 0.8232, "step": 7740 }, { "epoch": 23.27, "grad_norm": 15.817977905273438, "learning_rate": 9.224224224224225e-06, "loss": 0.8593, "step": 7750 }, { "epoch": 23.3, "grad_norm": 17.551942825317383, "learning_rate": 9.223223223223224e-06, "loss": 0.8224, "step": 7760 }, { "epoch": 23.33, "grad_norm": 15.3309907913208, "learning_rate": 9.222222222222224e-06, "loss": 0.8204, "step": 7770 }, { "epoch": 23.36, "grad_norm": 18.83182144165039, "learning_rate": 9.221221221221221e-06, "loss": 0.8573, "step": 7780 }, { "epoch": 23.39, "grad_norm": 18.249195098876953, "learning_rate": 9.220220220220221e-06, "loss": 0.7706, "step": 7790 }, { "epoch": 23.42, "grad_norm": 17.53611946105957, "learning_rate": 9.21921921921922e-06, "loss": 0.8571, "step": 7800 }, { "epoch": 23.45, "grad_norm": 18.392507553100586, "learning_rate": 9.218218218218218e-06, "loss": 0.7857, "step": 7810 }, { "epoch": 23.48, "grad_norm": 16.387056350708008, "learning_rate": 9.217217217217219e-06, "loss": 0.7913, "step": 7820 }, { "epoch": 23.51, "grad_norm": 14.469799995422363, "learning_rate": 9.216216216216217e-06, "loss": 0.8383, "step": 7830 }, { "epoch": 23.54, "grad_norm": 15.943617820739746, "learning_rate": 9.215215215215216e-06, "loss": 0.782, "step": 7840 }, { "epoch": 23.57, "grad_norm": 21.93720054626465, "learning_rate": 9.214214214214216e-06, "loss": 0.8644, "step": 7850 }, { "epoch": 23.6, "grad_norm": 16.22055435180664, "learning_rate": 9.213213213213213e-06, "loss": 0.7544, "step": 7860 }, { "epoch": 23.63, "grad_norm": 15.785012245178223, "learning_rate": 9.212212212212213e-06, "loss": 0.7947, "step": 7870 }, { "epoch": 23.66, "grad_norm": 14.264601707458496, "learning_rate": 9.211211211211212e-06, "loss": 0.8322, "step": 7880 }, { "epoch": 23.69, "grad_norm": 13.659516334533691, "learning_rate": 9.21021021021021e-06, "loss": 0.8537, "step": 7890 }, { "epoch": 23.72, "grad_norm": 21.310932159423828, "learning_rate": 9.20920920920921e-06, "loss": 0.8742, "step": 7900 }, { "epoch": 23.75, "grad_norm": 15.599264144897461, "learning_rate": 9.20820820820821e-06, "loss": 0.822, "step": 7910 }, { "epoch": 23.78, "grad_norm": 24.196393966674805, "learning_rate": 9.207207207207208e-06, "loss": 0.8751, "step": 7920 }, { "epoch": 23.81, "grad_norm": 15.778040885925293, "learning_rate": 9.206206206206207e-06, "loss": 0.8686, "step": 7930 }, { "epoch": 23.84, "grad_norm": 18.238977432250977, "learning_rate": 9.205205205205205e-06, "loss": 0.8613, "step": 7940 }, { "epoch": 23.87, "grad_norm": 14.943485260009766, "learning_rate": 9.204204204204204e-06, "loss": 0.8041, "step": 7950 }, { "epoch": 23.9, "grad_norm": 12.482244491577148, "learning_rate": 9.203203203203204e-06, "loss": 0.776, "step": 7960 }, { "epoch": 23.93, "grad_norm": 16.469688415527344, "learning_rate": 9.202202202202203e-06, "loss": 0.8753, "step": 7970 }, { "epoch": 23.96, "grad_norm": 15.951470375061035, "learning_rate": 9.201201201201201e-06, "loss": 0.8528, "step": 7980 }, { "epoch": 23.99, "grad_norm": 19.05841827392578, "learning_rate": 9.200200200200202e-06, "loss": 0.8681, "step": 7990 }, { "epoch": 24.0, "eval_accuracy": 0.8513, "eval_loss": 0.47983139753341675, "eval_runtime": 12.8651, "eval_samples_per_second": 777.298, "eval_steps_per_second": 3.109, "step": 7992 }, { "epoch": 24.02, "grad_norm": 13.12854290008545, "learning_rate": 9.1991991991992e-06, "loss": 0.7567, "step": 8000 }, { "epoch": 24.05, "grad_norm": 13.093955039978027, "learning_rate": 9.198198198198199e-06, "loss": 0.7827, "step": 8010 }, { "epoch": 24.08, "grad_norm": 17.031505584716797, "learning_rate": 9.197197197197199e-06, "loss": 0.8131, "step": 8020 }, { "epoch": 24.11, "grad_norm": 15.323765754699707, "learning_rate": 9.196196196196196e-06, "loss": 0.7802, "step": 8030 }, { "epoch": 24.14, "grad_norm": 14.65307903289795, "learning_rate": 9.195195195195196e-06, "loss": 0.8292, "step": 8040 }, { "epoch": 24.17, "grad_norm": 20.415098190307617, "learning_rate": 9.194194194194195e-06, "loss": 0.8012, "step": 8050 }, { "epoch": 24.2, "grad_norm": 16.29452133178711, "learning_rate": 9.193193193193194e-06, "loss": 0.7967, "step": 8060 }, { "epoch": 24.23, "grad_norm": 18.950679779052734, "learning_rate": 9.192192192192194e-06, "loss": 0.8356, "step": 8070 }, { "epoch": 24.26, "grad_norm": 18.14234161376953, "learning_rate": 9.191191191191192e-06, "loss": 0.8139, "step": 8080 }, { "epoch": 24.29, "grad_norm": 13.386857986450195, "learning_rate": 9.190190190190191e-06, "loss": 0.8111, "step": 8090 }, { "epoch": 24.32, "grad_norm": 16.188678741455078, "learning_rate": 9.189189189189191e-06, "loss": 0.8458, "step": 8100 }, { "epoch": 24.35, "grad_norm": 18.754507064819336, "learning_rate": 9.188188188188188e-06, "loss": 0.8107, "step": 8110 }, { "epoch": 24.38, "grad_norm": 15.322908401489258, "learning_rate": 9.187187187187187e-06, "loss": 0.8701, "step": 8120 }, { "epoch": 24.41, "grad_norm": 12.777388572692871, "learning_rate": 9.186186186186187e-06, "loss": 0.8625, "step": 8130 }, { "epoch": 24.44, "grad_norm": 15.782909393310547, "learning_rate": 9.185185185185186e-06, "loss": 0.8565, "step": 8140 }, { "epoch": 24.47, "grad_norm": 12.911823272705078, "learning_rate": 9.184184184184184e-06, "loss": 0.8595, "step": 8150 }, { "epoch": 24.5, "grad_norm": 15.34666633605957, "learning_rate": 9.183183183183185e-06, "loss": 0.8232, "step": 8160 }, { "epoch": 24.53, "grad_norm": 14.44251537322998, "learning_rate": 9.182182182182183e-06, "loss": 0.7846, "step": 8170 }, { "epoch": 24.56, "grad_norm": 14.62682819366455, "learning_rate": 9.181181181181182e-06, "loss": 0.8122, "step": 8180 }, { "epoch": 24.59, "grad_norm": 11.911892890930176, "learning_rate": 9.18018018018018e-06, "loss": 0.8487, "step": 8190 }, { "epoch": 24.62, "grad_norm": 17.382272720336914, "learning_rate": 9.179179179179179e-06, "loss": 0.7474, "step": 8200 }, { "epoch": 24.65, "grad_norm": 13.493112564086914, "learning_rate": 9.17817817817818e-06, "loss": 0.8379, "step": 8210 }, { "epoch": 24.68, "grad_norm": 13.13019847869873, "learning_rate": 9.177177177177178e-06, "loss": 0.788, "step": 8220 }, { "epoch": 24.71, "grad_norm": 17.745746612548828, "learning_rate": 9.176176176176176e-06, "loss": 0.8313, "step": 8230 }, { "epoch": 24.74, "grad_norm": 16.05807113647461, "learning_rate": 9.175175175175177e-06, "loss": 0.8015, "step": 8240 }, { "epoch": 24.77, "grad_norm": 17.024206161499023, "learning_rate": 9.174174174174175e-06, "loss": 0.8383, "step": 8250 }, { "epoch": 24.8, "grad_norm": 16.1260986328125, "learning_rate": 9.173173173173174e-06, "loss": 0.8483, "step": 8260 }, { "epoch": 24.83, "grad_norm": 16.126651763916016, "learning_rate": 9.172172172172172e-06, "loss": 0.8526, "step": 8270 }, { "epoch": 24.86, "grad_norm": 12.465509414672852, "learning_rate": 9.171171171171171e-06, "loss": 0.7497, "step": 8280 }, { "epoch": 24.89, "grad_norm": 14.012617111206055, "learning_rate": 9.170170170170171e-06, "loss": 0.8264, "step": 8290 }, { "epoch": 24.92, "grad_norm": 15.739836692810059, "learning_rate": 9.16916916916917e-06, "loss": 0.7831, "step": 8300 }, { "epoch": 24.95, "grad_norm": 16.17227554321289, "learning_rate": 9.168168168168169e-06, "loss": 0.8209, "step": 8310 }, { "epoch": 24.98, "grad_norm": 14.855598449707031, "learning_rate": 9.167167167167169e-06, "loss": 0.7693, "step": 8320 }, { "epoch": 25.0, "eval_accuracy": 0.8523, "eval_loss": 0.4679972529411316, "eval_runtime": 12.7295, "eval_samples_per_second": 785.579, "eval_steps_per_second": 3.142, "step": 8325 }, { "epoch": 25.02, "grad_norm": 19.4244384765625, "learning_rate": 9.166166166166167e-06, "loss": 0.8131, "step": 8330 }, { "epoch": 25.05, "grad_norm": 17.67390251159668, "learning_rate": 9.165165165165166e-06, "loss": 0.7675, "step": 8340 }, { "epoch": 25.08, "grad_norm": 16.680253982543945, "learning_rate": 9.164164164164165e-06, "loss": 0.8094, "step": 8350 }, { "epoch": 25.11, "grad_norm": 19.503707885742188, "learning_rate": 9.163163163163163e-06, "loss": 0.8312, "step": 8360 }, { "epoch": 25.14, "grad_norm": 19.490549087524414, "learning_rate": 9.162162162162162e-06, "loss": 0.8129, "step": 8370 }, { "epoch": 25.17, "grad_norm": 15.471096992492676, "learning_rate": 9.161161161161162e-06, "loss": 0.8767, "step": 8380 }, { "epoch": 25.2, "grad_norm": 16.655948638916016, "learning_rate": 9.16016016016016e-06, "loss": 0.8074, "step": 8390 }, { "epoch": 25.23, "grad_norm": 24.472150802612305, "learning_rate": 9.15915915915916e-06, "loss": 0.6905, "step": 8400 }, { "epoch": 25.26, "grad_norm": 12.948098182678223, "learning_rate": 9.15815815815816e-06, "loss": 0.7783, "step": 8410 }, { "epoch": 25.29, "grad_norm": 20.246353149414062, "learning_rate": 9.157157157157158e-06, "loss": 0.7865, "step": 8420 }, { "epoch": 25.32, "grad_norm": 16.572525024414062, "learning_rate": 9.156156156156157e-06, "loss": 0.8671, "step": 8430 }, { "epoch": 25.35, "grad_norm": 13.740578651428223, "learning_rate": 9.155155155155155e-06, "loss": 0.7599, "step": 8440 }, { "epoch": 25.38, "grad_norm": 19.841176986694336, "learning_rate": 9.154154154154154e-06, "loss": 0.8961, "step": 8450 }, { "epoch": 25.41, "grad_norm": 14.382325172424316, "learning_rate": 9.153153153153154e-06, "loss": 0.8425, "step": 8460 }, { "epoch": 25.44, "grad_norm": 16.1561336517334, "learning_rate": 9.152152152152153e-06, "loss": 0.8186, "step": 8470 }, { "epoch": 25.47, "grad_norm": 17.0059871673584, "learning_rate": 9.151151151151151e-06, "loss": 0.746, "step": 8480 }, { "epoch": 25.5, "grad_norm": 15.718467712402344, "learning_rate": 9.150150150150152e-06, "loss": 0.8708, "step": 8490 }, { "epoch": 25.53, "grad_norm": 16.485553741455078, "learning_rate": 9.14914914914915e-06, "loss": 0.8399, "step": 8500 }, { "epoch": 25.56, "grad_norm": 14.97546672821045, "learning_rate": 9.148148148148149e-06, "loss": 0.7853, "step": 8510 }, { "epoch": 25.59, "grad_norm": 15.825803756713867, "learning_rate": 9.147147147147147e-06, "loss": 0.7837, "step": 8520 }, { "epoch": 25.62, "grad_norm": 17.98794174194336, "learning_rate": 9.146146146146146e-06, "loss": 0.8777, "step": 8530 }, { "epoch": 25.65, "grad_norm": 16.677520751953125, "learning_rate": 9.145145145145146e-06, "loss": 0.8902, "step": 8540 }, { "epoch": 25.68, "grad_norm": 18.61178970336914, "learning_rate": 9.144144144144145e-06, "loss": 0.8479, "step": 8550 }, { "epoch": 25.71, "grad_norm": 15.579206466674805, "learning_rate": 9.143143143143144e-06, "loss": 0.8276, "step": 8560 }, { "epoch": 25.74, "grad_norm": 20.39088249206543, "learning_rate": 9.142142142142144e-06, "loss": 0.8162, "step": 8570 }, { "epoch": 25.77, "grad_norm": 17.708942413330078, "learning_rate": 9.141141141141142e-06, "loss": 0.8538, "step": 8580 }, { "epoch": 25.8, "grad_norm": 23.83163070678711, "learning_rate": 9.140140140140141e-06, "loss": 0.8433, "step": 8590 }, { "epoch": 25.83, "grad_norm": 17.273645401000977, "learning_rate": 9.13913913913914e-06, "loss": 0.8071, "step": 8600 }, { "epoch": 25.86, "grad_norm": 14.331090927124023, "learning_rate": 9.138138138138138e-06, "loss": 0.8062, "step": 8610 }, { "epoch": 25.89, "grad_norm": 13.921991348266602, "learning_rate": 9.137137137137137e-06, "loss": 0.8676, "step": 8620 }, { "epoch": 25.92, "grad_norm": 12.722929954528809, "learning_rate": 9.136136136136137e-06, "loss": 0.8033, "step": 8630 }, { "epoch": 25.95, "grad_norm": 20.208118438720703, "learning_rate": 9.135135135135136e-06, "loss": 0.8174, "step": 8640 }, { "epoch": 25.98, "grad_norm": 20.171621322631836, "learning_rate": 9.134134134134134e-06, "loss": 0.8693, "step": 8650 }, { "epoch": 26.0, "eval_accuracy": 0.8579, "eval_loss": 0.46457383036613464, "eval_runtime": 12.908, "eval_samples_per_second": 774.716, "eval_steps_per_second": 3.099, "step": 8658 }, { "epoch": 26.01, "grad_norm": 13.121548652648926, "learning_rate": 9.133133133133135e-06, "loss": 0.7968, "step": 8660 }, { "epoch": 26.04, "grad_norm": 14.341508865356445, "learning_rate": 9.132132132132133e-06, "loss": 0.797, "step": 8670 }, { "epoch": 26.07, "grad_norm": 15.286103248596191, "learning_rate": 9.131131131131132e-06, "loss": 0.7673, "step": 8680 }, { "epoch": 26.1, "grad_norm": 14.079110145568848, "learning_rate": 9.13013013013013e-06, "loss": 0.7776, "step": 8690 }, { "epoch": 26.13, "grad_norm": 15.827554702758789, "learning_rate": 9.129129129129129e-06, "loss": 0.7723, "step": 8700 }, { "epoch": 26.16, "grad_norm": 14.014419555664062, "learning_rate": 9.12812812812813e-06, "loss": 0.787, "step": 8710 }, { "epoch": 26.19, "grad_norm": 18.37980842590332, "learning_rate": 9.127127127127128e-06, "loss": 0.8254, "step": 8720 }, { "epoch": 26.22, "grad_norm": 15.558728218078613, "learning_rate": 9.126126126126126e-06, "loss": 0.7796, "step": 8730 }, { "epoch": 26.25, "grad_norm": 20.04802894592285, "learning_rate": 9.125125125125127e-06, "loss": 0.7613, "step": 8740 }, { "epoch": 26.28, "grad_norm": 19.22740936279297, "learning_rate": 9.124124124124125e-06, "loss": 0.7476, "step": 8750 }, { "epoch": 26.31, "grad_norm": 15.683467864990234, "learning_rate": 9.123123123123124e-06, "loss": 0.8745, "step": 8760 }, { "epoch": 26.34, "grad_norm": 15.375941276550293, "learning_rate": 9.122122122122123e-06, "loss": 0.7979, "step": 8770 }, { "epoch": 26.37, "grad_norm": 23.70912742614746, "learning_rate": 9.121121121121121e-06, "loss": 0.8757, "step": 8780 }, { "epoch": 26.4, "grad_norm": 16.594446182250977, "learning_rate": 9.120120120120121e-06, "loss": 0.8381, "step": 8790 }, { "epoch": 26.43, "grad_norm": 20.395078659057617, "learning_rate": 9.11911911911912e-06, "loss": 0.8804, "step": 8800 }, { "epoch": 26.46, "grad_norm": 15.868534088134766, "learning_rate": 9.118118118118119e-06, "loss": 0.8761, "step": 8810 }, { "epoch": 26.49, "grad_norm": 18.077486038208008, "learning_rate": 9.117117117117117e-06, "loss": 0.8425, "step": 8820 }, { "epoch": 26.52, "grad_norm": 15.2352294921875, "learning_rate": 9.116116116116117e-06, "loss": 0.7712, "step": 8830 }, { "epoch": 26.55, "grad_norm": 14.5189208984375, "learning_rate": 9.115115115115116e-06, "loss": 0.7817, "step": 8840 }, { "epoch": 26.58, "grad_norm": 15.7669038772583, "learning_rate": 9.114114114114115e-06, "loss": 0.8065, "step": 8850 }, { "epoch": 26.61, "grad_norm": 16.65861701965332, "learning_rate": 9.113113113113113e-06, "loss": 0.8495, "step": 8860 }, { "epoch": 26.64, "grad_norm": 14.653789520263672, "learning_rate": 9.112112112112112e-06, "loss": 0.8059, "step": 8870 }, { "epoch": 26.67, "grad_norm": 25.060998916625977, "learning_rate": 9.111111111111112e-06, "loss": 0.8203, "step": 8880 }, { "epoch": 26.7, "grad_norm": 10.723742485046387, "learning_rate": 9.11011011011011e-06, "loss": 0.7768, "step": 8890 }, { "epoch": 26.73, "grad_norm": 16.3524227142334, "learning_rate": 9.10910910910911e-06, "loss": 0.7948, "step": 8900 }, { "epoch": 26.76, "grad_norm": 13.840607643127441, "learning_rate": 9.10810810810811e-06, "loss": 0.7554, "step": 8910 }, { "epoch": 26.79, "grad_norm": 15.388121604919434, "learning_rate": 9.107107107107108e-06, "loss": 0.8108, "step": 8920 }, { "epoch": 26.82, "grad_norm": 15.419222831726074, "learning_rate": 9.106106106106107e-06, "loss": 0.8551, "step": 8930 }, { "epoch": 26.85, "grad_norm": 13.759719848632812, "learning_rate": 9.105105105105105e-06, "loss": 0.7758, "step": 8940 }, { "epoch": 26.88, "grad_norm": 20.2331600189209, "learning_rate": 9.104104104104104e-06, "loss": 0.7934, "step": 8950 }, { "epoch": 26.91, "grad_norm": 13.752047538757324, "learning_rate": 9.103103103103104e-06, "loss": 0.7795, "step": 8960 }, { "epoch": 26.94, "grad_norm": 20.42212677001953, "learning_rate": 9.102102102102103e-06, "loss": 0.842, "step": 8970 }, { "epoch": 26.97, "grad_norm": 15.064266204833984, "learning_rate": 9.101101101101101e-06, "loss": 0.8582, "step": 8980 }, { "epoch": 27.0, "grad_norm": 13.374460220336914, "learning_rate": 9.100100100100102e-06, "loss": 0.8041, "step": 8990 }, { "epoch": 27.0, "eval_accuracy": 0.8555, "eval_loss": 0.46856260299682617, "eval_runtime": 12.6948, "eval_samples_per_second": 787.727, "eval_steps_per_second": 3.151, "step": 8991 }, { "epoch": 27.03, "grad_norm": 15.646354675292969, "learning_rate": 9.0990990990991e-06, "loss": 0.7201, "step": 9000 }, { "epoch": 27.06, "grad_norm": 14.156930923461914, "learning_rate": 9.098098098098099e-06, "loss": 0.7586, "step": 9010 }, { "epoch": 27.09, "grad_norm": 18.541675567626953, "learning_rate": 9.097097097097098e-06, "loss": 0.8411, "step": 9020 }, { "epoch": 27.12, "grad_norm": 15.928442001342773, "learning_rate": 9.096096096096096e-06, "loss": 0.8087, "step": 9030 }, { "epoch": 27.15, "grad_norm": 23.12171745300293, "learning_rate": 9.095095095095095e-06, "loss": 0.8369, "step": 9040 }, { "epoch": 27.18, "grad_norm": 13.158681869506836, "learning_rate": 9.094094094094095e-06, "loss": 0.8279, "step": 9050 }, { "epoch": 27.21, "grad_norm": 12.238262176513672, "learning_rate": 9.093093093093094e-06, "loss": 0.7691, "step": 9060 }, { "epoch": 27.24, "grad_norm": 17.15790367126465, "learning_rate": 9.092092092092092e-06, "loss": 0.7774, "step": 9070 }, { "epoch": 27.27, "grad_norm": 12.419235229492188, "learning_rate": 9.091091091091093e-06, "loss": 0.7231, "step": 9080 }, { "epoch": 27.3, "grad_norm": 12.092683792114258, "learning_rate": 9.090090090090091e-06, "loss": 0.833, "step": 9090 }, { "epoch": 27.33, "grad_norm": 15.581094741821289, "learning_rate": 9.08908908908909e-06, "loss": 0.7504, "step": 9100 }, { "epoch": 27.36, "grad_norm": 18.712749481201172, "learning_rate": 9.088088088088088e-06, "loss": 0.8297, "step": 9110 }, { "epoch": 27.39, "grad_norm": 16.576900482177734, "learning_rate": 9.087087087087087e-06, "loss": 0.8084, "step": 9120 }, { "epoch": 27.42, "grad_norm": 28.18408966064453, "learning_rate": 9.086086086086087e-06, "loss": 0.8344, "step": 9130 }, { "epoch": 27.45, "grad_norm": 18.83150291442871, "learning_rate": 9.085085085085086e-06, "loss": 0.792, "step": 9140 }, { "epoch": 27.48, "grad_norm": 19.39872932434082, "learning_rate": 9.084084084084084e-06, "loss": 0.8539, "step": 9150 }, { "epoch": 27.51, "grad_norm": 15.377179145812988, "learning_rate": 9.083083083083085e-06, "loss": 0.8331, "step": 9160 }, { "epoch": 27.54, "grad_norm": 15.049494743347168, "learning_rate": 9.082082082082083e-06, "loss": 0.7915, "step": 9170 }, { "epoch": 27.57, "grad_norm": 14.656966209411621, "learning_rate": 9.081081081081082e-06, "loss": 0.7344, "step": 9180 }, { "epoch": 27.6, "grad_norm": 18.953798294067383, "learning_rate": 9.08008008008008e-06, "loss": 0.7675, "step": 9190 }, { "epoch": 27.63, "grad_norm": 15.632416725158691, "learning_rate": 9.079079079079079e-06, "loss": 0.8395, "step": 9200 }, { "epoch": 27.66, "grad_norm": 19.34852409362793, "learning_rate": 9.07807807807808e-06, "loss": 0.7695, "step": 9210 }, { "epoch": 27.69, "grad_norm": 16.209014892578125, "learning_rate": 9.077077077077078e-06, "loss": 0.7672, "step": 9220 }, { "epoch": 27.72, "grad_norm": 19.034046173095703, "learning_rate": 9.076076076076077e-06, "loss": 0.9088, "step": 9230 }, { "epoch": 27.75, "grad_norm": 17.229251861572266, "learning_rate": 9.075075075075077e-06, "loss": 0.8759, "step": 9240 }, { "epoch": 27.78, "grad_norm": 14.129185676574707, "learning_rate": 9.074074074074075e-06, "loss": 0.8082, "step": 9250 }, { "epoch": 27.81, "grad_norm": 13.324420928955078, "learning_rate": 9.073073073073074e-06, "loss": 0.777, "step": 9260 }, { "epoch": 27.84, "grad_norm": 16.472885131835938, "learning_rate": 9.072072072072073e-06, "loss": 0.809, "step": 9270 }, { "epoch": 27.87, "grad_norm": 14.68637466430664, "learning_rate": 9.071071071071071e-06, "loss": 0.7496, "step": 9280 }, { "epoch": 27.9, "grad_norm": 13.848377227783203, "learning_rate": 9.07007007007007e-06, "loss": 0.8411, "step": 9290 }, { "epoch": 27.93, "grad_norm": 17.72799301147461, "learning_rate": 9.06906906906907e-06, "loss": 0.7486, "step": 9300 }, { "epoch": 27.96, "grad_norm": 16.831594467163086, "learning_rate": 9.068068068068069e-06, "loss": 0.8027, "step": 9310 }, { "epoch": 27.99, "grad_norm": 30.738222122192383, "learning_rate": 9.067067067067067e-06, "loss": 0.8036, "step": 9320 }, { "epoch": 28.0, "eval_accuracy": 0.8578, "eval_loss": 0.46087294816970825, "eval_runtime": 12.7214, "eval_samples_per_second": 786.076, "eval_steps_per_second": 3.144, "step": 9324 }, { "epoch": 28.02, "grad_norm": 11.911028861999512, "learning_rate": 9.066066066066068e-06, "loss": 0.7672, "step": 9330 }, { "epoch": 28.05, "grad_norm": 14.490723609924316, "learning_rate": 9.065065065065066e-06, "loss": 0.7613, "step": 9340 }, { "epoch": 28.08, "grad_norm": 19.9595947265625, "learning_rate": 9.064064064064065e-06, "loss": 0.7991, "step": 9350 }, { "epoch": 28.11, "grad_norm": 17.42326545715332, "learning_rate": 9.063063063063063e-06, "loss": 0.8108, "step": 9360 }, { "epoch": 28.14, "grad_norm": 15.845438957214355, "learning_rate": 9.062062062062062e-06, "loss": 0.7008, "step": 9370 }, { "epoch": 28.17, "grad_norm": 20.148286819458008, "learning_rate": 9.061061061061062e-06, "loss": 0.7418, "step": 9380 }, { "epoch": 28.2, "grad_norm": 13.345690727233887, "learning_rate": 9.06006006006006e-06, "loss": 0.7152, "step": 9390 }, { "epoch": 28.23, "grad_norm": 12.969523429870605, "learning_rate": 9.05905905905906e-06, "loss": 0.8165, "step": 9400 }, { "epoch": 28.26, "grad_norm": 15.857499122619629, "learning_rate": 9.05805805805806e-06, "loss": 0.73, "step": 9410 }, { "epoch": 28.29, "grad_norm": 14.983546257019043, "learning_rate": 9.057057057057058e-06, "loss": 0.8108, "step": 9420 }, { "epoch": 28.32, "grad_norm": 16.10476303100586, "learning_rate": 9.056056056056057e-06, "loss": 0.7254, "step": 9430 }, { "epoch": 28.35, "grad_norm": 12.998758316040039, "learning_rate": 9.055055055055055e-06, "loss": 0.7881, "step": 9440 }, { "epoch": 28.38, "grad_norm": 17.046140670776367, "learning_rate": 9.054054054054054e-06, "loss": 0.8049, "step": 9450 }, { "epoch": 28.41, "grad_norm": 15.55833625793457, "learning_rate": 9.053053053053054e-06, "loss": 0.7728, "step": 9460 }, { "epoch": 28.44, "grad_norm": 16.298221588134766, "learning_rate": 9.052052052052053e-06, "loss": 0.794, "step": 9470 }, { "epoch": 28.47, "grad_norm": 16.310712814331055, "learning_rate": 9.051051051051052e-06, "loss": 0.835, "step": 9480 }, { "epoch": 28.5, "grad_norm": 15.452524185180664, "learning_rate": 9.05005005005005e-06, "loss": 0.8311, "step": 9490 }, { "epoch": 28.53, "grad_norm": 13.287662506103516, "learning_rate": 9.04904904904905e-06, "loss": 0.791, "step": 9500 }, { "epoch": 28.56, "grad_norm": 11.791207313537598, "learning_rate": 9.048048048048049e-06, "loss": 0.8211, "step": 9510 }, { "epoch": 28.59, "grad_norm": 14.900876998901367, "learning_rate": 9.047047047047048e-06, "loss": 0.7735, "step": 9520 }, { "epoch": 28.62, "grad_norm": 18.341766357421875, "learning_rate": 9.046046046046046e-06, "loss": 0.8503, "step": 9530 }, { "epoch": 28.65, "grad_norm": 18.77469253540039, "learning_rate": 9.045045045045045e-06, "loss": 0.7304, "step": 9540 }, { "epoch": 28.68, "grad_norm": 18.09326171875, "learning_rate": 9.044044044044045e-06, "loss": 0.7796, "step": 9550 }, { "epoch": 28.71, "grad_norm": 16.77538299560547, "learning_rate": 9.043043043043044e-06, "loss": 0.8585, "step": 9560 }, { "epoch": 28.74, "grad_norm": 14.218920707702637, "learning_rate": 9.042042042042042e-06, "loss": 0.8425, "step": 9570 }, { "epoch": 28.77, "grad_norm": 15.359567642211914, "learning_rate": 9.041041041041043e-06, "loss": 0.7995, "step": 9580 }, { "epoch": 28.8, "grad_norm": 17.3696231842041, "learning_rate": 9.040040040040041e-06, "loss": 0.8462, "step": 9590 }, { "epoch": 28.83, "grad_norm": 17.48063850402832, "learning_rate": 9.03903903903904e-06, "loss": 0.794, "step": 9600 }, { "epoch": 28.86, "grad_norm": 14.394133567810059, "learning_rate": 9.038038038038038e-06, "loss": 0.8378, "step": 9610 }, { "epoch": 28.89, "grad_norm": 22.86805534362793, "learning_rate": 9.037037037037037e-06, "loss": 0.7753, "step": 9620 }, { "epoch": 28.92, "grad_norm": 11.59033203125, "learning_rate": 9.036036036036037e-06, "loss": 0.7755, "step": 9630 }, { "epoch": 28.95, "grad_norm": 21.878067016601562, "learning_rate": 9.035035035035036e-06, "loss": 0.843, "step": 9640 }, { "epoch": 28.98, "grad_norm": 12.910211563110352, "learning_rate": 9.034034034034034e-06, "loss": 0.7571, "step": 9650 }, { "epoch": 29.0, "eval_accuracy": 0.8616, "eval_loss": 0.4597092270851135, "eval_runtime": 12.7071, "eval_samples_per_second": 786.965, "eval_steps_per_second": 3.148, "step": 9657 }, { "epoch": 29.01, "grad_norm": 22.672311782836914, "learning_rate": 9.033033033033035e-06, "loss": 0.7749, "step": 9660 }, { "epoch": 29.04, "grad_norm": 20.296001434326172, "learning_rate": 9.032032032032033e-06, "loss": 0.7856, "step": 9670 }, { "epoch": 29.07, "grad_norm": 15.42845344543457, "learning_rate": 9.031031031031032e-06, "loss": 0.7632, "step": 9680 }, { "epoch": 29.1, "grad_norm": 18.354835510253906, "learning_rate": 9.03003003003003e-06, "loss": 0.7764, "step": 9690 }, { "epoch": 29.13, "grad_norm": 20.938955307006836, "learning_rate": 9.029029029029029e-06, "loss": 0.7339, "step": 9700 }, { "epoch": 29.16, "grad_norm": 14.629878997802734, "learning_rate": 9.02802802802803e-06, "loss": 0.7615, "step": 9710 }, { "epoch": 29.19, "grad_norm": 13.274352073669434, "learning_rate": 9.027027027027028e-06, "loss": 0.7745, "step": 9720 }, { "epoch": 29.22, "grad_norm": 15.748404502868652, "learning_rate": 9.026026026026027e-06, "loss": 0.7845, "step": 9730 }, { "epoch": 29.25, "grad_norm": 15.212640762329102, "learning_rate": 9.025025025025025e-06, "loss": 0.7967, "step": 9740 }, { "epoch": 29.28, "grad_norm": 17.692638397216797, "learning_rate": 9.024024024024025e-06, "loss": 0.7447, "step": 9750 }, { "epoch": 29.31, "grad_norm": 15.170554161071777, "learning_rate": 9.023023023023024e-06, "loss": 0.7363, "step": 9760 }, { "epoch": 29.34, "grad_norm": 14.743155479431152, "learning_rate": 9.022022022022023e-06, "loss": 0.7615, "step": 9770 }, { "epoch": 29.37, "grad_norm": 18.1495361328125, "learning_rate": 9.021021021021021e-06, "loss": 0.7413, "step": 9780 }, { "epoch": 29.4, "grad_norm": 19.55647087097168, "learning_rate": 9.02002002002002e-06, "loss": 0.7874, "step": 9790 }, { "epoch": 29.43, "grad_norm": 13.923855781555176, "learning_rate": 9.01901901901902e-06, "loss": 0.7791, "step": 9800 }, { "epoch": 29.46, "grad_norm": 14.469110488891602, "learning_rate": 9.018018018018019e-06, "loss": 0.7539, "step": 9810 }, { "epoch": 29.49, "grad_norm": 18.734216690063477, "learning_rate": 9.017017017017017e-06, "loss": 0.7518, "step": 9820 }, { "epoch": 29.52, "grad_norm": 29.21765899658203, "learning_rate": 9.016016016016018e-06, "loss": 0.788, "step": 9830 }, { "epoch": 29.55, "grad_norm": 18.254535675048828, "learning_rate": 9.015015015015016e-06, "loss": 0.8137, "step": 9840 }, { "epoch": 29.58, "grad_norm": 24.3439998626709, "learning_rate": 9.014014014014015e-06, "loss": 0.7546, "step": 9850 }, { "epoch": 29.61, "grad_norm": 21.63543701171875, "learning_rate": 9.013013013013013e-06, "loss": 0.8033, "step": 9860 }, { "epoch": 29.64, "grad_norm": 20.718181610107422, "learning_rate": 9.012012012012012e-06, "loss": 0.7559, "step": 9870 }, { "epoch": 29.67, "grad_norm": 18.576004028320312, "learning_rate": 9.011011011011012e-06, "loss": 0.6953, "step": 9880 }, { "epoch": 29.7, "grad_norm": 13.842483520507812, "learning_rate": 9.010010010010011e-06, "loss": 0.8485, "step": 9890 }, { "epoch": 29.73, "grad_norm": 12.415961265563965, "learning_rate": 9.00900900900901e-06, "loss": 0.7197, "step": 9900 }, { "epoch": 29.76, "grad_norm": 21.572906494140625, "learning_rate": 9.00800800800801e-06, "loss": 0.7952, "step": 9910 }, { "epoch": 29.79, "grad_norm": 16.539274215698242, "learning_rate": 9.007007007007008e-06, "loss": 0.8305, "step": 9920 }, { "epoch": 29.82, "grad_norm": 24.245912551879883, "learning_rate": 9.006006006006007e-06, "loss": 0.7945, "step": 9930 }, { "epoch": 29.85, "grad_norm": 16.62843132019043, "learning_rate": 9.005005005005006e-06, "loss": 0.8332, "step": 9940 }, { "epoch": 29.88, "grad_norm": 17.65282440185547, "learning_rate": 9.004004004004004e-06, "loss": 0.7788, "step": 9950 }, { "epoch": 29.91, "grad_norm": 14.350862503051758, "learning_rate": 9.003003003003003e-06, "loss": 0.8164, "step": 9960 }, { "epoch": 29.94, "grad_norm": 14.50328540802002, "learning_rate": 9.002002002002003e-06, "loss": 0.7701, "step": 9970 }, { "epoch": 29.97, "grad_norm": 15.73355484008789, "learning_rate": 9.001001001001002e-06, "loss": 0.7745, "step": 9980 }, { "epoch": 30.0, "grad_norm": 62.948577880859375, "learning_rate": 9e-06, "loss": 0.7666, "step": 9990 }, { "epoch": 30.0, "eval_accuracy": 0.8606, "eval_loss": 0.45814234018325806, "eval_runtime": 12.6385, "eval_samples_per_second": 791.23, "eval_steps_per_second": 3.165, "step": 9990 }, { "epoch": 30.03, "grad_norm": 16.523704528808594, "learning_rate": 8.998998998999e-06, "loss": 0.8094, "step": 10000 }, { "epoch": 30.06, "grad_norm": 12.528546333312988, "learning_rate": 8.997997997997999e-06, "loss": 0.8251, "step": 10010 }, { "epoch": 30.09, "grad_norm": 16.390399932861328, "learning_rate": 8.996996996996998e-06, "loss": 0.7715, "step": 10020 }, { "epoch": 30.12, "grad_norm": 17.987390518188477, "learning_rate": 8.995995995995996e-06, "loss": 0.7799, "step": 10030 }, { "epoch": 30.15, "grad_norm": 17.51533317565918, "learning_rate": 8.994994994994995e-06, "loss": 0.7342, "step": 10040 }, { "epoch": 30.18, "grad_norm": 13.23087215423584, "learning_rate": 8.993993993993995e-06, "loss": 0.691, "step": 10050 }, { "epoch": 30.21, "grad_norm": 16.02936553955078, "learning_rate": 8.992992992992994e-06, "loss": 0.7646, "step": 10060 }, { "epoch": 30.24, "grad_norm": 13.150365829467773, "learning_rate": 8.991991991991992e-06, "loss": 0.6925, "step": 10070 }, { "epoch": 30.27, "grad_norm": 20.552528381347656, "learning_rate": 8.990990990990993e-06, "loss": 0.7867, "step": 10080 }, { "epoch": 30.3, "grad_norm": 14.539362907409668, "learning_rate": 8.989989989989991e-06, "loss": 0.691, "step": 10090 }, { "epoch": 30.33, "grad_norm": 18.682815551757812, "learning_rate": 8.98898898898899e-06, "loss": 0.7358, "step": 10100 }, { "epoch": 30.36, "grad_norm": 25.57851219177246, "learning_rate": 8.987987987987988e-06, "loss": 0.6709, "step": 10110 }, { "epoch": 30.39, "grad_norm": 23.194116592407227, "learning_rate": 8.986986986986987e-06, "loss": 0.7437, "step": 10120 }, { "epoch": 30.42, "grad_norm": 23.166088104248047, "learning_rate": 8.985985985985987e-06, "loss": 0.7844, "step": 10130 }, { "epoch": 30.45, "grad_norm": 15.125076293945312, "learning_rate": 8.984984984984986e-06, "loss": 0.7677, "step": 10140 }, { "epoch": 30.48, "grad_norm": 16.771602630615234, "learning_rate": 8.983983983983985e-06, "loss": 0.7418, "step": 10150 }, { "epoch": 30.51, "grad_norm": 15.16159725189209, "learning_rate": 8.982982982982985e-06, "loss": 0.7895, "step": 10160 }, { "epoch": 30.54, "grad_norm": 14.736344337463379, "learning_rate": 8.981981981981983e-06, "loss": 0.7441, "step": 10170 }, { "epoch": 30.57, "grad_norm": 17.855131149291992, "learning_rate": 8.980980980980982e-06, "loss": 0.6948, "step": 10180 }, { "epoch": 30.6, "grad_norm": 16.532987594604492, "learning_rate": 8.97997997997998e-06, "loss": 0.743, "step": 10190 }, { "epoch": 30.63, "grad_norm": 16.70667266845703, "learning_rate": 8.97897897897898e-06, "loss": 0.7552, "step": 10200 }, { "epoch": 30.66, "grad_norm": 15.976407051086426, "learning_rate": 8.977977977977978e-06, "loss": 0.709, "step": 10210 }, { "epoch": 30.69, "grad_norm": 24.74825096130371, "learning_rate": 8.976976976976978e-06, "loss": 0.7439, "step": 10220 }, { "epoch": 30.72, "grad_norm": 26.14236068725586, "learning_rate": 8.975975975975977e-06, "loss": 0.7512, "step": 10230 }, { "epoch": 30.75, "grad_norm": 15.58338737487793, "learning_rate": 8.974974974974975e-06, "loss": 0.7207, "step": 10240 }, { "epoch": 30.78, "grad_norm": 13.44525146484375, "learning_rate": 8.973973973973976e-06, "loss": 0.775, "step": 10250 }, { "epoch": 30.81, "grad_norm": 12.716086387634277, "learning_rate": 8.972972972972974e-06, "loss": 0.7855, "step": 10260 }, { "epoch": 30.84, "grad_norm": 17.73958396911621, "learning_rate": 8.971971971971973e-06, "loss": 0.7441, "step": 10270 }, { "epoch": 30.87, "grad_norm": 13.41761589050293, "learning_rate": 8.970970970970971e-06, "loss": 0.7244, "step": 10280 }, { "epoch": 30.9, "grad_norm": 15.482220649719238, "learning_rate": 8.96996996996997e-06, "loss": 0.7412, "step": 10290 }, { "epoch": 30.93, "grad_norm": 21.90937614440918, "learning_rate": 8.96896896896897e-06, "loss": 0.784, "step": 10300 }, { "epoch": 30.96, "grad_norm": 18.506118774414062, "learning_rate": 8.967967967967969e-06, "loss": 0.7994, "step": 10310 }, { "epoch": 30.99, "grad_norm": 16.833866119384766, "learning_rate": 8.966966966966967e-06, "loss": 0.7226, "step": 10320 }, { "epoch": 31.0, "eval_accuracy": 0.8601, "eval_loss": 0.45692509412765503, "eval_runtime": 12.6373, "eval_samples_per_second": 791.306, "eval_steps_per_second": 3.165, "step": 10323 }, { "epoch": 31.02, "grad_norm": 19.576566696166992, "learning_rate": 8.965965965965968e-06, "loss": 0.7177, "step": 10330 }, { "epoch": 31.05, "grad_norm": 15.05361557006836, "learning_rate": 8.964964964964966e-06, "loss": 0.8626, "step": 10340 }, { "epoch": 31.08, "grad_norm": 15.818574905395508, "learning_rate": 8.963963963963965e-06, "loss": 0.817, "step": 10350 }, { "epoch": 31.11, "grad_norm": 15.349098205566406, "learning_rate": 8.962962962962963e-06, "loss": 0.7074, "step": 10360 }, { "epoch": 31.14, "grad_norm": 17.57956314086914, "learning_rate": 8.961961961961962e-06, "loss": 0.6718, "step": 10370 }, { "epoch": 31.17, "grad_norm": 17.0306453704834, "learning_rate": 8.960960960960962e-06, "loss": 0.7001, "step": 10380 }, { "epoch": 31.2, "grad_norm": 19.56716537475586, "learning_rate": 8.959959959959961e-06, "loss": 0.7383, "step": 10390 }, { "epoch": 31.23, "grad_norm": 14.34196949005127, "learning_rate": 8.95895895895896e-06, "loss": 0.7525, "step": 10400 }, { "epoch": 31.26, "grad_norm": 18.495691299438477, "learning_rate": 8.957957957957958e-06, "loss": 0.7291, "step": 10410 }, { "epoch": 31.29, "grad_norm": 11.233891487121582, "learning_rate": 8.956956956956958e-06, "loss": 0.7199, "step": 10420 }, { "epoch": 31.32, "grad_norm": 11.309521675109863, "learning_rate": 8.955955955955957e-06, "loss": 0.7664, "step": 10430 }, { "epoch": 31.35, "grad_norm": 13.5640230178833, "learning_rate": 8.954954954954956e-06, "loss": 0.7566, "step": 10440 }, { "epoch": 31.38, "grad_norm": 17.966054916381836, "learning_rate": 8.953953953953954e-06, "loss": 0.7663, "step": 10450 }, { "epoch": 31.41, "grad_norm": 15.01924991607666, "learning_rate": 8.952952952952953e-06, "loss": 0.7395, "step": 10460 }, { "epoch": 31.44, "grad_norm": 15.482879638671875, "learning_rate": 8.951951951951953e-06, "loss": 0.6937, "step": 10470 }, { "epoch": 31.47, "grad_norm": 14.140779495239258, "learning_rate": 8.950950950950952e-06, "loss": 0.7769, "step": 10480 }, { "epoch": 31.5, "grad_norm": 15.519713401794434, "learning_rate": 8.94994994994995e-06, "loss": 0.7388, "step": 10490 }, { "epoch": 31.53, "grad_norm": 13.309924125671387, "learning_rate": 8.94894894894895e-06, "loss": 0.7968, "step": 10500 }, { "epoch": 31.56, "grad_norm": 17.812463760375977, "learning_rate": 8.94794794794795e-06, "loss": 0.7677, "step": 10510 }, { "epoch": 31.59, "grad_norm": 19.066875457763672, "learning_rate": 8.946946946946948e-06, "loss": 0.7271, "step": 10520 }, { "epoch": 31.62, "grad_norm": 18.764406204223633, "learning_rate": 8.945945945945946e-06, "loss": 0.7753, "step": 10530 }, { "epoch": 31.65, "grad_norm": 20.98076629638672, "learning_rate": 8.944944944944945e-06, "loss": 0.7288, "step": 10540 }, { "epoch": 31.68, "grad_norm": 19.42600440979004, "learning_rate": 8.943943943943945e-06, "loss": 0.7835, "step": 10550 }, { "epoch": 31.71, "grad_norm": 15.125018119812012, "learning_rate": 8.942942942942944e-06, "loss": 0.7244, "step": 10560 }, { "epoch": 31.74, "grad_norm": 13.277347564697266, "learning_rate": 8.941941941941942e-06, "loss": 0.8209, "step": 10570 }, { "epoch": 31.77, "grad_norm": 12.954193115234375, "learning_rate": 8.940940940940943e-06, "loss": 0.7931, "step": 10580 }, { "epoch": 31.8, "grad_norm": 19.218364715576172, "learning_rate": 8.939939939939941e-06, "loss": 0.8124, "step": 10590 }, { "epoch": 31.83, "grad_norm": 19.637117385864258, "learning_rate": 8.93893893893894e-06, "loss": 0.6632, "step": 10600 }, { "epoch": 31.86, "grad_norm": 16.105588912963867, "learning_rate": 8.937937937937939e-06, "loss": 0.762, "step": 10610 }, { "epoch": 31.89, "grad_norm": 15.068008422851562, "learning_rate": 8.936936936936937e-06, "loss": 0.7938, "step": 10620 }, { "epoch": 31.92, "grad_norm": 13.940908432006836, "learning_rate": 8.935935935935937e-06, "loss": 0.7368, "step": 10630 }, { "epoch": 31.95, "grad_norm": 13.348282814025879, "learning_rate": 8.934934934934936e-06, "loss": 0.7477, "step": 10640 }, { "epoch": 31.98, "grad_norm": 16.640409469604492, "learning_rate": 8.933933933933935e-06, "loss": 0.7179, "step": 10650 }, { "epoch": 32.0, "eval_accuracy": 0.8628, "eval_loss": 0.45731332898139954, "eval_runtime": 12.6917, "eval_samples_per_second": 787.917, "eval_steps_per_second": 3.152, "step": 10656 }, { "epoch": 32.01, "grad_norm": 12.540587425231934, "learning_rate": 8.932932932932933e-06, "loss": 0.633, "step": 10660 }, { "epoch": 32.04, "grad_norm": 14.436250686645508, "learning_rate": 8.931931931931933e-06, "loss": 0.7423, "step": 10670 }, { "epoch": 32.07, "grad_norm": 12.762358665466309, "learning_rate": 8.93093093093093e-06, "loss": 0.7614, "step": 10680 }, { "epoch": 32.1, "grad_norm": 12.973264694213867, "learning_rate": 8.92992992992993e-06, "loss": 0.7105, "step": 10690 }, { "epoch": 32.13, "grad_norm": 18.508798599243164, "learning_rate": 8.92892892892893e-06, "loss": 0.74, "step": 10700 }, { "epoch": 32.16, "grad_norm": 21.682180404663086, "learning_rate": 8.927927927927928e-06, "loss": 0.7725, "step": 10710 }, { "epoch": 32.19, "grad_norm": 16.946609497070312, "learning_rate": 8.926926926926928e-06, "loss": 0.7464, "step": 10720 }, { "epoch": 32.22, "grad_norm": 16.109943389892578, "learning_rate": 8.925925925925927e-06, "loss": 0.7526, "step": 10730 }, { "epoch": 32.25, "grad_norm": 26.805973052978516, "learning_rate": 8.924924924924925e-06, "loss": 0.8015, "step": 10740 }, { "epoch": 32.28, "grad_norm": 16.171995162963867, "learning_rate": 8.923923923923926e-06, "loss": 0.7825, "step": 10750 }, { "epoch": 32.31, "grad_norm": 13.513046264648438, "learning_rate": 8.922922922922924e-06, "loss": 0.7439, "step": 10760 }, { "epoch": 32.34, "grad_norm": 17.154273986816406, "learning_rate": 8.921921921921923e-06, "loss": 0.6719, "step": 10770 }, { "epoch": 32.37, "grad_norm": 12.631355285644531, "learning_rate": 8.920920920920921e-06, "loss": 0.7323, "step": 10780 }, { "epoch": 32.4, "grad_norm": 13.410599708557129, "learning_rate": 8.91991991991992e-06, "loss": 0.7047, "step": 10790 }, { "epoch": 32.43, "grad_norm": 20.0744686126709, "learning_rate": 8.91891891891892e-06, "loss": 0.7817, "step": 10800 }, { "epoch": 32.46, "grad_norm": 14.753862380981445, "learning_rate": 8.917917917917919e-06, "loss": 0.7503, "step": 10810 }, { "epoch": 32.49, "grad_norm": 17.139963150024414, "learning_rate": 8.916916916916917e-06, "loss": 0.632, "step": 10820 }, { "epoch": 32.52, "grad_norm": 22.070852279663086, "learning_rate": 8.915915915915918e-06, "loss": 0.759, "step": 10830 }, { "epoch": 32.55, "grad_norm": 14.3706693649292, "learning_rate": 8.914914914914916e-06, "loss": 0.7906, "step": 10840 }, { "epoch": 32.58, "grad_norm": 13.108540534973145, "learning_rate": 8.913913913913915e-06, "loss": 0.7442, "step": 10850 }, { "epoch": 32.61, "grad_norm": 14.135384559631348, "learning_rate": 8.912912912912914e-06, "loss": 0.765, "step": 10860 }, { "epoch": 32.64, "grad_norm": 16.804826736450195, "learning_rate": 8.911911911911912e-06, "loss": 0.7468, "step": 10870 }, { "epoch": 32.67, "grad_norm": 21.809114456176758, "learning_rate": 8.91091091091091e-06, "loss": 0.7202, "step": 10880 }, { "epoch": 32.7, "grad_norm": 13.352616310119629, "learning_rate": 8.909909909909911e-06, "loss": 0.7129, "step": 10890 }, { "epoch": 32.73, "grad_norm": 18.08527946472168, "learning_rate": 8.90890890890891e-06, "loss": 0.7206, "step": 10900 }, { "epoch": 32.76, "grad_norm": 13.541923522949219, "learning_rate": 8.907907907907908e-06, "loss": 0.7004, "step": 10910 }, { "epoch": 32.79, "grad_norm": 18.027904510498047, "learning_rate": 8.906906906906909e-06, "loss": 0.7492, "step": 10920 }, { "epoch": 32.82, "grad_norm": 14.436220169067383, "learning_rate": 8.905905905905905e-06, "loss": 0.7447, "step": 10930 }, { "epoch": 32.85, "grad_norm": 16.82992172241211, "learning_rate": 8.904904904904906e-06, "loss": 0.7365, "step": 10940 }, { "epoch": 32.88, "grad_norm": 15.015875816345215, "learning_rate": 8.903903903903904e-06, "loss": 0.7351, "step": 10950 }, { "epoch": 32.91, "grad_norm": 14.806081771850586, "learning_rate": 8.902902902902903e-06, "loss": 0.7421, "step": 10960 }, { "epoch": 32.94, "grad_norm": 21.481706619262695, "learning_rate": 8.901901901901903e-06, "loss": 0.7305, "step": 10970 }, { "epoch": 32.97, "grad_norm": 19.971282958984375, "learning_rate": 8.900900900900902e-06, "loss": 0.6866, "step": 10980 }, { "epoch": 33.0, "eval_accuracy": 0.8606, "eval_loss": 0.45673778653144836, "eval_runtime": 12.7087, "eval_samples_per_second": 786.86, "eval_steps_per_second": 3.147, "step": 10989 }, { "epoch": 33.0, "grad_norm": 12.800230979919434, "learning_rate": 8.8998998998999e-06, "loss": 0.646, "step": 10990 }, { "epoch": 33.03, "grad_norm": 17.8346004486084, "learning_rate": 8.8988988988989e-06, "loss": 0.8046, "step": 11000 }, { "epoch": 33.06, "grad_norm": 18.317232131958008, "learning_rate": 8.8978978978979e-06, "loss": 0.7373, "step": 11010 }, { "epoch": 33.09, "grad_norm": 15.877395629882812, "learning_rate": 8.896896896896898e-06, "loss": 0.6893, "step": 11020 }, { "epoch": 33.12, "grad_norm": 14.688508033752441, "learning_rate": 8.895895895895896e-06, "loss": 0.7672, "step": 11030 }, { "epoch": 33.15, "grad_norm": 13.723471641540527, "learning_rate": 8.894894894894895e-06, "loss": 0.7018, "step": 11040 }, { "epoch": 33.18, "grad_norm": 24.826631546020508, "learning_rate": 8.893893893893895e-06, "loss": 0.7106, "step": 11050 }, { "epoch": 33.21, "grad_norm": 21.92152976989746, "learning_rate": 8.892892892892894e-06, "loss": 0.8031, "step": 11060 }, { "epoch": 33.24, "grad_norm": 15.498889923095703, "learning_rate": 8.891891891891893e-06, "loss": 0.7772, "step": 11070 }, { "epoch": 33.27, "grad_norm": 17.40592384338379, "learning_rate": 8.890890890890893e-06, "loss": 0.6772, "step": 11080 }, { "epoch": 33.3, "grad_norm": 12.944879531860352, "learning_rate": 8.889889889889891e-06, "loss": 0.6895, "step": 11090 }, { "epoch": 33.33, "grad_norm": 18.65313148498535, "learning_rate": 8.888888888888888e-06, "loss": 0.751, "step": 11100 }, { "epoch": 33.36, "grad_norm": 15.92672348022461, "learning_rate": 8.887887887887889e-06, "loss": 0.7871, "step": 11110 }, { "epoch": 33.39, "grad_norm": 14.8170166015625, "learning_rate": 8.886886886886887e-06, "loss": 0.6701, "step": 11120 }, { "epoch": 33.42, "grad_norm": 15.525805473327637, "learning_rate": 8.885885885885886e-06, "loss": 0.7314, "step": 11130 }, { "epoch": 33.45, "grad_norm": 13.073753356933594, "learning_rate": 8.884884884884886e-06, "loss": 0.7163, "step": 11140 }, { "epoch": 33.48, "grad_norm": 18.035778045654297, "learning_rate": 8.883883883883885e-06, "loss": 0.6964, "step": 11150 }, { "epoch": 33.51, "grad_norm": 20.28458595275879, "learning_rate": 8.882882882882883e-06, "loss": 0.7479, "step": 11160 }, { "epoch": 33.54, "grad_norm": 15.934008598327637, "learning_rate": 8.881881881881884e-06, "loss": 0.7319, "step": 11170 }, { "epoch": 33.57, "grad_norm": 21.8447208404541, "learning_rate": 8.88088088088088e-06, "loss": 0.774, "step": 11180 }, { "epoch": 33.6, "grad_norm": 10.164825439453125, "learning_rate": 8.87987987987988e-06, "loss": 0.7126, "step": 11190 }, { "epoch": 33.63, "grad_norm": 14.525633811950684, "learning_rate": 8.87887887887888e-06, "loss": 0.6872, "step": 11200 }, { "epoch": 33.66, "grad_norm": 22.03354835510254, "learning_rate": 8.877877877877878e-06, "loss": 0.7237, "step": 11210 }, { "epoch": 33.69, "grad_norm": 13.250273704528809, "learning_rate": 8.876876876876878e-06, "loss": 0.8269, "step": 11220 }, { "epoch": 33.72, "grad_norm": 33.590572357177734, "learning_rate": 8.875875875875877e-06, "loss": 0.7296, "step": 11230 }, { "epoch": 33.75, "grad_norm": 15.577840805053711, "learning_rate": 8.874874874874875e-06, "loss": 0.7686, "step": 11240 }, { "epoch": 33.78, "grad_norm": 19.518146514892578, "learning_rate": 8.873873873873876e-06, "loss": 0.8281, "step": 11250 }, { "epoch": 33.81, "grad_norm": 15.385758399963379, "learning_rate": 8.872872872872874e-06, "loss": 0.7358, "step": 11260 }, { "epoch": 33.84, "grad_norm": 18.950332641601562, "learning_rate": 8.871871871871873e-06, "loss": 0.7626, "step": 11270 }, { "epoch": 33.87, "grad_norm": 18.48717498779297, "learning_rate": 8.870870870870871e-06, "loss": 0.7698, "step": 11280 }, { "epoch": 33.9, "grad_norm": 17.826658248901367, "learning_rate": 8.86986986986987e-06, "loss": 0.7064, "step": 11290 }, { "epoch": 33.93, "grad_norm": 13.667326927185059, "learning_rate": 8.86886886886887e-06, "loss": 0.7336, "step": 11300 }, { "epoch": 33.96, "grad_norm": 14.063284873962402, "learning_rate": 8.867867867867869e-06, "loss": 0.7627, "step": 11310 }, { "epoch": 33.99, "grad_norm": 10.707511901855469, "learning_rate": 8.866866866866868e-06, "loss": 0.7002, "step": 11320 }, { "epoch": 34.0, "eval_accuracy": 0.8576, "eval_loss": 0.4672236442565918, "eval_runtime": 12.5384, "eval_samples_per_second": 797.551, "eval_steps_per_second": 3.19, "step": 11322 }, { "epoch": 34.02, "grad_norm": 18.776241302490234, "learning_rate": 8.865865865865866e-06, "loss": 0.8133, "step": 11330 }, { "epoch": 34.05, "grad_norm": 16.304729461669922, "learning_rate": 8.864864864864866e-06, "loss": 0.7111, "step": 11340 }, { "epoch": 34.08, "grad_norm": 18.15487289428711, "learning_rate": 8.863863863863863e-06, "loss": 0.7658, "step": 11350 }, { "epoch": 34.11, "grad_norm": 20.110403060913086, "learning_rate": 8.862862862862864e-06, "loss": 0.7248, "step": 11360 }, { "epoch": 34.14, "grad_norm": 19.843303680419922, "learning_rate": 8.861861861861862e-06, "loss": 0.7505, "step": 11370 }, { "epoch": 34.17, "grad_norm": 14.428262710571289, "learning_rate": 8.86086086086086e-06, "loss": 0.7215, "step": 11380 }, { "epoch": 34.2, "grad_norm": 14.41454792022705, "learning_rate": 8.859859859859861e-06, "loss": 0.7939, "step": 11390 }, { "epoch": 34.23, "grad_norm": 17.413043975830078, "learning_rate": 8.85885885885886e-06, "loss": 0.7546, "step": 11400 }, { "epoch": 34.26, "grad_norm": 16.91671371459961, "learning_rate": 8.857857857857858e-06, "loss": 0.6689, "step": 11410 }, { "epoch": 34.29, "grad_norm": 15.457772254943848, "learning_rate": 8.856856856856859e-06, "loss": 0.7038, "step": 11420 }, { "epoch": 34.32, "grad_norm": 14.812891006469727, "learning_rate": 8.855855855855855e-06, "loss": 0.7306, "step": 11430 }, { "epoch": 34.35, "grad_norm": 13.014900207519531, "learning_rate": 8.854854854854856e-06, "loss": 0.683, "step": 11440 }, { "epoch": 34.38, "grad_norm": 15.84607982635498, "learning_rate": 8.853853853853854e-06, "loss": 0.7169, "step": 11450 }, { "epoch": 34.41, "grad_norm": 17.296056747436523, "learning_rate": 8.852852852852853e-06, "loss": 0.6834, "step": 11460 }, { "epoch": 34.44, "grad_norm": 19.145584106445312, "learning_rate": 8.851851851851853e-06, "loss": 0.7491, "step": 11470 }, { "epoch": 34.47, "grad_norm": 12.988202095031738, "learning_rate": 8.850850850850852e-06, "loss": 0.7727, "step": 11480 }, { "epoch": 34.5, "grad_norm": 15.30691909790039, "learning_rate": 8.84984984984985e-06, "loss": 0.6978, "step": 11490 }, { "epoch": 34.53, "grad_norm": 20.13016700744629, "learning_rate": 8.84884884884885e-06, "loss": 0.6725, "step": 11500 }, { "epoch": 34.56, "grad_norm": 14.221665382385254, "learning_rate": 8.84784784784785e-06, "loss": 0.7926, "step": 11510 }, { "epoch": 34.59, "grad_norm": 20.517425537109375, "learning_rate": 8.846846846846848e-06, "loss": 0.7829, "step": 11520 }, { "epoch": 34.62, "grad_norm": 16.56093978881836, "learning_rate": 8.845845845845847e-06, "loss": 0.7553, "step": 11530 }, { "epoch": 34.65, "grad_norm": 15.996752738952637, "learning_rate": 8.844844844844845e-06, "loss": 0.6587, "step": 11540 }, { "epoch": 34.68, "grad_norm": 19.360082626342773, "learning_rate": 8.843843843843844e-06, "loss": 0.7418, "step": 11550 }, { "epoch": 34.71, "grad_norm": 14.33364486694336, "learning_rate": 8.842842842842844e-06, "loss": 0.7477, "step": 11560 }, { "epoch": 34.74, "grad_norm": 18.078895568847656, "learning_rate": 8.841841841841843e-06, "loss": 0.7428, "step": 11570 }, { "epoch": 34.77, "grad_norm": 22.46721649169922, "learning_rate": 8.840840840840841e-06, "loss": 0.7594, "step": 11580 }, { "epoch": 34.8, "grad_norm": 19.704814910888672, "learning_rate": 8.839839839839841e-06, "loss": 0.7462, "step": 11590 }, { "epoch": 34.83, "grad_norm": 16.586517333984375, "learning_rate": 8.838838838838838e-06, "loss": 0.7261, "step": 11600 }, { "epoch": 34.86, "grad_norm": 14.008977890014648, "learning_rate": 8.837837837837839e-06, "loss": 0.7063, "step": 11610 }, { "epoch": 34.89, "grad_norm": 23.218713760375977, "learning_rate": 8.836836836836837e-06, "loss": 0.7072, "step": 11620 }, { "epoch": 34.92, "grad_norm": 12.442441940307617, "learning_rate": 8.835835835835836e-06, "loss": 0.7531, "step": 11630 }, { "epoch": 34.95, "grad_norm": 14.057740211486816, "learning_rate": 8.834834834834836e-06, "loss": 0.7379, "step": 11640 }, { "epoch": 34.98, "grad_norm": 15.848073959350586, "learning_rate": 8.833833833833835e-06, "loss": 0.7499, "step": 11650 }, { "epoch": 35.0, "eval_accuracy": 0.8611, "eval_loss": 0.46235302090644836, "eval_runtime": 12.973, "eval_samples_per_second": 770.832, "eval_steps_per_second": 3.083, "step": 11655 }, { "epoch": 35.02, "grad_norm": 15.122843742370605, "learning_rate": 8.832832832832833e-06, "loss": 0.6307, "step": 11660 }, { "epoch": 35.05, "grad_norm": 16.84096336364746, "learning_rate": 8.831831831831834e-06, "loss": 0.6986, "step": 11670 }, { "epoch": 35.08, "grad_norm": 11.740920066833496, "learning_rate": 8.83083083083083e-06, "loss": 0.7522, "step": 11680 }, { "epoch": 35.11, "grad_norm": 13.660318374633789, "learning_rate": 8.82982982982983e-06, "loss": 0.7223, "step": 11690 }, { "epoch": 35.14, "grad_norm": 18.145273208618164, "learning_rate": 8.82882882882883e-06, "loss": 0.7319, "step": 11700 }, { "epoch": 35.17, "grad_norm": 17.636823654174805, "learning_rate": 8.827827827827828e-06, "loss": 0.7589, "step": 11710 }, { "epoch": 35.2, "grad_norm": 11.691192626953125, "learning_rate": 8.826826826826828e-06, "loss": 0.7016, "step": 11720 }, { "epoch": 35.23, "grad_norm": 18.20246696472168, "learning_rate": 8.825825825825827e-06, "loss": 0.7366, "step": 11730 }, { "epoch": 35.26, "grad_norm": 13.31651782989502, "learning_rate": 8.824824824824825e-06, "loss": 0.766, "step": 11740 }, { "epoch": 35.29, "grad_norm": 17.355192184448242, "learning_rate": 8.823823823823826e-06, "loss": 0.6873, "step": 11750 }, { "epoch": 35.32, "grad_norm": 17.020751953125, "learning_rate": 8.822822822822824e-06, "loss": 0.7825, "step": 11760 }, { "epoch": 35.35, "grad_norm": 20.819473266601562, "learning_rate": 8.821821821821823e-06, "loss": 0.7451, "step": 11770 }, { "epoch": 35.38, "grad_norm": 15.498403549194336, "learning_rate": 8.820820820820822e-06, "loss": 0.7573, "step": 11780 }, { "epoch": 35.41, "grad_norm": 23.820362091064453, "learning_rate": 8.81981981981982e-06, "loss": 0.803, "step": 11790 }, { "epoch": 35.44, "grad_norm": 14.88689136505127, "learning_rate": 8.818818818818819e-06, "loss": 0.7374, "step": 11800 }, { "epoch": 35.47, "grad_norm": 20.722328186035156, "learning_rate": 8.817817817817819e-06, "loss": 0.748, "step": 11810 }, { "epoch": 35.5, "grad_norm": 17.91551971435547, "learning_rate": 8.816816816816818e-06, "loss": 0.7649, "step": 11820 }, { "epoch": 35.53, "grad_norm": 16.54551124572754, "learning_rate": 8.815815815815816e-06, "loss": 0.7282, "step": 11830 }, { "epoch": 35.56, "grad_norm": 17.026153564453125, "learning_rate": 8.814814814814817e-06, "loss": 0.7405, "step": 11840 }, { "epoch": 35.59, "grad_norm": 15.0831880569458, "learning_rate": 8.813813813813813e-06, "loss": 0.7831, "step": 11850 }, { "epoch": 35.62, "grad_norm": 12.430713653564453, "learning_rate": 8.812812812812814e-06, "loss": 0.6733, "step": 11860 }, { "epoch": 35.65, "grad_norm": 11.541810035705566, "learning_rate": 8.811811811811812e-06, "loss": 0.7605, "step": 11870 }, { "epoch": 35.68, "grad_norm": 18.88973617553711, "learning_rate": 8.810810810810811e-06, "loss": 0.7422, "step": 11880 }, { "epoch": 35.71, "grad_norm": 17.730871200561523, "learning_rate": 8.809809809809811e-06, "loss": 0.7544, "step": 11890 }, { "epoch": 35.74, "grad_norm": 18.9675235748291, "learning_rate": 8.80880880880881e-06, "loss": 0.7171, "step": 11900 }, { "epoch": 35.77, "grad_norm": 13.948833465576172, "learning_rate": 8.807807807807808e-06, "loss": 0.6782, "step": 11910 }, { "epoch": 35.8, "grad_norm": 18.81002426147461, "learning_rate": 8.806806806806809e-06, "loss": 0.7638, "step": 11920 }, { "epoch": 35.83, "grad_norm": 15.860320091247559, "learning_rate": 8.805805805805806e-06, "loss": 0.7025, "step": 11930 }, { "epoch": 35.86, "grad_norm": 17.77602195739746, "learning_rate": 8.804804804804806e-06, "loss": 0.7484, "step": 11940 }, { "epoch": 35.89, "grad_norm": 25.042753219604492, "learning_rate": 8.803803803803804e-06, "loss": 0.7014, "step": 11950 }, { "epoch": 35.92, "grad_norm": 13.907393455505371, "learning_rate": 8.802802802802803e-06, "loss": 0.7804, "step": 11960 }, { "epoch": 35.95, "grad_norm": 13.40766429901123, "learning_rate": 8.801801801801803e-06, "loss": 0.6443, "step": 11970 }, { "epoch": 35.98, "grad_norm": 11.657933235168457, "learning_rate": 8.800800800800802e-06, "loss": 0.7393, "step": 11980 }, { "epoch": 36.0, "eval_accuracy": 0.8604, "eval_loss": 0.45787227153778076, "eval_runtime": 12.6531, "eval_samples_per_second": 790.318, "eval_steps_per_second": 3.161, "step": 11988 }, { "epoch": 36.01, "grad_norm": 11.238343238830566, "learning_rate": 8.7997997997998e-06, "loss": 0.7178, "step": 11990 }, { "epoch": 36.04, "grad_norm": 12.2100191116333, "learning_rate": 8.798798798798799e-06, "loss": 0.6748, "step": 12000 }, { "epoch": 36.07, "grad_norm": 15.70413875579834, "learning_rate": 8.797797797797798e-06, "loss": 0.77, "step": 12010 }, { "epoch": 36.1, "grad_norm": 13.942218780517578, "learning_rate": 8.796796796796796e-06, "loss": 0.7306, "step": 12020 }, { "epoch": 36.13, "grad_norm": 15.168832778930664, "learning_rate": 8.795795795795797e-06, "loss": 0.746, "step": 12030 }, { "epoch": 36.16, "grad_norm": 13.083436012268066, "learning_rate": 8.794794794794795e-06, "loss": 0.7263, "step": 12040 }, { "epoch": 36.19, "grad_norm": 11.604787826538086, "learning_rate": 8.793793793793794e-06, "loss": 0.6678, "step": 12050 }, { "epoch": 36.22, "grad_norm": 12.220256805419922, "learning_rate": 8.792792792792794e-06, "loss": 0.701, "step": 12060 }, { "epoch": 36.25, "grad_norm": 15.767328262329102, "learning_rate": 8.791791791791793e-06, "loss": 0.6641, "step": 12070 }, { "epoch": 36.28, "grad_norm": 18.16026496887207, "learning_rate": 8.790790790790791e-06, "loss": 0.6757, "step": 12080 }, { "epoch": 36.31, "grad_norm": 15.411042213439941, "learning_rate": 8.789789789789792e-06, "loss": 0.6962, "step": 12090 }, { "epoch": 36.34, "grad_norm": 16.6982421875, "learning_rate": 8.788788788788788e-06, "loss": 0.655, "step": 12100 }, { "epoch": 36.37, "grad_norm": 17.151487350463867, "learning_rate": 8.787787787787789e-06, "loss": 0.7209, "step": 12110 }, { "epoch": 36.4, "grad_norm": 11.250988960266113, "learning_rate": 8.786786786786787e-06, "loss": 0.7736, "step": 12120 }, { "epoch": 36.43, "grad_norm": 15.72909927368164, "learning_rate": 8.785785785785786e-06, "loss": 0.727, "step": 12130 }, { "epoch": 36.46, "grad_norm": 15.478221893310547, "learning_rate": 8.784784784784786e-06, "loss": 0.6861, "step": 12140 }, { "epoch": 36.49, "grad_norm": 14.448071479797363, "learning_rate": 8.783783783783785e-06, "loss": 0.6771, "step": 12150 }, { "epoch": 36.52, "grad_norm": 18.77178192138672, "learning_rate": 8.782782782782783e-06, "loss": 0.6514, "step": 12160 }, { "epoch": 36.55, "grad_norm": 19.86296844482422, "learning_rate": 8.781781781781784e-06, "loss": 0.7216, "step": 12170 }, { "epoch": 36.58, "grad_norm": 14.47397518157959, "learning_rate": 8.78078078078078e-06, "loss": 0.7528, "step": 12180 }, { "epoch": 36.61, "grad_norm": 18.463701248168945, "learning_rate": 8.779779779779781e-06, "loss": 0.7306, "step": 12190 }, { "epoch": 36.64, "grad_norm": 22.52227783203125, "learning_rate": 8.77877877877878e-06, "loss": 0.7225, "step": 12200 }, { "epoch": 36.67, "grad_norm": 19.450387954711914, "learning_rate": 8.777777777777778e-06, "loss": 0.7652, "step": 12210 }, { "epoch": 36.7, "grad_norm": 16.372949600219727, "learning_rate": 8.776776776776778e-06, "loss": 0.6711, "step": 12220 }, { "epoch": 36.73, "grad_norm": 16.941741943359375, "learning_rate": 8.775775775775777e-06, "loss": 0.6673, "step": 12230 }, { "epoch": 36.76, "grad_norm": 17.055347442626953, "learning_rate": 8.774774774774776e-06, "loss": 0.6923, "step": 12240 }, { "epoch": 36.79, "grad_norm": 12.24468994140625, "learning_rate": 8.773773773773774e-06, "loss": 0.7552, "step": 12250 }, { "epoch": 36.82, "grad_norm": 15.246345520019531, "learning_rate": 8.772772772772773e-06, "loss": 0.6756, "step": 12260 }, { "epoch": 36.85, "grad_norm": 14.248589515686035, "learning_rate": 8.771771771771771e-06, "loss": 0.697, "step": 12270 }, { "epoch": 36.88, "grad_norm": 18.042526245117188, "learning_rate": 8.770770770770772e-06, "loss": 0.7176, "step": 12280 }, { "epoch": 36.91, "grad_norm": 17.553979873657227, "learning_rate": 8.76976976976977e-06, "loss": 0.7256, "step": 12290 }, { "epoch": 36.94, "grad_norm": 14.7088623046875, "learning_rate": 8.768768768768769e-06, "loss": 0.7202, "step": 12300 }, { "epoch": 36.97, "grad_norm": 13.636119842529297, "learning_rate": 8.767767767767769e-06, "loss": 0.6976, "step": 12310 }, { "epoch": 37.0, "grad_norm": 13.1280517578125, "learning_rate": 8.766766766766768e-06, "loss": 0.7393, "step": 12320 }, { "epoch": 37.0, "eval_accuracy": 0.8619, "eval_loss": 0.4559956192970276, "eval_runtime": 12.8598, "eval_samples_per_second": 777.618, "eval_steps_per_second": 3.11, "step": 12321 }, { "epoch": 37.03, "grad_norm": 19.153818130493164, "learning_rate": 8.765765765765766e-06, "loss": 0.7586, "step": 12330 }, { "epoch": 37.06, "grad_norm": 17.784103393554688, "learning_rate": 8.764764764764767e-06, "loss": 0.689, "step": 12340 }, { "epoch": 37.09, "grad_norm": 20.15745735168457, "learning_rate": 8.763763763763763e-06, "loss": 0.7198, "step": 12350 }, { "epoch": 37.12, "grad_norm": 19.974891662597656, "learning_rate": 8.762762762762764e-06, "loss": 0.7454, "step": 12360 }, { "epoch": 37.15, "grad_norm": 15.03307819366455, "learning_rate": 8.761761761761762e-06, "loss": 0.6477, "step": 12370 }, { "epoch": 37.18, "grad_norm": 18.829687118530273, "learning_rate": 8.760760760760761e-06, "loss": 0.7508, "step": 12380 }, { "epoch": 37.21, "grad_norm": 24.866214752197266, "learning_rate": 8.759759759759761e-06, "loss": 0.7404, "step": 12390 }, { "epoch": 37.24, "grad_norm": 12.424365043640137, "learning_rate": 8.75875875875876e-06, "loss": 0.6448, "step": 12400 }, { "epoch": 37.27, "grad_norm": 17.853824615478516, "learning_rate": 8.757757757757758e-06, "loss": 0.6773, "step": 12410 }, { "epoch": 37.3, "grad_norm": 15.64295768737793, "learning_rate": 8.756756756756759e-06, "loss": 0.6744, "step": 12420 }, { "epoch": 37.33, "grad_norm": 13.909880638122559, "learning_rate": 8.755755755755756e-06, "loss": 0.7238, "step": 12430 }, { "epoch": 37.36, "grad_norm": 12.074225425720215, "learning_rate": 8.754754754754756e-06, "loss": 0.7357, "step": 12440 }, { "epoch": 37.39, "grad_norm": 20.037227630615234, "learning_rate": 8.753753753753755e-06, "loss": 0.6693, "step": 12450 }, { "epoch": 37.42, "grad_norm": 14.504189491271973, "learning_rate": 8.752752752752753e-06, "loss": 0.6185, "step": 12460 }, { "epoch": 37.45, "grad_norm": 14.980822563171387, "learning_rate": 8.751751751751752e-06, "loss": 0.7665, "step": 12470 }, { "epoch": 37.48, "grad_norm": 14.017216682434082, "learning_rate": 8.750750750750752e-06, "loss": 0.7558, "step": 12480 }, { "epoch": 37.51, "grad_norm": 18.15026092529297, "learning_rate": 8.74974974974975e-06, "loss": 0.7032, "step": 12490 }, { "epoch": 37.54, "grad_norm": 21.63968849182129, "learning_rate": 8.74874874874875e-06, "loss": 0.7766, "step": 12500 }, { "epoch": 37.57, "grad_norm": 13.396917343139648, "learning_rate": 8.747747747747748e-06, "loss": 0.7048, "step": 12510 }, { "epoch": 37.6, "grad_norm": 12.383674621582031, "learning_rate": 8.746746746746746e-06, "loss": 0.7738, "step": 12520 }, { "epoch": 37.63, "grad_norm": 14.813749313354492, "learning_rate": 8.745745745745747e-06, "loss": 0.7799, "step": 12530 }, { "epoch": 37.66, "grad_norm": 18.46799659729004, "learning_rate": 8.744744744744745e-06, "loss": 0.7278, "step": 12540 }, { "epoch": 37.69, "grad_norm": 16.44928550720215, "learning_rate": 8.743743743743744e-06, "loss": 0.6939, "step": 12550 }, { "epoch": 37.72, "grad_norm": 16.80510902404785, "learning_rate": 8.742742742742744e-06, "loss": 0.6232, "step": 12560 }, { "epoch": 37.75, "grad_norm": 17.623991012573242, "learning_rate": 8.741741741741743e-06, "loss": 0.6502, "step": 12570 }, { "epoch": 37.78, "grad_norm": 22.835899353027344, "learning_rate": 8.740740740740741e-06, "loss": 0.6595, "step": 12580 }, { "epoch": 37.81, "grad_norm": 17.97762680053711, "learning_rate": 8.739739739739742e-06, "loss": 0.6918, "step": 12590 }, { "epoch": 37.84, "grad_norm": 16.164525985717773, "learning_rate": 8.738738738738739e-06, "loss": 0.7216, "step": 12600 }, { "epoch": 37.87, "grad_norm": 19.715900421142578, "learning_rate": 8.737737737737739e-06, "loss": 0.727, "step": 12610 }, { "epoch": 37.9, "grad_norm": 17.584728240966797, "learning_rate": 8.736736736736737e-06, "loss": 0.714, "step": 12620 }, { "epoch": 37.93, "grad_norm": 14.52590274810791, "learning_rate": 8.735735735735736e-06, "loss": 0.7141, "step": 12630 }, { "epoch": 37.96, "grad_norm": 11.749207496643066, "learning_rate": 8.734734734734736e-06, "loss": 0.7204, "step": 12640 }, { "epoch": 37.99, "grad_norm": 12.749289512634277, "learning_rate": 8.733733733733735e-06, "loss": 0.7599, "step": 12650 }, { "epoch": 38.0, "eval_accuracy": 0.8637, "eval_loss": 0.4503398835659027, "eval_runtime": 12.8021, "eval_samples_per_second": 781.122, "eval_steps_per_second": 3.124, "step": 12654 }, { "epoch": 38.02, "grad_norm": 12.482909202575684, "learning_rate": 8.732732732732733e-06, "loss": 0.6813, "step": 12660 }, { "epoch": 38.05, "grad_norm": 14.705399513244629, "learning_rate": 8.731731731731734e-06, "loss": 0.716, "step": 12670 }, { "epoch": 38.08, "grad_norm": 12.497817039489746, "learning_rate": 8.73073073073073e-06, "loss": 0.6871, "step": 12680 }, { "epoch": 38.11, "grad_norm": 18.887226104736328, "learning_rate": 8.72972972972973e-06, "loss": 0.6538, "step": 12690 }, { "epoch": 38.14, "grad_norm": 26.343616485595703, "learning_rate": 8.72872872872873e-06, "loss": 0.6999, "step": 12700 }, { "epoch": 38.17, "grad_norm": 17.42749786376953, "learning_rate": 8.727727727727728e-06, "loss": 0.6758, "step": 12710 }, { "epoch": 38.2, "grad_norm": 16.925424575805664, "learning_rate": 8.726726726726727e-06, "loss": 0.7202, "step": 12720 }, { "epoch": 38.23, "grad_norm": 19.516830444335938, "learning_rate": 8.725725725725727e-06, "loss": 0.6942, "step": 12730 }, { "epoch": 38.26, "grad_norm": 23.34222984313965, "learning_rate": 8.724724724724726e-06, "loss": 0.7196, "step": 12740 }, { "epoch": 38.29, "grad_norm": 14.984414100646973, "learning_rate": 8.723723723723724e-06, "loss": 0.7194, "step": 12750 }, { "epoch": 38.32, "grad_norm": 17.17688751220703, "learning_rate": 8.722722722722723e-06, "loss": 0.7218, "step": 12760 }, { "epoch": 38.35, "grad_norm": 13.632336616516113, "learning_rate": 8.721721721721721e-06, "loss": 0.686, "step": 12770 }, { "epoch": 38.38, "grad_norm": 12.847562789916992, "learning_rate": 8.720720720720722e-06, "loss": 0.6567, "step": 12780 }, { "epoch": 38.41, "grad_norm": 17.647014617919922, "learning_rate": 8.71971971971972e-06, "loss": 0.673, "step": 12790 }, { "epoch": 38.44, "grad_norm": 14.931132316589355, "learning_rate": 8.718718718718719e-06, "loss": 0.668, "step": 12800 }, { "epoch": 38.47, "grad_norm": 17.570600509643555, "learning_rate": 8.71771771771772e-06, "loss": 0.7211, "step": 12810 }, { "epoch": 38.5, "grad_norm": 19.52607536315918, "learning_rate": 8.716716716716718e-06, "loss": 0.7171, "step": 12820 }, { "epoch": 38.53, "grad_norm": 17.393821716308594, "learning_rate": 8.715715715715716e-06, "loss": 0.656, "step": 12830 }, { "epoch": 38.56, "grad_norm": 17.661428451538086, "learning_rate": 8.714714714714717e-06, "loss": 0.6668, "step": 12840 }, { "epoch": 38.59, "grad_norm": 19.984769821166992, "learning_rate": 8.713713713713714e-06, "loss": 0.6828, "step": 12850 }, { "epoch": 38.62, "grad_norm": 17.77810287475586, "learning_rate": 8.712712712712714e-06, "loss": 0.7671, "step": 12860 }, { "epoch": 38.65, "grad_norm": 14.88648509979248, "learning_rate": 8.711711711711712e-06, "loss": 0.6844, "step": 12870 }, { "epoch": 38.68, "grad_norm": 13.530986785888672, "learning_rate": 8.710710710710711e-06, "loss": 0.7069, "step": 12880 }, { "epoch": 38.71, "grad_norm": 16.827951431274414, "learning_rate": 8.709709709709711e-06, "loss": 0.7527, "step": 12890 }, { "epoch": 38.74, "grad_norm": 18.00787353515625, "learning_rate": 8.70870870870871e-06, "loss": 0.6821, "step": 12900 }, { "epoch": 38.77, "grad_norm": 11.284110069274902, "learning_rate": 8.707707707707708e-06, "loss": 0.642, "step": 12910 }, { "epoch": 38.8, "grad_norm": 14.642751693725586, "learning_rate": 8.706706706706707e-06, "loss": 0.7072, "step": 12920 }, { "epoch": 38.83, "grad_norm": 13.521146774291992, "learning_rate": 8.705705705705706e-06, "loss": 0.6929, "step": 12930 }, { "epoch": 38.86, "grad_norm": 15.691649436950684, "learning_rate": 8.704704704704704e-06, "loss": 0.6839, "step": 12940 }, { "epoch": 38.89, "grad_norm": 15.077134132385254, "learning_rate": 8.703703703703705e-06, "loss": 0.7288, "step": 12950 }, { "epoch": 38.92, "grad_norm": 18.418928146362305, "learning_rate": 8.702702702702703e-06, "loss": 0.7192, "step": 12960 }, { "epoch": 38.95, "grad_norm": 16.348697662353516, "learning_rate": 8.701701701701702e-06, "loss": 0.7042, "step": 12970 }, { "epoch": 38.98, "grad_norm": 13.3355073928833, "learning_rate": 8.700700700700702e-06, "loss": 0.6636, "step": 12980 }, { "epoch": 39.0, "eval_accuracy": 0.8636, "eval_loss": 0.4542168974876404, "eval_runtime": 12.7582, "eval_samples_per_second": 783.808, "eval_steps_per_second": 3.135, "step": 12987 }, { "epoch": 39.01, "grad_norm": 17.700868606567383, "learning_rate": 8.6996996996997e-06, "loss": 0.6941, "step": 12990 }, { "epoch": 39.04, "grad_norm": 22.615550994873047, "learning_rate": 8.6986986986987e-06, "loss": 0.6983, "step": 13000 }, { "epoch": 39.07, "grad_norm": 19.436965942382812, "learning_rate": 8.697697697697698e-06, "loss": 0.6828, "step": 13010 }, { "epoch": 39.1, "grad_norm": 12.391670227050781, "learning_rate": 8.696696696696696e-06, "loss": 0.7454, "step": 13020 }, { "epoch": 39.13, "grad_norm": 14.891891479492188, "learning_rate": 8.695695695695697e-06, "loss": 0.6627, "step": 13030 }, { "epoch": 39.16, "grad_norm": 13.117813110351562, "learning_rate": 8.694694694694695e-06, "loss": 0.6927, "step": 13040 }, { "epoch": 39.19, "grad_norm": 15.762950897216797, "learning_rate": 8.693693693693694e-06, "loss": 0.6707, "step": 13050 }, { "epoch": 39.22, "grad_norm": 16.58308219909668, "learning_rate": 8.692692692692694e-06, "loss": 0.6483, "step": 13060 }, { "epoch": 39.25, "grad_norm": 19.617237091064453, "learning_rate": 8.691691691691693e-06, "loss": 0.6582, "step": 13070 }, { "epoch": 39.28, "grad_norm": 13.417759895324707, "learning_rate": 8.690690690690691e-06, "loss": 0.6729, "step": 13080 }, { "epoch": 39.31, "grad_norm": 13.217031478881836, "learning_rate": 8.689689689689692e-06, "loss": 0.7673, "step": 13090 }, { "epoch": 39.34, "grad_norm": 18.13391876220703, "learning_rate": 8.688688688688689e-06, "loss": 0.7142, "step": 13100 }, { "epoch": 39.37, "grad_norm": 15.779520988464355, "learning_rate": 8.687687687687689e-06, "loss": 0.7541, "step": 13110 }, { "epoch": 39.4, "grad_norm": 14.55923080444336, "learning_rate": 8.686686686686687e-06, "loss": 0.7059, "step": 13120 }, { "epoch": 39.43, "grad_norm": 15.300395965576172, "learning_rate": 8.685685685685686e-06, "loss": 0.7028, "step": 13130 }, { "epoch": 39.46, "grad_norm": 12.668709754943848, "learning_rate": 8.684684684684686e-06, "loss": 0.7071, "step": 13140 }, { "epoch": 39.49, "grad_norm": 27.27690887451172, "learning_rate": 8.683683683683685e-06, "loss": 0.7162, "step": 13150 }, { "epoch": 39.52, "grad_norm": 24.240915298461914, "learning_rate": 8.682682682682684e-06, "loss": 0.6125, "step": 13160 }, { "epoch": 39.55, "grad_norm": 13.969755172729492, "learning_rate": 8.681681681681682e-06, "loss": 0.7409, "step": 13170 }, { "epoch": 39.58, "grad_norm": 17.5896053314209, "learning_rate": 8.68068068068068e-06, "loss": 0.755, "step": 13180 }, { "epoch": 39.61, "grad_norm": 15.106500625610352, "learning_rate": 8.67967967967968e-06, "loss": 0.7074, "step": 13190 }, { "epoch": 39.64, "grad_norm": 13.774001121520996, "learning_rate": 8.67867867867868e-06, "loss": 0.7052, "step": 13200 }, { "epoch": 39.67, "grad_norm": 17.810007095336914, "learning_rate": 8.677677677677678e-06, "loss": 0.692, "step": 13210 }, { "epoch": 39.7, "grad_norm": 12.47335147857666, "learning_rate": 8.676676676676677e-06, "loss": 0.7536, "step": 13220 }, { "epoch": 39.73, "grad_norm": 17.86687660217285, "learning_rate": 8.675675675675677e-06, "loss": 0.7456, "step": 13230 }, { "epoch": 39.76, "grad_norm": 13.085111618041992, "learning_rate": 8.674674674674676e-06, "loss": 0.7243, "step": 13240 }, { "epoch": 39.79, "grad_norm": 15.906510353088379, "learning_rate": 8.673673673673674e-06, "loss": 0.7421, "step": 13250 }, { "epoch": 39.82, "grad_norm": 17.313884735107422, "learning_rate": 8.672672672672673e-06, "loss": 0.7011, "step": 13260 }, { "epoch": 39.85, "grad_norm": 21.422687530517578, "learning_rate": 8.671671671671671e-06, "loss": 0.7689, "step": 13270 }, { "epoch": 39.88, "grad_norm": 13.853211402893066, "learning_rate": 8.670670670670672e-06, "loss": 0.6439, "step": 13280 }, { "epoch": 39.91, "grad_norm": 14.36677074432373, "learning_rate": 8.66966966966967e-06, "loss": 0.6398, "step": 13290 }, { "epoch": 39.94, "grad_norm": 14.41776180267334, "learning_rate": 8.668668668668669e-06, "loss": 0.6711, "step": 13300 }, { "epoch": 39.97, "grad_norm": 15.397546768188477, "learning_rate": 8.66766766766767e-06, "loss": 0.7268, "step": 13310 }, { "epoch": 40.0, "grad_norm": 26.0655460357666, "learning_rate": 8.666666666666668e-06, "loss": 0.6759, "step": 13320 }, { "epoch": 40.0, "eval_accuracy": 0.8631, "eval_loss": 0.4483490586280823, "eval_runtime": 12.7377, "eval_samples_per_second": 785.072, "eval_steps_per_second": 3.14, "step": 13320 }, { "epoch": 40.03, "grad_norm": 16.34037208557129, "learning_rate": 8.665665665665666e-06, "loss": 0.7081, "step": 13330 }, { "epoch": 40.06, "grad_norm": 22.965871810913086, "learning_rate": 8.664664664664665e-06, "loss": 0.7028, "step": 13340 }, { "epoch": 40.09, "grad_norm": 15.480595588684082, "learning_rate": 8.663663663663664e-06, "loss": 0.6835, "step": 13350 }, { "epoch": 40.12, "grad_norm": 20.957406997680664, "learning_rate": 8.662662662662664e-06, "loss": 0.7048, "step": 13360 }, { "epoch": 40.15, "grad_norm": 17.77429962158203, "learning_rate": 8.661661661661662e-06, "loss": 0.6815, "step": 13370 }, { "epoch": 40.18, "grad_norm": 15.408767700195312, "learning_rate": 8.660660660660661e-06, "loss": 0.7629, "step": 13380 }, { "epoch": 40.21, "grad_norm": 29.371509552001953, "learning_rate": 8.65965965965966e-06, "loss": 0.6937, "step": 13390 }, { "epoch": 40.24, "grad_norm": 18.90596580505371, "learning_rate": 8.65865865865866e-06, "loss": 0.7482, "step": 13400 }, { "epoch": 40.27, "grad_norm": 15.754220008850098, "learning_rate": 8.657657657657659e-06, "loss": 0.7235, "step": 13410 }, { "epoch": 40.3, "grad_norm": 14.71186637878418, "learning_rate": 8.656656656656657e-06, "loss": 0.641, "step": 13420 }, { "epoch": 40.33, "grad_norm": 13.372477531433105, "learning_rate": 8.655655655655656e-06, "loss": 0.7081, "step": 13430 }, { "epoch": 40.36, "grad_norm": 14.14633846282959, "learning_rate": 8.654654654654654e-06, "loss": 0.6236, "step": 13440 }, { "epoch": 40.39, "grad_norm": 19.37156105041504, "learning_rate": 8.653653653653655e-06, "loss": 0.7461, "step": 13450 }, { "epoch": 40.42, "grad_norm": 10.999704360961914, "learning_rate": 8.652652652652653e-06, "loss": 0.7108, "step": 13460 }, { "epoch": 40.45, "grad_norm": 13.646711349487305, "learning_rate": 8.651651651651652e-06, "loss": 0.6774, "step": 13470 }, { "epoch": 40.48, "grad_norm": 16.37213897705078, "learning_rate": 8.650650650650652e-06, "loss": 0.7388, "step": 13480 }, { "epoch": 40.51, "grad_norm": 13.935249328613281, "learning_rate": 8.64964964964965e-06, "loss": 0.6764, "step": 13490 }, { "epoch": 40.54, "grad_norm": 18.431129455566406, "learning_rate": 8.64864864864865e-06, "loss": 0.6639, "step": 13500 }, { "epoch": 40.57, "grad_norm": 17.029029846191406, "learning_rate": 8.647647647647648e-06, "loss": 0.7311, "step": 13510 }, { "epoch": 40.6, "grad_norm": 17.871564865112305, "learning_rate": 8.646646646646646e-06, "loss": 0.6722, "step": 13520 }, { "epoch": 40.63, "grad_norm": 21.63091278076172, "learning_rate": 8.645645645645647e-06, "loss": 0.6511, "step": 13530 }, { "epoch": 40.66, "grad_norm": 16.98485565185547, "learning_rate": 8.644644644644645e-06, "loss": 0.6225, "step": 13540 }, { "epoch": 40.69, "grad_norm": 21.113874435424805, "learning_rate": 8.643643643643644e-06, "loss": 0.6879, "step": 13550 }, { "epoch": 40.72, "grad_norm": 14.520386695861816, "learning_rate": 8.642642642642644e-06, "loss": 0.681, "step": 13560 }, { "epoch": 40.75, "grad_norm": 18.337249755859375, "learning_rate": 8.641641641641643e-06, "loss": 0.6918, "step": 13570 }, { "epoch": 40.78, "grad_norm": 17.143888473510742, "learning_rate": 8.640640640640641e-06, "loss": 0.6627, "step": 13580 }, { "epoch": 40.81, "grad_norm": 16.210668563842773, "learning_rate": 8.63963963963964e-06, "loss": 0.6589, "step": 13590 }, { "epoch": 40.84, "grad_norm": 12.302651405334473, "learning_rate": 8.638638638638639e-06, "loss": 0.7005, "step": 13600 }, { "epoch": 40.87, "grad_norm": 15.272514343261719, "learning_rate": 8.637637637637637e-06, "loss": 0.6784, "step": 13610 }, { "epoch": 40.9, "grad_norm": 16.107824325561523, "learning_rate": 8.636636636636638e-06, "loss": 0.6779, "step": 13620 }, { "epoch": 40.93, "grad_norm": 10.67597770690918, "learning_rate": 8.635635635635636e-06, "loss": 0.6982, "step": 13630 }, { "epoch": 40.96, "grad_norm": 16.74396514892578, "learning_rate": 8.634634634634635e-06, "loss": 0.7181, "step": 13640 }, { "epoch": 40.99, "grad_norm": 15.14286994934082, "learning_rate": 8.633633633633635e-06, "loss": 0.7266, "step": 13650 }, { "epoch": 41.0, "eval_accuracy": 0.8636, "eval_loss": 0.44839540123939514, "eval_runtime": 12.8276, "eval_samples_per_second": 779.568, "eval_steps_per_second": 3.118, "step": 13653 }, { "epoch": 41.02, "grad_norm": 11.654528617858887, "learning_rate": 8.632632632632634e-06, "loss": 0.6292, "step": 13660 }, { "epoch": 41.05, "grad_norm": 11.669693946838379, "learning_rate": 8.631631631631632e-06, "loss": 0.6264, "step": 13670 }, { "epoch": 41.08, "grad_norm": 24.02885627746582, "learning_rate": 8.63063063063063e-06, "loss": 0.6173, "step": 13680 }, { "epoch": 41.11, "grad_norm": 13.989689826965332, "learning_rate": 8.62962962962963e-06, "loss": 0.6185, "step": 13690 }, { "epoch": 41.14, "grad_norm": 14.894391059875488, "learning_rate": 8.62862862862863e-06, "loss": 0.6867, "step": 13700 }, { "epoch": 41.17, "grad_norm": 12.18368911743164, "learning_rate": 8.627627627627628e-06, "loss": 0.7027, "step": 13710 }, { "epoch": 41.2, "grad_norm": 10.826661109924316, "learning_rate": 8.626626626626627e-06, "loss": 0.5933, "step": 13720 }, { "epoch": 41.23, "grad_norm": 15.728558540344238, "learning_rate": 8.625625625625627e-06, "loss": 0.6609, "step": 13730 }, { "epoch": 41.26, "grad_norm": 12.73317813873291, "learning_rate": 8.624624624624626e-06, "loss": 0.6357, "step": 13740 }, { "epoch": 41.29, "grad_norm": 20.110111236572266, "learning_rate": 8.623623623623624e-06, "loss": 0.6726, "step": 13750 }, { "epoch": 41.32, "grad_norm": 11.981986999511719, "learning_rate": 8.622622622622623e-06, "loss": 0.6711, "step": 13760 }, { "epoch": 41.35, "grad_norm": 18.302762985229492, "learning_rate": 8.621621621621622e-06, "loss": 0.6916, "step": 13770 }, { "epoch": 41.38, "grad_norm": 14.573165893554688, "learning_rate": 8.620620620620622e-06, "loss": 0.7553, "step": 13780 }, { "epoch": 41.41, "grad_norm": 11.373902320861816, "learning_rate": 8.61961961961962e-06, "loss": 0.6533, "step": 13790 }, { "epoch": 41.44, "grad_norm": 19.43169593811035, "learning_rate": 8.618618618618619e-06, "loss": 0.6569, "step": 13800 }, { "epoch": 41.47, "grad_norm": 18.107942581176758, "learning_rate": 8.61761761761762e-06, "loss": 0.6466, "step": 13810 }, { "epoch": 41.5, "grad_norm": 15.347994804382324, "learning_rate": 8.616616616616618e-06, "loss": 0.6784, "step": 13820 }, { "epoch": 41.53, "grad_norm": 15.557960510253906, "learning_rate": 8.615615615615616e-06, "loss": 0.6951, "step": 13830 }, { "epoch": 41.56, "grad_norm": 13.998001098632812, "learning_rate": 8.614614614614615e-06, "loss": 0.7094, "step": 13840 }, { "epoch": 41.59, "grad_norm": 21.74806785583496, "learning_rate": 8.613613613613614e-06, "loss": 0.6718, "step": 13850 }, { "epoch": 41.62, "grad_norm": 17.380090713500977, "learning_rate": 8.612612612612612e-06, "loss": 0.6722, "step": 13860 }, { "epoch": 41.65, "grad_norm": 16.73099708557129, "learning_rate": 8.611611611611613e-06, "loss": 0.6847, "step": 13870 }, { "epoch": 41.68, "grad_norm": 14.040267944335938, "learning_rate": 8.610610610610611e-06, "loss": 0.6534, "step": 13880 }, { "epoch": 41.71, "grad_norm": 17.12603187561035, "learning_rate": 8.60960960960961e-06, "loss": 0.636, "step": 13890 }, { "epoch": 41.74, "grad_norm": 13.38772964477539, "learning_rate": 8.60860860860861e-06, "loss": 0.7212, "step": 13900 }, { "epoch": 41.77, "grad_norm": 13.89432430267334, "learning_rate": 8.607607607607609e-06, "loss": 0.7177, "step": 13910 }, { "epoch": 41.8, "grad_norm": 16.896560668945312, "learning_rate": 8.606606606606607e-06, "loss": 0.7, "step": 13920 }, { "epoch": 41.83, "grad_norm": 15.703804969787598, "learning_rate": 8.605605605605606e-06, "loss": 0.68, "step": 13930 }, { "epoch": 41.86, "grad_norm": 19.20931053161621, "learning_rate": 8.604604604604604e-06, "loss": 0.6466, "step": 13940 }, { "epoch": 41.89, "grad_norm": 15.076231002807617, "learning_rate": 8.603603603603605e-06, "loss": 0.6595, "step": 13950 }, { "epoch": 41.92, "grad_norm": 19.755826950073242, "learning_rate": 8.602602602602603e-06, "loss": 0.6285, "step": 13960 }, { "epoch": 41.95, "grad_norm": 16.200489044189453, "learning_rate": 8.601601601601602e-06, "loss": 0.7014, "step": 13970 }, { "epoch": 41.98, "grad_norm": 13.048054695129395, "learning_rate": 8.600600600600602e-06, "loss": 0.6819, "step": 13980 }, { "epoch": 42.0, "eval_accuracy": 0.8647, "eval_loss": 0.44531553983688354, "eval_runtime": 12.8015, "eval_samples_per_second": 781.157, "eval_steps_per_second": 3.125, "step": 13986 }, { "epoch": 42.01, "grad_norm": 18.29779624938965, "learning_rate": 8.5995995995996e-06, "loss": 0.6019, "step": 13990 }, { "epoch": 42.04, "grad_norm": 16.439821243286133, "learning_rate": 8.5985985985986e-06, "loss": 0.6732, "step": 14000 }, { "epoch": 42.07, "grad_norm": 16.085302352905273, "learning_rate": 8.597597597597598e-06, "loss": 0.724, "step": 14010 }, { "epoch": 42.1, "grad_norm": 14.533585548400879, "learning_rate": 8.596596596596597e-06, "loss": 0.645, "step": 14020 }, { "epoch": 42.13, "grad_norm": 21.371252059936523, "learning_rate": 8.595595595595597e-06, "loss": 0.6839, "step": 14030 }, { "epoch": 42.16, "grad_norm": 20.06671905517578, "learning_rate": 8.594594594594595e-06, "loss": 0.6694, "step": 14040 }, { "epoch": 42.19, "grad_norm": 12.474686622619629, "learning_rate": 8.593593593593594e-06, "loss": 0.7014, "step": 14050 }, { "epoch": 42.22, "grad_norm": 16.817657470703125, "learning_rate": 8.592592592592593e-06, "loss": 0.6726, "step": 14060 }, { "epoch": 42.25, "grad_norm": 15.75899600982666, "learning_rate": 8.591591591591593e-06, "loss": 0.7043, "step": 14070 }, { "epoch": 42.28, "grad_norm": 17.445838928222656, "learning_rate": 8.590590590590592e-06, "loss": 0.7285, "step": 14080 }, { "epoch": 42.31, "grad_norm": 15.50268840789795, "learning_rate": 8.58958958958959e-06, "loss": 0.6661, "step": 14090 }, { "epoch": 42.34, "grad_norm": 15.050628662109375, "learning_rate": 8.588588588588589e-06, "loss": 0.7308, "step": 14100 }, { "epoch": 42.37, "grad_norm": 17.504926681518555, "learning_rate": 8.587587587587587e-06, "loss": 0.6646, "step": 14110 }, { "epoch": 42.4, "grad_norm": 16.50434684753418, "learning_rate": 8.586586586586588e-06, "loss": 0.7288, "step": 14120 }, { "epoch": 42.43, "grad_norm": 17.286767959594727, "learning_rate": 8.585585585585586e-06, "loss": 0.6343, "step": 14130 }, { "epoch": 42.46, "grad_norm": 18.94367218017578, "learning_rate": 8.584584584584585e-06, "loss": 0.653, "step": 14140 }, { "epoch": 42.49, "grad_norm": 18.6459903717041, "learning_rate": 8.583583583583585e-06, "loss": 0.6394, "step": 14150 }, { "epoch": 42.52, "grad_norm": 15.302431106567383, "learning_rate": 8.582582582582584e-06, "loss": 0.7375, "step": 14160 }, { "epoch": 42.55, "grad_norm": 15.144591331481934, "learning_rate": 8.581581581581582e-06, "loss": 0.7067, "step": 14170 }, { "epoch": 42.58, "grad_norm": 17.692134857177734, "learning_rate": 8.580580580580581e-06, "loss": 0.7066, "step": 14180 }, { "epoch": 42.61, "grad_norm": 17.765499114990234, "learning_rate": 8.57957957957958e-06, "loss": 0.6299, "step": 14190 }, { "epoch": 42.64, "grad_norm": 11.533652305603027, "learning_rate": 8.57857857857858e-06, "loss": 0.6854, "step": 14200 }, { "epoch": 42.67, "grad_norm": 13.993724822998047, "learning_rate": 8.577577577577578e-06, "loss": 0.7785, "step": 14210 }, { "epoch": 42.7, "grad_norm": 17.138479232788086, "learning_rate": 8.576576576576577e-06, "loss": 0.6751, "step": 14220 }, { "epoch": 42.73, "grad_norm": 18.59140968322754, "learning_rate": 8.575575575575577e-06, "loss": 0.6469, "step": 14230 }, { "epoch": 42.76, "grad_norm": 17.046478271484375, "learning_rate": 8.574574574574576e-06, "loss": 0.6916, "step": 14240 }, { "epoch": 42.79, "grad_norm": 21.127248764038086, "learning_rate": 8.573573573573574e-06, "loss": 0.7079, "step": 14250 }, { "epoch": 42.82, "grad_norm": 12.968728065490723, "learning_rate": 8.572572572572573e-06, "loss": 0.6931, "step": 14260 }, { "epoch": 42.85, "grad_norm": 12.719165802001953, "learning_rate": 8.571571571571572e-06, "loss": 0.6553, "step": 14270 }, { "epoch": 42.88, "grad_norm": 16.34621810913086, "learning_rate": 8.570570570570572e-06, "loss": 0.6198, "step": 14280 }, { "epoch": 42.91, "grad_norm": 19.220603942871094, "learning_rate": 8.56956956956957e-06, "loss": 0.7594, "step": 14290 }, { "epoch": 42.94, "grad_norm": 17.396108627319336, "learning_rate": 8.568568568568569e-06, "loss": 0.6612, "step": 14300 }, { "epoch": 42.97, "grad_norm": 17.92281723022461, "learning_rate": 8.567567567567568e-06, "loss": 0.5912, "step": 14310 }, { "epoch": 43.0, "eval_accuracy": 0.864, "eval_loss": 0.44930997490882874, "eval_runtime": 12.9537, "eval_samples_per_second": 771.983, "eval_steps_per_second": 3.088, "step": 14319 }, { "epoch": 43.0, "grad_norm": 20.381256103515625, "learning_rate": 8.566566566566568e-06, "loss": 0.7875, "step": 14320 }, { "epoch": 43.03, "grad_norm": 15.10037899017334, "learning_rate": 8.565565565565567e-06, "loss": 0.6471, "step": 14330 }, { "epoch": 43.06, "grad_norm": 23.53544044494629, "learning_rate": 8.564564564564565e-06, "loss": 0.7124, "step": 14340 }, { "epoch": 43.09, "grad_norm": 12.924009323120117, "learning_rate": 8.563563563563564e-06, "loss": 0.7007, "step": 14350 }, { "epoch": 43.12, "grad_norm": 14.097970008850098, "learning_rate": 8.562562562562562e-06, "loss": 0.6386, "step": 14360 }, { "epoch": 43.15, "grad_norm": 13.898308753967285, "learning_rate": 8.561561561561563e-06, "loss": 0.7065, "step": 14370 }, { "epoch": 43.18, "grad_norm": 22.49960708618164, "learning_rate": 8.560560560560561e-06, "loss": 0.6298, "step": 14380 }, { "epoch": 43.21, "grad_norm": 14.752483367919922, "learning_rate": 8.55955955955956e-06, "loss": 0.6568, "step": 14390 }, { "epoch": 43.24, "grad_norm": 16.16034698486328, "learning_rate": 8.55855855855856e-06, "loss": 0.6402, "step": 14400 }, { "epoch": 43.27, "grad_norm": 32.22293472290039, "learning_rate": 8.557557557557559e-06, "loss": 0.7523, "step": 14410 }, { "epoch": 43.3, "grad_norm": 12.088333129882812, "learning_rate": 8.556556556556557e-06, "loss": 0.5934, "step": 14420 }, { "epoch": 43.33, "grad_norm": 21.806533813476562, "learning_rate": 8.555555555555556e-06, "loss": 0.6815, "step": 14430 }, { "epoch": 43.36, "grad_norm": 15.63686466217041, "learning_rate": 8.554554554554554e-06, "loss": 0.66, "step": 14440 }, { "epoch": 43.39, "grad_norm": 16.214784622192383, "learning_rate": 8.553553553553555e-06, "loss": 0.7082, "step": 14450 }, { "epoch": 43.42, "grad_norm": 15.74312973022461, "learning_rate": 8.552552552552553e-06, "loss": 0.685, "step": 14460 }, { "epoch": 43.45, "grad_norm": 16.908809661865234, "learning_rate": 8.551551551551552e-06, "loss": 0.6638, "step": 14470 }, { "epoch": 43.48, "grad_norm": 14.65440559387207, "learning_rate": 8.550550550550552e-06, "loss": 0.6949, "step": 14480 }, { "epoch": 43.51, "grad_norm": 17.863666534423828, "learning_rate": 8.549549549549551e-06, "loss": 0.6373, "step": 14490 }, { "epoch": 43.54, "grad_norm": 16.440378189086914, "learning_rate": 8.54854854854855e-06, "loss": 0.6862, "step": 14500 }, { "epoch": 43.57, "grad_norm": 20.176227569580078, "learning_rate": 8.547547547547548e-06, "loss": 0.6744, "step": 14510 }, { "epoch": 43.6, "grad_norm": 18.182384490966797, "learning_rate": 8.546546546546547e-06, "loss": 0.6158, "step": 14520 }, { "epoch": 43.63, "grad_norm": 13.061956405639648, "learning_rate": 8.545545545545545e-06, "loss": 0.6501, "step": 14530 }, { "epoch": 43.66, "grad_norm": 14.27139663696289, "learning_rate": 8.544544544544546e-06, "loss": 0.6133, "step": 14540 }, { "epoch": 43.69, "grad_norm": 15.125067710876465, "learning_rate": 8.543543543543544e-06, "loss": 0.686, "step": 14550 }, { "epoch": 43.72, "grad_norm": 22.890474319458008, "learning_rate": 8.542542542542543e-06, "loss": 0.5918, "step": 14560 }, { "epoch": 43.75, "grad_norm": 15.257393836975098, "learning_rate": 8.541541541541543e-06, "loss": 0.7166, "step": 14570 }, { "epoch": 43.78, "grad_norm": 16.940078735351562, "learning_rate": 8.540540540540542e-06, "loss": 0.6683, "step": 14580 }, { "epoch": 43.81, "grad_norm": 16.10072135925293, "learning_rate": 8.53953953953954e-06, "loss": 0.6698, "step": 14590 }, { "epoch": 43.84, "grad_norm": 15.821409225463867, "learning_rate": 8.538538538538539e-06, "loss": 0.7221, "step": 14600 }, { "epoch": 43.87, "grad_norm": 17.96417808532715, "learning_rate": 8.537537537537537e-06, "loss": 0.677, "step": 14610 }, { "epoch": 43.9, "grad_norm": 15.759181022644043, "learning_rate": 8.536536536536538e-06, "loss": 0.6919, "step": 14620 }, { "epoch": 43.93, "grad_norm": 16.6920166015625, "learning_rate": 8.535535535535536e-06, "loss": 0.6148, "step": 14630 }, { "epoch": 43.96, "grad_norm": 17.795061111450195, "learning_rate": 8.534534534534535e-06, "loss": 0.6573, "step": 14640 }, { "epoch": 43.99, "grad_norm": 14.420255661010742, "learning_rate": 8.533533533533535e-06, "loss": 0.6803, "step": 14650 }, { "epoch": 44.0, "eval_accuracy": 0.8646, "eval_loss": 0.44525375962257385, "eval_runtime": 13.0886, "eval_samples_per_second": 764.025, "eval_steps_per_second": 3.056, "step": 14652 }, { "epoch": 44.02, "grad_norm": 15.55947494506836, "learning_rate": 8.532532532532534e-06, "loss": 0.5994, "step": 14660 }, { "epoch": 44.05, "grad_norm": 17.376937866210938, "learning_rate": 8.531531531531532e-06, "loss": 0.6174, "step": 14670 }, { "epoch": 44.08, "grad_norm": 12.870214462280273, "learning_rate": 8.530530530530531e-06, "loss": 0.6422, "step": 14680 }, { "epoch": 44.11, "grad_norm": 13.59985065460205, "learning_rate": 8.52952952952953e-06, "loss": 0.6884, "step": 14690 }, { "epoch": 44.14, "grad_norm": 15.210004806518555, "learning_rate": 8.52852852852853e-06, "loss": 0.6767, "step": 14700 }, { "epoch": 44.17, "grad_norm": 13.726428031921387, "learning_rate": 8.527527527527528e-06, "loss": 0.6895, "step": 14710 }, { "epoch": 44.2, "grad_norm": 13.653573036193848, "learning_rate": 8.526526526526527e-06, "loss": 0.638, "step": 14720 }, { "epoch": 44.23, "grad_norm": 13.787467002868652, "learning_rate": 8.525525525525527e-06, "loss": 0.6513, "step": 14730 }, { "epoch": 44.26, "grad_norm": 15.06092643737793, "learning_rate": 8.524524524524526e-06, "loss": 0.667, "step": 14740 }, { "epoch": 44.29, "grad_norm": 11.093276977539062, "learning_rate": 8.523523523523524e-06, "loss": 0.6214, "step": 14750 }, { "epoch": 44.32, "grad_norm": 13.810016632080078, "learning_rate": 8.522522522522523e-06, "loss": 0.6762, "step": 14760 }, { "epoch": 44.35, "grad_norm": 15.720680236816406, "learning_rate": 8.521521521521522e-06, "loss": 0.6101, "step": 14770 }, { "epoch": 44.38, "grad_norm": 16.587196350097656, "learning_rate": 8.52052052052052e-06, "loss": 0.6182, "step": 14780 }, { "epoch": 44.41, "grad_norm": 13.699054718017578, "learning_rate": 8.51951951951952e-06, "loss": 0.6334, "step": 14790 }, { "epoch": 44.44, "grad_norm": 12.843461036682129, "learning_rate": 8.518518518518519e-06, "loss": 0.723, "step": 14800 }, { "epoch": 44.47, "grad_norm": 17.749874114990234, "learning_rate": 8.517517517517518e-06, "loss": 0.6909, "step": 14810 }, { "epoch": 44.5, "grad_norm": 18.269649505615234, "learning_rate": 8.516516516516518e-06, "loss": 0.6154, "step": 14820 }, { "epoch": 44.53, "grad_norm": 14.030564308166504, "learning_rate": 8.515515515515517e-06, "loss": 0.6779, "step": 14830 }, { "epoch": 44.56, "grad_norm": 14.909416198730469, "learning_rate": 8.514514514514515e-06, "loss": 0.6351, "step": 14840 }, { "epoch": 44.59, "grad_norm": 24.80760383605957, "learning_rate": 8.513513513513514e-06, "loss": 0.6858, "step": 14850 }, { "epoch": 44.62, "grad_norm": 12.409850120544434, "learning_rate": 8.512512512512512e-06, "loss": 0.593, "step": 14860 }, { "epoch": 44.65, "grad_norm": 15.177338600158691, "learning_rate": 8.511511511511513e-06, "loss": 0.5928, "step": 14870 }, { "epoch": 44.68, "grad_norm": 17.484256744384766, "learning_rate": 8.510510510510511e-06, "loss": 0.675, "step": 14880 }, { "epoch": 44.71, "grad_norm": 14.007301330566406, "learning_rate": 8.50950950950951e-06, "loss": 0.644, "step": 14890 }, { "epoch": 44.74, "grad_norm": 21.29224395751953, "learning_rate": 8.50850850850851e-06, "loss": 0.6996, "step": 14900 }, { "epoch": 44.77, "grad_norm": 18.76585578918457, "learning_rate": 8.507507507507509e-06, "loss": 0.6909, "step": 14910 }, { "epoch": 44.8, "grad_norm": 16.887725830078125, "learning_rate": 8.506506506506507e-06, "loss": 0.6274, "step": 14920 }, { "epoch": 44.83, "grad_norm": 16.269027709960938, "learning_rate": 8.505505505505506e-06, "loss": 0.6791, "step": 14930 }, { "epoch": 44.86, "grad_norm": 11.667265892028809, "learning_rate": 8.504504504504505e-06, "loss": 0.692, "step": 14940 }, { "epoch": 44.89, "grad_norm": 12.998801231384277, "learning_rate": 8.503503503503505e-06, "loss": 0.6251, "step": 14950 }, { "epoch": 44.92, "grad_norm": 20.578771591186523, "learning_rate": 8.502502502502503e-06, "loss": 0.6662, "step": 14960 }, { "epoch": 44.95, "grad_norm": 13.840142250061035, "learning_rate": 8.501501501501502e-06, "loss": 0.7111, "step": 14970 }, { "epoch": 44.98, "grad_norm": 14.712730407714844, "learning_rate": 8.5005005005005e-06, "loss": 0.6898, "step": 14980 }, { "epoch": 45.0, "eval_accuracy": 0.8628, "eval_loss": 0.44582709670066833, "eval_runtime": 12.3919, "eval_samples_per_second": 806.978, "eval_steps_per_second": 3.228, "step": 14985 }, { "epoch": 45.02, "grad_norm": 15.17032241821289, "learning_rate": 8.499499499499501e-06, "loss": 0.795, "step": 14990 }, { "epoch": 45.05, "grad_norm": 14.758530616760254, "learning_rate": 8.4984984984985e-06, "loss": 0.6719, "step": 15000 }, { "epoch": 45.08, "grad_norm": 14.033868789672852, "learning_rate": 8.497497497497498e-06, "loss": 0.6007, "step": 15010 }, { "epoch": 45.11, "grad_norm": 20.025487899780273, "learning_rate": 8.496496496496497e-06, "loss": 0.6915, "step": 15020 }, { "epoch": 45.14, "grad_norm": 18.14263153076172, "learning_rate": 8.495495495495495e-06, "loss": 0.6224, "step": 15030 }, { "epoch": 45.17, "grad_norm": 18.517101287841797, "learning_rate": 8.494494494494496e-06, "loss": 0.6209, "step": 15040 }, { "epoch": 45.2, "grad_norm": 11.368416786193848, "learning_rate": 8.493493493493494e-06, "loss": 0.6446, "step": 15050 }, { "epoch": 45.23, "grad_norm": 14.640449523925781, "learning_rate": 8.492492492492493e-06, "loss": 0.6376, "step": 15060 }, { "epoch": 45.26, "grad_norm": 20.73483657836914, "learning_rate": 8.491491491491493e-06, "loss": 0.6679, "step": 15070 }, { "epoch": 45.29, "grad_norm": 15.753366470336914, "learning_rate": 8.490490490490492e-06, "loss": 0.6786, "step": 15080 }, { "epoch": 45.32, "grad_norm": 12.244842529296875, "learning_rate": 8.48948948948949e-06, "loss": 0.6502, "step": 15090 }, { "epoch": 45.35, "grad_norm": 16.75644874572754, "learning_rate": 8.488488488488489e-06, "loss": 0.6085, "step": 15100 }, { "epoch": 45.38, "grad_norm": 12.139744758605957, "learning_rate": 8.487487487487487e-06, "loss": 0.7269, "step": 15110 }, { "epoch": 45.41, "grad_norm": 13.40955924987793, "learning_rate": 8.486486486486488e-06, "loss": 0.6944, "step": 15120 }, { "epoch": 45.44, "grad_norm": 22.504512786865234, "learning_rate": 8.485485485485486e-06, "loss": 0.6239, "step": 15130 }, { "epoch": 45.47, "grad_norm": 14.809057235717773, "learning_rate": 8.484484484484485e-06, "loss": 0.6635, "step": 15140 }, { "epoch": 45.5, "grad_norm": 20.3834285736084, "learning_rate": 8.483483483483485e-06, "loss": 0.672, "step": 15150 }, { "epoch": 45.53, "grad_norm": 13.717203140258789, "learning_rate": 8.482482482482484e-06, "loss": 0.6436, "step": 15160 }, { "epoch": 45.56, "grad_norm": 13.105717658996582, "learning_rate": 8.481481481481482e-06, "loss": 0.6458, "step": 15170 }, { "epoch": 45.59, "grad_norm": 16.13928985595703, "learning_rate": 8.480480480480481e-06, "loss": 0.5647, "step": 15180 }, { "epoch": 45.62, "grad_norm": 14.582718849182129, "learning_rate": 8.47947947947948e-06, "loss": 0.6806, "step": 15190 }, { "epoch": 45.65, "grad_norm": 12.979229927062988, "learning_rate": 8.47847847847848e-06, "loss": 0.5787, "step": 15200 }, { "epoch": 45.68, "grad_norm": 14.528862953186035, "learning_rate": 8.477477477477478e-06, "loss": 0.679, "step": 15210 }, { "epoch": 45.71, "grad_norm": 14.974054336547852, "learning_rate": 8.476476476476477e-06, "loss": 0.6121, "step": 15220 }, { "epoch": 45.74, "grad_norm": 19.096973419189453, "learning_rate": 8.475475475475476e-06, "loss": 0.5976, "step": 15230 }, { "epoch": 45.77, "grad_norm": 12.103023529052734, "learning_rate": 8.474474474474476e-06, "loss": 0.6534, "step": 15240 }, { "epoch": 45.8, "grad_norm": 15.813861846923828, "learning_rate": 8.473473473473475e-06, "loss": 0.652, "step": 15250 }, { "epoch": 45.83, "grad_norm": 18.952356338500977, "learning_rate": 8.472472472472473e-06, "loss": 0.636, "step": 15260 }, { "epoch": 45.86, "grad_norm": 21.465044021606445, "learning_rate": 8.471471471471472e-06, "loss": 0.619, "step": 15270 }, { "epoch": 45.89, "grad_norm": 16.716754913330078, "learning_rate": 8.47047047047047e-06, "loss": 0.6996, "step": 15280 }, { "epoch": 45.92, "grad_norm": 18.515457153320312, "learning_rate": 8.46946946946947e-06, "loss": 0.6817, "step": 15290 }, { "epoch": 45.95, "grad_norm": 13.04875373840332, "learning_rate": 8.46846846846847e-06, "loss": 0.6051, "step": 15300 }, { "epoch": 45.98, "grad_norm": 12.385298728942871, "learning_rate": 8.467467467467468e-06, "loss": 0.6312, "step": 15310 }, { "epoch": 46.0, "eval_accuracy": 0.8636, "eval_loss": 0.44993892312049866, "eval_runtime": 12.9287, "eval_samples_per_second": 773.476, "eval_steps_per_second": 3.094, "step": 15318 }, { "epoch": 46.01, "grad_norm": 17.510120391845703, "learning_rate": 8.466466466466468e-06, "loss": 0.6119, "step": 15320 }, { "epoch": 46.04, "grad_norm": 16.67719268798828, "learning_rate": 8.465465465465467e-06, "loss": 0.5838, "step": 15330 }, { "epoch": 46.07, "grad_norm": 17.144563674926758, "learning_rate": 8.464464464464465e-06, "loss": 0.5972, "step": 15340 }, { "epoch": 46.1, "grad_norm": 14.985879898071289, "learning_rate": 8.463463463463464e-06, "loss": 0.6636, "step": 15350 }, { "epoch": 46.13, "grad_norm": 12.297521591186523, "learning_rate": 8.462462462462462e-06, "loss": 0.6748, "step": 15360 }, { "epoch": 46.16, "grad_norm": 14.004356384277344, "learning_rate": 8.461461461461463e-06, "loss": 0.6887, "step": 15370 }, { "epoch": 46.19, "grad_norm": 14.065787315368652, "learning_rate": 8.460460460460461e-06, "loss": 0.6011, "step": 15380 }, { "epoch": 46.22, "grad_norm": 16.010286331176758, "learning_rate": 8.45945945945946e-06, "loss": 0.6151, "step": 15390 }, { "epoch": 46.25, "grad_norm": 20.490942001342773, "learning_rate": 8.45845845845846e-06, "loss": 0.6239, "step": 15400 }, { "epoch": 46.28, "grad_norm": 17.279037475585938, "learning_rate": 8.457457457457459e-06, "loss": 0.6526, "step": 15410 }, { "epoch": 46.31, "grad_norm": 16.7100830078125, "learning_rate": 8.456456456456457e-06, "loss": 0.6632, "step": 15420 }, { "epoch": 46.34, "grad_norm": 13.888665199279785, "learning_rate": 8.455455455455456e-06, "loss": 0.6362, "step": 15430 }, { "epoch": 46.37, "grad_norm": 13.176531791687012, "learning_rate": 8.454454454454455e-06, "loss": 0.6445, "step": 15440 }, { "epoch": 46.4, "grad_norm": 25.23253631591797, "learning_rate": 8.453453453453453e-06, "loss": 0.5765, "step": 15450 }, { "epoch": 46.43, "grad_norm": 14.532776832580566, "learning_rate": 8.452452452452454e-06, "loss": 0.6501, "step": 15460 }, { "epoch": 46.46, "grad_norm": 15.153827667236328, "learning_rate": 8.451451451451452e-06, "loss": 0.6621, "step": 15470 }, { "epoch": 46.49, "grad_norm": 14.688456535339355, "learning_rate": 8.45045045045045e-06, "loss": 0.6491, "step": 15480 }, { "epoch": 46.52, "grad_norm": 18.969226837158203, "learning_rate": 8.449449449449451e-06, "loss": 0.6258, "step": 15490 }, { "epoch": 46.55, "grad_norm": 16.73055076599121, "learning_rate": 8.44844844844845e-06, "loss": 0.6336, "step": 15500 }, { "epoch": 46.58, "grad_norm": 12.873644828796387, "learning_rate": 8.447447447447448e-06, "loss": 0.6116, "step": 15510 }, { "epoch": 46.61, "grad_norm": 15.362987518310547, "learning_rate": 8.446446446446447e-06, "loss": 0.6924, "step": 15520 }, { "epoch": 46.64, "grad_norm": 15.47192096710205, "learning_rate": 8.445445445445445e-06, "loss": 0.6528, "step": 15530 }, { "epoch": 46.67, "grad_norm": 20.89593505859375, "learning_rate": 8.444444444444446e-06, "loss": 0.6524, "step": 15540 }, { "epoch": 46.7, "grad_norm": 24.830190658569336, "learning_rate": 8.443443443443444e-06, "loss": 0.6957, "step": 15550 }, { "epoch": 46.73, "grad_norm": 17.25192642211914, "learning_rate": 8.442442442442443e-06, "loss": 0.6185, "step": 15560 }, { "epoch": 46.76, "grad_norm": 17.689939498901367, "learning_rate": 8.441441441441443e-06, "loss": 0.6935, "step": 15570 }, { "epoch": 46.79, "grad_norm": 13.986361503601074, "learning_rate": 8.440440440440442e-06, "loss": 0.6663, "step": 15580 }, { "epoch": 46.82, "grad_norm": 16.990419387817383, "learning_rate": 8.43943943943944e-06, "loss": 0.6245, "step": 15590 }, { "epoch": 46.85, "grad_norm": 15.792227745056152, "learning_rate": 8.438438438438439e-06, "loss": 0.7197, "step": 15600 }, { "epoch": 46.88, "grad_norm": 13.95806884765625, "learning_rate": 8.437437437437438e-06, "loss": 0.6644, "step": 15610 }, { "epoch": 46.91, "grad_norm": 17.619291305541992, "learning_rate": 8.436436436436438e-06, "loss": 0.6191, "step": 15620 }, { "epoch": 46.94, "grad_norm": 18.768346786499023, "learning_rate": 8.435435435435436e-06, "loss": 0.6689, "step": 15630 }, { "epoch": 46.97, "grad_norm": 15.373311042785645, "learning_rate": 8.434434434434435e-06, "loss": 0.6363, "step": 15640 }, { "epoch": 47.0, "grad_norm": 14.394283294677734, "learning_rate": 8.433433433433435e-06, "loss": 0.6972, "step": 15650 }, { "epoch": 47.0, "eval_accuracy": 0.8646, "eval_loss": 0.44944092631340027, "eval_runtime": 12.6296, "eval_samples_per_second": 791.789, "eval_steps_per_second": 3.167, "step": 15651 }, { "epoch": 47.03, "grad_norm": 20.598209381103516, "learning_rate": 8.432432432432434e-06, "loss": 0.7864, "step": 15660 }, { "epoch": 47.06, "grad_norm": 13.101500511169434, "learning_rate": 8.43143143143143e-06, "loss": 0.6511, "step": 15670 }, { "epoch": 47.09, "grad_norm": 14.161370277404785, "learning_rate": 8.430430430430431e-06, "loss": 0.6281, "step": 15680 }, { "epoch": 47.12, "grad_norm": 15.656643867492676, "learning_rate": 8.42942942942943e-06, "loss": 0.6166, "step": 15690 }, { "epoch": 47.15, "grad_norm": 14.171300888061523, "learning_rate": 8.428428428428428e-06, "loss": 0.7045, "step": 15700 }, { "epoch": 47.18, "grad_norm": 14.199458122253418, "learning_rate": 8.427427427427429e-06, "loss": 0.6232, "step": 15710 }, { "epoch": 47.21, "grad_norm": 15.500053405761719, "learning_rate": 8.426426426426427e-06, "loss": 0.6978, "step": 15720 }, { "epoch": 47.24, "grad_norm": 14.61178970336914, "learning_rate": 8.425425425425426e-06, "loss": 0.7303, "step": 15730 }, { "epoch": 47.27, "grad_norm": 20.73687171936035, "learning_rate": 8.424424424424426e-06, "loss": 0.6623, "step": 15740 }, { "epoch": 47.3, "grad_norm": 12.876822471618652, "learning_rate": 8.423423423423423e-06, "loss": 0.6474, "step": 15750 }, { "epoch": 47.33, "grad_norm": 11.815642356872559, "learning_rate": 8.422422422422423e-06, "loss": 0.6512, "step": 15760 }, { "epoch": 47.36, "grad_norm": 13.063693046569824, "learning_rate": 8.421421421421422e-06, "loss": 0.7295, "step": 15770 }, { "epoch": 47.39, "grad_norm": 10.594338417053223, "learning_rate": 8.42042042042042e-06, "loss": 0.6597, "step": 15780 }, { "epoch": 47.42, "grad_norm": 19.250690460205078, "learning_rate": 8.41941941941942e-06, "loss": 0.6416, "step": 15790 }, { "epoch": 47.45, "grad_norm": 13.238747596740723, "learning_rate": 8.41841841841842e-06, "loss": 0.6216, "step": 15800 }, { "epoch": 47.48, "grad_norm": 16.948776245117188, "learning_rate": 8.417417417417418e-06, "loss": 0.6794, "step": 15810 }, { "epoch": 47.51, "grad_norm": 17.4051570892334, "learning_rate": 8.416416416416418e-06, "loss": 0.6369, "step": 15820 }, { "epoch": 47.54, "grad_norm": 9.287778854370117, "learning_rate": 8.415415415415417e-06, "loss": 0.6962, "step": 15830 }, { "epoch": 47.57, "grad_norm": 17.206253051757812, "learning_rate": 8.414414414414415e-06, "loss": 0.6219, "step": 15840 }, { "epoch": 47.6, "grad_norm": 18.415864944458008, "learning_rate": 8.413413413413414e-06, "loss": 0.6386, "step": 15850 }, { "epoch": 47.63, "grad_norm": 18.562761306762695, "learning_rate": 8.412412412412413e-06, "loss": 0.6297, "step": 15860 }, { "epoch": 47.66, "grad_norm": 11.125750541687012, "learning_rate": 8.411411411411413e-06, "loss": 0.6703, "step": 15870 }, { "epoch": 47.69, "grad_norm": 12.335123062133789, "learning_rate": 8.410410410410411e-06, "loss": 0.6603, "step": 15880 }, { "epoch": 47.72, "grad_norm": 13.002345085144043, "learning_rate": 8.40940940940941e-06, "loss": 0.6128, "step": 15890 }, { "epoch": 47.75, "grad_norm": 24.356122970581055, "learning_rate": 8.408408408408409e-06, "loss": 0.5832, "step": 15900 }, { "epoch": 47.78, "grad_norm": 11.936380386352539, "learning_rate": 8.407407407407409e-06, "loss": 0.6503, "step": 15910 }, { "epoch": 47.81, "grad_norm": 16.61817741394043, "learning_rate": 8.406406406406406e-06, "loss": 0.6221, "step": 15920 }, { "epoch": 47.84, "grad_norm": 17.528955459594727, "learning_rate": 8.405405405405406e-06, "loss": 0.6718, "step": 15930 }, { "epoch": 47.87, "grad_norm": 14.016353607177734, "learning_rate": 8.404404404404405e-06, "loss": 0.627, "step": 15940 }, { "epoch": 47.9, "grad_norm": 9.69521427154541, "learning_rate": 8.403403403403403e-06, "loss": 0.6385, "step": 15950 }, { "epoch": 47.93, "grad_norm": 17.623855590820312, "learning_rate": 8.402402402402404e-06, "loss": 0.6143, "step": 15960 }, { "epoch": 47.96, "grad_norm": 12.067631721496582, "learning_rate": 8.401401401401402e-06, "loss": 0.6262, "step": 15970 }, { "epoch": 47.99, "grad_norm": 13.75415325164795, "learning_rate": 8.4004004004004e-06, "loss": 0.616, "step": 15980 }, { "epoch": 48.0, "eval_accuracy": 0.8674, "eval_loss": 0.45251065492630005, "eval_runtime": 12.7377, "eval_samples_per_second": 785.071, "eval_steps_per_second": 3.14, "step": 15984 }, { "epoch": 48.02, "grad_norm": 14.465485572814941, "learning_rate": 8.399399399399401e-06, "loss": 0.5542, "step": 15990 }, { "epoch": 48.05, "grad_norm": 14.680464744567871, "learning_rate": 8.398398398398398e-06, "loss": 0.7044, "step": 16000 }, { "epoch": 48.08, "grad_norm": 11.977607727050781, "learning_rate": 8.397397397397398e-06, "loss": 0.6151, "step": 16010 }, { "epoch": 48.11, "grad_norm": 17.933095932006836, "learning_rate": 8.396396396396397e-06, "loss": 0.6358, "step": 16020 }, { "epoch": 48.14, "grad_norm": 15.633028984069824, "learning_rate": 8.395395395395395e-06, "loss": 0.657, "step": 16030 }, { "epoch": 48.17, "grad_norm": 14.462199211120605, "learning_rate": 8.394394394394396e-06, "loss": 0.6752, "step": 16040 }, { "epoch": 48.2, "grad_norm": 12.743391990661621, "learning_rate": 8.393393393393394e-06, "loss": 0.6515, "step": 16050 }, { "epoch": 48.23, "grad_norm": 12.953666687011719, "learning_rate": 8.392392392392393e-06, "loss": 0.6412, "step": 16060 }, { "epoch": 48.26, "grad_norm": 14.461003303527832, "learning_rate": 8.391391391391393e-06, "loss": 0.6275, "step": 16070 }, { "epoch": 48.29, "grad_norm": 18.0173397064209, "learning_rate": 8.390390390390392e-06, "loss": 0.7052, "step": 16080 }, { "epoch": 48.32, "grad_norm": 13.393255233764648, "learning_rate": 8.38938938938939e-06, "loss": 0.6697, "step": 16090 }, { "epoch": 48.35, "grad_norm": 14.54150676727295, "learning_rate": 8.388388388388389e-06, "loss": 0.6551, "step": 16100 }, { "epoch": 48.38, "grad_norm": 16.1068115234375, "learning_rate": 8.387387387387388e-06, "loss": 0.6035, "step": 16110 }, { "epoch": 48.41, "grad_norm": 14.576680183410645, "learning_rate": 8.386386386386386e-06, "loss": 0.6353, "step": 16120 }, { "epoch": 48.44, "grad_norm": 20.142234802246094, "learning_rate": 8.385385385385386e-06, "loss": 0.6536, "step": 16130 }, { "epoch": 48.47, "grad_norm": 14.841194152832031, "learning_rate": 8.384384384384385e-06, "loss": 0.6272, "step": 16140 }, { "epoch": 48.5, "grad_norm": 15.862102508544922, "learning_rate": 8.383383383383384e-06, "loss": 0.5461, "step": 16150 }, { "epoch": 48.53, "grad_norm": 22.324954986572266, "learning_rate": 8.382382382382384e-06, "loss": 0.6802, "step": 16160 }, { "epoch": 48.56, "grad_norm": 29.10227394104004, "learning_rate": 8.381381381381381e-06, "loss": 0.6632, "step": 16170 }, { "epoch": 48.59, "grad_norm": 15.7824125289917, "learning_rate": 8.380380380380381e-06, "loss": 0.6967, "step": 16180 }, { "epoch": 48.62, "grad_norm": 17.80396842956543, "learning_rate": 8.37937937937938e-06, "loss": 0.6012, "step": 16190 }, { "epoch": 48.65, "grad_norm": 17.60359764099121, "learning_rate": 8.378378378378378e-06, "loss": 0.6098, "step": 16200 }, { "epoch": 48.68, "grad_norm": 13.17934799194336, "learning_rate": 8.377377377377379e-06, "loss": 0.6558, "step": 16210 }, { "epoch": 48.71, "grad_norm": 18.207441329956055, "learning_rate": 8.376376376376377e-06, "loss": 0.6079, "step": 16220 }, { "epoch": 48.74, "grad_norm": 13.500203132629395, "learning_rate": 8.375375375375376e-06, "loss": 0.6025, "step": 16230 }, { "epoch": 48.77, "grad_norm": 13.301795959472656, "learning_rate": 8.374374374374376e-06, "loss": 0.6264, "step": 16240 }, { "epoch": 48.8, "grad_norm": 14.00633430480957, "learning_rate": 8.373373373373373e-06, "loss": 0.7114, "step": 16250 }, { "epoch": 48.83, "grad_norm": 15.595272064208984, "learning_rate": 8.372372372372373e-06, "loss": 0.5517, "step": 16260 }, { "epoch": 48.86, "grad_norm": 16.70868492126465, "learning_rate": 8.371371371371372e-06, "loss": 0.6613, "step": 16270 }, { "epoch": 48.89, "grad_norm": 14.774352073669434, "learning_rate": 8.37037037037037e-06, "loss": 0.5916, "step": 16280 }, { "epoch": 48.92, "grad_norm": 13.337324142456055, "learning_rate": 8.36936936936937e-06, "loss": 0.6354, "step": 16290 }, { "epoch": 48.95, "grad_norm": 16.758169174194336, "learning_rate": 8.36836836836837e-06, "loss": 0.6615, "step": 16300 }, { "epoch": 48.98, "grad_norm": 17.271648406982422, "learning_rate": 8.367367367367368e-06, "loss": 0.6911, "step": 16310 }, { "epoch": 49.0, "eval_accuracy": 0.8637, "eval_loss": 0.4506002366542816, "eval_runtime": 12.6548, "eval_samples_per_second": 790.216, "eval_steps_per_second": 3.161, "step": 16317 }, { "epoch": 49.01, "grad_norm": 16.090843200683594, "learning_rate": 8.366366366366368e-06, "loss": 0.7275, "step": 16320 }, { "epoch": 49.04, "grad_norm": 15.7288179397583, "learning_rate": 8.365365365365367e-06, "loss": 0.6623, "step": 16330 }, { "epoch": 49.07, "grad_norm": 19.782133102416992, "learning_rate": 8.364364364364365e-06, "loss": 0.5901, "step": 16340 }, { "epoch": 49.1, "grad_norm": 19.402259826660156, "learning_rate": 8.363363363363364e-06, "loss": 0.5984, "step": 16350 }, { "epoch": 49.13, "grad_norm": 12.4588041305542, "learning_rate": 8.362362362362363e-06, "loss": 0.6633, "step": 16360 }, { "epoch": 49.16, "grad_norm": 13.56613826751709, "learning_rate": 8.361361361361361e-06, "loss": 0.6799, "step": 16370 }, { "epoch": 49.19, "grad_norm": 17.75737762451172, "learning_rate": 8.360360360360362e-06, "loss": 0.5668, "step": 16380 }, { "epoch": 49.22, "grad_norm": 20.319490432739258, "learning_rate": 8.35935935935936e-06, "loss": 0.6673, "step": 16390 }, { "epoch": 49.25, "grad_norm": 28.126279830932617, "learning_rate": 8.358358358358359e-06, "loss": 0.6171, "step": 16400 }, { "epoch": 49.28, "grad_norm": 17.8362979888916, "learning_rate": 8.357357357357359e-06, "loss": 0.619, "step": 16410 }, { "epoch": 49.31, "grad_norm": 15.377138137817383, "learning_rate": 8.356356356356356e-06, "loss": 0.6238, "step": 16420 }, { "epoch": 49.34, "grad_norm": 13.098530769348145, "learning_rate": 8.355355355355356e-06, "loss": 0.6247, "step": 16430 }, { "epoch": 49.37, "grad_norm": 25.9362850189209, "learning_rate": 8.354354354354355e-06, "loss": 0.6544, "step": 16440 }, { "epoch": 49.4, "grad_norm": 14.757615089416504, "learning_rate": 8.353353353353353e-06, "loss": 0.6629, "step": 16450 }, { "epoch": 49.43, "grad_norm": 16.951858520507812, "learning_rate": 8.352352352352354e-06, "loss": 0.6348, "step": 16460 }, { "epoch": 49.46, "grad_norm": 20.39686393737793, "learning_rate": 8.351351351351352e-06, "loss": 0.7222, "step": 16470 }, { "epoch": 49.49, "grad_norm": 11.490650177001953, "learning_rate": 8.35035035035035e-06, "loss": 0.5808, "step": 16480 }, { "epoch": 49.52, "grad_norm": 16.503400802612305, "learning_rate": 8.349349349349351e-06, "loss": 0.6684, "step": 16490 }, { "epoch": 49.55, "grad_norm": 10.76430606842041, "learning_rate": 8.348348348348348e-06, "loss": 0.6216, "step": 16500 }, { "epoch": 49.58, "grad_norm": 18.0540771484375, "learning_rate": 8.347347347347348e-06, "loss": 0.6222, "step": 16510 }, { "epoch": 49.61, "grad_norm": 26.5042781829834, "learning_rate": 8.346346346346347e-06, "loss": 0.6964, "step": 16520 }, { "epoch": 49.64, "grad_norm": 21.34775161743164, "learning_rate": 8.345345345345346e-06, "loss": 0.6164, "step": 16530 }, { "epoch": 49.67, "grad_norm": 14.51412582397461, "learning_rate": 8.344344344344346e-06, "loss": 0.6464, "step": 16540 }, { "epoch": 49.7, "grad_norm": 14.07699966430664, "learning_rate": 8.343343343343344e-06, "loss": 0.667, "step": 16550 }, { "epoch": 49.73, "grad_norm": 13.938491821289062, "learning_rate": 8.342342342342343e-06, "loss": 0.6875, "step": 16560 }, { "epoch": 49.76, "grad_norm": 12.678507804870605, "learning_rate": 8.341341341341343e-06, "loss": 0.5823, "step": 16570 }, { "epoch": 49.79, "grad_norm": 13.337613105773926, "learning_rate": 8.340340340340342e-06, "loss": 0.6212, "step": 16580 }, { "epoch": 49.82, "grad_norm": 17.60381317138672, "learning_rate": 8.339339339339339e-06, "loss": 0.6683, "step": 16590 }, { "epoch": 49.85, "grad_norm": 15.319842338562012, "learning_rate": 8.338338338338339e-06, "loss": 0.6332, "step": 16600 }, { "epoch": 49.88, "grad_norm": 21.950565338134766, "learning_rate": 8.337337337337338e-06, "loss": 0.5829, "step": 16610 }, { "epoch": 49.91, "grad_norm": 16.845312118530273, "learning_rate": 8.336336336336336e-06, "loss": 0.6298, "step": 16620 }, { "epoch": 49.94, "grad_norm": 16.28782081604004, "learning_rate": 8.335335335335337e-06, "loss": 0.6576, "step": 16630 }, { "epoch": 49.97, "grad_norm": 13.272744178771973, "learning_rate": 8.334334334334335e-06, "loss": 0.583, "step": 16640 }, { "epoch": 50.0, "grad_norm": 80.26773834228516, "learning_rate": 8.333333333333334e-06, "loss": 0.6737, "step": 16650 }, { "epoch": 50.0, "eval_accuracy": 0.8648, "eval_loss": 0.4503735899925232, "eval_runtime": 12.6697, "eval_samples_per_second": 789.285, "eval_steps_per_second": 3.157, "step": 16650 }, { "epoch": 50.03, "grad_norm": 16.50661849975586, "learning_rate": 8.332332332332334e-06, "loss": 0.6137, "step": 16660 }, { "epoch": 50.06, "grad_norm": 15.900172233581543, "learning_rate": 8.331331331331331e-06, "loss": 0.6835, "step": 16670 }, { "epoch": 50.09, "grad_norm": 18.15906524658203, "learning_rate": 8.330330330330331e-06, "loss": 0.6425, "step": 16680 }, { "epoch": 50.12, "grad_norm": 11.704545021057129, "learning_rate": 8.32932932932933e-06, "loss": 0.5861, "step": 16690 }, { "epoch": 50.15, "grad_norm": 11.482316970825195, "learning_rate": 8.328328328328328e-06, "loss": 0.6083, "step": 16700 }, { "epoch": 50.18, "grad_norm": 21.32619857788086, "learning_rate": 8.327327327327329e-06, "loss": 0.6729, "step": 16710 }, { "epoch": 50.21, "grad_norm": 15.120251655578613, "learning_rate": 8.326326326326327e-06, "loss": 0.6147, "step": 16720 }, { "epoch": 50.24, "grad_norm": 12.25693130493164, "learning_rate": 8.325325325325326e-06, "loss": 0.6118, "step": 16730 }, { "epoch": 50.27, "grad_norm": 14.365169525146484, "learning_rate": 8.324324324324326e-06, "loss": 0.575, "step": 16740 }, { "epoch": 50.3, "grad_norm": 16.102943420410156, "learning_rate": 8.323323323323323e-06, "loss": 0.5915, "step": 16750 }, { "epoch": 50.33, "grad_norm": 16.202205657958984, "learning_rate": 8.322322322322323e-06, "loss": 0.6382, "step": 16760 }, { "epoch": 50.36, "grad_norm": 15.920069694519043, "learning_rate": 8.321321321321322e-06, "loss": 0.6673, "step": 16770 }, { "epoch": 50.39, "grad_norm": 14.586955070495605, "learning_rate": 8.32032032032032e-06, "loss": 0.631, "step": 16780 }, { "epoch": 50.42, "grad_norm": 25.971364974975586, "learning_rate": 8.31931931931932e-06, "loss": 0.5962, "step": 16790 }, { "epoch": 50.45, "grad_norm": 21.708187103271484, "learning_rate": 8.31831831831832e-06, "loss": 0.5806, "step": 16800 }, { "epoch": 50.48, "grad_norm": 17.089094161987305, "learning_rate": 8.317317317317318e-06, "loss": 0.6518, "step": 16810 }, { "epoch": 50.51, "grad_norm": 12.237706184387207, "learning_rate": 8.316316316316317e-06, "loss": 0.6206, "step": 16820 }, { "epoch": 50.54, "grad_norm": 12.970582008361816, "learning_rate": 8.315315315315317e-06, "loss": 0.5751, "step": 16830 }, { "epoch": 50.57, "grad_norm": 16.444976806640625, "learning_rate": 8.314314314314314e-06, "loss": 0.5381, "step": 16840 }, { "epoch": 50.6, "grad_norm": 17.726957321166992, "learning_rate": 8.313313313313314e-06, "loss": 0.6321, "step": 16850 }, { "epoch": 50.63, "grad_norm": 14.764182090759277, "learning_rate": 8.312312312312313e-06, "loss": 0.6045, "step": 16860 }, { "epoch": 50.66, "grad_norm": 11.397544860839844, "learning_rate": 8.311311311311311e-06, "loss": 0.6195, "step": 16870 }, { "epoch": 50.69, "grad_norm": 18.760087966918945, "learning_rate": 8.310310310310312e-06, "loss": 0.6775, "step": 16880 }, { "epoch": 50.72, "grad_norm": 16.329402923583984, "learning_rate": 8.30930930930931e-06, "loss": 0.5952, "step": 16890 }, { "epoch": 50.75, "grad_norm": 17.519657135009766, "learning_rate": 8.308308308308309e-06, "loss": 0.6568, "step": 16900 }, { "epoch": 50.78, "grad_norm": 13.098520278930664, "learning_rate": 8.307307307307309e-06, "loss": 0.6262, "step": 16910 }, { "epoch": 50.81, "grad_norm": 18.580644607543945, "learning_rate": 8.306306306306306e-06, "loss": 0.6059, "step": 16920 }, { "epoch": 50.84, "grad_norm": 13.884407043457031, "learning_rate": 8.305305305305306e-06, "loss": 0.646, "step": 16930 }, { "epoch": 50.87, "grad_norm": 16.32083511352539, "learning_rate": 8.304304304304305e-06, "loss": 0.6617, "step": 16940 }, { "epoch": 50.9, "grad_norm": 17.042152404785156, "learning_rate": 8.303303303303303e-06, "loss": 0.6276, "step": 16950 }, { "epoch": 50.93, "grad_norm": 19.203786849975586, "learning_rate": 8.302302302302304e-06, "loss": 0.6461, "step": 16960 }, { "epoch": 50.96, "grad_norm": 17.717748641967773, "learning_rate": 8.301301301301302e-06, "loss": 0.6799, "step": 16970 }, { "epoch": 50.99, "grad_norm": 15.912696838378906, "learning_rate": 8.300300300300301e-06, "loss": 0.5573, "step": 16980 }, { "epoch": 51.0, "eval_accuracy": 0.8641, "eval_loss": 0.4541584551334381, "eval_runtime": 12.9427, "eval_samples_per_second": 772.638, "eval_steps_per_second": 3.091, "step": 16983 }, { "epoch": 51.02, "grad_norm": 16.83027458190918, "learning_rate": 8.299299299299301e-06, "loss": 0.5235, "step": 16990 }, { "epoch": 51.05, "grad_norm": 24.17321014404297, "learning_rate": 8.298298298298298e-06, "loss": 0.6485, "step": 17000 }, { "epoch": 51.08, "grad_norm": 15.718440055847168, "learning_rate": 8.297297297297298e-06, "loss": 0.5895, "step": 17010 }, { "epoch": 51.11, "grad_norm": 26.84842300415039, "learning_rate": 8.296296296296297e-06, "loss": 0.6912, "step": 17020 }, { "epoch": 51.14, "grad_norm": 12.315144538879395, "learning_rate": 8.295295295295296e-06, "loss": 0.6454, "step": 17030 }, { "epoch": 51.17, "grad_norm": 15.44035816192627, "learning_rate": 8.294294294294294e-06, "loss": 0.6382, "step": 17040 }, { "epoch": 51.2, "grad_norm": 19.862279891967773, "learning_rate": 8.293293293293294e-06, "loss": 0.6557, "step": 17050 }, { "epoch": 51.23, "grad_norm": 17.58277130126953, "learning_rate": 8.292292292292293e-06, "loss": 0.6643, "step": 17060 }, { "epoch": 51.26, "grad_norm": 15.316263198852539, "learning_rate": 8.291291291291292e-06, "loss": 0.6296, "step": 17070 }, { "epoch": 51.29, "grad_norm": 13.260090827941895, "learning_rate": 8.29029029029029e-06, "loss": 0.6293, "step": 17080 }, { "epoch": 51.32, "grad_norm": 18.472536087036133, "learning_rate": 8.289289289289289e-06, "loss": 0.6279, "step": 17090 }, { "epoch": 51.35, "grad_norm": 18.13410758972168, "learning_rate": 8.288288288288289e-06, "loss": 0.5917, "step": 17100 }, { "epoch": 51.38, "grad_norm": 12.870095252990723, "learning_rate": 8.287287287287288e-06, "loss": 0.5901, "step": 17110 }, { "epoch": 51.41, "grad_norm": 13.40429973602295, "learning_rate": 8.286286286286286e-06, "loss": 0.5984, "step": 17120 }, { "epoch": 51.44, "grad_norm": 19.12213134765625, "learning_rate": 8.285285285285287e-06, "loss": 0.6627, "step": 17130 }, { "epoch": 51.47, "grad_norm": 12.901103973388672, "learning_rate": 8.284284284284285e-06, "loss": 0.6143, "step": 17140 }, { "epoch": 51.5, "grad_norm": 18.621458053588867, "learning_rate": 8.283283283283284e-06, "loss": 0.5618, "step": 17150 }, { "epoch": 51.53, "grad_norm": 17.519861221313477, "learning_rate": 8.282282282282284e-06, "loss": 0.6703, "step": 17160 }, { "epoch": 51.56, "grad_norm": 23.81211280822754, "learning_rate": 8.281281281281281e-06, "loss": 0.6219, "step": 17170 }, { "epoch": 51.59, "grad_norm": 16.727157592773438, "learning_rate": 8.280280280280281e-06, "loss": 0.5719, "step": 17180 }, { "epoch": 51.62, "grad_norm": 15.704798698425293, "learning_rate": 8.27927927927928e-06, "loss": 0.6613, "step": 17190 }, { "epoch": 51.65, "grad_norm": 20.87688636779785, "learning_rate": 8.278278278278278e-06, "loss": 0.5979, "step": 17200 }, { "epoch": 51.68, "grad_norm": 14.883535385131836, "learning_rate": 8.277277277277279e-06, "loss": 0.6128, "step": 17210 }, { "epoch": 51.71, "grad_norm": 18.046480178833008, "learning_rate": 8.276276276276277e-06, "loss": 0.6295, "step": 17220 }, { "epoch": 51.74, "grad_norm": 14.467040061950684, "learning_rate": 8.275275275275276e-06, "loss": 0.6666, "step": 17230 }, { "epoch": 51.77, "grad_norm": 13.173455238342285, "learning_rate": 8.274274274274276e-06, "loss": 0.5738, "step": 17240 }, { "epoch": 51.8, "grad_norm": 15.290773391723633, "learning_rate": 8.273273273273273e-06, "loss": 0.6693, "step": 17250 }, { "epoch": 51.83, "grad_norm": 15.725456237792969, "learning_rate": 8.272272272272273e-06, "loss": 0.6455, "step": 17260 }, { "epoch": 51.86, "grad_norm": 19.449800491333008, "learning_rate": 8.271271271271272e-06, "loss": 0.6245, "step": 17270 }, { "epoch": 51.89, "grad_norm": 15.260242462158203, "learning_rate": 8.27027027027027e-06, "loss": 0.5877, "step": 17280 }, { "epoch": 51.92, "grad_norm": 16.413896560668945, "learning_rate": 8.26926926926927e-06, "loss": 0.6061, "step": 17290 }, { "epoch": 51.95, "grad_norm": 12.333784103393555, "learning_rate": 8.26826826826827e-06, "loss": 0.6318, "step": 17300 }, { "epoch": 51.98, "grad_norm": 17.20624351501465, "learning_rate": 8.267267267267268e-06, "loss": 0.6296, "step": 17310 }, { "epoch": 52.0, "eval_accuracy": 0.8626, "eval_loss": 0.45732319355010986, "eval_runtime": 13.213, "eval_samples_per_second": 756.831, "eval_steps_per_second": 3.027, "step": 17316 }, { "epoch": 52.01, "grad_norm": 15.85933780670166, "learning_rate": 8.266266266266267e-06, "loss": 0.6516, "step": 17320 }, { "epoch": 52.04, "grad_norm": 21.219789505004883, "learning_rate": 8.265265265265265e-06, "loss": 0.6241, "step": 17330 }, { "epoch": 52.07, "grad_norm": 16.367534637451172, "learning_rate": 8.264264264264264e-06, "loss": 0.6824, "step": 17340 }, { "epoch": 52.1, "grad_norm": 18.358396530151367, "learning_rate": 8.263263263263264e-06, "loss": 0.6016, "step": 17350 }, { "epoch": 52.13, "grad_norm": 13.449258804321289, "learning_rate": 8.262262262262263e-06, "loss": 0.6292, "step": 17360 }, { "epoch": 52.16, "grad_norm": 19.812196731567383, "learning_rate": 8.261261261261261e-06, "loss": 0.6062, "step": 17370 }, { "epoch": 52.19, "grad_norm": 16.552570343017578, "learning_rate": 8.260260260260262e-06, "loss": 0.6181, "step": 17380 }, { "epoch": 52.22, "grad_norm": 13.700467109680176, "learning_rate": 8.25925925925926e-06, "loss": 0.6256, "step": 17390 }, { "epoch": 52.25, "grad_norm": 16.087739944458008, "learning_rate": 8.258258258258259e-06, "loss": 0.624, "step": 17400 }, { "epoch": 52.28, "grad_norm": 13.194904327392578, "learning_rate": 8.257257257257259e-06, "loss": 0.6256, "step": 17410 }, { "epoch": 52.31, "grad_norm": 20.677940368652344, "learning_rate": 8.256256256256256e-06, "loss": 0.6342, "step": 17420 }, { "epoch": 52.34, "grad_norm": 17.496768951416016, "learning_rate": 8.255255255255256e-06, "loss": 0.6425, "step": 17430 }, { "epoch": 52.37, "grad_norm": 16.56483268737793, "learning_rate": 8.254254254254255e-06, "loss": 0.5831, "step": 17440 }, { "epoch": 52.4, "grad_norm": 20.45583152770996, "learning_rate": 8.253253253253254e-06, "loss": 0.6419, "step": 17450 }, { "epoch": 52.43, "grad_norm": 15.296815872192383, "learning_rate": 8.252252252252254e-06, "loss": 0.5942, "step": 17460 }, { "epoch": 52.46, "grad_norm": 13.59698486328125, "learning_rate": 8.251251251251252e-06, "loss": 0.5907, "step": 17470 }, { "epoch": 52.49, "grad_norm": 15.531094551086426, "learning_rate": 8.250250250250251e-06, "loss": 0.6174, "step": 17480 }, { "epoch": 52.52, "grad_norm": 11.718438148498535, "learning_rate": 8.24924924924925e-06, "loss": 0.6663, "step": 17490 }, { "epoch": 52.55, "grad_norm": 13.98644733428955, "learning_rate": 8.248248248248248e-06, "loss": 0.598, "step": 17500 }, { "epoch": 52.58, "grad_norm": 12.666622161865234, "learning_rate": 8.247247247247247e-06, "loss": 0.6163, "step": 17510 }, { "epoch": 52.61, "grad_norm": 18.385515213012695, "learning_rate": 8.246246246246247e-06, "loss": 0.6184, "step": 17520 }, { "epoch": 52.64, "grad_norm": 21.152790069580078, "learning_rate": 8.245245245245246e-06, "loss": 0.6664, "step": 17530 }, { "epoch": 52.67, "grad_norm": 14.228870391845703, "learning_rate": 8.244244244244244e-06, "loss": 0.6337, "step": 17540 }, { "epoch": 52.7, "grad_norm": 19.32399559020996, "learning_rate": 8.243243243243245e-06, "loss": 0.6542, "step": 17550 }, { "epoch": 52.73, "grad_norm": 14.791872024536133, "learning_rate": 8.242242242242243e-06, "loss": 0.6347, "step": 17560 }, { "epoch": 52.76, "grad_norm": 12.860326766967773, "learning_rate": 8.241241241241242e-06, "loss": 0.5534, "step": 17570 }, { "epoch": 52.79, "grad_norm": 13.235801696777344, "learning_rate": 8.24024024024024e-06, "loss": 0.6217, "step": 17580 }, { "epoch": 52.82, "grad_norm": 16.83515739440918, "learning_rate": 8.239239239239239e-06, "loss": 0.6346, "step": 17590 }, { "epoch": 52.85, "grad_norm": 13.064797401428223, "learning_rate": 8.23823823823824e-06, "loss": 0.6224, "step": 17600 }, { "epoch": 52.88, "grad_norm": 17.000850677490234, "learning_rate": 8.237237237237238e-06, "loss": 0.6116, "step": 17610 }, { "epoch": 52.91, "grad_norm": 13.369550704956055, "learning_rate": 8.236236236236236e-06, "loss": 0.6734, "step": 17620 }, { "epoch": 52.94, "grad_norm": 13.209344863891602, "learning_rate": 8.235235235235237e-06, "loss": 0.6274, "step": 17630 }, { "epoch": 52.97, "grad_norm": 17.668176651000977, "learning_rate": 8.234234234234235e-06, "loss": 0.6245, "step": 17640 }, { "epoch": 53.0, "eval_accuracy": 0.8647, "eval_loss": 0.45495444536209106, "eval_runtime": 13.0186, "eval_samples_per_second": 768.132, "eval_steps_per_second": 3.073, "step": 17649 }, { "epoch": 53.0, "grad_norm": 12.670238494873047, "learning_rate": 8.233233233233234e-06, "loss": 0.6691, "step": 17650 }, { "epoch": 53.03, "grad_norm": 17.616172790527344, "learning_rate": 8.232232232232234e-06, "loss": 0.6252, "step": 17660 }, { "epoch": 53.06, "grad_norm": 14.60269546508789, "learning_rate": 8.231231231231231e-06, "loss": 0.6346, "step": 17670 }, { "epoch": 53.09, "grad_norm": 15.650839805603027, "learning_rate": 8.230230230230231e-06, "loss": 0.6162, "step": 17680 }, { "epoch": 53.12, "grad_norm": 15.538573265075684, "learning_rate": 8.22922922922923e-06, "loss": 0.5013, "step": 17690 }, { "epoch": 53.15, "grad_norm": 13.48410701751709, "learning_rate": 8.228228228228229e-06, "loss": 0.6216, "step": 17700 }, { "epoch": 53.18, "grad_norm": 21.481840133666992, "learning_rate": 8.227227227227229e-06, "loss": 0.5999, "step": 17710 }, { "epoch": 53.21, "grad_norm": 14.913869857788086, "learning_rate": 8.226226226226227e-06, "loss": 0.5996, "step": 17720 }, { "epoch": 53.24, "grad_norm": 15.669997215270996, "learning_rate": 8.225225225225226e-06, "loss": 0.6066, "step": 17730 }, { "epoch": 53.27, "grad_norm": 12.006368637084961, "learning_rate": 8.224224224224225e-06, "loss": 0.5889, "step": 17740 }, { "epoch": 53.3, "grad_norm": 15.566818237304688, "learning_rate": 8.223223223223223e-06, "loss": 0.5884, "step": 17750 }, { "epoch": 53.33, "grad_norm": 21.139230728149414, "learning_rate": 8.222222222222222e-06, "loss": 0.6272, "step": 17760 }, { "epoch": 53.36, "grad_norm": 14.190361976623535, "learning_rate": 8.221221221221222e-06, "loss": 0.6257, "step": 17770 }, { "epoch": 53.39, "grad_norm": 12.169196128845215, "learning_rate": 8.22022022022022e-06, "loss": 0.6332, "step": 17780 }, { "epoch": 53.42, "grad_norm": 15.970104217529297, "learning_rate": 8.21921921921922e-06, "loss": 0.6271, "step": 17790 }, { "epoch": 53.45, "grad_norm": 16.380125045776367, "learning_rate": 8.21821821821822e-06, "loss": 0.6672, "step": 17800 }, { "epoch": 53.48, "grad_norm": 24.46523666381836, "learning_rate": 8.217217217217218e-06, "loss": 0.6545, "step": 17810 }, { "epoch": 53.51, "grad_norm": 21.113473892211914, "learning_rate": 8.216216216216217e-06, "loss": 0.6436, "step": 17820 }, { "epoch": 53.54, "grad_norm": 20.094146728515625, "learning_rate": 8.215215215215215e-06, "loss": 0.6114, "step": 17830 }, { "epoch": 53.57, "grad_norm": 21.91627311706543, "learning_rate": 8.214214214214214e-06, "loss": 0.6301, "step": 17840 }, { "epoch": 53.6, "grad_norm": 16.08312225341797, "learning_rate": 8.213213213213214e-06, "loss": 0.6304, "step": 17850 }, { "epoch": 53.63, "grad_norm": 16.935136795043945, "learning_rate": 8.212212212212213e-06, "loss": 0.6028, "step": 17860 }, { "epoch": 53.66, "grad_norm": 16.1990909576416, "learning_rate": 8.211211211211211e-06, "loss": 0.6504, "step": 17870 }, { "epoch": 53.69, "grad_norm": 22.533565521240234, "learning_rate": 8.210210210210212e-06, "loss": 0.6307, "step": 17880 }, { "epoch": 53.72, "grad_norm": 24.32695770263672, "learning_rate": 8.20920920920921e-06, "loss": 0.6518, "step": 17890 }, { "epoch": 53.75, "grad_norm": 17.2038516998291, "learning_rate": 8.208208208208209e-06, "loss": 0.5854, "step": 17900 }, { "epoch": 53.78, "grad_norm": 12.194356918334961, "learning_rate": 8.20720720720721e-06, "loss": 0.6106, "step": 17910 }, { "epoch": 53.81, "grad_norm": 18.936695098876953, "learning_rate": 8.206206206206206e-06, "loss": 0.5942, "step": 17920 }, { "epoch": 53.84, "grad_norm": 12.751850128173828, "learning_rate": 8.205205205205206e-06, "loss": 0.5833, "step": 17930 }, { "epoch": 53.87, "grad_norm": 13.7149019241333, "learning_rate": 8.204204204204205e-06, "loss": 0.6136, "step": 17940 }, { "epoch": 53.9, "grad_norm": 12.324130058288574, "learning_rate": 8.203203203203204e-06, "loss": 0.5969, "step": 17950 }, { "epoch": 53.93, "grad_norm": 14.895933151245117, "learning_rate": 8.202202202202202e-06, "loss": 0.6523, "step": 17960 }, { "epoch": 53.96, "grad_norm": 14.21236801147461, "learning_rate": 8.201201201201202e-06, "loss": 0.6516, "step": 17970 }, { "epoch": 53.99, "grad_norm": 11.186808586120605, "learning_rate": 8.200200200200201e-06, "loss": 0.6018, "step": 17980 }, { "epoch": 54.0, "eval_accuracy": 0.8668, "eval_loss": 0.45088598132133484, "eval_runtime": 13.0896, "eval_samples_per_second": 763.966, "eval_steps_per_second": 3.056, "step": 17982 }, { "epoch": 54.02, "grad_norm": 11.544270515441895, "learning_rate": 8.1991991991992e-06, "loss": 0.5316, "step": 17990 }, { "epoch": 54.05, "grad_norm": 18.69845199584961, "learning_rate": 8.198198198198198e-06, "loss": 0.6515, "step": 18000 }, { "epoch": 54.08, "grad_norm": 14.782732009887695, "learning_rate": 8.197197197197197e-06, "loss": 0.658, "step": 18010 }, { "epoch": 54.11, "grad_norm": 12.888195037841797, "learning_rate": 8.196196196196197e-06, "loss": 0.6079, "step": 18020 }, { "epoch": 54.14, "grad_norm": 16.469144821166992, "learning_rate": 8.195195195195196e-06, "loss": 0.5935, "step": 18030 }, { "epoch": 54.17, "grad_norm": 39.164031982421875, "learning_rate": 8.194194194194194e-06, "loss": 0.5693, "step": 18040 }, { "epoch": 54.2, "grad_norm": 13.540485382080078, "learning_rate": 8.193193193193195e-06, "loss": 0.597, "step": 18050 }, { "epoch": 54.23, "grad_norm": 18.07455062866211, "learning_rate": 8.192192192192193e-06, "loss": 0.6451, "step": 18060 }, { "epoch": 54.26, "grad_norm": 14.38621997833252, "learning_rate": 8.191191191191192e-06, "loss": 0.5821, "step": 18070 }, { "epoch": 54.29, "grad_norm": 18.677093505859375, "learning_rate": 8.19019019019019e-06, "loss": 0.622, "step": 18080 }, { "epoch": 54.32, "grad_norm": 22.3408203125, "learning_rate": 8.189189189189189e-06, "loss": 0.6261, "step": 18090 }, { "epoch": 54.35, "grad_norm": 11.709248542785645, "learning_rate": 8.18818818818819e-06, "loss": 0.6504, "step": 18100 }, { "epoch": 54.38, "grad_norm": 15.678053855895996, "learning_rate": 8.187187187187188e-06, "loss": 0.6171, "step": 18110 }, { "epoch": 54.41, "grad_norm": 13.658485412597656, "learning_rate": 8.186186186186186e-06, "loss": 0.562, "step": 18120 }, { "epoch": 54.44, "grad_norm": 14.877076148986816, "learning_rate": 8.185185185185187e-06, "loss": 0.6325, "step": 18130 }, { "epoch": 54.47, "grad_norm": 21.70797348022461, "learning_rate": 8.184184184184185e-06, "loss": 0.6752, "step": 18140 }, { "epoch": 54.5, "grad_norm": 15.471793174743652, "learning_rate": 8.183183183183184e-06, "loss": 0.6115, "step": 18150 }, { "epoch": 54.53, "grad_norm": 17.171215057373047, "learning_rate": 8.182182182182183e-06, "loss": 0.5683, "step": 18160 }, { "epoch": 54.56, "grad_norm": 11.893485069274902, "learning_rate": 8.181181181181181e-06, "loss": 0.5831, "step": 18170 }, { "epoch": 54.59, "grad_norm": 13.77318286895752, "learning_rate": 8.18018018018018e-06, "loss": 0.618, "step": 18180 }, { "epoch": 54.62, "grad_norm": 12.6817045211792, "learning_rate": 8.17917917917918e-06, "loss": 0.5864, "step": 18190 }, { "epoch": 54.65, "grad_norm": 12.999750137329102, "learning_rate": 8.178178178178179e-06, "loss": 0.5751, "step": 18200 }, { "epoch": 54.68, "grad_norm": 11.285355567932129, "learning_rate": 8.177177177177177e-06, "loss": 0.6755, "step": 18210 }, { "epoch": 54.71, "grad_norm": 24.20255470275879, "learning_rate": 8.176176176176177e-06, "loss": 0.621, "step": 18220 }, { "epoch": 54.74, "grad_norm": 17.335365295410156, "learning_rate": 8.175175175175176e-06, "loss": 0.6793, "step": 18230 }, { "epoch": 54.77, "grad_norm": 15.035238265991211, "learning_rate": 8.174174174174175e-06, "loss": 0.5939, "step": 18240 }, { "epoch": 54.8, "grad_norm": 11.79697322845459, "learning_rate": 8.173173173173173e-06, "loss": 0.5754, "step": 18250 }, { "epoch": 54.83, "grad_norm": 17.361902236938477, "learning_rate": 8.172172172172172e-06, "loss": 0.5622, "step": 18260 }, { "epoch": 54.86, "grad_norm": 22.03786277770996, "learning_rate": 8.171171171171172e-06, "loss": 0.5917, "step": 18270 }, { "epoch": 54.89, "grad_norm": 22.793529510498047, "learning_rate": 8.17017017017017e-06, "loss": 0.6358, "step": 18280 }, { "epoch": 54.92, "grad_norm": 17.468017578125, "learning_rate": 8.16916916916917e-06, "loss": 0.607, "step": 18290 }, { "epoch": 54.95, "grad_norm": 21.24018669128418, "learning_rate": 8.16816816816817e-06, "loss": 0.6604, "step": 18300 }, { "epoch": 54.98, "grad_norm": 14.710421562194824, "learning_rate": 8.167167167167168e-06, "loss": 0.6068, "step": 18310 }, { "epoch": 55.0, "eval_accuracy": 0.865, "eval_loss": 0.45606616139411926, "eval_runtime": 12.6186, "eval_samples_per_second": 792.48, "eval_steps_per_second": 3.17, "step": 18315 }, { "epoch": 55.02, "grad_norm": 15.025124549865723, "learning_rate": 8.166166166166167e-06, "loss": 0.5491, "step": 18320 }, { "epoch": 55.05, "grad_norm": 15.541799545288086, "learning_rate": 8.165165165165165e-06, "loss": 0.6588, "step": 18330 }, { "epoch": 55.08, "grad_norm": 15.385310173034668, "learning_rate": 8.164164164164164e-06, "loss": 0.6377, "step": 18340 }, { "epoch": 55.11, "grad_norm": 15.760037422180176, "learning_rate": 8.163163163163164e-06, "loss": 0.6153, "step": 18350 }, { "epoch": 55.14, "grad_norm": 19.843534469604492, "learning_rate": 8.162162162162163e-06, "loss": 0.5795, "step": 18360 }, { "epoch": 55.17, "grad_norm": 12.835470199584961, "learning_rate": 8.161161161161161e-06, "loss": 0.6025, "step": 18370 }, { "epoch": 55.2, "grad_norm": 16.53463363647461, "learning_rate": 8.160160160160162e-06, "loss": 0.4885, "step": 18380 }, { "epoch": 55.23, "grad_norm": 12.131781578063965, "learning_rate": 8.15915915915916e-06, "loss": 0.6007, "step": 18390 }, { "epoch": 55.26, "grad_norm": 27.071044921875, "learning_rate": 8.158158158158159e-06, "loss": 0.5671, "step": 18400 }, { "epoch": 55.29, "grad_norm": 17.2073974609375, "learning_rate": 8.157157157157158e-06, "loss": 0.5901, "step": 18410 }, { "epoch": 55.32, "grad_norm": 15.685284614562988, "learning_rate": 8.156156156156156e-06, "loss": 0.5979, "step": 18420 }, { "epoch": 55.35, "grad_norm": 11.316452980041504, "learning_rate": 8.155155155155155e-06, "loss": 0.5642, "step": 18430 }, { "epoch": 55.38, "grad_norm": 15.418350219726562, "learning_rate": 8.154154154154155e-06, "loss": 0.6188, "step": 18440 }, { "epoch": 55.41, "grad_norm": 15.311288833618164, "learning_rate": 8.153153153153154e-06, "loss": 0.6231, "step": 18450 }, { "epoch": 55.44, "grad_norm": 18.07636260986328, "learning_rate": 8.152152152152152e-06, "loss": 0.6114, "step": 18460 }, { "epoch": 55.47, "grad_norm": 15.564066886901855, "learning_rate": 8.151151151151153e-06, "loss": 0.5485, "step": 18470 }, { "epoch": 55.5, "grad_norm": 12.460175514221191, "learning_rate": 8.150150150150151e-06, "loss": 0.5263, "step": 18480 }, { "epoch": 55.53, "grad_norm": 18.801799774169922, "learning_rate": 8.14914914914915e-06, "loss": 0.6383, "step": 18490 }, { "epoch": 55.56, "grad_norm": 14.195170402526855, "learning_rate": 8.148148148148148e-06, "loss": 0.5956, "step": 18500 }, { "epoch": 55.59, "grad_norm": 16.936365127563477, "learning_rate": 8.147147147147147e-06, "loss": 0.5915, "step": 18510 }, { "epoch": 55.62, "grad_norm": 11.47391414642334, "learning_rate": 8.146146146146147e-06, "loss": 0.5321, "step": 18520 }, { "epoch": 55.65, "grad_norm": 18.33390235900879, "learning_rate": 8.145145145145146e-06, "loss": 0.6296, "step": 18530 }, { "epoch": 55.68, "grad_norm": 27.2676944732666, "learning_rate": 8.144144144144144e-06, "loss": 0.5628, "step": 18540 }, { "epoch": 55.71, "grad_norm": 16.5135555267334, "learning_rate": 8.143143143143145e-06, "loss": 0.5631, "step": 18550 }, { "epoch": 55.74, "grad_norm": 20.772871017456055, "learning_rate": 8.142142142142143e-06, "loss": 0.6028, "step": 18560 }, { "epoch": 55.77, "grad_norm": 18.499374389648438, "learning_rate": 8.141141141141142e-06, "loss": 0.6235, "step": 18570 }, { "epoch": 55.8, "grad_norm": 20.60066032409668, "learning_rate": 8.14014014014014e-06, "loss": 0.5681, "step": 18580 }, { "epoch": 55.83, "grad_norm": 16.534555435180664, "learning_rate": 8.139139139139139e-06, "loss": 0.6002, "step": 18590 }, { "epoch": 55.86, "grad_norm": 20.2315673828125, "learning_rate": 8.13813813813814e-06, "loss": 0.5633, "step": 18600 }, { "epoch": 55.89, "grad_norm": 18.77223014831543, "learning_rate": 8.137137137137138e-06, "loss": 0.6241, "step": 18610 }, { "epoch": 55.92, "grad_norm": 16.95858383178711, "learning_rate": 8.136136136136137e-06, "loss": 0.5785, "step": 18620 }, { "epoch": 55.95, "grad_norm": 22.44672393798828, "learning_rate": 8.135135135135137e-06, "loss": 0.6005, "step": 18630 }, { "epoch": 55.98, "grad_norm": 12.324645042419434, "learning_rate": 8.134134134134135e-06, "loss": 0.6368, "step": 18640 }, { "epoch": 56.0, "eval_accuracy": 0.8666, "eval_loss": 0.4533466696739197, "eval_runtime": 12.8866, "eval_samples_per_second": 776.001, "eval_steps_per_second": 3.104, "step": 18648 }, { "epoch": 56.01, "grad_norm": 18.5239200592041, "learning_rate": 8.133133133133134e-06, "loss": 0.6064, "step": 18650 }, { "epoch": 56.04, "grad_norm": 12.947190284729004, "learning_rate": 8.132132132132133e-06, "loss": 0.5789, "step": 18660 }, { "epoch": 56.07, "grad_norm": 15.244621276855469, "learning_rate": 8.131131131131131e-06, "loss": 0.5906, "step": 18670 }, { "epoch": 56.1, "grad_norm": 19.841922760009766, "learning_rate": 8.13013013013013e-06, "loss": 0.5527, "step": 18680 }, { "epoch": 56.13, "grad_norm": 16.695791244506836, "learning_rate": 8.12912912912913e-06, "loss": 0.5803, "step": 18690 }, { "epoch": 56.16, "grad_norm": 19.614633560180664, "learning_rate": 8.128128128128129e-06, "loss": 0.6265, "step": 18700 }, { "epoch": 56.19, "grad_norm": 11.918386459350586, "learning_rate": 8.127127127127127e-06, "loss": 0.5851, "step": 18710 }, { "epoch": 56.22, "grad_norm": 12.749957084655762, "learning_rate": 8.126126126126128e-06, "loss": 0.6859, "step": 18720 }, { "epoch": 56.25, "grad_norm": 17.98277473449707, "learning_rate": 8.125125125125126e-06, "loss": 0.5873, "step": 18730 }, { "epoch": 56.28, "grad_norm": 25.136402130126953, "learning_rate": 8.124124124124125e-06, "loss": 0.6042, "step": 18740 }, { "epoch": 56.31, "grad_norm": 17.568328857421875, "learning_rate": 8.123123123123123e-06, "loss": 0.5497, "step": 18750 }, { "epoch": 56.34, "grad_norm": 16.162120819091797, "learning_rate": 8.122122122122122e-06, "loss": 0.5819, "step": 18760 }, { "epoch": 56.37, "grad_norm": 15.463545799255371, "learning_rate": 8.121121121121122e-06, "loss": 0.5787, "step": 18770 }, { "epoch": 56.4, "grad_norm": 14.494797706604004, "learning_rate": 8.12012012012012e-06, "loss": 0.5883, "step": 18780 }, { "epoch": 56.43, "grad_norm": 16.26420021057129, "learning_rate": 8.11911911911912e-06, "loss": 0.5825, "step": 18790 }, { "epoch": 56.46, "grad_norm": 16.86106300354004, "learning_rate": 8.11811811811812e-06, "loss": 0.5885, "step": 18800 }, { "epoch": 56.49, "grad_norm": 17.785783767700195, "learning_rate": 8.117117117117118e-06, "loss": 0.5989, "step": 18810 }, { "epoch": 56.52, "grad_norm": 21.74614906311035, "learning_rate": 8.116116116116117e-06, "loss": 0.6554, "step": 18820 }, { "epoch": 56.55, "grad_norm": 15.670501708984375, "learning_rate": 8.115115115115115e-06, "loss": 0.5839, "step": 18830 }, { "epoch": 56.58, "grad_norm": 14.93682861328125, "learning_rate": 8.114114114114114e-06, "loss": 0.5679, "step": 18840 }, { "epoch": 56.61, "grad_norm": 12.969562530517578, "learning_rate": 8.113113113113114e-06, "loss": 0.6219, "step": 18850 }, { "epoch": 56.64, "grad_norm": 13.447518348693848, "learning_rate": 8.112112112112113e-06, "loss": 0.579, "step": 18860 }, { "epoch": 56.67, "grad_norm": 12.603790283203125, "learning_rate": 8.111111111111112e-06, "loss": 0.6207, "step": 18870 }, { "epoch": 56.7, "grad_norm": 13.396700859069824, "learning_rate": 8.11011011011011e-06, "loss": 0.5672, "step": 18880 }, { "epoch": 56.73, "grad_norm": 22.983783721923828, "learning_rate": 8.10910910910911e-06, "loss": 0.6262, "step": 18890 }, { "epoch": 56.76, "grad_norm": 22.194602966308594, "learning_rate": 8.108108108108109e-06, "loss": 0.5478, "step": 18900 }, { "epoch": 56.79, "grad_norm": 14.818936347961426, "learning_rate": 8.107107107107108e-06, "loss": 0.6087, "step": 18910 }, { "epoch": 56.82, "grad_norm": 14.03148365020752, "learning_rate": 8.106106106106106e-06, "loss": 0.5593, "step": 18920 }, { "epoch": 56.85, "grad_norm": 13.037586212158203, "learning_rate": 8.105105105105105e-06, "loss": 0.5773, "step": 18930 }, { "epoch": 56.88, "grad_norm": 19.797903060913086, "learning_rate": 8.104104104104105e-06, "loss": 0.6012, "step": 18940 }, { "epoch": 56.91, "grad_norm": 12.080364227294922, "learning_rate": 8.103103103103104e-06, "loss": 0.6236, "step": 18950 }, { "epoch": 56.94, "grad_norm": 14.649006843566895, "learning_rate": 8.102102102102102e-06, "loss": 0.646, "step": 18960 }, { "epoch": 56.97, "grad_norm": 15.998294830322266, "learning_rate": 8.101101101101103e-06, "loss": 0.6381, "step": 18970 }, { "epoch": 57.0, "grad_norm": 15.81445598602295, "learning_rate": 8.100100100100101e-06, "loss": 0.5945, "step": 18980 }, { "epoch": 57.0, "eval_accuracy": 0.8646, "eval_loss": 0.45373043417930603, "eval_runtime": 13.1449, "eval_samples_per_second": 760.752, "eval_steps_per_second": 3.043, "step": 18981 }, { "epoch": 57.03, "grad_norm": 10.915128707885742, "learning_rate": 8.0990990990991e-06, "loss": 0.6811, "step": 18990 }, { "epoch": 57.06, "grad_norm": 16.743343353271484, "learning_rate": 8.098098098098098e-06, "loss": 0.6148, "step": 19000 }, { "epoch": 57.09, "grad_norm": 11.98512077331543, "learning_rate": 8.097097097097097e-06, "loss": 0.5817, "step": 19010 }, { "epoch": 57.12, "grad_norm": 12.866323471069336, "learning_rate": 8.096096096096097e-06, "loss": 0.5966, "step": 19020 }, { "epoch": 57.15, "grad_norm": 13.117215156555176, "learning_rate": 8.095095095095096e-06, "loss": 0.5596, "step": 19030 }, { "epoch": 57.18, "grad_norm": 22.87441635131836, "learning_rate": 8.094094094094094e-06, "loss": 0.5684, "step": 19040 }, { "epoch": 57.21, "grad_norm": 10.847088813781738, "learning_rate": 8.093093093093095e-06, "loss": 0.5343, "step": 19050 }, { "epoch": 57.24, "grad_norm": 18.142459869384766, "learning_rate": 8.092092092092093e-06, "loss": 0.5796, "step": 19060 }, { "epoch": 57.27, "grad_norm": 19.32158660888672, "learning_rate": 8.091091091091092e-06, "loss": 0.5566, "step": 19070 }, { "epoch": 57.3, "grad_norm": 13.15195369720459, "learning_rate": 8.09009009009009e-06, "loss": 0.5322, "step": 19080 }, { "epoch": 57.33, "grad_norm": 14.730746269226074, "learning_rate": 8.089089089089089e-06, "loss": 0.6219, "step": 19090 }, { "epoch": 57.36, "grad_norm": 16.131155014038086, "learning_rate": 8.088088088088088e-06, "loss": 0.5773, "step": 19100 }, { "epoch": 57.39, "grad_norm": 18.815977096557617, "learning_rate": 8.087087087087088e-06, "loss": 0.5456, "step": 19110 }, { "epoch": 57.42, "grad_norm": 16.685556411743164, "learning_rate": 8.086086086086087e-06, "loss": 0.5711, "step": 19120 }, { "epoch": 57.45, "grad_norm": 17.469432830810547, "learning_rate": 8.085085085085085e-06, "loss": 0.5864, "step": 19130 }, { "epoch": 57.48, "grad_norm": 17.60506248474121, "learning_rate": 8.084084084084085e-06, "loss": 0.6233, "step": 19140 }, { "epoch": 57.51, "grad_norm": 14.348414421081543, "learning_rate": 8.083083083083084e-06, "loss": 0.5357, "step": 19150 }, { "epoch": 57.54, "grad_norm": 19.012470245361328, "learning_rate": 8.082082082082083e-06, "loss": 0.6338, "step": 19160 }, { "epoch": 57.57, "grad_norm": 13.458383560180664, "learning_rate": 8.081081081081081e-06, "loss": 0.5687, "step": 19170 }, { "epoch": 57.6, "grad_norm": 18.919784545898438, "learning_rate": 8.08008008008008e-06, "loss": 0.6034, "step": 19180 }, { "epoch": 57.63, "grad_norm": 16.603261947631836, "learning_rate": 8.07907907907908e-06, "loss": 0.6172, "step": 19190 }, { "epoch": 57.66, "grad_norm": 11.052780151367188, "learning_rate": 8.078078078078079e-06, "loss": 0.6061, "step": 19200 }, { "epoch": 57.69, "grad_norm": 12.503372192382812, "learning_rate": 8.077077077077077e-06, "loss": 0.6252, "step": 19210 }, { "epoch": 57.72, "grad_norm": 14.037562370300293, "learning_rate": 8.076076076076078e-06, "loss": 0.6156, "step": 19220 }, { "epoch": 57.75, "grad_norm": 20.213315963745117, "learning_rate": 8.075075075075076e-06, "loss": 0.6186, "step": 19230 }, { "epoch": 57.78, "grad_norm": 18.24369239807129, "learning_rate": 8.074074074074075e-06, "loss": 0.5665, "step": 19240 }, { "epoch": 57.81, "grad_norm": 15.676369667053223, "learning_rate": 8.073073073073073e-06, "loss": 0.6401, "step": 19250 }, { "epoch": 57.84, "grad_norm": 19.942174911499023, "learning_rate": 8.072072072072072e-06, "loss": 0.5869, "step": 19260 }, { "epoch": 57.87, "grad_norm": 18.580678939819336, "learning_rate": 8.071071071071072e-06, "loss": 0.594, "step": 19270 }, { "epoch": 57.9, "grad_norm": 14.152729988098145, "learning_rate": 8.070070070070071e-06, "loss": 0.6364, "step": 19280 }, { "epoch": 57.93, "grad_norm": 15.426253318786621, "learning_rate": 8.06906906906907e-06, "loss": 0.6143, "step": 19290 }, { "epoch": 57.96, "grad_norm": 12.899490356445312, "learning_rate": 8.06806806806807e-06, "loss": 0.567, "step": 19300 }, { "epoch": 57.99, "grad_norm": 13.860706329345703, "learning_rate": 8.067067067067068e-06, "loss": 0.5379, "step": 19310 }, { "epoch": 58.0, "eval_accuracy": 0.8644, "eval_loss": 0.4582826793193817, "eval_runtime": 13.2356, "eval_samples_per_second": 755.537, "eval_steps_per_second": 3.022, "step": 19314 }, { "epoch": 58.02, "grad_norm": 15.111919403076172, "learning_rate": 8.066066066066067e-06, "loss": 0.544, "step": 19320 }, { "epoch": 58.05, "grad_norm": 15.057886123657227, "learning_rate": 8.065065065065066e-06, "loss": 0.5784, "step": 19330 }, { "epoch": 58.08, "grad_norm": 19.320295333862305, "learning_rate": 8.064064064064064e-06, "loss": 0.6362, "step": 19340 }, { "epoch": 58.11, "grad_norm": 18.68480110168457, "learning_rate": 8.063063063063063e-06, "loss": 0.6141, "step": 19350 }, { "epoch": 58.14, "grad_norm": 16.06761360168457, "learning_rate": 8.062062062062063e-06, "loss": 0.5976, "step": 19360 }, { "epoch": 58.17, "grad_norm": 32.325294494628906, "learning_rate": 8.061061061061062e-06, "loss": 0.5454, "step": 19370 }, { "epoch": 58.2, "grad_norm": 19.5351619720459, "learning_rate": 8.06006006006006e-06, "loss": 0.6339, "step": 19380 }, { "epoch": 58.23, "grad_norm": 22.733434677124023, "learning_rate": 8.05905905905906e-06, "loss": 0.6224, "step": 19390 }, { "epoch": 58.26, "grad_norm": 16.71336555480957, "learning_rate": 8.058058058058059e-06, "loss": 0.6396, "step": 19400 }, { "epoch": 58.29, "grad_norm": 13.782090187072754, "learning_rate": 8.057057057057058e-06, "loss": 0.5882, "step": 19410 }, { "epoch": 58.32, "grad_norm": 15.715386390686035, "learning_rate": 8.056056056056056e-06, "loss": 0.5647, "step": 19420 }, { "epoch": 58.35, "grad_norm": 17.997074127197266, "learning_rate": 8.055055055055055e-06, "loss": 0.6363, "step": 19430 }, { "epoch": 58.38, "grad_norm": 14.587823867797852, "learning_rate": 8.054054054054055e-06, "loss": 0.5845, "step": 19440 }, { "epoch": 58.41, "grad_norm": 15.96159839630127, "learning_rate": 8.053053053053054e-06, "loss": 0.5909, "step": 19450 }, { "epoch": 58.44, "grad_norm": 12.311030387878418, "learning_rate": 8.052052052052052e-06, "loss": 0.5711, "step": 19460 }, { "epoch": 58.47, "grad_norm": 16.868986129760742, "learning_rate": 8.051051051051053e-06, "loss": 0.5818, "step": 19470 }, { "epoch": 58.5, "grad_norm": 18.937164306640625, "learning_rate": 8.050050050050051e-06, "loss": 0.6229, "step": 19480 }, { "epoch": 58.53, "grad_norm": 12.239059448242188, "learning_rate": 8.04904904904905e-06, "loss": 0.5801, "step": 19490 }, { "epoch": 58.56, "grad_norm": 15.623565673828125, "learning_rate": 8.048048048048048e-06, "loss": 0.6241, "step": 19500 }, { "epoch": 58.59, "grad_norm": 21.60228157043457, "learning_rate": 8.047047047047047e-06, "loss": 0.5768, "step": 19510 }, { "epoch": 58.62, "grad_norm": 13.131705284118652, "learning_rate": 8.046046046046047e-06, "loss": 0.5423, "step": 19520 }, { "epoch": 58.65, "grad_norm": 16.483760833740234, "learning_rate": 8.045045045045046e-06, "loss": 0.5831, "step": 19530 }, { "epoch": 58.68, "grad_norm": 14.74811840057373, "learning_rate": 8.044044044044045e-06, "loss": 0.5568, "step": 19540 }, { "epoch": 58.71, "grad_norm": 18.404155731201172, "learning_rate": 8.043043043043043e-06, "loss": 0.5936, "step": 19550 }, { "epoch": 58.74, "grad_norm": 14.674580574035645, "learning_rate": 8.042042042042043e-06, "loss": 0.5879, "step": 19560 }, { "epoch": 58.77, "grad_norm": 23.461597442626953, "learning_rate": 8.041041041041042e-06, "loss": 0.5931, "step": 19570 }, { "epoch": 58.8, "grad_norm": 24.857013702392578, "learning_rate": 8.04004004004004e-06, "loss": 0.6067, "step": 19580 }, { "epoch": 58.83, "grad_norm": 16.145484924316406, "learning_rate": 8.03903903903904e-06, "loss": 0.6382, "step": 19590 }, { "epoch": 58.86, "grad_norm": 19.54372787475586, "learning_rate": 8.038038038038038e-06, "loss": 0.6105, "step": 19600 }, { "epoch": 58.89, "grad_norm": 12.304158210754395, "learning_rate": 8.037037037037038e-06, "loss": 0.5826, "step": 19610 }, { "epoch": 58.92, "grad_norm": 13.34826374053955, "learning_rate": 8.036036036036037e-06, "loss": 0.5916, "step": 19620 }, { "epoch": 58.95, "grad_norm": 15.727002143859863, "learning_rate": 8.035035035035035e-06, "loss": 0.5416, "step": 19630 }, { "epoch": 58.98, "grad_norm": 20.862152099609375, "learning_rate": 8.034034034034036e-06, "loss": 0.6031, "step": 19640 }, { "epoch": 59.0, "eval_accuracy": 0.8647, "eval_loss": 0.4573589861392975, "eval_runtime": 12.7266, "eval_samples_per_second": 785.754, "eval_steps_per_second": 3.143, "step": 19647 }, { "epoch": 59.01, "grad_norm": 17.41853904724121, "learning_rate": 8.033033033033034e-06, "loss": 0.5721, "step": 19650 }, { "epoch": 59.04, "grad_norm": 14.630786895751953, "learning_rate": 8.032032032032033e-06, "loss": 0.5814, "step": 19660 }, { "epoch": 59.07, "grad_norm": 18.51511573791504, "learning_rate": 8.031031031031031e-06, "loss": 0.6257, "step": 19670 }, { "epoch": 59.1, "grad_norm": 16.176118850708008, "learning_rate": 8.03003003003003e-06, "loss": 0.5536, "step": 19680 }, { "epoch": 59.13, "grad_norm": 18.753887176513672, "learning_rate": 8.02902902902903e-06, "loss": 0.5978, "step": 19690 }, { "epoch": 59.16, "grad_norm": 14.25525188446045, "learning_rate": 8.028028028028029e-06, "loss": 0.535, "step": 19700 }, { "epoch": 59.19, "grad_norm": 23.966169357299805, "learning_rate": 8.027027027027027e-06, "loss": 0.5691, "step": 19710 }, { "epoch": 59.22, "grad_norm": 15.448729515075684, "learning_rate": 8.026026026026028e-06, "loss": 0.6087, "step": 19720 }, { "epoch": 59.25, "grad_norm": 14.6622314453125, "learning_rate": 8.025025025025026e-06, "loss": 0.5718, "step": 19730 }, { "epoch": 59.28, "grad_norm": 28.261085510253906, "learning_rate": 8.024024024024025e-06, "loss": 0.5416, "step": 19740 }, { "epoch": 59.31, "grad_norm": 19.39133071899414, "learning_rate": 8.023023023023023e-06, "loss": 0.5686, "step": 19750 }, { "epoch": 59.34, "grad_norm": 14.506977081298828, "learning_rate": 8.022022022022022e-06, "loss": 0.5647, "step": 19760 }, { "epoch": 59.37, "grad_norm": 11.523022651672363, "learning_rate": 8.021021021021022e-06, "loss": 0.5916, "step": 19770 }, { "epoch": 59.4, "grad_norm": 14.599289894104004, "learning_rate": 8.020020020020021e-06, "loss": 0.6119, "step": 19780 }, { "epoch": 59.43, "grad_norm": 16.842275619506836, "learning_rate": 8.01901901901902e-06, "loss": 0.6193, "step": 19790 }, { "epoch": 59.46, "grad_norm": 17.81415367126465, "learning_rate": 8.018018018018018e-06, "loss": 0.653, "step": 19800 }, { "epoch": 59.49, "grad_norm": 13.608844757080078, "learning_rate": 8.017017017017018e-06, "loss": 0.6132, "step": 19810 }, { "epoch": 59.52, "grad_norm": 17.850231170654297, "learning_rate": 8.016016016016017e-06, "loss": 0.5948, "step": 19820 }, { "epoch": 59.55, "grad_norm": 14.544975280761719, "learning_rate": 8.015015015015016e-06, "loss": 0.5357, "step": 19830 }, { "epoch": 59.58, "grad_norm": 22.349634170532227, "learning_rate": 8.014014014014014e-06, "loss": 0.5214, "step": 19840 }, { "epoch": 59.61, "grad_norm": 14.013883590698242, "learning_rate": 8.013013013013013e-06, "loss": 0.496, "step": 19850 }, { "epoch": 59.64, "grad_norm": 13.427343368530273, "learning_rate": 8.012012012012013e-06, "loss": 0.56, "step": 19860 }, { "epoch": 59.67, "grad_norm": 14.705164909362793, "learning_rate": 8.011011011011012e-06, "loss": 0.5356, "step": 19870 }, { "epoch": 59.7, "grad_norm": 20.24700164794922, "learning_rate": 8.01001001001001e-06, "loss": 0.6188, "step": 19880 }, { "epoch": 59.73, "grad_norm": 16.095613479614258, "learning_rate": 8.00900900900901e-06, "loss": 0.6366, "step": 19890 }, { "epoch": 59.76, "grad_norm": 14.603818893432617, "learning_rate": 8.00800800800801e-06, "loss": 0.5962, "step": 19900 }, { "epoch": 59.79, "grad_norm": 25.40558624267578, "learning_rate": 8.007007007007008e-06, "loss": 0.6345, "step": 19910 }, { "epoch": 59.82, "grad_norm": 15.939118385314941, "learning_rate": 8.006006006006006e-06, "loss": 0.5625, "step": 19920 }, { "epoch": 59.85, "grad_norm": 24.41084098815918, "learning_rate": 8.005005005005005e-06, "loss": 0.6134, "step": 19930 }, { "epoch": 59.88, "grad_norm": 12.29263687133789, "learning_rate": 8.004004004004005e-06, "loss": 0.5302, "step": 19940 }, { "epoch": 59.91, "grad_norm": 17.57611846923828, "learning_rate": 8.003003003003004e-06, "loss": 0.587, "step": 19950 }, { "epoch": 59.94, "grad_norm": 12.565730094909668, "learning_rate": 8.002002002002002e-06, "loss": 0.5219, "step": 19960 }, { "epoch": 59.97, "grad_norm": 20.622846603393555, "learning_rate": 8.001001001001003e-06, "loss": 0.6452, "step": 19970 }, { "epoch": 60.0, "grad_norm": 131.89938354492188, "learning_rate": 8.000000000000001e-06, "loss": 0.5445, "step": 19980 }, { "epoch": 60.0, "eval_accuracy": 0.8629, "eval_loss": 0.4607356786727905, "eval_runtime": 13.0912, "eval_samples_per_second": 763.874, "eval_steps_per_second": 3.055, "step": 19980 }, { "epoch": 60.03, "grad_norm": 14.892760276794434, "learning_rate": 7.998998998999e-06, "loss": 0.5442, "step": 19990 }, { "epoch": 60.06, "grad_norm": 15.569472312927246, "learning_rate": 7.997997997997999e-06, "loss": 0.6092, "step": 20000 }, { "epoch": 60.09, "grad_norm": 18.588777542114258, "learning_rate": 7.996996996996997e-06, "loss": 0.5771, "step": 20010 }, { "epoch": 60.12, "grad_norm": 21.79499626159668, "learning_rate": 7.995995995995996e-06, "loss": 0.6436, "step": 20020 }, { "epoch": 60.15, "grad_norm": 10.009160041809082, "learning_rate": 7.994994994994996e-06, "loss": 0.5526, "step": 20030 }, { "epoch": 60.18, "grad_norm": 14.873530387878418, "learning_rate": 7.993993993993995e-06, "loss": 0.5506, "step": 20040 }, { "epoch": 60.21, "grad_norm": 14.862759590148926, "learning_rate": 7.992992992992993e-06, "loss": 0.5676, "step": 20050 }, { "epoch": 60.24, "grad_norm": 14.91917896270752, "learning_rate": 7.991991991991993e-06, "loss": 0.5795, "step": 20060 }, { "epoch": 60.27, "grad_norm": 16.91362762451172, "learning_rate": 7.990990990990992e-06, "loss": 0.5699, "step": 20070 }, { "epoch": 60.3, "grad_norm": 22.361339569091797, "learning_rate": 7.98998998998999e-06, "loss": 0.5749, "step": 20080 }, { "epoch": 60.33, "grad_norm": 25.052778244018555, "learning_rate": 7.98898898898899e-06, "loss": 0.5692, "step": 20090 }, { "epoch": 60.36, "grad_norm": 19.123653411865234, "learning_rate": 7.987987987987988e-06, "loss": 0.5832, "step": 20100 }, { "epoch": 60.39, "grad_norm": 16.62225341796875, "learning_rate": 7.986986986986988e-06, "loss": 0.5846, "step": 20110 }, { "epoch": 60.42, "grad_norm": 16.223085403442383, "learning_rate": 7.985985985985987e-06, "loss": 0.5737, "step": 20120 }, { "epoch": 60.45, "grad_norm": 20.05699920654297, "learning_rate": 7.984984984984985e-06, "loss": 0.5762, "step": 20130 }, { "epoch": 60.48, "grad_norm": 16.24242401123047, "learning_rate": 7.983983983983986e-06, "loss": 0.553, "step": 20140 }, { "epoch": 60.51, "grad_norm": 12.968668937683105, "learning_rate": 7.982982982982984e-06, "loss": 0.5181, "step": 20150 }, { "epoch": 60.54, "grad_norm": 27.785924911499023, "learning_rate": 7.981981981981983e-06, "loss": 0.5528, "step": 20160 }, { "epoch": 60.57, "grad_norm": 14.472647666931152, "learning_rate": 7.980980980980981e-06, "loss": 0.54, "step": 20170 }, { "epoch": 60.6, "grad_norm": 12.095008850097656, "learning_rate": 7.97997997997998e-06, "loss": 0.5601, "step": 20180 }, { "epoch": 60.63, "grad_norm": 11.346393585205078, "learning_rate": 7.97897897897898e-06, "loss": 0.57, "step": 20190 }, { "epoch": 60.66, "grad_norm": 16.481164932250977, "learning_rate": 7.977977977977979e-06, "loss": 0.5786, "step": 20200 }, { "epoch": 60.69, "grad_norm": 12.198641777038574, "learning_rate": 7.976976976976977e-06, "loss": 0.5661, "step": 20210 }, { "epoch": 60.72, "grad_norm": 15.9042329788208, "learning_rate": 7.975975975975978e-06, "loss": 0.6127, "step": 20220 }, { "epoch": 60.75, "grad_norm": 16.836502075195312, "learning_rate": 7.974974974974976e-06, "loss": 0.5705, "step": 20230 }, { "epoch": 60.78, "grad_norm": 16.284013748168945, "learning_rate": 7.973973973973973e-06, "loss": 0.6247, "step": 20240 }, { "epoch": 60.81, "grad_norm": 13.169670104980469, "learning_rate": 7.972972972972974e-06, "loss": 0.5771, "step": 20250 }, { "epoch": 60.84, "grad_norm": 15.882311820983887, "learning_rate": 7.971971971971972e-06, "loss": 0.6058, "step": 20260 }, { "epoch": 60.87, "grad_norm": 18.854999542236328, "learning_rate": 7.97097097097097e-06, "loss": 0.5582, "step": 20270 }, { "epoch": 60.9, "grad_norm": 11.473094940185547, "learning_rate": 7.969969969969971e-06, "loss": 0.6164, "step": 20280 }, { "epoch": 60.93, "grad_norm": 17.09153175354004, "learning_rate": 7.96896896896897e-06, "loss": 0.5493, "step": 20290 }, { "epoch": 60.96, "grad_norm": 16.638715744018555, "learning_rate": 7.967967967967968e-06, "loss": 0.5766, "step": 20300 }, { "epoch": 60.99, "grad_norm": 17.09829330444336, "learning_rate": 7.966966966966969e-06, "loss": 0.5589, "step": 20310 }, { "epoch": 61.0, "eval_accuracy": 0.8649, "eval_loss": 0.4618603587150574, "eval_runtime": 12.9435, "eval_samples_per_second": 772.588, "eval_steps_per_second": 3.09, "step": 20313 }, { "epoch": 61.02, "grad_norm": 13.013632774353027, "learning_rate": 7.965965965965967e-06, "loss": 0.515, "step": 20320 }, { "epoch": 61.05, "grad_norm": 19.766704559326172, "learning_rate": 7.964964964964966e-06, "loss": 0.551, "step": 20330 }, { "epoch": 61.08, "grad_norm": 14.834807395935059, "learning_rate": 7.963963963963964e-06, "loss": 0.598, "step": 20340 }, { "epoch": 61.11, "grad_norm": 13.760293006896973, "learning_rate": 7.962962962962963e-06, "loss": 0.5825, "step": 20350 }, { "epoch": 61.14, "grad_norm": 17.49335479736328, "learning_rate": 7.961961961961963e-06, "loss": 0.5328, "step": 20360 }, { "epoch": 61.17, "grad_norm": 16.027816772460938, "learning_rate": 7.960960960960962e-06, "loss": 0.6012, "step": 20370 }, { "epoch": 61.2, "grad_norm": 23.449174880981445, "learning_rate": 7.95995995995996e-06, "loss": 0.5785, "step": 20380 }, { "epoch": 61.23, "grad_norm": 25.767745971679688, "learning_rate": 7.95895895895896e-06, "loss": 0.613, "step": 20390 }, { "epoch": 61.26, "grad_norm": 11.872919082641602, "learning_rate": 7.95795795795796e-06, "loss": 0.5442, "step": 20400 }, { "epoch": 61.29, "grad_norm": 14.535460472106934, "learning_rate": 7.956956956956958e-06, "loss": 0.599, "step": 20410 }, { "epoch": 61.32, "grad_norm": 14.450933456420898, "learning_rate": 7.955955955955956e-06, "loss": 0.5674, "step": 20420 }, { "epoch": 61.35, "grad_norm": 16.769847869873047, "learning_rate": 7.954954954954955e-06, "loss": 0.5948, "step": 20430 }, { "epoch": 61.38, "grad_norm": 21.674453735351562, "learning_rate": 7.953953953953955e-06, "loss": 0.5648, "step": 20440 }, { "epoch": 61.41, "grad_norm": 17.37084197998047, "learning_rate": 7.952952952952954e-06, "loss": 0.6078, "step": 20450 }, { "epoch": 61.44, "grad_norm": 14.884594917297363, "learning_rate": 7.951951951951953e-06, "loss": 0.5549, "step": 20460 }, { "epoch": 61.47, "grad_norm": 22.308752059936523, "learning_rate": 7.950950950950951e-06, "loss": 0.5316, "step": 20470 }, { "epoch": 61.5, "grad_norm": 15.277771949768066, "learning_rate": 7.949949949949951e-06, "loss": 0.6423, "step": 20480 }, { "epoch": 61.53, "grad_norm": 12.716387748718262, "learning_rate": 7.948948948948948e-06, "loss": 0.6142, "step": 20490 }, { "epoch": 61.56, "grad_norm": 21.089902877807617, "learning_rate": 7.947947947947949e-06, "loss": 0.564, "step": 20500 }, { "epoch": 61.59, "grad_norm": 19.012067794799805, "learning_rate": 7.946946946946947e-06, "loss": 0.5945, "step": 20510 }, { "epoch": 61.62, "grad_norm": 17.51711082458496, "learning_rate": 7.945945945945946e-06, "loss": 0.5803, "step": 20520 }, { "epoch": 61.65, "grad_norm": 14.719621658325195, "learning_rate": 7.944944944944946e-06, "loss": 0.5965, "step": 20530 }, { "epoch": 61.68, "grad_norm": 18.21731948852539, "learning_rate": 7.943943943943945e-06, "loss": 0.5845, "step": 20540 }, { "epoch": 61.71, "grad_norm": 15.174802780151367, "learning_rate": 7.942942942942943e-06, "loss": 0.5575, "step": 20550 }, { "epoch": 61.74, "grad_norm": 13.877257347106934, "learning_rate": 7.941941941941944e-06, "loss": 0.5519, "step": 20560 }, { "epoch": 61.77, "grad_norm": 17.353370666503906, "learning_rate": 7.940940940940942e-06, "loss": 0.563, "step": 20570 }, { "epoch": 61.8, "grad_norm": 10.737649917602539, "learning_rate": 7.93993993993994e-06, "loss": 0.5891, "step": 20580 }, { "epoch": 61.83, "grad_norm": 15.70091724395752, "learning_rate": 7.93893893893894e-06, "loss": 0.5924, "step": 20590 }, { "epoch": 61.86, "grad_norm": 18.718290328979492, "learning_rate": 7.937937937937938e-06, "loss": 0.6194, "step": 20600 }, { "epoch": 61.89, "grad_norm": 13.471919059753418, "learning_rate": 7.936936936936938e-06, "loss": 0.5749, "step": 20610 }, { "epoch": 61.92, "grad_norm": 17.65549659729004, "learning_rate": 7.935935935935937e-06, "loss": 0.5113, "step": 20620 }, { "epoch": 61.95, "grad_norm": 14.194541931152344, "learning_rate": 7.934934934934935e-06, "loss": 0.5658, "step": 20630 }, { "epoch": 61.98, "grad_norm": 13.701493263244629, "learning_rate": 7.933933933933936e-06, "loss": 0.5777, "step": 20640 }, { "epoch": 62.0, "eval_accuracy": 0.8626, "eval_loss": 0.4739953577518463, "eval_runtime": 12.593, "eval_samples_per_second": 794.095, "eval_steps_per_second": 3.176, "step": 20646 }, { "epoch": 62.01, "grad_norm": 16.66990852355957, "learning_rate": 7.932932932932934e-06, "loss": 0.5321, "step": 20650 }, { "epoch": 62.04, "grad_norm": 16.17997169494629, "learning_rate": 7.931931931931933e-06, "loss": 0.553, "step": 20660 }, { "epoch": 62.07, "grad_norm": 15.974793434143066, "learning_rate": 7.930930930930931e-06, "loss": 0.6005, "step": 20670 }, { "epoch": 62.1, "grad_norm": 22.15069580078125, "learning_rate": 7.92992992992993e-06, "loss": 0.5662, "step": 20680 }, { "epoch": 62.13, "grad_norm": 15.846395492553711, "learning_rate": 7.928928928928929e-06, "loss": 0.6265, "step": 20690 }, { "epoch": 62.16, "grad_norm": 15.250953674316406, "learning_rate": 7.927927927927929e-06, "loss": 0.5739, "step": 20700 }, { "epoch": 62.19, "grad_norm": 14.215643882751465, "learning_rate": 7.926926926926928e-06, "loss": 0.5919, "step": 20710 }, { "epoch": 62.22, "grad_norm": 26.54216194152832, "learning_rate": 7.925925925925926e-06, "loss": 0.5703, "step": 20720 }, { "epoch": 62.25, "grad_norm": 23.78998374938965, "learning_rate": 7.924924924924926e-06, "loss": 0.5604, "step": 20730 }, { "epoch": 62.28, "grad_norm": 20.236692428588867, "learning_rate": 7.923923923923923e-06, "loss": 0.5878, "step": 20740 }, { "epoch": 62.31, "grad_norm": 23.307592391967773, "learning_rate": 7.922922922922924e-06, "loss": 0.5362, "step": 20750 }, { "epoch": 62.34, "grad_norm": 21.87845802307129, "learning_rate": 7.921921921921922e-06, "loss": 0.5588, "step": 20760 }, { "epoch": 62.37, "grad_norm": 19.75901222229004, "learning_rate": 7.92092092092092e-06, "loss": 0.5118, "step": 20770 }, { "epoch": 62.4, "grad_norm": 15.26552963256836, "learning_rate": 7.919919919919921e-06, "loss": 0.5601, "step": 20780 }, { "epoch": 62.43, "grad_norm": 10.958950996398926, "learning_rate": 7.91891891891892e-06, "loss": 0.5518, "step": 20790 }, { "epoch": 62.46, "grad_norm": 14.27664852142334, "learning_rate": 7.917917917917918e-06, "loss": 0.5905, "step": 20800 }, { "epoch": 62.49, "grad_norm": 17.700889587402344, "learning_rate": 7.916916916916919e-06, "loss": 0.5595, "step": 20810 }, { "epoch": 62.52, "grad_norm": 16.927452087402344, "learning_rate": 7.915915915915915e-06, "loss": 0.5627, "step": 20820 }, { "epoch": 62.55, "grad_norm": 15.279191017150879, "learning_rate": 7.914914914914916e-06, "loss": 0.5965, "step": 20830 }, { "epoch": 62.58, "grad_norm": 14.534746170043945, "learning_rate": 7.913913913913914e-06, "loss": 0.6307, "step": 20840 }, { "epoch": 62.61, "grad_norm": 20.03127670288086, "learning_rate": 7.912912912912913e-06, "loss": 0.5817, "step": 20850 }, { "epoch": 62.64, "grad_norm": 14.393763542175293, "learning_rate": 7.911911911911913e-06, "loss": 0.5687, "step": 20860 }, { "epoch": 62.67, "grad_norm": 19.6622314453125, "learning_rate": 7.910910910910912e-06, "loss": 0.5802, "step": 20870 }, { "epoch": 62.7, "grad_norm": 12.865262031555176, "learning_rate": 7.90990990990991e-06, "loss": 0.5928, "step": 20880 }, { "epoch": 62.73, "grad_norm": 15.107696533203125, "learning_rate": 7.90890890890891e-06, "loss": 0.5349, "step": 20890 }, { "epoch": 62.76, "grad_norm": 18.19290542602539, "learning_rate": 7.90790790790791e-06, "loss": 0.578, "step": 20900 }, { "epoch": 62.79, "grad_norm": 12.389555931091309, "learning_rate": 7.906906906906908e-06, "loss": 0.6213, "step": 20910 }, { "epoch": 62.82, "grad_norm": 16.094959259033203, "learning_rate": 7.905905905905907e-06, "loss": 0.5919, "step": 20920 }, { "epoch": 62.85, "grad_norm": 20.8697509765625, "learning_rate": 7.904904904904905e-06, "loss": 0.6286, "step": 20930 }, { "epoch": 62.88, "grad_norm": 12.95927906036377, "learning_rate": 7.903903903903904e-06, "loss": 0.5543, "step": 20940 }, { "epoch": 62.91, "grad_norm": 12.200501441955566, "learning_rate": 7.902902902902904e-06, "loss": 0.5954, "step": 20950 }, { "epoch": 62.94, "grad_norm": 20.14626693725586, "learning_rate": 7.901901901901903e-06, "loss": 0.5558, "step": 20960 }, { "epoch": 62.97, "grad_norm": 15.596393585205078, "learning_rate": 7.900900900900901e-06, "loss": 0.5711, "step": 20970 }, { "epoch": 63.0, "eval_accuracy": 0.8659, "eval_loss": 0.4683513045310974, "eval_runtime": 12.7882, "eval_samples_per_second": 781.971, "eval_steps_per_second": 3.128, "step": 20979 }, { "epoch": 63.0, "grad_norm": 16.402956008911133, "learning_rate": 7.899899899899901e-06, "loss": 0.5426, "step": 20980 }, { "epoch": 63.03, "grad_norm": 16.609580993652344, "learning_rate": 7.898898898898898e-06, "loss": 0.5684, "step": 20990 }, { "epoch": 63.06, "grad_norm": 19.532304763793945, "learning_rate": 7.897897897897899e-06, "loss": 0.589, "step": 21000 }, { "epoch": 63.09, "grad_norm": 11.068476676940918, "learning_rate": 7.896896896896897e-06, "loss": 0.5477, "step": 21010 }, { "epoch": 63.12, "grad_norm": 12.566080093383789, "learning_rate": 7.895895895895896e-06, "loss": 0.5507, "step": 21020 }, { "epoch": 63.15, "grad_norm": 12.536846160888672, "learning_rate": 7.894894894894896e-06, "loss": 0.5802, "step": 21030 }, { "epoch": 63.18, "grad_norm": 20.08684539794922, "learning_rate": 7.893893893893895e-06, "loss": 0.5974, "step": 21040 }, { "epoch": 63.21, "grad_norm": 14.982975959777832, "learning_rate": 7.892892892892893e-06, "loss": 0.5331, "step": 21050 }, { "epoch": 63.24, "grad_norm": 15.32133674621582, "learning_rate": 7.891891891891894e-06, "loss": 0.5339, "step": 21060 }, { "epoch": 63.27, "grad_norm": 16.593841552734375, "learning_rate": 7.89089089089089e-06, "loss": 0.5521, "step": 21070 }, { "epoch": 63.3, "grad_norm": 11.591399192810059, "learning_rate": 7.88988988988989e-06, "loss": 0.5025, "step": 21080 }, { "epoch": 63.33, "grad_norm": 16.387269973754883, "learning_rate": 7.88888888888889e-06, "loss": 0.6144, "step": 21090 }, { "epoch": 63.36, "grad_norm": 14.72872543334961, "learning_rate": 7.887887887887888e-06, "loss": 0.5563, "step": 21100 }, { "epoch": 63.39, "grad_norm": 16.292652130126953, "learning_rate": 7.886886886886888e-06, "loss": 0.5957, "step": 21110 }, { "epoch": 63.42, "grad_norm": 11.861322402954102, "learning_rate": 7.885885885885887e-06, "loss": 0.6094, "step": 21120 }, { "epoch": 63.45, "grad_norm": 18.908823013305664, "learning_rate": 7.884884884884885e-06, "loss": 0.5862, "step": 21130 }, { "epoch": 63.48, "grad_norm": 15.238788604736328, "learning_rate": 7.883883883883886e-06, "loss": 0.6308, "step": 21140 }, { "epoch": 63.51, "grad_norm": 13.101716041564941, "learning_rate": 7.882882882882884e-06, "loss": 0.5427, "step": 21150 }, { "epoch": 63.54, "grad_norm": 14.60013198852539, "learning_rate": 7.881881881881881e-06, "loss": 0.5067, "step": 21160 }, { "epoch": 63.57, "grad_norm": 16.040077209472656, "learning_rate": 7.880880880880882e-06, "loss": 0.618, "step": 21170 }, { "epoch": 63.6, "grad_norm": 17.32454490661621, "learning_rate": 7.87987987987988e-06, "loss": 0.5619, "step": 21180 }, { "epoch": 63.63, "grad_norm": 13.533453941345215, "learning_rate": 7.878878878878879e-06, "loss": 0.5796, "step": 21190 }, { "epoch": 63.66, "grad_norm": 16.205324172973633, "learning_rate": 7.877877877877879e-06, "loss": 0.5986, "step": 21200 }, { "epoch": 63.69, "grad_norm": 14.960679054260254, "learning_rate": 7.876876876876878e-06, "loss": 0.5975, "step": 21210 }, { "epoch": 63.72, "grad_norm": 15.52830982208252, "learning_rate": 7.875875875875876e-06, "loss": 0.5616, "step": 21220 }, { "epoch": 63.75, "grad_norm": 15.617673873901367, "learning_rate": 7.874874874874877e-06, "loss": 0.5214, "step": 21230 }, { "epoch": 63.78, "grad_norm": 12.792104721069336, "learning_rate": 7.873873873873873e-06, "loss": 0.5831, "step": 21240 }, { "epoch": 63.81, "grad_norm": 12.508368492126465, "learning_rate": 7.872872872872874e-06, "loss": 0.5702, "step": 21250 }, { "epoch": 63.84, "grad_norm": 12.276655197143555, "learning_rate": 7.871871871871872e-06, "loss": 0.574, "step": 21260 }, { "epoch": 63.87, "grad_norm": 11.787044525146484, "learning_rate": 7.870870870870871e-06, "loss": 0.5661, "step": 21270 }, { "epoch": 63.9, "grad_norm": 15.9622163772583, "learning_rate": 7.869869869869871e-06, "loss": 0.5474, "step": 21280 }, { "epoch": 63.93, "grad_norm": 22.360185623168945, "learning_rate": 7.86886886886887e-06, "loss": 0.5567, "step": 21290 }, { "epoch": 63.96, "grad_norm": 11.864326477050781, "learning_rate": 7.867867867867868e-06, "loss": 0.5897, "step": 21300 }, { "epoch": 63.99, "grad_norm": 23.33677101135254, "learning_rate": 7.866866866866869e-06, "loss": 0.5369, "step": 21310 }, { "epoch": 64.0, "eval_accuracy": 0.8639, "eval_loss": 0.4654677212238312, "eval_runtime": 12.5277, "eval_samples_per_second": 798.231, "eval_steps_per_second": 3.193, "step": 21312 }, { "epoch": 64.02, "grad_norm": 14.384602546691895, "learning_rate": 7.865865865865866e-06, "loss": 0.5792, "step": 21320 }, { "epoch": 64.05, "grad_norm": 24.945512771606445, "learning_rate": 7.864864864864866e-06, "loss": 0.5791, "step": 21330 }, { "epoch": 64.08, "grad_norm": 14.134567260742188, "learning_rate": 7.863863863863864e-06, "loss": 0.6084, "step": 21340 }, { "epoch": 64.11, "grad_norm": 15.435029983520508, "learning_rate": 7.862862862862863e-06, "loss": 0.5915, "step": 21350 }, { "epoch": 64.14, "grad_norm": 17.37160873413086, "learning_rate": 7.861861861861863e-06, "loss": 0.5845, "step": 21360 }, { "epoch": 64.17, "grad_norm": 12.681687355041504, "learning_rate": 7.860860860860862e-06, "loss": 0.5518, "step": 21370 }, { "epoch": 64.2, "grad_norm": 15.757386207580566, "learning_rate": 7.85985985985986e-06, "loss": 0.5454, "step": 21380 }, { "epoch": 64.23, "grad_norm": 16.375354766845703, "learning_rate": 7.858858858858859e-06, "loss": 0.62, "step": 21390 }, { "epoch": 64.26, "grad_norm": 15.337791442871094, "learning_rate": 7.85785785785786e-06, "loss": 0.5498, "step": 21400 }, { "epoch": 64.29, "grad_norm": 13.011787414550781, "learning_rate": 7.856856856856856e-06, "loss": 0.5191, "step": 21410 }, { "epoch": 64.32, "grad_norm": 14.646492958068848, "learning_rate": 7.855855855855857e-06, "loss": 0.5602, "step": 21420 }, { "epoch": 64.35, "grad_norm": 14.467238426208496, "learning_rate": 7.854854854854855e-06, "loss": 0.5091, "step": 21430 }, { "epoch": 64.38, "grad_norm": 16.313474655151367, "learning_rate": 7.853853853853854e-06, "loss": 0.5858, "step": 21440 }, { "epoch": 64.41, "grad_norm": 15.73123550415039, "learning_rate": 7.852852852852854e-06, "loss": 0.5917, "step": 21450 }, { "epoch": 64.44, "grad_norm": 17.798755645751953, "learning_rate": 7.851851851851853e-06, "loss": 0.5632, "step": 21460 }, { "epoch": 64.47, "grad_norm": 13.452340126037598, "learning_rate": 7.850850850850851e-06, "loss": 0.5486, "step": 21470 }, { "epoch": 64.5, "grad_norm": 12.10430908203125, "learning_rate": 7.849849849849852e-06, "loss": 0.5004, "step": 21480 }, { "epoch": 64.53, "grad_norm": 16.858976364135742, "learning_rate": 7.848848848848848e-06, "loss": 0.5309, "step": 21490 }, { "epoch": 64.56, "grad_norm": 16.18703269958496, "learning_rate": 7.847847847847849e-06, "loss": 0.6153, "step": 21500 }, { "epoch": 64.59, "grad_norm": 19.76614761352539, "learning_rate": 7.846846846846847e-06, "loss": 0.6109, "step": 21510 }, { "epoch": 64.62, "grad_norm": 15.78753662109375, "learning_rate": 7.845845845845846e-06, "loss": 0.5379, "step": 21520 }, { "epoch": 64.65, "grad_norm": 15.919106483459473, "learning_rate": 7.844844844844846e-06, "loss": 0.5909, "step": 21530 }, { "epoch": 64.68, "grad_norm": 16.947357177734375, "learning_rate": 7.843843843843845e-06, "loss": 0.5273, "step": 21540 }, { "epoch": 64.71, "grad_norm": 14.03024959564209, "learning_rate": 7.842842842842843e-06, "loss": 0.5747, "step": 21550 }, { "epoch": 64.74, "grad_norm": 14.933127403259277, "learning_rate": 7.841841841841844e-06, "loss": 0.4973, "step": 21560 }, { "epoch": 64.77, "grad_norm": 15.095874786376953, "learning_rate": 7.84084084084084e-06, "loss": 0.5504, "step": 21570 }, { "epoch": 64.8, "grad_norm": 14.683979034423828, "learning_rate": 7.839839839839841e-06, "loss": 0.5498, "step": 21580 }, { "epoch": 64.83, "grad_norm": 17.910722732543945, "learning_rate": 7.83883883883884e-06, "loss": 0.5901, "step": 21590 }, { "epoch": 64.86, "grad_norm": 13.064376831054688, "learning_rate": 7.837837837837838e-06, "loss": 0.6135, "step": 21600 }, { "epoch": 64.89, "grad_norm": 14.728261947631836, "learning_rate": 7.836836836836837e-06, "loss": 0.6248, "step": 21610 }, { "epoch": 64.92, "grad_norm": 17.558917999267578, "learning_rate": 7.835835835835837e-06, "loss": 0.589, "step": 21620 }, { "epoch": 64.95, "grad_norm": 16.495952606201172, "learning_rate": 7.834834834834836e-06, "loss": 0.543, "step": 21630 }, { "epoch": 64.98, "grad_norm": 24.232667922973633, "learning_rate": 7.833833833833834e-06, "loss": 0.5454, "step": 21640 }, { "epoch": 65.0, "eval_accuracy": 0.867, "eval_loss": 0.45735976099967957, "eval_runtime": 12.4469, "eval_samples_per_second": 803.414, "eval_steps_per_second": 3.214, "step": 21645 }, { "epoch": 65.02, "grad_norm": 20.44569969177246, "learning_rate": 7.832832832832834e-06, "loss": 0.5781, "step": 21650 }, { "epoch": 65.05, "grad_norm": 14.24367618560791, "learning_rate": 7.831831831831831e-06, "loss": 0.5046, "step": 21660 }, { "epoch": 65.08, "grad_norm": 18.760143280029297, "learning_rate": 7.830830830830832e-06, "loss": 0.5318, "step": 21670 }, { "epoch": 65.11, "grad_norm": 9.44014835357666, "learning_rate": 7.82982982982983e-06, "loss": 0.545, "step": 21680 }, { "epoch": 65.14, "grad_norm": 13.003317832946777, "learning_rate": 7.828828828828829e-06, "loss": 0.5661, "step": 21690 }, { "epoch": 65.17, "grad_norm": 12.037182807922363, "learning_rate": 7.827827827827829e-06, "loss": 0.5858, "step": 21700 }, { "epoch": 65.2, "grad_norm": 16.40542984008789, "learning_rate": 7.826826826826828e-06, "loss": 0.5875, "step": 21710 }, { "epoch": 65.23, "grad_norm": 23.315446853637695, "learning_rate": 7.825825825825826e-06, "loss": 0.5395, "step": 21720 }, { "epoch": 65.26, "grad_norm": 15.518813133239746, "learning_rate": 7.824824824824827e-06, "loss": 0.5867, "step": 21730 }, { "epoch": 65.29, "grad_norm": 12.166085243225098, "learning_rate": 7.823823823823823e-06, "loss": 0.5756, "step": 21740 }, { "epoch": 65.32, "grad_norm": 18.248538970947266, "learning_rate": 7.822822822822824e-06, "loss": 0.5688, "step": 21750 }, { "epoch": 65.35, "grad_norm": 18.594797134399414, "learning_rate": 7.821821821821822e-06, "loss": 0.5927, "step": 21760 }, { "epoch": 65.38, "grad_norm": 12.831320762634277, "learning_rate": 7.820820820820821e-06, "loss": 0.5372, "step": 21770 }, { "epoch": 65.41, "grad_norm": 13.435611724853516, "learning_rate": 7.819819819819821e-06, "loss": 0.5832, "step": 21780 }, { "epoch": 65.44, "grad_norm": 13.874771118164062, "learning_rate": 7.81881881881882e-06, "loss": 0.5114, "step": 21790 }, { "epoch": 65.47, "grad_norm": 14.710037231445312, "learning_rate": 7.817817817817818e-06, "loss": 0.5079, "step": 21800 }, { "epoch": 65.5, "grad_norm": 17.372941970825195, "learning_rate": 7.816816816816819e-06, "loss": 0.5225, "step": 21810 }, { "epoch": 65.53, "grad_norm": 16.734004974365234, "learning_rate": 7.815815815815816e-06, "loss": 0.524, "step": 21820 }, { "epoch": 65.56, "grad_norm": 17.2759952545166, "learning_rate": 7.814814814814816e-06, "loss": 0.5102, "step": 21830 }, { "epoch": 65.59, "grad_norm": 12.53348445892334, "learning_rate": 7.813813813813815e-06, "loss": 0.5968, "step": 21840 }, { "epoch": 65.62, "grad_norm": 15.117351531982422, "learning_rate": 7.812812812812813e-06, "loss": 0.5649, "step": 21850 }, { "epoch": 65.65, "grad_norm": 14.020438194274902, "learning_rate": 7.811811811811812e-06, "loss": 0.5632, "step": 21860 }, { "epoch": 65.68, "grad_norm": 19.232515335083008, "learning_rate": 7.810810810810812e-06, "loss": 0.5373, "step": 21870 }, { "epoch": 65.71, "grad_norm": 18.334613800048828, "learning_rate": 7.80980980980981e-06, "loss": 0.5736, "step": 21880 }, { "epoch": 65.74, "grad_norm": 14.187418937683105, "learning_rate": 7.80880880880881e-06, "loss": 0.5321, "step": 21890 }, { "epoch": 65.77, "grad_norm": 23.7889461517334, "learning_rate": 7.807807807807808e-06, "loss": 0.5544, "step": 21900 }, { "epoch": 65.8, "grad_norm": 13.015482902526855, "learning_rate": 7.806806806806806e-06, "loss": 0.6049, "step": 21910 }, { "epoch": 65.83, "grad_norm": 19.774629592895508, "learning_rate": 7.805805805805807e-06, "loss": 0.5379, "step": 21920 }, { "epoch": 65.86, "grad_norm": 17.81984519958496, "learning_rate": 7.804804804804805e-06, "loss": 0.6127, "step": 21930 }, { "epoch": 65.89, "grad_norm": 11.629871368408203, "learning_rate": 7.803803803803804e-06, "loss": 0.5709, "step": 21940 }, { "epoch": 65.92, "grad_norm": 14.506510734558105, "learning_rate": 7.802802802802804e-06, "loss": 0.5617, "step": 21950 }, { "epoch": 65.95, "grad_norm": 13.262490272521973, "learning_rate": 7.801801801801803e-06, "loss": 0.6193, "step": 21960 }, { "epoch": 65.98, "grad_norm": 16.1383113861084, "learning_rate": 7.800800800800801e-06, "loss": 0.5471, "step": 21970 }, { "epoch": 66.0, "eval_accuracy": 0.8655, "eval_loss": 0.4578593671321869, "eval_runtime": 12.6482, "eval_samples_per_second": 790.628, "eval_steps_per_second": 3.163, "step": 21978 }, { "epoch": 66.01, "grad_norm": 14.458390235900879, "learning_rate": 7.799799799799802e-06, "loss": 0.6371, "step": 21980 }, { "epoch": 66.04, "grad_norm": 13.121495246887207, "learning_rate": 7.798798798798799e-06, "loss": 0.5389, "step": 21990 }, { "epoch": 66.07, "grad_norm": 14.864715576171875, "learning_rate": 7.797797797797799e-06, "loss": 0.5968, "step": 22000 }, { "epoch": 66.1, "grad_norm": 10.540728569030762, "learning_rate": 7.796796796796797e-06, "loss": 0.5739, "step": 22010 }, { "epoch": 66.13, "grad_norm": 20.390697479248047, "learning_rate": 7.795795795795796e-06, "loss": 0.5319, "step": 22020 }, { "epoch": 66.16, "grad_norm": 19.64850616455078, "learning_rate": 7.794794794794796e-06, "loss": 0.5881, "step": 22030 }, { "epoch": 66.19, "grad_norm": 22.785123825073242, "learning_rate": 7.793793793793795e-06, "loss": 0.5249, "step": 22040 }, { "epoch": 66.22, "grad_norm": 17.400972366333008, "learning_rate": 7.792792792792793e-06, "loss": 0.5222, "step": 22050 }, { "epoch": 66.25, "grad_norm": 17.724403381347656, "learning_rate": 7.791791791791792e-06, "loss": 0.6046, "step": 22060 }, { "epoch": 66.28, "grad_norm": 10.515296936035156, "learning_rate": 7.79079079079079e-06, "loss": 0.5991, "step": 22070 }, { "epoch": 66.31, "grad_norm": 19.60765266418457, "learning_rate": 7.78978978978979e-06, "loss": 0.5919, "step": 22080 }, { "epoch": 66.34, "grad_norm": 20.284080505371094, "learning_rate": 7.78878878878879e-06, "loss": 0.5722, "step": 22090 }, { "epoch": 66.37, "grad_norm": 18.877843856811523, "learning_rate": 7.787787787787788e-06, "loss": 0.5648, "step": 22100 }, { "epoch": 66.4, "grad_norm": 15.429301261901855, "learning_rate": 7.786786786786787e-06, "loss": 0.5848, "step": 22110 }, { "epoch": 66.43, "grad_norm": 17.803064346313477, "learning_rate": 7.785785785785787e-06, "loss": 0.5631, "step": 22120 }, { "epoch": 66.46, "grad_norm": 14.009722709655762, "learning_rate": 7.784784784784786e-06, "loss": 0.563, "step": 22130 }, { "epoch": 66.49, "grad_norm": 13.050360679626465, "learning_rate": 7.783783783783784e-06, "loss": 0.5608, "step": 22140 }, { "epoch": 66.52, "grad_norm": 14.04554557800293, "learning_rate": 7.782782782782783e-06, "loss": 0.5285, "step": 22150 }, { "epoch": 66.55, "grad_norm": 14.689785957336426, "learning_rate": 7.781781781781781e-06, "loss": 0.5928, "step": 22160 }, { "epoch": 66.58, "grad_norm": 18.265892028808594, "learning_rate": 7.780780780780782e-06, "loss": 0.5736, "step": 22170 }, { "epoch": 66.61, "grad_norm": 12.933117866516113, "learning_rate": 7.77977977977978e-06, "loss": 0.5685, "step": 22180 }, { "epoch": 66.64, "grad_norm": 17.895145416259766, "learning_rate": 7.778778778778779e-06, "loss": 0.5729, "step": 22190 }, { "epoch": 66.67, "grad_norm": 19.048784255981445, "learning_rate": 7.77777777777778e-06, "loss": 0.6241, "step": 22200 }, { "epoch": 66.7, "grad_norm": 12.441827774047852, "learning_rate": 7.776776776776778e-06, "loss": 0.5249, "step": 22210 }, { "epoch": 66.73, "grad_norm": 12.524911880493164, "learning_rate": 7.775775775775776e-06, "loss": 0.5657, "step": 22220 }, { "epoch": 66.76, "grad_norm": 25.477313995361328, "learning_rate": 7.774774774774777e-06, "loss": 0.5484, "step": 22230 }, { "epoch": 66.79, "grad_norm": 15.488204002380371, "learning_rate": 7.773773773773774e-06, "loss": 0.5484, "step": 22240 }, { "epoch": 66.82, "grad_norm": 12.64756965637207, "learning_rate": 7.772772772772774e-06, "loss": 0.4966, "step": 22250 }, { "epoch": 66.85, "grad_norm": 20.53177261352539, "learning_rate": 7.771771771771772e-06, "loss": 0.6213, "step": 22260 }, { "epoch": 66.88, "grad_norm": 12.159225463867188, "learning_rate": 7.770770770770771e-06, "loss": 0.6016, "step": 22270 }, { "epoch": 66.91, "grad_norm": 20.43048858642578, "learning_rate": 7.769769769769771e-06, "loss": 0.5002, "step": 22280 }, { "epoch": 66.94, "grad_norm": 16.07718276977539, "learning_rate": 7.76876876876877e-06, "loss": 0.612, "step": 22290 }, { "epoch": 66.97, "grad_norm": 14.752755165100098, "learning_rate": 7.767767767767769e-06, "loss": 0.5301, "step": 22300 }, { "epoch": 67.0, "grad_norm": 13.626973152160645, "learning_rate": 7.766766766766767e-06, "loss": 0.5816, "step": 22310 }, { "epoch": 67.0, "eval_accuracy": 0.8662, "eval_loss": 0.46097758412361145, "eval_runtime": 12.9948, "eval_samples_per_second": 769.54, "eval_steps_per_second": 3.078, "step": 22311 }, { "epoch": 67.03, "grad_norm": 17.61612892150879, "learning_rate": 7.765765765765766e-06, "loss": 0.5121, "step": 22320 }, { "epoch": 67.06, "grad_norm": 14.522309303283691, "learning_rate": 7.764764764764764e-06, "loss": 0.5258, "step": 22330 }, { "epoch": 67.09, "grad_norm": 12.155220031738281, "learning_rate": 7.763763763763765e-06, "loss": 0.5355, "step": 22340 }, { "epoch": 67.12, "grad_norm": 17.316837310791016, "learning_rate": 7.762762762762763e-06, "loss": 0.5247, "step": 22350 }, { "epoch": 67.15, "grad_norm": 20.70302963256836, "learning_rate": 7.761761761761762e-06, "loss": 0.5918, "step": 22360 }, { "epoch": 67.18, "grad_norm": 15.649313926696777, "learning_rate": 7.760760760760762e-06, "loss": 0.5231, "step": 22370 }, { "epoch": 67.21, "grad_norm": 12.764701843261719, "learning_rate": 7.75975975975976e-06, "loss": 0.5763, "step": 22380 }, { "epoch": 67.24, "grad_norm": 22.652385711669922, "learning_rate": 7.75875875875876e-06, "loss": 0.5285, "step": 22390 }, { "epoch": 67.27, "grad_norm": 20.876888275146484, "learning_rate": 7.757757757757758e-06, "loss": 0.6043, "step": 22400 }, { "epoch": 67.3, "grad_norm": 16.850923538208008, "learning_rate": 7.756756756756756e-06, "loss": 0.6162, "step": 22410 }, { "epoch": 67.33, "grad_norm": 12.737711906433105, "learning_rate": 7.755755755755757e-06, "loss": 0.5823, "step": 22420 }, { "epoch": 67.36, "grad_norm": 15.0642728805542, "learning_rate": 7.754754754754755e-06, "loss": 0.5225, "step": 22430 }, { "epoch": 67.39, "grad_norm": 14.839635848999023, "learning_rate": 7.753753753753754e-06, "loss": 0.572, "step": 22440 }, { "epoch": 67.42, "grad_norm": 15.499557495117188, "learning_rate": 7.752752752752754e-06, "loss": 0.5427, "step": 22450 }, { "epoch": 67.45, "grad_norm": 21.27898597717285, "learning_rate": 7.751751751751753e-06, "loss": 0.5442, "step": 22460 }, { "epoch": 67.48, "grad_norm": 18.203609466552734, "learning_rate": 7.750750750750751e-06, "loss": 0.5685, "step": 22470 }, { "epoch": 67.51, "grad_norm": 14.603145599365234, "learning_rate": 7.749749749749752e-06, "loss": 0.6034, "step": 22480 }, { "epoch": 67.54, "grad_norm": 19.590543746948242, "learning_rate": 7.748748748748749e-06, "loss": 0.5384, "step": 22490 }, { "epoch": 67.57, "grad_norm": 17.51311683654785, "learning_rate": 7.747747747747749e-06, "loss": 0.5626, "step": 22500 }, { "epoch": 67.6, "grad_norm": 14.876330375671387, "learning_rate": 7.746746746746747e-06, "loss": 0.5592, "step": 22510 }, { "epoch": 67.63, "grad_norm": 22.465009689331055, "learning_rate": 7.745745745745746e-06, "loss": 0.5905, "step": 22520 }, { "epoch": 67.66, "grad_norm": 14.902283668518066, "learning_rate": 7.744744744744745e-06, "loss": 0.5325, "step": 22530 }, { "epoch": 67.69, "grad_norm": 12.203268051147461, "learning_rate": 7.743743743743745e-06, "loss": 0.482, "step": 22540 }, { "epoch": 67.72, "grad_norm": 17.009416580200195, "learning_rate": 7.742742742742744e-06, "loss": 0.5318, "step": 22550 }, { "epoch": 67.75, "grad_norm": 32.27000427246094, "learning_rate": 7.741741741741742e-06, "loss": 0.5071, "step": 22560 }, { "epoch": 67.78, "grad_norm": 14.510608673095703, "learning_rate": 7.74074074074074e-06, "loss": 0.5233, "step": 22570 }, { "epoch": 67.81, "grad_norm": 16.449337005615234, "learning_rate": 7.73973973973974e-06, "loss": 0.4803, "step": 22580 }, { "epoch": 67.84, "grad_norm": 19.513935089111328, "learning_rate": 7.73873873873874e-06, "loss": 0.6083, "step": 22590 }, { "epoch": 67.87, "grad_norm": 17.661775588989258, "learning_rate": 7.737737737737738e-06, "loss": 0.5604, "step": 22600 }, { "epoch": 67.9, "grad_norm": 19.88722038269043, "learning_rate": 7.736736736736737e-06, "loss": 0.5425, "step": 22610 }, { "epoch": 67.93, "grad_norm": 21.868816375732422, "learning_rate": 7.735735735735737e-06, "loss": 0.5772, "step": 22620 }, { "epoch": 67.96, "grad_norm": 17.84311294555664, "learning_rate": 7.734734734734736e-06, "loss": 0.5472, "step": 22630 }, { "epoch": 67.99, "grad_norm": 10.967425346374512, "learning_rate": 7.733733733733734e-06, "loss": 0.5262, "step": 22640 }, { "epoch": 68.0, "eval_accuracy": 0.8646, "eval_loss": 0.4631381928920746, "eval_runtime": 12.8581, "eval_samples_per_second": 777.719, "eval_steps_per_second": 3.111, "step": 22644 }, { "epoch": 68.02, "grad_norm": 25.05714225769043, "learning_rate": 7.732732732732733e-06, "loss": 0.5734, "step": 22650 }, { "epoch": 68.05, "grad_norm": 15.997255325317383, "learning_rate": 7.731731731731731e-06, "loss": 0.5517, "step": 22660 }, { "epoch": 68.08, "grad_norm": 14.824918746948242, "learning_rate": 7.730730730730732e-06, "loss": 0.5681, "step": 22670 }, { "epoch": 68.11, "grad_norm": 23.816343307495117, "learning_rate": 7.72972972972973e-06, "loss": 0.5502, "step": 22680 }, { "epoch": 68.14, "grad_norm": 18.046607971191406, "learning_rate": 7.728728728728729e-06, "loss": 0.5043, "step": 22690 }, { "epoch": 68.17, "grad_norm": 15.784799575805664, "learning_rate": 7.72772772772773e-06, "loss": 0.5441, "step": 22700 }, { "epoch": 68.2, "grad_norm": 11.996554374694824, "learning_rate": 7.726726726726728e-06, "loss": 0.5802, "step": 22710 }, { "epoch": 68.23, "grad_norm": 13.040654182434082, "learning_rate": 7.725725725725726e-06, "loss": 0.5738, "step": 22720 }, { "epoch": 68.26, "grad_norm": 18.27982521057129, "learning_rate": 7.724724724724727e-06, "loss": 0.5618, "step": 22730 }, { "epoch": 68.29, "grad_norm": 16.63296127319336, "learning_rate": 7.723723723723724e-06, "loss": 0.5308, "step": 22740 }, { "epoch": 68.32, "grad_norm": 15.186198234558105, "learning_rate": 7.722722722722722e-06, "loss": 0.5959, "step": 22750 }, { "epoch": 68.35, "grad_norm": 19.347002029418945, "learning_rate": 7.721721721721722e-06, "loss": 0.5802, "step": 22760 }, { "epoch": 68.38, "grad_norm": 15.88683032989502, "learning_rate": 7.720720720720721e-06, "loss": 0.5528, "step": 22770 }, { "epoch": 68.41, "grad_norm": 15.911577224731445, "learning_rate": 7.71971971971972e-06, "loss": 0.5677, "step": 22780 }, { "epoch": 68.44, "grad_norm": 11.782918930053711, "learning_rate": 7.71871871871872e-06, "loss": 0.5572, "step": 22790 }, { "epoch": 68.47, "grad_norm": 14.087546348571777, "learning_rate": 7.717717717717719e-06, "loss": 0.5194, "step": 22800 }, { "epoch": 68.5, "grad_norm": 16.74912452697754, "learning_rate": 7.716716716716717e-06, "loss": 0.5816, "step": 22810 }, { "epoch": 68.53, "grad_norm": 16.07416534423828, "learning_rate": 7.715715715715716e-06, "loss": 0.5926, "step": 22820 }, { "epoch": 68.56, "grad_norm": 13.526765823364258, "learning_rate": 7.714714714714714e-06, "loss": 0.528, "step": 22830 }, { "epoch": 68.59, "grad_norm": 15.224496841430664, "learning_rate": 7.713713713713715e-06, "loss": 0.5792, "step": 22840 }, { "epoch": 68.62, "grad_norm": 13.790152549743652, "learning_rate": 7.712712712712713e-06, "loss": 0.6411, "step": 22850 }, { "epoch": 68.65, "grad_norm": 14.932406425476074, "learning_rate": 7.711711711711712e-06, "loss": 0.5628, "step": 22860 }, { "epoch": 68.68, "grad_norm": 15.2655668258667, "learning_rate": 7.710710710710712e-06, "loss": 0.5397, "step": 22870 }, { "epoch": 68.71, "grad_norm": 19.84419822692871, "learning_rate": 7.70970970970971e-06, "loss": 0.5996, "step": 22880 }, { "epoch": 68.74, "grad_norm": 15.053720474243164, "learning_rate": 7.70870870870871e-06, "loss": 0.532, "step": 22890 }, { "epoch": 68.77, "grad_norm": 13.848394393920898, "learning_rate": 7.707707707707708e-06, "loss": 0.4833, "step": 22900 }, { "epoch": 68.8, "grad_norm": 18.321563720703125, "learning_rate": 7.706706706706707e-06, "loss": 0.5969, "step": 22910 }, { "epoch": 68.83, "grad_norm": 17.008100509643555, "learning_rate": 7.705705705705707e-06, "loss": 0.5224, "step": 22920 }, { "epoch": 68.86, "grad_norm": 9.902338981628418, "learning_rate": 7.704704704704705e-06, "loss": 0.5066, "step": 22930 }, { "epoch": 68.89, "grad_norm": 19.7772159576416, "learning_rate": 7.703703703703704e-06, "loss": 0.5297, "step": 22940 }, { "epoch": 68.92, "grad_norm": 17.880611419677734, "learning_rate": 7.702702702702704e-06, "loss": 0.5247, "step": 22950 }, { "epoch": 68.95, "grad_norm": 12.586747169494629, "learning_rate": 7.701701701701703e-06, "loss": 0.5011, "step": 22960 }, { "epoch": 68.98, "grad_norm": 18.35675621032715, "learning_rate": 7.700700700700701e-06, "loss": 0.5163, "step": 22970 }, { "epoch": 69.0, "eval_accuracy": 0.8677, "eval_loss": 0.45321527123451233, "eval_runtime": 12.8451, "eval_samples_per_second": 778.506, "eval_steps_per_second": 3.114, "step": 22977 }, { "epoch": 69.01, "grad_norm": 13.263096809387207, "learning_rate": 7.6996996996997e-06, "loss": 0.5317, "step": 22980 }, { "epoch": 69.04, "grad_norm": 22.147701263427734, "learning_rate": 7.698698698698699e-06, "loss": 0.5439, "step": 22990 }, { "epoch": 69.07, "grad_norm": 18.904695510864258, "learning_rate": 7.697697697697697e-06, "loss": 0.5396, "step": 23000 }, { "epoch": 69.1, "grad_norm": 14.03147029876709, "learning_rate": 7.696696696696698e-06, "loss": 0.5593, "step": 23010 }, { "epoch": 69.13, "grad_norm": 11.370265007019043, "learning_rate": 7.695695695695696e-06, "loss": 0.5129, "step": 23020 }, { "epoch": 69.16, "grad_norm": 13.232316017150879, "learning_rate": 7.694694694694695e-06, "loss": 0.573, "step": 23030 }, { "epoch": 69.19, "grad_norm": 19.535900115966797, "learning_rate": 7.693693693693695e-06, "loss": 0.5484, "step": 23040 }, { "epoch": 69.22, "grad_norm": 22.441991806030273, "learning_rate": 7.692692692692694e-06, "loss": 0.5777, "step": 23050 }, { "epoch": 69.25, "grad_norm": 14.082656860351562, "learning_rate": 7.691691691691692e-06, "loss": 0.4827, "step": 23060 }, { "epoch": 69.28, "grad_norm": 18.965608596801758, "learning_rate": 7.69069069069069e-06, "loss": 0.5555, "step": 23070 }, { "epoch": 69.31, "grad_norm": 21.62281608581543, "learning_rate": 7.68968968968969e-06, "loss": 0.4925, "step": 23080 }, { "epoch": 69.34, "grad_norm": 13.503475189208984, "learning_rate": 7.68868868868869e-06, "loss": 0.5216, "step": 23090 }, { "epoch": 69.37, "grad_norm": 18.784984588623047, "learning_rate": 7.687687687687688e-06, "loss": 0.526, "step": 23100 }, { "epoch": 69.4, "grad_norm": 18.865676879882812, "learning_rate": 7.686686686686687e-06, "loss": 0.553, "step": 23110 }, { "epoch": 69.43, "grad_norm": 16.839923858642578, "learning_rate": 7.685685685685687e-06, "loss": 0.5339, "step": 23120 }, { "epoch": 69.46, "grad_norm": 14.643489837646484, "learning_rate": 7.684684684684686e-06, "loss": 0.5262, "step": 23130 }, { "epoch": 69.49, "grad_norm": 15.389371871948242, "learning_rate": 7.683683683683684e-06, "loss": 0.5784, "step": 23140 }, { "epoch": 69.52, "grad_norm": 12.793866157531738, "learning_rate": 7.682682682682683e-06, "loss": 0.5171, "step": 23150 }, { "epoch": 69.55, "grad_norm": 11.249611854553223, "learning_rate": 7.681681681681682e-06, "loss": 0.5073, "step": 23160 }, { "epoch": 69.58, "grad_norm": 16.497753143310547, "learning_rate": 7.680680680680682e-06, "loss": 0.5207, "step": 23170 }, { "epoch": 69.61, "grad_norm": 16.578771591186523, "learning_rate": 7.67967967967968e-06, "loss": 0.5069, "step": 23180 }, { "epoch": 69.64, "grad_norm": 17.70914077758789, "learning_rate": 7.678678678678679e-06, "loss": 0.5507, "step": 23190 }, { "epoch": 69.67, "grad_norm": 14.014601707458496, "learning_rate": 7.67767767767768e-06, "loss": 0.5078, "step": 23200 }, { "epoch": 69.7, "grad_norm": 17.132427215576172, "learning_rate": 7.676676676676678e-06, "loss": 0.541, "step": 23210 }, { "epoch": 69.73, "grad_norm": 23.833972930908203, "learning_rate": 7.675675675675676e-06, "loss": 0.5316, "step": 23220 }, { "epoch": 69.76, "grad_norm": 15.896707534790039, "learning_rate": 7.674674674674675e-06, "loss": 0.5324, "step": 23230 }, { "epoch": 69.79, "grad_norm": 22.634628295898438, "learning_rate": 7.673673673673674e-06, "loss": 0.5618, "step": 23240 }, { "epoch": 69.82, "grad_norm": 20.63096809387207, "learning_rate": 7.672672672672672e-06, "loss": 0.5282, "step": 23250 }, { "epoch": 69.85, "grad_norm": 16.865144729614258, "learning_rate": 7.671671671671673e-06, "loss": 0.5597, "step": 23260 }, { "epoch": 69.88, "grad_norm": 12.029297828674316, "learning_rate": 7.670670670670671e-06, "loss": 0.52, "step": 23270 }, { "epoch": 69.91, "grad_norm": 19.67845344543457, "learning_rate": 7.66966966966967e-06, "loss": 0.582, "step": 23280 }, { "epoch": 69.94, "grad_norm": 10.350509643554688, "learning_rate": 7.66866866866867e-06, "loss": 0.5364, "step": 23290 }, { "epoch": 69.97, "grad_norm": 16.220821380615234, "learning_rate": 7.667667667667669e-06, "loss": 0.5446, "step": 23300 }, { "epoch": 70.0, "grad_norm": 57.43446350097656, "learning_rate": 7.666666666666667e-06, "loss": 0.5231, "step": 23310 }, { "epoch": 70.0, "eval_accuracy": 0.867, "eval_loss": 0.46347084641456604, "eval_runtime": 13.0652, "eval_samples_per_second": 765.395, "eval_steps_per_second": 3.062, "step": 23310 }, { "epoch": 70.03, "grad_norm": 15.071653366088867, "learning_rate": 7.665665665665666e-06, "loss": 0.5315, "step": 23320 }, { "epoch": 70.06, "grad_norm": 33.71089553833008, "learning_rate": 7.664664664664664e-06, "loss": 0.5371, "step": 23330 }, { "epoch": 70.09, "grad_norm": 18.549217224121094, "learning_rate": 7.663663663663665e-06, "loss": 0.5398, "step": 23340 }, { "epoch": 70.12, "grad_norm": 19.0047550201416, "learning_rate": 7.662662662662663e-06, "loss": 0.5, "step": 23350 }, { "epoch": 70.15, "grad_norm": 12.97152042388916, "learning_rate": 7.661661661661662e-06, "loss": 0.5523, "step": 23360 }, { "epoch": 70.18, "grad_norm": 16.551597595214844, "learning_rate": 7.660660660660662e-06, "loss": 0.5394, "step": 23370 }, { "epoch": 70.21, "grad_norm": 17.33650779724121, "learning_rate": 7.65965965965966e-06, "loss": 0.5095, "step": 23380 }, { "epoch": 70.24, "grad_norm": 20.949827194213867, "learning_rate": 7.65865865865866e-06, "loss": 0.5707, "step": 23390 }, { "epoch": 70.27, "grad_norm": 17.115488052368164, "learning_rate": 7.657657657657658e-06, "loss": 0.5053, "step": 23400 }, { "epoch": 70.3, "grad_norm": 18.152801513671875, "learning_rate": 7.656656656656657e-06, "loss": 0.5563, "step": 23410 }, { "epoch": 70.33, "grad_norm": 24.085355758666992, "learning_rate": 7.655655655655657e-06, "loss": 0.5389, "step": 23420 }, { "epoch": 70.36, "grad_norm": 18.19046401977539, "learning_rate": 7.654654654654655e-06, "loss": 0.5402, "step": 23430 }, { "epoch": 70.39, "grad_norm": 19.540538787841797, "learning_rate": 7.653653653653654e-06, "loss": 0.5509, "step": 23440 }, { "epoch": 70.42, "grad_norm": 18.179523468017578, "learning_rate": 7.652652652652653e-06, "loss": 0.5563, "step": 23450 }, { "epoch": 70.45, "grad_norm": 11.197782516479492, "learning_rate": 7.651651651651653e-06, "loss": 0.4984, "step": 23460 }, { "epoch": 70.48, "grad_norm": 18.672710418701172, "learning_rate": 7.650650650650652e-06, "loss": 0.553, "step": 23470 }, { "epoch": 70.51, "grad_norm": 20.223995208740234, "learning_rate": 7.64964964964965e-06, "loss": 0.54, "step": 23480 }, { "epoch": 70.54, "grad_norm": 19.691944122314453, "learning_rate": 7.648648648648649e-06, "loss": 0.6133, "step": 23490 }, { "epoch": 70.57, "grad_norm": 13.218948364257812, "learning_rate": 7.647647647647647e-06, "loss": 0.536, "step": 23500 }, { "epoch": 70.6, "grad_norm": 18.09910774230957, "learning_rate": 7.646646646646648e-06, "loss": 0.5601, "step": 23510 }, { "epoch": 70.63, "grad_norm": 15.12651252746582, "learning_rate": 7.645645645645646e-06, "loss": 0.5298, "step": 23520 }, { "epoch": 70.66, "grad_norm": 13.266009330749512, "learning_rate": 7.644644644644645e-06, "loss": 0.5261, "step": 23530 }, { "epoch": 70.69, "grad_norm": 15.739737510681152, "learning_rate": 7.643643643643645e-06, "loss": 0.5563, "step": 23540 }, { "epoch": 70.72, "grad_norm": 19.626802444458008, "learning_rate": 7.642642642642644e-06, "loss": 0.5593, "step": 23550 }, { "epoch": 70.75, "grad_norm": 15.771201133728027, "learning_rate": 7.641641641641642e-06, "loss": 0.4422, "step": 23560 }, { "epoch": 70.78, "grad_norm": 12.177371978759766, "learning_rate": 7.640640640640641e-06, "loss": 0.5231, "step": 23570 }, { "epoch": 70.81, "grad_norm": 17.018787384033203, "learning_rate": 7.63963963963964e-06, "loss": 0.5769, "step": 23580 }, { "epoch": 70.84, "grad_norm": 13.767796516418457, "learning_rate": 7.63863863863864e-06, "loss": 0.5271, "step": 23590 }, { "epoch": 70.87, "grad_norm": 13.441535949707031, "learning_rate": 7.637637637637638e-06, "loss": 0.5156, "step": 23600 }, { "epoch": 70.9, "grad_norm": 8.215331077575684, "learning_rate": 7.636636636636637e-06, "loss": 0.506, "step": 23610 }, { "epoch": 70.93, "grad_norm": 16.722057342529297, "learning_rate": 7.635635635635637e-06, "loss": 0.611, "step": 23620 }, { "epoch": 70.96, "grad_norm": 13.369221687316895, "learning_rate": 7.634634634634636e-06, "loss": 0.5239, "step": 23630 }, { "epoch": 70.99, "grad_norm": 18.794593811035156, "learning_rate": 7.633633633633634e-06, "loss": 0.5672, "step": 23640 }, { "epoch": 71.0, "eval_accuracy": 0.8668, "eval_loss": 0.4625888764858246, "eval_runtime": 12.8291, "eval_samples_per_second": 779.477, "eval_steps_per_second": 3.118, "step": 23643 }, { "epoch": 71.02, "grad_norm": 14.65259075164795, "learning_rate": 7.632632632632633e-06, "loss": 0.5242, "step": 23650 }, { "epoch": 71.05, "grad_norm": 14.861987113952637, "learning_rate": 7.631631631631632e-06, "loss": 0.525, "step": 23660 }, { "epoch": 71.08, "grad_norm": 17.436002731323242, "learning_rate": 7.63063063063063e-06, "loss": 0.5318, "step": 23670 }, { "epoch": 71.11, "grad_norm": 16.081445693969727, "learning_rate": 7.62962962962963e-06, "loss": 0.5621, "step": 23680 }, { "epoch": 71.14, "grad_norm": 14.570060729980469, "learning_rate": 7.628628628628629e-06, "loss": 0.545, "step": 23690 }, { "epoch": 71.17, "grad_norm": 22.60666847229004, "learning_rate": 7.6276276276276285e-06, "loss": 0.5597, "step": 23700 }, { "epoch": 71.2, "grad_norm": 15.404147148132324, "learning_rate": 7.626626626626628e-06, "loss": 0.5265, "step": 23710 }, { "epoch": 71.23, "grad_norm": 14.775761604309082, "learning_rate": 7.6256256256256266e-06, "loss": 0.5357, "step": 23720 }, { "epoch": 71.26, "grad_norm": 10.40296745300293, "learning_rate": 7.624624624624624e-06, "loss": 0.5307, "step": 23730 }, { "epoch": 71.29, "grad_norm": 21.786359786987305, "learning_rate": 7.623623623623624e-06, "loss": 0.5797, "step": 23740 }, { "epoch": 71.32, "grad_norm": 15.039482116699219, "learning_rate": 7.622622622622623e-06, "loss": 0.4593, "step": 23750 }, { "epoch": 71.35, "grad_norm": 16.6463623046875, "learning_rate": 7.621621621621622e-06, "loss": 0.5371, "step": 23760 }, { "epoch": 71.38, "grad_norm": 18.40976333618164, "learning_rate": 7.620620620620621e-06, "loss": 0.5448, "step": 23770 }, { "epoch": 71.41, "grad_norm": 18.807512283325195, "learning_rate": 7.619619619619621e-06, "loss": 0.5824, "step": 23780 }, { "epoch": 71.44, "grad_norm": 18.5842342376709, "learning_rate": 7.618618618618619e-06, "loss": 0.5171, "step": 23790 }, { "epoch": 71.47, "grad_norm": 23.46646499633789, "learning_rate": 7.617617617617619e-06, "loss": 0.5398, "step": 23800 }, { "epoch": 71.5, "grad_norm": 12.604337692260742, "learning_rate": 7.616616616616618e-06, "loss": 0.5405, "step": 23810 }, { "epoch": 71.53, "grad_norm": 19.489194869995117, "learning_rate": 7.615615615615616e-06, "loss": 0.5279, "step": 23820 }, { "epoch": 71.56, "grad_norm": 17.195911407470703, "learning_rate": 7.614614614614615e-06, "loss": 0.5924, "step": 23830 }, { "epoch": 71.59, "grad_norm": 16.526309967041016, "learning_rate": 7.613613613613614e-06, "loss": 0.5628, "step": 23840 }, { "epoch": 71.62, "grad_norm": 19.324848175048828, "learning_rate": 7.612612612612613e-06, "loss": 0.5232, "step": 23850 }, { "epoch": 71.65, "grad_norm": 17.28827476501465, "learning_rate": 7.611611611611612e-06, "loss": 0.5247, "step": 23860 }, { "epoch": 71.68, "grad_norm": 16.0518798828125, "learning_rate": 7.610610610610611e-06, "loss": 0.5137, "step": 23870 }, { "epoch": 71.71, "grad_norm": 15.90246295928955, "learning_rate": 7.609609609609611e-06, "loss": 0.5948, "step": 23880 }, { "epoch": 71.74, "grad_norm": 13.580601692199707, "learning_rate": 7.6086086086086095e-06, "loss": 0.5067, "step": 23890 }, { "epoch": 71.77, "grad_norm": 16.561983108520508, "learning_rate": 7.607607607607608e-06, "loss": 0.5639, "step": 23900 }, { "epoch": 71.8, "grad_norm": 22.43711280822754, "learning_rate": 7.606606606606607e-06, "loss": 0.5548, "step": 23910 }, { "epoch": 71.83, "grad_norm": 15.016554832458496, "learning_rate": 7.605605605605606e-06, "loss": 0.5917, "step": 23920 }, { "epoch": 71.86, "grad_norm": 15.802587509155273, "learning_rate": 7.6046046046046055e-06, "loss": 0.6077, "step": 23930 }, { "epoch": 71.89, "grad_norm": 12.077566146850586, "learning_rate": 7.603603603603604e-06, "loss": 0.486, "step": 23940 }, { "epoch": 71.92, "grad_norm": 17.41840171813965, "learning_rate": 7.6026026026026036e-06, "loss": 0.5488, "step": 23950 }, { "epoch": 71.95, "grad_norm": 17.22233009338379, "learning_rate": 7.601601601601602e-06, "loss": 0.4889, "step": 23960 }, { "epoch": 71.98, "grad_norm": 14.471308708190918, "learning_rate": 7.600600600600602e-06, "loss": 0.501, "step": 23970 }, { "epoch": 72.0, "eval_accuracy": 0.8677, "eval_loss": 0.4600684642791748, "eval_runtime": 12.7849, "eval_samples_per_second": 782.172, "eval_steps_per_second": 3.129, "step": 23976 }, { "epoch": 72.01, "grad_norm": 15.971799850463867, "learning_rate": 7.599599599599599e-06, "loss": 0.5027, "step": 23980 }, { "epoch": 72.04, "grad_norm": 15.979533195495605, "learning_rate": 7.598598598598599e-06, "loss": 0.5537, "step": 23990 }, { "epoch": 72.07, "grad_norm": 16.601274490356445, "learning_rate": 7.597597597597598e-06, "loss": 0.4777, "step": 24000 }, { "epoch": 72.1, "grad_norm": 13.306760787963867, "learning_rate": 7.596596596596597e-06, "loss": 0.5637, "step": 24010 }, { "epoch": 72.13, "grad_norm": 19.46541976928711, "learning_rate": 7.595595595595596e-06, "loss": 0.5209, "step": 24020 }, { "epoch": 72.16, "grad_norm": 10.344480514526367, "learning_rate": 7.594594594594596e-06, "loss": 0.5183, "step": 24030 }, { "epoch": 72.19, "grad_norm": 15.620344161987305, "learning_rate": 7.593593593593594e-06, "loss": 0.5722, "step": 24040 }, { "epoch": 72.22, "grad_norm": 16.568490982055664, "learning_rate": 7.592592592592594e-06, "loss": 0.5324, "step": 24050 }, { "epoch": 72.25, "grad_norm": 11.025808334350586, "learning_rate": 7.591591591591592e-06, "loss": 0.5042, "step": 24060 }, { "epoch": 72.28, "grad_norm": 14.100255012512207, "learning_rate": 7.590590590590591e-06, "loss": 0.5028, "step": 24070 }, { "epoch": 72.31, "grad_norm": 11.300607681274414, "learning_rate": 7.5895895895895895e-06, "loss": 0.4996, "step": 24080 }, { "epoch": 72.34, "grad_norm": 15.547744750976562, "learning_rate": 7.588588588588589e-06, "loss": 0.549, "step": 24090 }, { "epoch": 72.37, "grad_norm": 17.27249526977539, "learning_rate": 7.587587587587588e-06, "loss": 0.5353, "step": 24100 }, { "epoch": 72.4, "grad_norm": 11.9943208694458, "learning_rate": 7.586586586586587e-06, "loss": 0.5581, "step": 24110 }, { "epoch": 72.43, "grad_norm": 9.500299453735352, "learning_rate": 7.5855855855855865e-06, "loss": 0.5421, "step": 24120 }, { "epoch": 72.46, "grad_norm": 17.034353256225586, "learning_rate": 7.584584584584586e-06, "loss": 0.5442, "step": 24130 }, { "epoch": 72.49, "grad_norm": 12.58971881866455, "learning_rate": 7.5835835835835845e-06, "loss": 0.5692, "step": 24140 }, { "epoch": 72.52, "grad_norm": 13.629549980163574, "learning_rate": 7.582582582582583e-06, "loss": 0.561, "step": 24150 }, { "epoch": 72.55, "grad_norm": 13.327164649963379, "learning_rate": 7.581581581581582e-06, "loss": 0.5717, "step": 24160 }, { "epoch": 72.58, "grad_norm": 14.426154136657715, "learning_rate": 7.580580580580581e-06, "loss": 0.5369, "step": 24170 }, { "epoch": 72.61, "grad_norm": 12.07056999206543, "learning_rate": 7.57957957957958e-06, "loss": 0.5526, "step": 24180 }, { "epoch": 72.64, "grad_norm": 13.597389221191406, "learning_rate": 7.578578578578579e-06, "loss": 0.5784, "step": 24190 }, { "epoch": 72.67, "grad_norm": 19.362857818603516, "learning_rate": 7.577577577577579e-06, "loss": 0.5699, "step": 24200 }, { "epoch": 72.7, "grad_norm": 15.222396850585938, "learning_rate": 7.576576576576577e-06, "loss": 0.4642, "step": 24210 }, { "epoch": 72.73, "grad_norm": 21.673702239990234, "learning_rate": 7.575575575575577e-06, "loss": 0.5819, "step": 24220 }, { "epoch": 72.76, "grad_norm": 18.147018432617188, "learning_rate": 7.574574574574574e-06, "loss": 0.5674, "step": 24230 }, { "epoch": 72.79, "grad_norm": 12.631148338317871, "learning_rate": 7.573573573573574e-06, "loss": 0.5157, "step": 24240 }, { "epoch": 72.82, "grad_norm": 14.27245044708252, "learning_rate": 7.572572572572573e-06, "loss": 0.5214, "step": 24250 }, { "epoch": 72.85, "grad_norm": 15.539676666259766, "learning_rate": 7.571571571571572e-06, "loss": 0.5118, "step": 24260 }, { "epoch": 72.88, "grad_norm": 15.721545219421387, "learning_rate": 7.570570570570571e-06, "loss": 0.5546, "step": 24270 }, { "epoch": 72.91, "grad_norm": 13.978747367858887, "learning_rate": 7.569569569569571e-06, "loss": 0.5183, "step": 24280 }, { "epoch": 72.94, "grad_norm": 15.935824394226074, "learning_rate": 7.568568568568569e-06, "loss": 0.5509, "step": 24290 }, { "epoch": 72.97, "grad_norm": 12.331801414489746, "learning_rate": 7.567567567567569e-06, "loss": 0.527, "step": 24300 }, { "epoch": 73.0, "eval_accuracy": 0.8644, "eval_loss": 0.4660574793815613, "eval_runtime": 12.9139, "eval_samples_per_second": 774.361, "eval_steps_per_second": 3.097, "step": 24309 }, { "epoch": 73.0, "grad_norm": 15.559767723083496, "learning_rate": 7.566566566566567e-06, "loss": 0.4842, "step": 24310 }, { "epoch": 73.03, "grad_norm": 16.145465850830078, "learning_rate": 7.565565565565566e-06, "loss": 0.5281, "step": 24320 }, { "epoch": 73.06, "grad_norm": 14.819173812866211, "learning_rate": 7.5645645645645646e-06, "loss": 0.5776, "step": 24330 }, { "epoch": 73.09, "grad_norm": 17.449100494384766, "learning_rate": 7.563563563563564e-06, "loss": 0.506, "step": 24340 }, { "epoch": 73.12, "grad_norm": 17.26336669921875, "learning_rate": 7.5625625625625634e-06, "loss": 0.5094, "step": 24350 }, { "epoch": 73.15, "grad_norm": 18.2810001373291, "learning_rate": 7.561561561561562e-06, "loss": 0.5599, "step": 24360 }, { "epoch": 73.18, "grad_norm": 16.956859588623047, "learning_rate": 7.5605605605605615e-06, "loss": 0.5338, "step": 24370 }, { "epoch": 73.21, "grad_norm": 13.581554412841797, "learning_rate": 7.559559559559561e-06, "loss": 0.5648, "step": 24380 }, { "epoch": 73.24, "grad_norm": 11.054035186767578, "learning_rate": 7.5585585585585595e-06, "loss": 0.5501, "step": 24390 }, { "epoch": 73.27, "grad_norm": 17.458660125732422, "learning_rate": 7.557557557557558e-06, "loss": 0.5212, "step": 24400 }, { "epoch": 73.3, "grad_norm": 11.59261417388916, "learning_rate": 7.556556556556557e-06, "loss": 0.559, "step": 24410 }, { "epoch": 73.33, "grad_norm": 16.86316680908203, "learning_rate": 7.555555555555556e-06, "loss": 0.58, "step": 24420 }, { "epoch": 73.36, "grad_norm": 19.530414581298828, "learning_rate": 7.554554554554555e-06, "loss": 0.5764, "step": 24430 }, { "epoch": 73.39, "grad_norm": 12.657683372497559, "learning_rate": 7.553553553553554e-06, "loss": 0.5493, "step": 24440 }, { "epoch": 73.42, "grad_norm": 14.319578170776367, "learning_rate": 7.552552552552554e-06, "loss": 0.5061, "step": 24450 }, { "epoch": 73.45, "grad_norm": 15.400496482849121, "learning_rate": 7.551551551551552e-06, "loss": 0.5655, "step": 24460 }, { "epoch": 73.48, "grad_norm": 20.86901092529297, "learning_rate": 7.550550550550552e-06, "loss": 0.5578, "step": 24470 }, { "epoch": 73.51, "grad_norm": 15.07816219329834, "learning_rate": 7.549549549549549e-06, "loss": 0.5714, "step": 24480 }, { "epoch": 73.54, "grad_norm": 17.34552764892578, "learning_rate": 7.548548548548549e-06, "loss": 0.5304, "step": 24490 }, { "epoch": 73.57, "grad_norm": 14.773884773254395, "learning_rate": 7.547547547547548e-06, "loss": 0.5399, "step": 24500 }, { "epoch": 73.6, "grad_norm": 19.51259422302246, "learning_rate": 7.546546546546547e-06, "loss": 0.5351, "step": 24510 }, { "epoch": 73.63, "grad_norm": 16.412368774414062, "learning_rate": 7.545545545545546e-06, "loss": 0.4782, "step": 24520 }, { "epoch": 73.66, "grad_norm": 15.846576690673828, "learning_rate": 7.544544544544545e-06, "loss": 0.5483, "step": 24530 }, { "epoch": 73.69, "grad_norm": 15.892062187194824, "learning_rate": 7.543543543543544e-06, "loss": 0.5103, "step": 24540 }, { "epoch": 73.72, "grad_norm": 14.08358383178711, "learning_rate": 7.542542542542544e-06, "loss": 0.5717, "step": 24550 }, { "epoch": 73.75, "grad_norm": 14.289830207824707, "learning_rate": 7.5415415415415416e-06, "loss": 0.5413, "step": 24560 }, { "epoch": 73.78, "grad_norm": 16.19147300720215, "learning_rate": 7.540540540540541e-06, "loss": 0.5163, "step": 24570 }, { "epoch": 73.81, "grad_norm": 13.138388633728027, "learning_rate": 7.53953953953954e-06, "loss": 0.5051, "step": 24580 }, { "epoch": 73.84, "grad_norm": 20.523103713989258, "learning_rate": 7.538538538538539e-06, "loss": 0.5301, "step": 24590 }, { "epoch": 73.87, "grad_norm": 13.263509750366211, "learning_rate": 7.5375375375375385e-06, "loss": 0.5616, "step": 24600 }, { "epoch": 73.9, "grad_norm": 12.0683012008667, "learning_rate": 7.536536536536537e-06, "loss": 0.5498, "step": 24610 }, { "epoch": 73.93, "grad_norm": 20.918397903442383, "learning_rate": 7.5355355355355365e-06, "loss": 0.5083, "step": 24620 }, { "epoch": 73.96, "grad_norm": 20.743820190429688, "learning_rate": 7.534534534534535e-06, "loss": 0.5541, "step": 24630 }, { "epoch": 73.99, "grad_norm": 26.155380249023438, "learning_rate": 7.5335335335335346e-06, "loss": 0.5618, "step": 24640 }, { "epoch": 74.0, "eval_accuracy": 0.8664, "eval_loss": 0.46769315004348755, "eval_runtime": 13.034, "eval_samples_per_second": 767.225, "eval_steps_per_second": 3.069, "step": 24642 }, { "epoch": 74.02, "grad_norm": 14.723753929138184, "learning_rate": 7.532532532532532e-06, "loss": 0.5415, "step": 24650 }, { "epoch": 74.05, "grad_norm": 14.85951042175293, "learning_rate": 7.531531531531532e-06, "loss": 0.4975, "step": 24660 }, { "epoch": 74.08, "grad_norm": 15.907417297363281, "learning_rate": 7.530530530530531e-06, "loss": 0.5815, "step": 24670 }, { "epoch": 74.11, "grad_norm": 18.325946807861328, "learning_rate": 7.52952952952953e-06, "loss": 0.5234, "step": 24680 }, { "epoch": 74.14, "grad_norm": 13.661330223083496, "learning_rate": 7.528528528528529e-06, "loss": 0.5401, "step": 24690 }, { "epoch": 74.17, "grad_norm": 17.674766540527344, "learning_rate": 7.527527527527529e-06, "loss": 0.5339, "step": 24700 }, { "epoch": 74.2, "grad_norm": 14.289420127868652, "learning_rate": 7.526526526526527e-06, "loss": 0.5384, "step": 24710 }, { "epoch": 74.23, "grad_norm": 15.812458038330078, "learning_rate": 7.525525525525527e-06, "loss": 0.541, "step": 24720 }, { "epoch": 74.26, "grad_norm": 13.972175598144531, "learning_rate": 7.5245245245245245e-06, "loss": 0.5716, "step": 24730 }, { "epoch": 74.29, "grad_norm": 14.59525203704834, "learning_rate": 7.523523523523524e-06, "loss": 0.5133, "step": 24740 }, { "epoch": 74.32, "grad_norm": 15.350430488586426, "learning_rate": 7.5225225225225225e-06, "loss": 0.5353, "step": 24750 }, { "epoch": 74.35, "grad_norm": 18.458045959472656, "learning_rate": 7.521521521521522e-06, "loss": 0.5476, "step": 24760 }, { "epoch": 74.38, "grad_norm": 16.9620361328125, "learning_rate": 7.520520520520521e-06, "loss": 0.5185, "step": 24770 }, { "epoch": 74.41, "grad_norm": 19.114097595214844, "learning_rate": 7.51951951951952e-06, "loss": 0.5245, "step": 24780 }, { "epoch": 74.44, "grad_norm": 17.223175048828125, "learning_rate": 7.518518518518519e-06, "loss": 0.5483, "step": 24790 }, { "epoch": 74.47, "grad_norm": 19.7282772064209, "learning_rate": 7.517517517517519e-06, "loss": 0.511, "step": 24800 }, { "epoch": 74.5, "grad_norm": 13.930578231811523, "learning_rate": 7.516516516516517e-06, "loss": 0.5263, "step": 24810 }, { "epoch": 74.53, "grad_norm": 12.889962196350098, "learning_rate": 7.515515515515516e-06, "loss": 0.5838, "step": 24820 }, { "epoch": 74.56, "grad_norm": 13.918925285339355, "learning_rate": 7.514514514514515e-06, "loss": 0.5182, "step": 24830 }, { "epoch": 74.59, "grad_norm": 13.224120140075684, "learning_rate": 7.513513513513514e-06, "loss": 0.5643, "step": 24840 }, { "epoch": 74.62, "grad_norm": 13.113409996032715, "learning_rate": 7.5125125125125135e-06, "loss": 0.5046, "step": 24850 }, { "epoch": 74.65, "grad_norm": 13.158830642700195, "learning_rate": 7.511511511511512e-06, "loss": 0.4659, "step": 24860 }, { "epoch": 74.68, "grad_norm": 14.555266380310059, "learning_rate": 7.5105105105105116e-06, "loss": 0.4796, "step": 24870 }, { "epoch": 74.71, "grad_norm": 15.64537239074707, "learning_rate": 7.50950950950951e-06, "loss": 0.5034, "step": 24880 }, { "epoch": 74.74, "grad_norm": 20.45342254638672, "learning_rate": 7.50850850850851e-06, "loss": 0.5726, "step": 24890 }, { "epoch": 74.77, "grad_norm": 13.022698402404785, "learning_rate": 7.507507507507507e-06, "loss": 0.5209, "step": 24900 }, { "epoch": 74.8, "grad_norm": 22.128602981567383, "learning_rate": 7.506506506506507e-06, "loss": 0.6244, "step": 24910 }, { "epoch": 74.83, "grad_norm": 13.853561401367188, "learning_rate": 7.505505505505506e-06, "loss": 0.5215, "step": 24920 }, { "epoch": 74.86, "grad_norm": 14.449660301208496, "learning_rate": 7.504504504504505e-06, "loss": 0.5369, "step": 24930 }, { "epoch": 74.89, "grad_norm": 16.451210021972656, "learning_rate": 7.503503503503504e-06, "loss": 0.537, "step": 24940 }, { "epoch": 74.92, "grad_norm": 17.204198837280273, "learning_rate": 7.502502502502504e-06, "loss": 0.4918, "step": 24950 }, { "epoch": 74.95, "grad_norm": 13.26692008972168, "learning_rate": 7.501501501501502e-06, "loss": 0.5672, "step": 24960 }, { "epoch": 74.98, "grad_norm": 11.932439804077148, "learning_rate": 7.500500500500502e-06, "loss": 0.5161, "step": 24970 }, { "epoch": 75.0, "eval_accuracy": 0.8691, "eval_loss": 0.4629597067832947, "eval_runtime": 12.3855, "eval_samples_per_second": 807.397, "eval_steps_per_second": 3.23, "step": 24975 }, { "epoch": 75.02, "grad_norm": 20.42967414855957, "learning_rate": 7.4994994994994995e-06, "loss": 0.4581, "step": 24980 }, { "epoch": 75.05, "grad_norm": 13.88707160949707, "learning_rate": 7.498498498498499e-06, "loss": 0.4407, "step": 24990 }, { "epoch": 75.08, "grad_norm": 12.693586349487305, "learning_rate": 7.4974974974974975e-06, "loss": 0.5471, "step": 25000 }, { "epoch": 75.11, "grad_norm": 18.46637725830078, "learning_rate": 7.496496496496497e-06, "loss": 0.5798, "step": 25010 }, { "epoch": 75.14, "grad_norm": 22.856239318847656, "learning_rate": 7.495495495495496e-06, "loss": 0.496, "step": 25020 }, { "epoch": 75.17, "grad_norm": 15.663064002990723, "learning_rate": 7.494494494494495e-06, "loss": 0.5584, "step": 25030 }, { "epoch": 75.2, "grad_norm": 21.31612205505371, "learning_rate": 7.4934934934934944e-06, "loss": 0.4742, "step": 25040 }, { "epoch": 75.23, "grad_norm": 13.868929862976074, "learning_rate": 7.492492492492494e-06, "loss": 0.5257, "step": 25050 }, { "epoch": 75.26, "grad_norm": 13.549492835998535, "learning_rate": 7.491491491491492e-06, "loss": 0.5409, "step": 25060 }, { "epoch": 75.29, "grad_norm": 13.97533130645752, "learning_rate": 7.490490490490491e-06, "loss": 0.5411, "step": 25070 }, { "epoch": 75.32, "grad_norm": 14.013882637023926, "learning_rate": 7.48948948948949e-06, "loss": 0.5462, "step": 25080 }, { "epoch": 75.35, "grad_norm": 16.790821075439453, "learning_rate": 7.488488488488489e-06, "loss": 0.5867, "step": 25090 }, { "epoch": 75.38, "grad_norm": 20.78099250793457, "learning_rate": 7.487487487487488e-06, "loss": 0.5547, "step": 25100 }, { "epoch": 75.41, "grad_norm": 11.377620697021484, "learning_rate": 7.486486486486487e-06, "loss": 0.5413, "step": 25110 }, { "epoch": 75.44, "grad_norm": 27.481689453125, "learning_rate": 7.485485485485487e-06, "loss": 0.5153, "step": 25120 }, { "epoch": 75.47, "grad_norm": 13.65371036529541, "learning_rate": 7.484484484484485e-06, "loss": 0.5683, "step": 25130 }, { "epoch": 75.5, "grad_norm": 12.149045944213867, "learning_rate": 7.483483483483485e-06, "loss": 0.5496, "step": 25140 }, { "epoch": 75.53, "grad_norm": 13.61082935333252, "learning_rate": 7.482482482482482e-06, "loss": 0.4959, "step": 25150 }, { "epoch": 75.56, "grad_norm": 20.277263641357422, "learning_rate": 7.481481481481482e-06, "loss": 0.4964, "step": 25160 }, { "epoch": 75.59, "grad_norm": 13.979513168334961, "learning_rate": 7.480480480480481e-06, "loss": 0.5109, "step": 25170 }, { "epoch": 75.62, "grad_norm": 14.793264389038086, "learning_rate": 7.47947947947948e-06, "loss": 0.5199, "step": 25180 }, { "epoch": 75.65, "grad_norm": 15.40218448638916, "learning_rate": 7.478478478478479e-06, "loss": 0.517, "step": 25190 }, { "epoch": 75.68, "grad_norm": 10.543524742126465, "learning_rate": 7.477477477477479e-06, "loss": 0.582, "step": 25200 }, { "epoch": 75.71, "grad_norm": 28.832815170288086, "learning_rate": 7.476476476476477e-06, "loss": 0.5057, "step": 25210 }, { "epoch": 75.74, "grad_norm": 12.903115272521973, "learning_rate": 7.475475475475477e-06, "loss": 0.5065, "step": 25220 }, { "epoch": 75.77, "grad_norm": 24.768508911132812, "learning_rate": 7.4744744744744745e-06, "loss": 0.5163, "step": 25230 }, { "epoch": 75.8, "grad_norm": 11.10902214050293, "learning_rate": 7.473473473473474e-06, "loss": 0.5206, "step": 25240 }, { "epoch": 75.83, "grad_norm": 10.880926132202148, "learning_rate": 7.4724724724724726e-06, "loss": 0.49, "step": 25250 }, { "epoch": 75.86, "grad_norm": 20.442588806152344, "learning_rate": 7.471471471471472e-06, "loss": 0.5405, "step": 25260 }, { "epoch": 75.89, "grad_norm": 15.126239776611328, "learning_rate": 7.4704704704704714e-06, "loss": 0.4991, "step": 25270 }, { "epoch": 75.92, "grad_norm": 15.257558822631836, "learning_rate": 7.46946946946947e-06, "loss": 0.5766, "step": 25280 }, { "epoch": 75.95, "grad_norm": 14.302605628967285, "learning_rate": 7.4684684684684695e-06, "loss": 0.5137, "step": 25290 }, { "epoch": 75.98, "grad_norm": 15.673317909240723, "learning_rate": 7.467467467467469e-06, "loss": 0.5158, "step": 25300 }, { "epoch": 76.0, "eval_accuracy": 0.8671, "eval_loss": 0.46911802887916565, "eval_runtime": 12.7971, "eval_samples_per_second": 781.425, "eval_steps_per_second": 3.126, "step": 25308 }, { "epoch": 76.01, "grad_norm": 14.649646759033203, "learning_rate": 7.466466466466467e-06, "loss": 0.4862, "step": 25310 }, { "epoch": 76.04, "grad_norm": 13.316688537597656, "learning_rate": 7.465465465465466e-06, "loss": 0.4907, "step": 25320 }, { "epoch": 76.07, "grad_norm": 18.16513442993164, "learning_rate": 7.464464464464465e-06, "loss": 0.5469, "step": 25330 }, { "epoch": 76.1, "grad_norm": 18.091474533081055, "learning_rate": 7.463463463463464e-06, "loss": 0.5662, "step": 25340 }, { "epoch": 76.13, "grad_norm": 13.058123588562012, "learning_rate": 7.462462462462463e-06, "loss": 0.5114, "step": 25350 }, { "epoch": 76.16, "grad_norm": 32.16606140136719, "learning_rate": 7.461461461461462e-06, "loss": 0.5055, "step": 25360 }, { "epoch": 76.19, "grad_norm": 10.640914916992188, "learning_rate": 7.460460460460462e-06, "loss": 0.5011, "step": 25370 }, { "epoch": 76.22, "grad_norm": 12.13187026977539, "learning_rate": 7.45945945945946e-06, "loss": 0.5331, "step": 25380 }, { "epoch": 76.25, "grad_norm": 22.112607955932617, "learning_rate": 7.45845845845846e-06, "loss": 0.5492, "step": 25390 }, { "epoch": 76.28, "grad_norm": 20.70840072631836, "learning_rate": 7.457457457457457e-06, "loss": 0.5628, "step": 25400 }, { "epoch": 76.31, "grad_norm": 19.347606658935547, "learning_rate": 7.456456456456457e-06, "loss": 0.5265, "step": 25410 }, { "epoch": 76.34, "grad_norm": 15.124089241027832, "learning_rate": 7.455455455455456e-06, "loss": 0.4816, "step": 25420 }, { "epoch": 76.37, "grad_norm": 11.108996391296387, "learning_rate": 7.454454454454455e-06, "loss": 0.5009, "step": 25430 }, { "epoch": 76.4, "grad_norm": 24.092477798461914, "learning_rate": 7.453453453453454e-06, "loss": 0.5121, "step": 25440 }, { "epoch": 76.43, "grad_norm": 17.434934616088867, "learning_rate": 7.452452452452453e-06, "loss": 0.4952, "step": 25450 }, { "epoch": 76.46, "grad_norm": 18.704214096069336, "learning_rate": 7.451451451451452e-06, "loss": 0.4373, "step": 25460 }, { "epoch": 76.49, "grad_norm": 18.26543426513672, "learning_rate": 7.450450450450452e-06, "loss": 0.5725, "step": 25470 }, { "epoch": 76.52, "grad_norm": 25.885238647460938, "learning_rate": 7.4494494494494496e-06, "loss": 0.537, "step": 25480 }, { "epoch": 76.55, "grad_norm": 30.565876007080078, "learning_rate": 7.448448448448449e-06, "loss": 0.4964, "step": 25490 }, { "epoch": 76.58, "grad_norm": 20.226112365722656, "learning_rate": 7.447447447447448e-06, "loss": 0.5429, "step": 25500 }, { "epoch": 76.61, "grad_norm": 18.410228729248047, "learning_rate": 7.446446446446447e-06, "loss": 0.4911, "step": 25510 }, { "epoch": 76.64, "grad_norm": 24.221858978271484, "learning_rate": 7.4454454454454465e-06, "loss": 0.4685, "step": 25520 }, { "epoch": 76.67, "grad_norm": 20.43824577331543, "learning_rate": 7.444444444444445e-06, "loss": 0.5016, "step": 25530 }, { "epoch": 76.7, "grad_norm": 13.232439994812012, "learning_rate": 7.4434434434434445e-06, "loss": 0.5267, "step": 25540 }, { "epoch": 76.73, "grad_norm": 16.06315040588379, "learning_rate": 7.442442442442443e-06, "loss": 0.5382, "step": 25550 }, { "epoch": 76.76, "grad_norm": 15.746918678283691, "learning_rate": 7.441441441441442e-06, "loss": 0.5118, "step": 25560 }, { "epoch": 76.79, "grad_norm": 22.798219680786133, "learning_rate": 7.44044044044044e-06, "loss": 0.4976, "step": 25570 }, { "epoch": 76.82, "grad_norm": 15.366040229797363, "learning_rate": 7.43943943943944e-06, "loss": 0.5226, "step": 25580 }, { "epoch": 76.85, "grad_norm": 15.96611499786377, "learning_rate": 7.438438438438439e-06, "loss": 0.5613, "step": 25590 }, { "epoch": 76.88, "grad_norm": 19.329378128051758, "learning_rate": 7.437437437437438e-06, "loss": 0.4833, "step": 25600 }, { "epoch": 76.91, "grad_norm": 18.379074096679688, "learning_rate": 7.436436436436437e-06, "loss": 0.5026, "step": 25610 }, { "epoch": 76.94, "grad_norm": 12.549742698669434, "learning_rate": 7.435435435435437e-06, "loss": 0.5504, "step": 25620 }, { "epoch": 76.97, "grad_norm": 15.263510704040527, "learning_rate": 7.434434434434435e-06, "loss": 0.4608, "step": 25630 }, { "epoch": 77.0, "grad_norm": 21.36216163635254, "learning_rate": 7.433433433433434e-06, "loss": 0.54, "step": 25640 }, { "epoch": 77.0, "eval_accuracy": 0.8696, "eval_loss": 0.46450892090797424, "eval_runtime": 13.007, "eval_samples_per_second": 768.818, "eval_steps_per_second": 3.075, "step": 25641 }, { "epoch": 77.03, "grad_norm": 14.115147590637207, "learning_rate": 7.4324324324324324e-06, "loss": 0.4776, "step": 25650 }, { "epoch": 77.06, "grad_norm": 19.314167022705078, "learning_rate": 7.431431431431432e-06, "loss": 0.5811, "step": 25660 }, { "epoch": 77.09, "grad_norm": 11.57456111907959, "learning_rate": 7.4304304304304305e-06, "loss": 0.5489, "step": 25670 }, { "epoch": 77.12, "grad_norm": 17.31260108947754, "learning_rate": 7.42942942942943e-06, "loss": 0.5489, "step": 25680 }, { "epoch": 77.15, "grad_norm": 18.1948299407959, "learning_rate": 7.428428428428429e-06, "loss": 0.4979, "step": 25690 }, { "epoch": 77.18, "grad_norm": 14.55479907989502, "learning_rate": 7.427427427427428e-06, "loss": 0.5327, "step": 25700 }, { "epoch": 77.21, "grad_norm": 24.24778938293457, "learning_rate": 7.426426426426427e-06, "loss": 0.4662, "step": 25710 }, { "epoch": 77.24, "grad_norm": 13.166237831115723, "learning_rate": 7.425425425425427e-06, "loss": 0.4897, "step": 25720 }, { "epoch": 77.27, "grad_norm": 14.406869888305664, "learning_rate": 7.424424424424425e-06, "loss": 0.4897, "step": 25730 }, { "epoch": 77.3, "grad_norm": 16.864351272583008, "learning_rate": 7.423423423423424e-06, "loss": 0.5531, "step": 25740 }, { "epoch": 77.33, "grad_norm": 17.601715087890625, "learning_rate": 7.422422422422423e-06, "loss": 0.4555, "step": 25750 }, { "epoch": 77.36, "grad_norm": 12.461166381835938, "learning_rate": 7.421421421421422e-06, "loss": 0.4522, "step": 25760 }, { "epoch": 77.39, "grad_norm": 20.531606674194336, "learning_rate": 7.4204204204204215e-06, "loss": 0.544, "step": 25770 }, { "epoch": 77.42, "grad_norm": 14.070215225219727, "learning_rate": 7.41941941941942e-06, "loss": 0.5131, "step": 25780 }, { "epoch": 77.45, "grad_norm": 14.558828353881836, "learning_rate": 7.4184184184184195e-06, "loss": 0.5136, "step": 25790 }, { "epoch": 77.48, "grad_norm": 19.045860290527344, "learning_rate": 7.417417417417418e-06, "loss": 0.5237, "step": 25800 }, { "epoch": 77.51, "grad_norm": 24.205039978027344, "learning_rate": 7.416416416416417e-06, "loss": 0.5299, "step": 25810 }, { "epoch": 77.54, "grad_norm": 12.875337600708008, "learning_rate": 7.415415415415415e-06, "loss": 0.5001, "step": 25820 }, { "epoch": 77.57, "grad_norm": 19.669151306152344, "learning_rate": 7.414414414414415e-06, "loss": 0.5513, "step": 25830 }, { "epoch": 77.6, "grad_norm": 18.091136932373047, "learning_rate": 7.413413413413414e-06, "loss": 0.5318, "step": 25840 }, { "epoch": 77.63, "grad_norm": 13.371623039245605, "learning_rate": 7.412412412412413e-06, "loss": 0.5119, "step": 25850 }, { "epoch": 77.66, "grad_norm": 17.000181198120117, "learning_rate": 7.411411411411412e-06, "loss": 0.5288, "step": 25860 }, { "epoch": 77.69, "grad_norm": 19.7879638671875, "learning_rate": 7.410410410410412e-06, "loss": 0.555, "step": 25870 }, { "epoch": 77.72, "grad_norm": 18.344942092895508, "learning_rate": 7.40940940940941e-06, "loss": 0.5115, "step": 25880 }, { "epoch": 77.75, "grad_norm": 15.039511680603027, "learning_rate": 7.408408408408409e-06, "loss": 0.5399, "step": 25890 }, { "epoch": 77.78, "grad_norm": 12.591846466064453, "learning_rate": 7.4074074074074075e-06, "loss": 0.4651, "step": 25900 }, { "epoch": 77.81, "grad_norm": 15.273913383483887, "learning_rate": 7.406406406406407e-06, "loss": 0.5634, "step": 25910 }, { "epoch": 77.84, "grad_norm": 14.827644348144531, "learning_rate": 7.4054054054054055e-06, "loss": 0.4924, "step": 25920 }, { "epoch": 77.87, "grad_norm": 17.303077697753906, "learning_rate": 7.404404404404405e-06, "loss": 0.5312, "step": 25930 }, { "epoch": 77.9, "grad_norm": 17.895198822021484, "learning_rate": 7.403403403403404e-06, "loss": 0.5065, "step": 25940 }, { "epoch": 77.93, "grad_norm": 14.915367126464844, "learning_rate": 7.402402402402403e-06, "loss": 0.5569, "step": 25950 }, { "epoch": 77.96, "grad_norm": 20.017568588256836, "learning_rate": 7.4014014014014024e-06, "loss": 0.5149, "step": 25960 }, { "epoch": 77.99, "grad_norm": 13.076175689697266, "learning_rate": 7.400400400400402e-06, "loss": 0.5352, "step": 25970 }, { "epoch": 78.0, "eval_accuracy": 0.8649, "eval_loss": 0.48047134280204773, "eval_runtime": 12.8629, "eval_samples_per_second": 777.431, "eval_steps_per_second": 3.11, "step": 25974 }, { "epoch": 78.02, "grad_norm": 12.429010391235352, "learning_rate": 7.3993993993994e-06, "loss": 0.48, "step": 25980 }, { "epoch": 78.05, "grad_norm": 12.912761688232422, "learning_rate": 7.398398398398399e-06, "loss": 0.5868, "step": 25990 }, { "epoch": 78.08, "grad_norm": 12.608386039733887, "learning_rate": 7.397397397397398e-06, "loss": 0.5076, "step": 26000 }, { "epoch": 78.11, "grad_norm": 17.142749786376953, "learning_rate": 7.396396396396397e-06, "loss": 0.5145, "step": 26010 }, { "epoch": 78.14, "grad_norm": 18.803665161132812, "learning_rate": 7.395395395395396e-06, "loss": 0.4547, "step": 26020 }, { "epoch": 78.17, "grad_norm": 18.04738426208496, "learning_rate": 7.394394394394395e-06, "loss": 0.5698, "step": 26030 }, { "epoch": 78.2, "grad_norm": 15.603890419006348, "learning_rate": 7.393393393393395e-06, "loss": 0.5145, "step": 26040 }, { "epoch": 78.23, "grad_norm": 13.2296142578125, "learning_rate": 7.392392392392393e-06, "loss": 0.487, "step": 26050 }, { "epoch": 78.26, "grad_norm": 20.099300384521484, "learning_rate": 7.391391391391392e-06, "loss": 0.524, "step": 26060 }, { "epoch": 78.29, "grad_norm": 12.897528648376465, "learning_rate": 7.39039039039039e-06, "loss": 0.5418, "step": 26070 }, { "epoch": 78.32, "grad_norm": 15.419456481933594, "learning_rate": 7.38938938938939e-06, "loss": 0.555, "step": 26080 }, { "epoch": 78.35, "grad_norm": 16.353063583374023, "learning_rate": 7.388388388388389e-06, "loss": 0.5425, "step": 26090 }, { "epoch": 78.38, "grad_norm": 15.16103458404541, "learning_rate": 7.387387387387388e-06, "loss": 0.526, "step": 26100 }, { "epoch": 78.41, "grad_norm": 12.94898509979248, "learning_rate": 7.386386386386387e-06, "loss": 0.5487, "step": 26110 }, { "epoch": 78.44, "grad_norm": 17.046621322631836, "learning_rate": 7.385385385385386e-06, "loss": 0.5144, "step": 26120 }, { "epoch": 78.47, "grad_norm": 19.95154571533203, "learning_rate": 7.384384384384385e-06, "loss": 0.5222, "step": 26130 }, { "epoch": 78.5, "grad_norm": 18.736618041992188, "learning_rate": 7.383383383383383e-06, "loss": 0.5622, "step": 26140 }, { "epoch": 78.53, "grad_norm": 18.860132217407227, "learning_rate": 7.3823823823823825e-06, "loss": 0.5259, "step": 26150 }, { "epoch": 78.56, "grad_norm": 17.85606575012207, "learning_rate": 7.381381381381382e-06, "loss": 0.4899, "step": 26160 }, { "epoch": 78.59, "grad_norm": 13.834342002868652, "learning_rate": 7.3803803803803806e-06, "loss": 0.4883, "step": 26170 }, { "epoch": 78.62, "grad_norm": 17.14006233215332, "learning_rate": 7.37937937937938e-06, "loss": 0.5362, "step": 26180 }, { "epoch": 78.65, "grad_norm": 10.08365535736084, "learning_rate": 7.3783783783783794e-06, "loss": 0.4879, "step": 26190 }, { "epoch": 78.68, "grad_norm": 13.915678977966309, "learning_rate": 7.377377377377378e-06, "loss": 0.5099, "step": 26200 }, { "epoch": 78.71, "grad_norm": 12.106389045715332, "learning_rate": 7.3763763763763775e-06, "loss": 0.508, "step": 26210 }, { "epoch": 78.74, "grad_norm": 21.481420516967773, "learning_rate": 7.375375375375377e-06, "loss": 0.5478, "step": 26220 }, { "epoch": 78.77, "grad_norm": 10.372005462646484, "learning_rate": 7.374374374374375e-06, "loss": 0.5376, "step": 26230 }, { "epoch": 78.8, "grad_norm": 11.340289115905762, "learning_rate": 7.373373373373373e-06, "loss": 0.5383, "step": 26240 }, { "epoch": 78.83, "grad_norm": 11.110851287841797, "learning_rate": 7.372372372372373e-06, "loss": 0.5323, "step": 26250 }, { "epoch": 78.86, "grad_norm": 20.465383529663086, "learning_rate": 7.371371371371372e-06, "loss": 0.5645, "step": 26260 }, { "epoch": 78.89, "grad_norm": 14.545747756958008, "learning_rate": 7.370370370370371e-06, "loss": 0.5126, "step": 26270 }, { "epoch": 78.92, "grad_norm": 16.517396926879883, "learning_rate": 7.36936936936937e-06, "loss": 0.5487, "step": 26280 }, { "epoch": 78.95, "grad_norm": 17.911067962646484, "learning_rate": 7.36836836836837e-06, "loss": 0.4834, "step": 26290 }, { "epoch": 78.98, "grad_norm": 12.054357528686523, "learning_rate": 7.367367367367368e-06, "loss": 0.5433, "step": 26300 }, { "epoch": 79.0, "eval_accuracy": 0.867, "eval_loss": 0.4695671498775482, "eval_runtime": 12.7001, "eval_samples_per_second": 787.398, "eval_steps_per_second": 3.15, "step": 26307 }, { "epoch": 79.01, "grad_norm": 16.02682113647461, "learning_rate": 7.366366366366367e-06, "loss": 0.5008, "step": 26310 }, { "epoch": 79.04, "grad_norm": 8.246256828308105, "learning_rate": 7.365365365365365e-06, "loss": 0.4939, "step": 26320 }, { "epoch": 79.07, "grad_norm": 24.42353630065918, "learning_rate": 7.364364364364365e-06, "loss": 0.5352, "step": 26330 }, { "epoch": 79.1, "grad_norm": 18.5328369140625, "learning_rate": 7.363363363363364e-06, "loss": 0.5155, "step": 26340 }, { "epoch": 79.13, "grad_norm": 15.051130294799805, "learning_rate": 7.362362362362363e-06, "loss": 0.4789, "step": 26350 }, { "epoch": 79.16, "grad_norm": 18.929536819458008, "learning_rate": 7.361361361361362e-06, "loss": 0.572, "step": 26360 }, { "epoch": 79.19, "grad_norm": 17.490177154541016, "learning_rate": 7.360360360360361e-06, "loss": 0.4708, "step": 26370 }, { "epoch": 79.22, "grad_norm": 17.31424331665039, "learning_rate": 7.35935935935936e-06, "loss": 0.49, "step": 26380 }, { "epoch": 79.25, "grad_norm": 18.38802719116211, "learning_rate": 7.358358358358358e-06, "loss": 0.4939, "step": 26390 }, { "epoch": 79.28, "grad_norm": 15.984980583190918, "learning_rate": 7.3573573573573575e-06, "loss": 0.5251, "step": 26400 }, { "epoch": 79.31, "grad_norm": 17.431676864624023, "learning_rate": 7.356356356356357e-06, "loss": 0.4669, "step": 26410 }, { "epoch": 79.34, "grad_norm": 15.049677848815918, "learning_rate": 7.355355355355356e-06, "loss": 0.5497, "step": 26420 }, { "epoch": 79.37, "grad_norm": 19.36207389831543, "learning_rate": 7.354354354354355e-06, "loss": 0.5202, "step": 26430 }, { "epoch": 79.4, "grad_norm": 19.58750343322754, "learning_rate": 7.3533533533533545e-06, "loss": 0.5266, "step": 26440 }, { "epoch": 79.43, "grad_norm": 25.741657257080078, "learning_rate": 7.352352352352353e-06, "loss": 0.5046, "step": 26450 }, { "epoch": 79.46, "grad_norm": 13.647472381591797, "learning_rate": 7.3513513513513525e-06, "loss": 0.4464, "step": 26460 }, { "epoch": 79.49, "grad_norm": 13.559154510498047, "learning_rate": 7.350350350350351e-06, "loss": 0.5424, "step": 26470 }, { "epoch": 79.52, "grad_norm": 11.44558334350586, "learning_rate": 7.34934934934935e-06, "loss": 0.4915, "step": 26480 }, { "epoch": 79.55, "grad_norm": 13.8262939453125, "learning_rate": 7.348348348348348e-06, "loss": 0.4972, "step": 26490 }, { "epoch": 79.58, "grad_norm": 12.904470443725586, "learning_rate": 7.347347347347348e-06, "loss": 0.5266, "step": 26500 }, { "epoch": 79.61, "grad_norm": 15.852788925170898, "learning_rate": 7.346346346346347e-06, "loss": 0.4829, "step": 26510 }, { "epoch": 79.64, "grad_norm": 12.143349647521973, "learning_rate": 7.345345345345346e-06, "loss": 0.5313, "step": 26520 }, { "epoch": 79.67, "grad_norm": 16.085865020751953, "learning_rate": 7.344344344344345e-06, "loss": 0.4879, "step": 26530 }, { "epoch": 79.7, "grad_norm": 16.444753646850586, "learning_rate": 7.343343343343345e-06, "loss": 0.5562, "step": 26540 }, { "epoch": 79.73, "grad_norm": 15.056065559387207, "learning_rate": 7.342342342342343e-06, "loss": 0.5133, "step": 26550 }, { "epoch": 79.76, "grad_norm": 21.398672103881836, "learning_rate": 7.341341341341342e-06, "loss": 0.4803, "step": 26560 }, { "epoch": 79.79, "grad_norm": 17.769912719726562, "learning_rate": 7.3403403403403404e-06, "loss": 0.5133, "step": 26570 }, { "epoch": 79.82, "grad_norm": 16.689434051513672, "learning_rate": 7.33933933933934e-06, "loss": 0.5437, "step": 26580 }, { "epoch": 79.85, "grad_norm": 16.95115089416504, "learning_rate": 7.3383383383383385e-06, "loss": 0.536, "step": 26590 }, { "epoch": 79.88, "grad_norm": 20.207969665527344, "learning_rate": 7.337337337337338e-06, "loss": 0.5318, "step": 26600 }, { "epoch": 79.91, "grad_norm": 19.271556854248047, "learning_rate": 7.336336336336337e-06, "loss": 0.5122, "step": 26610 }, { "epoch": 79.94, "grad_norm": 15.942670822143555, "learning_rate": 7.335335335335336e-06, "loss": 0.513, "step": 26620 }, { "epoch": 79.97, "grad_norm": 18.47221565246582, "learning_rate": 7.334334334334335e-06, "loss": 0.5029, "step": 26630 }, { "epoch": 80.0, "grad_norm": 129.9085235595703, "learning_rate": 7.333333333333333e-06, "loss": 0.5555, "step": 26640 }, { "epoch": 80.0, "eval_accuracy": 0.8657, "eval_loss": 0.4745276868343353, "eval_runtime": 13.0471, "eval_samples_per_second": 766.451, "eval_steps_per_second": 3.066, "step": 26640 }, { "epoch": 80.03, "grad_norm": 19.516761779785156, "learning_rate": 7.332332332332333e-06, "loss": 0.5288, "step": 26650 }, { "epoch": 80.06, "grad_norm": 15.747294425964355, "learning_rate": 7.331331331331332e-06, "loss": 0.495, "step": 26660 }, { "epoch": 80.09, "grad_norm": 15.504973411560059, "learning_rate": 7.330330330330331e-06, "loss": 0.5094, "step": 26670 }, { "epoch": 80.12, "grad_norm": 11.713431358337402, "learning_rate": 7.32932932932933e-06, "loss": 0.5242, "step": 26680 }, { "epoch": 80.15, "grad_norm": 14.051051139831543, "learning_rate": 7.328328328328329e-06, "loss": 0.5044, "step": 26690 }, { "epoch": 80.18, "grad_norm": 16.7624568939209, "learning_rate": 7.327327327327328e-06, "loss": 0.536, "step": 26700 }, { "epoch": 80.21, "grad_norm": 12.7097806930542, "learning_rate": 7.3263263263263275e-06, "loss": 0.529, "step": 26710 }, { "epoch": 80.24, "grad_norm": 17.222837448120117, "learning_rate": 7.325325325325326e-06, "loss": 0.52, "step": 26720 }, { "epoch": 80.27, "grad_norm": 14.705011367797852, "learning_rate": 7.324324324324325e-06, "loss": 0.5685, "step": 26730 }, { "epoch": 80.3, "grad_norm": 17.78485870361328, "learning_rate": 7.323323323323323e-06, "loss": 0.5046, "step": 26740 }, { "epoch": 80.33, "grad_norm": 12.968180656433105, "learning_rate": 7.322322322322323e-06, "loss": 0.5051, "step": 26750 }, { "epoch": 80.36, "grad_norm": 20.615371704101562, "learning_rate": 7.321321321321322e-06, "loss": 0.5554, "step": 26760 }, { "epoch": 80.39, "grad_norm": 20.71367073059082, "learning_rate": 7.320320320320321e-06, "loss": 0.4645, "step": 26770 }, { "epoch": 80.42, "grad_norm": 14.724589347839355, "learning_rate": 7.31931931931932e-06, "loss": 0.5227, "step": 26780 }, { "epoch": 80.45, "grad_norm": 19.262706756591797, "learning_rate": 7.31831831831832e-06, "loss": 0.5481, "step": 26790 }, { "epoch": 80.48, "grad_norm": 13.716856002807617, "learning_rate": 7.317317317317318e-06, "loss": 0.4711, "step": 26800 }, { "epoch": 80.51, "grad_norm": 17.449464797973633, "learning_rate": 7.316316316316316e-06, "loss": 0.4541, "step": 26810 }, { "epoch": 80.54, "grad_norm": 16.110658645629883, "learning_rate": 7.3153153153153155e-06, "loss": 0.5067, "step": 26820 }, { "epoch": 80.57, "grad_norm": 9.302728652954102, "learning_rate": 7.314314314314315e-06, "loss": 0.5089, "step": 26830 }, { "epoch": 80.6, "grad_norm": 16.48388671875, "learning_rate": 7.3133133133133135e-06, "loss": 0.5377, "step": 26840 }, { "epoch": 80.63, "grad_norm": 13.761683464050293, "learning_rate": 7.312312312312313e-06, "loss": 0.4673, "step": 26850 }, { "epoch": 80.66, "grad_norm": 16.251955032348633, "learning_rate": 7.311311311311312e-06, "loss": 0.5625, "step": 26860 }, { "epoch": 80.69, "grad_norm": 21.661922454833984, "learning_rate": 7.310310310310311e-06, "loss": 0.536, "step": 26870 }, { "epoch": 80.72, "grad_norm": 18.71791648864746, "learning_rate": 7.3093093093093104e-06, "loss": 0.5402, "step": 26880 }, { "epoch": 80.75, "grad_norm": 13.311158180236816, "learning_rate": 7.308308308308308e-06, "loss": 0.4404, "step": 26890 }, { "epoch": 80.78, "grad_norm": 14.995808601379395, "learning_rate": 7.307307307307308e-06, "loss": 0.5359, "step": 26900 }, { "epoch": 80.81, "grad_norm": 17.181751251220703, "learning_rate": 7.306306306306307e-06, "loss": 0.5081, "step": 26910 }, { "epoch": 80.84, "grad_norm": 12.407795906066895, "learning_rate": 7.305305305305306e-06, "loss": 0.5269, "step": 26920 }, { "epoch": 80.87, "grad_norm": 13.916088104248047, "learning_rate": 7.304304304304305e-06, "loss": 0.5152, "step": 26930 }, { "epoch": 80.9, "grad_norm": 23.82271957397461, "learning_rate": 7.303303303303304e-06, "loss": 0.5478, "step": 26940 }, { "epoch": 80.93, "grad_norm": 10.362595558166504, "learning_rate": 7.302302302302303e-06, "loss": 0.4607, "step": 26950 }, { "epoch": 80.96, "grad_norm": 13.769407272338867, "learning_rate": 7.3013013013013026e-06, "loss": 0.453, "step": 26960 }, { "epoch": 80.99, "grad_norm": 17.067766189575195, "learning_rate": 7.3003003003003e-06, "loss": 0.5248, "step": 26970 }, { "epoch": 81.0, "eval_accuracy": 0.8655, "eval_loss": 0.47673192620277405, "eval_runtime": 12.8835, "eval_samples_per_second": 776.186, "eval_steps_per_second": 3.105, "step": 26973 }, { "epoch": 81.02, "grad_norm": 19.173913955688477, "learning_rate": 7.2992992992993e-06, "loss": 0.5459, "step": 26980 }, { "epoch": 81.05, "grad_norm": 17.089609146118164, "learning_rate": 7.298298298298298e-06, "loss": 0.469, "step": 26990 }, { "epoch": 81.08, "grad_norm": 19.796131134033203, "learning_rate": 7.297297297297298e-06, "loss": 0.5065, "step": 27000 }, { "epoch": 81.11, "grad_norm": 13.776817321777344, "learning_rate": 7.296296296296297e-06, "loss": 0.5056, "step": 27010 }, { "epoch": 81.14, "grad_norm": 17.408824920654297, "learning_rate": 7.295295295295296e-06, "loss": 0.511, "step": 27020 }, { "epoch": 81.17, "grad_norm": 41.608097076416016, "learning_rate": 7.294294294294295e-06, "loss": 0.5315, "step": 27030 }, { "epoch": 81.2, "grad_norm": 14.133171081542969, "learning_rate": 7.293293293293294e-06, "loss": 0.4975, "step": 27040 }, { "epoch": 81.23, "grad_norm": 14.316442489624023, "learning_rate": 7.292292292292293e-06, "loss": 0.5426, "step": 27050 }, { "epoch": 81.26, "grad_norm": 15.659419059753418, "learning_rate": 7.291291291291291e-06, "loss": 0.4493, "step": 27060 }, { "epoch": 81.29, "grad_norm": 16.83542823791504, "learning_rate": 7.2902902902902905e-06, "loss": 0.5195, "step": 27070 }, { "epoch": 81.32, "grad_norm": 15.329632759094238, "learning_rate": 7.28928928928929e-06, "loss": 0.511, "step": 27080 }, { "epoch": 81.35, "grad_norm": 15.858656883239746, "learning_rate": 7.2882882882882885e-06, "loss": 0.4848, "step": 27090 }, { "epoch": 81.38, "grad_norm": 17.962413787841797, "learning_rate": 7.287287287287288e-06, "loss": 0.4607, "step": 27100 }, { "epoch": 81.41, "grad_norm": 13.855925559997559, "learning_rate": 7.2862862862862874e-06, "loss": 0.4947, "step": 27110 }, { "epoch": 81.44, "grad_norm": 15.741239547729492, "learning_rate": 7.285285285285286e-06, "loss": 0.496, "step": 27120 }, { "epoch": 81.47, "grad_norm": 15.112213134765625, "learning_rate": 7.2842842842842855e-06, "loss": 0.5332, "step": 27130 }, { "epoch": 81.5, "grad_norm": 14.706315040588379, "learning_rate": 7.283283283283283e-06, "loss": 0.5166, "step": 27140 }, { "epoch": 81.53, "grad_norm": 17.013132095336914, "learning_rate": 7.282282282282283e-06, "loss": 0.4587, "step": 27150 }, { "epoch": 81.56, "grad_norm": 14.217673301696777, "learning_rate": 7.281281281281281e-06, "loss": 0.5012, "step": 27160 }, { "epoch": 81.59, "grad_norm": 16.70499610900879, "learning_rate": 7.280280280280281e-06, "loss": 0.5238, "step": 27170 }, { "epoch": 81.62, "grad_norm": 18.069900512695312, "learning_rate": 7.27927927927928e-06, "loss": 0.5143, "step": 27180 }, { "epoch": 81.65, "grad_norm": 20.117631912231445, "learning_rate": 7.278278278278279e-06, "loss": 0.4739, "step": 27190 }, { "epoch": 81.68, "grad_norm": 20.28511619567871, "learning_rate": 7.277277277277278e-06, "loss": 0.4929, "step": 27200 }, { "epoch": 81.71, "grad_norm": 13.58857250213623, "learning_rate": 7.276276276276278e-06, "loss": 0.527, "step": 27210 }, { "epoch": 81.74, "grad_norm": 12.338050842285156, "learning_rate": 7.275275275275275e-06, "loss": 0.4731, "step": 27220 }, { "epoch": 81.77, "grad_norm": 10.727715492248535, "learning_rate": 7.274274274274275e-06, "loss": 0.6063, "step": 27230 }, { "epoch": 81.8, "grad_norm": 19.270235061645508, "learning_rate": 7.273273273273273e-06, "loss": 0.5, "step": 27240 }, { "epoch": 81.83, "grad_norm": 11.240653991699219, "learning_rate": 7.272272272272273e-06, "loss": 0.4847, "step": 27250 }, { "epoch": 81.86, "grad_norm": 16.444652557373047, "learning_rate": 7.271271271271272e-06, "loss": 0.466, "step": 27260 }, { "epoch": 81.89, "grad_norm": 12.542959213256836, "learning_rate": 7.270270270270271e-06, "loss": 0.5569, "step": 27270 }, { "epoch": 81.92, "grad_norm": 16.11238670349121, "learning_rate": 7.26926926926927e-06, "loss": 0.5396, "step": 27280 }, { "epoch": 81.95, "grad_norm": 23.949087142944336, "learning_rate": 7.268268268268269e-06, "loss": 0.5495, "step": 27290 }, { "epoch": 81.98, "grad_norm": 17.89312744140625, "learning_rate": 7.267267267267268e-06, "loss": 0.4648, "step": 27300 }, { "epoch": 82.0, "eval_accuracy": 0.8681, "eval_loss": 0.4730146825313568, "eval_runtime": 12.8199, "eval_samples_per_second": 780.04, "eval_steps_per_second": 3.12, "step": 27306 }, { "epoch": 82.01, "grad_norm": 23.988683700561523, "learning_rate": 7.266266266266266e-06, "loss": 0.5477, "step": 27310 }, { "epoch": 82.04, "grad_norm": 11.013201713562012, "learning_rate": 7.2652652652652655e-06, "loss": 0.4666, "step": 27320 }, { "epoch": 82.07, "grad_norm": 14.66703987121582, "learning_rate": 7.264264264264265e-06, "loss": 0.467, "step": 27330 }, { "epoch": 82.1, "grad_norm": 19.158966064453125, "learning_rate": 7.263263263263264e-06, "loss": 0.5321, "step": 27340 }, { "epoch": 82.13, "grad_norm": 17.552034378051758, "learning_rate": 7.262262262262263e-06, "loss": 0.5071, "step": 27350 }, { "epoch": 82.16, "grad_norm": 11.901301383972168, "learning_rate": 7.2612612612612625e-06, "loss": 0.4968, "step": 27360 }, { "epoch": 82.19, "grad_norm": 14.851964950561523, "learning_rate": 7.260260260260261e-06, "loss": 0.5088, "step": 27370 }, { "epoch": 82.22, "grad_norm": 13.52309799194336, "learning_rate": 7.2592592592592605e-06, "loss": 0.5089, "step": 27380 }, { "epoch": 82.25, "grad_norm": 15.34189224243164, "learning_rate": 7.258258258258258e-06, "loss": 0.4828, "step": 27390 }, { "epoch": 82.28, "grad_norm": 21.503339767456055, "learning_rate": 7.257257257257258e-06, "loss": 0.5525, "step": 27400 }, { "epoch": 82.31, "grad_norm": 16.29681968688965, "learning_rate": 7.256256256256256e-06, "loss": 0.4926, "step": 27410 }, { "epoch": 82.34, "grad_norm": 20.325592041015625, "learning_rate": 7.255255255255256e-06, "loss": 0.5453, "step": 27420 }, { "epoch": 82.37, "grad_norm": 22.434194564819336, "learning_rate": 7.254254254254255e-06, "loss": 0.5411, "step": 27430 }, { "epoch": 82.4, "grad_norm": 19.42127227783203, "learning_rate": 7.253253253253254e-06, "loss": 0.5606, "step": 27440 }, { "epoch": 82.43, "grad_norm": 12.874723434448242, "learning_rate": 7.252252252252253e-06, "loss": 0.539, "step": 27450 }, { "epoch": 82.46, "grad_norm": 13.633169174194336, "learning_rate": 7.251251251251253e-06, "loss": 0.4834, "step": 27460 }, { "epoch": 82.49, "grad_norm": 16.47105598449707, "learning_rate": 7.25025025025025e-06, "loss": 0.4278, "step": 27470 }, { "epoch": 82.52, "grad_norm": 14.252326965332031, "learning_rate": 7.24924924924925e-06, "loss": 0.4782, "step": 27480 }, { "epoch": 82.55, "grad_norm": 15.690261840820312, "learning_rate": 7.2482482482482484e-06, "loss": 0.5556, "step": 27490 }, { "epoch": 82.58, "grad_norm": 12.490242958068848, "learning_rate": 7.247247247247248e-06, "loss": 0.5361, "step": 27500 }, { "epoch": 82.61, "grad_norm": 16.0650634765625, "learning_rate": 7.2462462462462465e-06, "loss": 0.5288, "step": 27510 }, { "epoch": 82.64, "grad_norm": 16.423114776611328, "learning_rate": 7.245245245245246e-06, "loss": 0.5194, "step": 27520 }, { "epoch": 82.67, "grad_norm": 14.915671348571777, "learning_rate": 7.244244244244245e-06, "loss": 0.5426, "step": 27530 }, { "epoch": 82.7, "grad_norm": 19.004558563232422, "learning_rate": 7.243243243243244e-06, "loss": 0.5105, "step": 27540 }, { "epoch": 82.73, "grad_norm": 13.798174858093262, "learning_rate": 7.242242242242243e-06, "loss": 0.5387, "step": 27550 }, { "epoch": 82.76, "grad_norm": 19.4901180267334, "learning_rate": 7.241241241241241e-06, "loss": 0.4877, "step": 27560 }, { "epoch": 82.79, "grad_norm": 15.883666038513184, "learning_rate": 7.240240240240241e-06, "loss": 0.5695, "step": 27570 }, { "epoch": 82.82, "grad_norm": 14.284727096557617, "learning_rate": 7.23923923923924e-06, "loss": 0.5544, "step": 27580 }, { "epoch": 82.85, "grad_norm": 12.819677352905273, "learning_rate": 7.238238238238239e-06, "loss": 0.5676, "step": 27590 }, { "epoch": 82.88, "grad_norm": 18.853370666503906, "learning_rate": 7.237237237237238e-06, "loss": 0.4831, "step": 27600 }, { "epoch": 82.91, "grad_norm": 20.63605308532715, "learning_rate": 7.236236236236237e-06, "loss": 0.5092, "step": 27610 }, { "epoch": 82.94, "grad_norm": 15.875106811523438, "learning_rate": 7.235235235235236e-06, "loss": 0.4739, "step": 27620 }, { "epoch": 82.97, "grad_norm": 20.798343658447266, "learning_rate": 7.2342342342342355e-06, "loss": 0.5853, "step": 27630 }, { "epoch": 83.0, "eval_accuracy": 0.8656, "eval_loss": 0.4780651926994324, "eval_runtime": 12.8752, "eval_samples_per_second": 776.688, "eval_steps_per_second": 3.107, "step": 27639 }, { "epoch": 83.0, "grad_norm": 11.860279083251953, "learning_rate": 7.233233233233233e-06, "loss": 0.505, "step": 27640 }, { "epoch": 83.03, "grad_norm": 18.203495025634766, "learning_rate": 7.232232232232233e-06, "loss": 0.5636, "step": 27650 }, { "epoch": 83.06, "grad_norm": 13.584324836730957, "learning_rate": 7.231231231231231e-06, "loss": 0.4661, "step": 27660 }, { "epoch": 83.09, "grad_norm": 13.434273719787598, "learning_rate": 7.230230230230231e-06, "loss": 0.4884, "step": 27670 }, { "epoch": 83.12, "grad_norm": 26.261144638061523, "learning_rate": 7.22922922922923e-06, "loss": 0.5101, "step": 27680 }, { "epoch": 83.15, "grad_norm": 15.838640213012695, "learning_rate": 7.228228228228229e-06, "loss": 0.4783, "step": 27690 }, { "epoch": 83.18, "grad_norm": 19.25486946105957, "learning_rate": 7.227227227227228e-06, "loss": 0.5055, "step": 27700 }, { "epoch": 83.21, "grad_norm": 12.068188667297363, "learning_rate": 7.226226226226228e-06, "loss": 0.5237, "step": 27710 }, { "epoch": 83.24, "grad_norm": 16.15995979309082, "learning_rate": 7.2252252252252254e-06, "loss": 0.4966, "step": 27720 }, { "epoch": 83.27, "grad_norm": 18.45575714111328, "learning_rate": 7.224224224224224e-06, "loss": 0.5047, "step": 27730 }, { "epoch": 83.3, "grad_norm": 21.30694580078125, "learning_rate": 7.2232232232232235e-06, "loss": 0.515, "step": 27740 }, { "epoch": 83.33, "grad_norm": 14.493951797485352, "learning_rate": 7.222222222222223e-06, "loss": 0.508, "step": 27750 }, { "epoch": 83.36, "grad_norm": 9.595751762390137, "learning_rate": 7.2212212212212215e-06, "loss": 0.4961, "step": 27760 }, { "epoch": 83.39, "grad_norm": 19.376615524291992, "learning_rate": 7.220220220220221e-06, "loss": 0.5333, "step": 27770 }, { "epoch": 83.42, "grad_norm": 19.98113250732422, "learning_rate": 7.21921921921922e-06, "loss": 0.4903, "step": 27780 }, { "epoch": 83.45, "grad_norm": 12.427263259887695, "learning_rate": 7.218218218218219e-06, "loss": 0.4672, "step": 27790 }, { "epoch": 83.48, "grad_norm": 16.322376251220703, "learning_rate": 7.217217217217218e-06, "loss": 0.5028, "step": 27800 }, { "epoch": 83.51, "grad_norm": 16.018421173095703, "learning_rate": 7.216216216216216e-06, "loss": 0.5217, "step": 27810 }, { "epoch": 83.54, "grad_norm": 10.323281288146973, "learning_rate": 7.215215215215216e-06, "loss": 0.4973, "step": 27820 }, { "epoch": 83.57, "grad_norm": 10.037334442138672, "learning_rate": 7.214214214214215e-06, "loss": 0.5405, "step": 27830 }, { "epoch": 83.6, "grad_norm": 13.62839412689209, "learning_rate": 7.213213213213214e-06, "loss": 0.509, "step": 27840 }, { "epoch": 83.63, "grad_norm": 14.484914779663086, "learning_rate": 7.212212212212213e-06, "loss": 0.5128, "step": 27850 }, { "epoch": 83.66, "grad_norm": 13.376684188842773, "learning_rate": 7.211211211211212e-06, "loss": 0.5263, "step": 27860 }, { "epoch": 83.69, "grad_norm": 19.595674514770508, "learning_rate": 7.210210210210211e-06, "loss": 0.4579, "step": 27870 }, { "epoch": 83.72, "grad_norm": 15.632843017578125, "learning_rate": 7.2092092092092106e-06, "loss": 0.4914, "step": 27880 }, { "epoch": 83.75, "grad_norm": 16.465303421020508, "learning_rate": 7.208208208208208e-06, "loss": 0.5398, "step": 27890 }, { "epoch": 83.78, "grad_norm": 19.066591262817383, "learning_rate": 7.207207207207208e-06, "loss": 0.4711, "step": 27900 }, { "epoch": 83.81, "grad_norm": 15.890423774719238, "learning_rate": 7.206206206206206e-06, "loss": 0.56, "step": 27910 }, { "epoch": 83.84, "grad_norm": 20.270509719848633, "learning_rate": 7.205205205205206e-06, "loss": 0.5004, "step": 27920 }, { "epoch": 83.87, "grad_norm": 14.511930465698242, "learning_rate": 7.204204204204205e-06, "loss": 0.5083, "step": 27930 }, { "epoch": 83.9, "grad_norm": 15.224581718444824, "learning_rate": 7.203203203203204e-06, "loss": 0.5081, "step": 27940 }, { "epoch": 83.93, "grad_norm": 14.492236137390137, "learning_rate": 7.202202202202203e-06, "loss": 0.5216, "step": 27950 }, { "epoch": 83.96, "grad_norm": 15.13267707824707, "learning_rate": 7.201201201201202e-06, "loss": 0.451, "step": 27960 }, { "epoch": 83.99, "grad_norm": 20.824356079101562, "learning_rate": 7.2002002002002005e-06, "loss": 0.5298, "step": 27970 }, { "epoch": 84.0, "eval_accuracy": 0.869, "eval_loss": 0.4728562533855438, "eval_runtime": 13.0251, "eval_samples_per_second": 767.751, "eval_steps_per_second": 3.071, "step": 27972 }, { "epoch": 84.02, "grad_norm": 10.574320793151855, "learning_rate": 7.199199199199199e-06, "loss": 0.604, "step": 27980 }, { "epoch": 84.05, "grad_norm": 14.731752395629883, "learning_rate": 7.1981981981981985e-06, "loss": 0.4887, "step": 27990 }, { "epoch": 84.08, "grad_norm": 10.434557914733887, "learning_rate": 7.197197197197198e-06, "loss": 0.4773, "step": 28000 }, { "epoch": 84.11, "grad_norm": 13.42420768737793, "learning_rate": 7.1961961961961965e-06, "loss": 0.5254, "step": 28010 }, { "epoch": 84.14, "grad_norm": 13.98668098449707, "learning_rate": 7.195195195195196e-06, "loss": 0.4986, "step": 28020 }, { "epoch": 84.17, "grad_norm": 12.513934135437012, "learning_rate": 7.194194194194195e-06, "loss": 0.4707, "step": 28030 }, { "epoch": 84.2, "grad_norm": 15.887160301208496, "learning_rate": 7.193193193193194e-06, "loss": 0.4697, "step": 28040 }, { "epoch": 84.23, "grad_norm": 15.136001586914062, "learning_rate": 7.1921921921921935e-06, "loss": 0.5131, "step": 28050 }, { "epoch": 84.26, "grad_norm": 13.415595054626465, "learning_rate": 7.191191191191191e-06, "loss": 0.4972, "step": 28060 }, { "epoch": 84.29, "grad_norm": 16.6083984375, "learning_rate": 7.190190190190191e-06, "loss": 0.4587, "step": 28070 }, { "epoch": 84.32, "grad_norm": 28.22075843811035, "learning_rate": 7.189189189189189e-06, "loss": 0.5245, "step": 28080 }, { "epoch": 84.35, "grad_norm": 12.82327938079834, "learning_rate": 7.188188188188189e-06, "loss": 0.5054, "step": 28090 }, { "epoch": 84.38, "grad_norm": 17.963396072387695, "learning_rate": 7.187187187187188e-06, "loss": 0.5275, "step": 28100 }, { "epoch": 84.41, "grad_norm": 13.487298965454102, "learning_rate": 7.186186186186187e-06, "loss": 0.4571, "step": 28110 }, { "epoch": 84.44, "grad_norm": 17.037858963012695, "learning_rate": 7.185185185185186e-06, "loss": 0.5254, "step": 28120 }, { "epoch": 84.47, "grad_norm": 16.970691680908203, "learning_rate": 7.184184184184186e-06, "loss": 0.4804, "step": 28130 }, { "epoch": 84.5, "grad_norm": 19.333240509033203, "learning_rate": 7.183183183183183e-06, "loss": 0.5244, "step": 28140 }, { "epoch": 84.53, "grad_norm": 12.293442726135254, "learning_rate": 7.182182182182183e-06, "loss": 0.4906, "step": 28150 }, { "epoch": 84.56, "grad_norm": 13.594487190246582, "learning_rate": 7.181181181181181e-06, "loss": 0.4918, "step": 28160 }, { "epoch": 84.59, "grad_norm": 13.262083053588867, "learning_rate": 7.180180180180181e-06, "loss": 0.5092, "step": 28170 }, { "epoch": 84.62, "grad_norm": 20.580219268798828, "learning_rate": 7.1791791791791794e-06, "loss": 0.5616, "step": 28180 }, { "epoch": 84.65, "grad_norm": 19.55314064025879, "learning_rate": 7.178178178178179e-06, "loss": 0.5074, "step": 28190 }, { "epoch": 84.68, "grad_norm": 16.784488677978516, "learning_rate": 7.177177177177178e-06, "loss": 0.5366, "step": 28200 }, { "epoch": 84.71, "grad_norm": 15.993558883666992, "learning_rate": 7.176176176176177e-06, "loss": 0.5448, "step": 28210 }, { "epoch": 84.74, "grad_norm": 15.678027153015137, "learning_rate": 7.1751751751751755e-06, "loss": 0.5437, "step": 28220 }, { "epoch": 84.77, "grad_norm": 11.770125389099121, "learning_rate": 7.174174174174174e-06, "loss": 0.535, "step": 28230 }, { "epoch": 84.8, "grad_norm": 18.855430603027344, "learning_rate": 7.1731731731731735e-06, "loss": 0.5175, "step": 28240 }, { "epoch": 84.83, "grad_norm": 23.09416389465332, "learning_rate": 7.172172172172173e-06, "loss": 0.5511, "step": 28250 }, { "epoch": 84.86, "grad_norm": 15.866230964660645, "learning_rate": 7.1711711711711716e-06, "loss": 0.51, "step": 28260 }, { "epoch": 84.89, "grad_norm": 16.555824279785156, "learning_rate": 7.170170170170171e-06, "loss": 0.4589, "step": 28270 }, { "epoch": 84.92, "grad_norm": 13.851365089416504, "learning_rate": 7.1691691691691705e-06, "loss": 0.5211, "step": 28280 }, { "epoch": 84.95, "grad_norm": 12.860268592834473, "learning_rate": 7.168168168168169e-06, "loss": 0.4454, "step": 28290 }, { "epoch": 84.98, "grad_norm": 14.178278923034668, "learning_rate": 7.167167167167167e-06, "loss": 0.4484, "step": 28300 }, { "epoch": 85.0, "eval_accuracy": 0.869, "eval_loss": 0.4740825593471527, "eval_runtime": 12.7553, "eval_samples_per_second": 783.987, "eval_steps_per_second": 3.136, "step": 28305 }, { "epoch": 85.02, "grad_norm": 18.63173484802246, "learning_rate": 7.166166166166166e-06, "loss": 0.6098, "step": 28310 }, { "epoch": 85.05, "grad_norm": 19.61794662475586, "learning_rate": 7.165165165165166e-06, "loss": 0.5282, "step": 28320 }, { "epoch": 85.08, "grad_norm": 20.200681686401367, "learning_rate": 7.164164164164164e-06, "loss": 0.4699, "step": 28330 }, { "epoch": 85.11, "grad_norm": 23.22762680053711, "learning_rate": 7.163163163163164e-06, "loss": 0.5258, "step": 28340 }, { "epoch": 85.14, "grad_norm": 13.644779205322266, "learning_rate": 7.162162162162163e-06, "loss": 0.5212, "step": 28350 }, { "epoch": 85.17, "grad_norm": 15.226165771484375, "learning_rate": 7.161161161161162e-06, "loss": 0.4739, "step": 28360 }, { "epoch": 85.2, "grad_norm": 19.651981353759766, "learning_rate": 7.160160160160161e-06, "loss": 0.472, "step": 28370 }, { "epoch": 85.23, "grad_norm": 12.74783992767334, "learning_rate": 7.159159159159161e-06, "loss": 0.4689, "step": 28380 }, { "epoch": 85.26, "grad_norm": 13.573821067810059, "learning_rate": 7.158158158158158e-06, "loss": 0.5248, "step": 28390 }, { "epoch": 85.29, "grad_norm": 17.988346099853516, "learning_rate": 7.157157157157158e-06, "loss": 0.5036, "step": 28400 }, { "epoch": 85.32, "grad_norm": 16.88225746154785, "learning_rate": 7.156156156156156e-06, "loss": 0.5256, "step": 28410 }, { "epoch": 85.35, "grad_norm": 17.99980926513672, "learning_rate": 7.155155155155156e-06, "loss": 0.5283, "step": 28420 }, { "epoch": 85.38, "grad_norm": 16.545686721801758, "learning_rate": 7.1541541541541545e-06, "loss": 0.5181, "step": 28430 }, { "epoch": 85.41, "grad_norm": 18.98824691772461, "learning_rate": 7.153153153153154e-06, "loss": 0.4657, "step": 28440 }, { "epoch": 85.44, "grad_norm": 14.740886688232422, "learning_rate": 7.152152152152153e-06, "loss": 0.5669, "step": 28450 }, { "epoch": 85.47, "grad_norm": 15.921751022338867, "learning_rate": 7.151151151151152e-06, "loss": 0.4818, "step": 28460 }, { "epoch": 85.5, "grad_norm": 13.95246410369873, "learning_rate": 7.1501501501501505e-06, "loss": 0.4806, "step": 28470 }, { "epoch": 85.53, "grad_norm": 14.615503311157227, "learning_rate": 7.149149149149149e-06, "loss": 0.4843, "step": 28480 }, { "epoch": 85.56, "grad_norm": 22.90113639831543, "learning_rate": 7.1481481481481486e-06, "loss": 0.5608, "step": 28490 }, { "epoch": 85.59, "grad_norm": 17.773834228515625, "learning_rate": 7.147147147147148e-06, "loss": 0.5189, "step": 28500 }, { "epoch": 85.62, "grad_norm": 16.468957901000977, "learning_rate": 7.146146146146147e-06, "loss": 0.5018, "step": 28510 }, { "epoch": 85.65, "grad_norm": 20.70461654663086, "learning_rate": 7.145145145145146e-06, "loss": 0.4561, "step": 28520 }, { "epoch": 85.68, "grad_norm": 11.857874870300293, "learning_rate": 7.144144144144145e-06, "loss": 0.516, "step": 28530 }, { "epoch": 85.71, "grad_norm": 13.240480422973633, "learning_rate": 7.143143143143144e-06, "loss": 0.4974, "step": 28540 }, { "epoch": 85.74, "grad_norm": 14.336261749267578, "learning_rate": 7.142142142142142e-06, "loss": 0.4871, "step": 28550 }, { "epoch": 85.77, "grad_norm": 9.17609977722168, "learning_rate": 7.141141141141141e-06, "loss": 0.4628, "step": 28560 }, { "epoch": 85.8, "grad_norm": 12.698349952697754, "learning_rate": 7.140140140140141e-06, "loss": 0.5396, "step": 28570 }, { "epoch": 85.83, "grad_norm": 17.80215072631836, "learning_rate": 7.139139139139139e-06, "loss": 0.4676, "step": 28580 }, { "epoch": 85.86, "grad_norm": 20.226518630981445, "learning_rate": 7.138138138138139e-06, "loss": 0.5154, "step": 28590 }, { "epoch": 85.89, "grad_norm": 15.085134506225586, "learning_rate": 7.137137137137138e-06, "loss": 0.494, "step": 28600 }, { "epoch": 85.92, "grad_norm": 12.757584571838379, "learning_rate": 7.136136136136137e-06, "loss": 0.537, "step": 28610 }, { "epoch": 85.95, "grad_norm": 12.16426944732666, "learning_rate": 7.135135135135136e-06, "loss": 0.4782, "step": 28620 }, { "epoch": 85.98, "grad_norm": 12.735384941101074, "learning_rate": 7.134134134134135e-06, "loss": 0.4765, "step": 28630 }, { "epoch": 86.0, "eval_accuracy": 0.8633, "eval_loss": 0.4877474009990692, "eval_runtime": 12.5121, "eval_samples_per_second": 799.225, "eval_steps_per_second": 3.197, "step": 28638 }, { "epoch": 86.01, "grad_norm": 20.190876007080078, "learning_rate": 7.133133133133133e-06, "loss": 0.5358, "step": 28640 }, { "epoch": 86.04, "grad_norm": 16.767045974731445, "learning_rate": 7.132132132132132e-06, "loss": 0.5106, "step": 28650 }, { "epoch": 86.07, "grad_norm": 16.47939682006836, "learning_rate": 7.1311311311311315e-06, "loss": 0.5083, "step": 28660 }, { "epoch": 86.1, "grad_norm": 12.725757598876953, "learning_rate": 7.130130130130131e-06, "loss": 0.4462, "step": 28670 }, { "epoch": 86.13, "grad_norm": 21.844676971435547, "learning_rate": 7.1291291291291295e-06, "loss": 0.4942, "step": 28680 }, { "epoch": 86.16, "grad_norm": 13.987099647521973, "learning_rate": 7.128128128128129e-06, "loss": 0.4893, "step": 28690 }, { "epoch": 86.19, "grad_norm": 12.143754959106445, "learning_rate": 7.127127127127128e-06, "loss": 0.4771, "step": 28700 }, { "epoch": 86.22, "grad_norm": 19.128433227539062, "learning_rate": 7.126126126126127e-06, "loss": 0.503, "step": 28710 }, { "epoch": 86.25, "grad_norm": 16.6035099029541, "learning_rate": 7.1251251251251256e-06, "loss": 0.5514, "step": 28720 }, { "epoch": 86.28, "grad_norm": 16.854846954345703, "learning_rate": 7.124124124124124e-06, "loss": 0.4651, "step": 28730 }, { "epoch": 86.31, "grad_norm": 13.618094444274902, "learning_rate": 7.123123123123124e-06, "loss": 0.5122, "step": 28740 }, { "epoch": 86.34, "grad_norm": 14.200691223144531, "learning_rate": 7.122122122122122e-06, "loss": 0.5339, "step": 28750 }, { "epoch": 86.37, "grad_norm": 14.8397855758667, "learning_rate": 7.121121121121122e-06, "loss": 0.5011, "step": 28760 }, { "epoch": 86.4, "grad_norm": 11.214278221130371, "learning_rate": 7.120120120120121e-06, "loss": 0.5291, "step": 28770 }, { "epoch": 86.43, "grad_norm": 18.572744369506836, "learning_rate": 7.11911911911912e-06, "loss": 0.5285, "step": 28780 }, { "epoch": 86.46, "grad_norm": 18.296024322509766, "learning_rate": 7.118118118118119e-06, "loss": 0.5268, "step": 28790 }, { "epoch": 86.49, "grad_norm": 13.891396522521973, "learning_rate": 7.117117117117117e-06, "loss": 0.5011, "step": 28800 }, { "epoch": 86.52, "grad_norm": 14.529227256774902, "learning_rate": 7.116116116116116e-06, "loss": 0.4931, "step": 28810 }, { "epoch": 86.55, "grad_norm": 17.182336807250977, "learning_rate": 7.115115115115116e-06, "loss": 0.4914, "step": 28820 }, { "epoch": 86.58, "grad_norm": 17.16726303100586, "learning_rate": 7.114114114114114e-06, "loss": 0.5218, "step": 28830 }, { "epoch": 86.61, "grad_norm": 14.171222686767578, "learning_rate": 7.113113113113114e-06, "loss": 0.5402, "step": 28840 }, { "epoch": 86.64, "grad_norm": 21.08846664428711, "learning_rate": 7.112112112112113e-06, "loss": 0.5499, "step": 28850 }, { "epoch": 86.67, "grad_norm": 17.825288772583008, "learning_rate": 7.111111111111112e-06, "loss": 0.4919, "step": 28860 }, { "epoch": 86.7, "grad_norm": 15.799781799316406, "learning_rate": 7.110110110110111e-06, "loss": 0.5057, "step": 28870 }, { "epoch": 86.73, "grad_norm": 10.760543823242188, "learning_rate": 7.10910910910911e-06, "loss": 0.4696, "step": 28880 }, { "epoch": 86.76, "grad_norm": 12.362149238586426, "learning_rate": 7.1081081081081085e-06, "loss": 0.5351, "step": 28890 }, { "epoch": 86.79, "grad_norm": 22.476179122924805, "learning_rate": 7.107107107107107e-06, "loss": 0.5215, "step": 28900 }, { "epoch": 86.82, "grad_norm": 12.070683479309082, "learning_rate": 7.1061061061061065e-06, "loss": 0.4834, "step": 28910 }, { "epoch": 86.85, "grad_norm": 18.42873191833496, "learning_rate": 7.105105105105106e-06, "loss": 0.4361, "step": 28920 }, { "epoch": 86.88, "grad_norm": 25.31146812438965, "learning_rate": 7.1041041041041045e-06, "loss": 0.5506, "step": 28930 }, { "epoch": 86.91, "grad_norm": 13.773992538452148, "learning_rate": 7.103103103103104e-06, "loss": 0.5195, "step": 28940 }, { "epoch": 86.94, "grad_norm": 20.223474502563477, "learning_rate": 7.102102102102103e-06, "loss": 0.4811, "step": 28950 }, { "epoch": 86.97, "grad_norm": 13.442011833190918, "learning_rate": 7.101101101101102e-06, "loss": 0.4472, "step": 28960 }, { "epoch": 87.0, "grad_norm": 18.693674087524414, "learning_rate": 7.100100100100101e-06, "loss": 0.5409, "step": 28970 }, { "epoch": 87.0, "eval_accuracy": 0.8664, "eval_loss": 0.4806751012802124, "eval_runtime": 12.7527, "eval_samples_per_second": 784.148, "eval_steps_per_second": 3.137, "step": 28971 }, { "epoch": 87.03, "grad_norm": 11.919361114501953, "learning_rate": 7.099099099099099e-06, "loss": 0.445, "step": 28980 }, { "epoch": 87.06, "grad_norm": 19.516067504882812, "learning_rate": 7.098098098098099e-06, "loss": 0.5561, "step": 28990 }, { "epoch": 87.09, "grad_norm": 13.301137924194336, "learning_rate": 7.097097097097097e-06, "loss": 0.443, "step": 29000 }, { "epoch": 87.12, "grad_norm": 17.93448829650879, "learning_rate": 7.096096096096097e-06, "loss": 0.4778, "step": 29010 }, { "epoch": 87.15, "grad_norm": 13.911527633666992, "learning_rate": 7.095095095095096e-06, "loss": 0.4956, "step": 29020 }, { "epoch": 87.18, "grad_norm": 19.80583953857422, "learning_rate": 7.094094094094095e-06, "loss": 0.4452, "step": 29030 }, { "epoch": 87.21, "grad_norm": 15.735163688659668, "learning_rate": 7.093093093093094e-06, "loss": 0.5091, "step": 29040 }, { "epoch": 87.24, "grad_norm": 28.821273803710938, "learning_rate": 7.092092092092092e-06, "loss": 0.5307, "step": 29050 }, { "epoch": 87.27, "grad_norm": 30.416255950927734, "learning_rate": 7.091091091091091e-06, "loss": 0.5255, "step": 29060 }, { "epoch": 87.3, "grad_norm": 15.77895450592041, "learning_rate": 7.090090090090091e-06, "loss": 0.4988, "step": 29070 }, { "epoch": 87.33, "grad_norm": 17.492530822753906, "learning_rate": 7.089089089089089e-06, "loss": 0.4559, "step": 29080 }, { "epoch": 87.36, "grad_norm": 18.281818389892578, "learning_rate": 7.088088088088089e-06, "loss": 0.4955, "step": 29090 }, { "epoch": 87.39, "grad_norm": 17.973648071289062, "learning_rate": 7.087087087087087e-06, "loss": 0.4519, "step": 29100 }, { "epoch": 87.42, "grad_norm": 13.263154029846191, "learning_rate": 7.086086086086087e-06, "loss": 0.4735, "step": 29110 }, { "epoch": 87.45, "grad_norm": 14.076695442199707, "learning_rate": 7.085085085085086e-06, "loss": 0.5984, "step": 29120 }, { "epoch": 87.48, "grad_norm": 11.15629768371582, "learning_rate": 7.084084084084085e-06, "loss": 0.5091, "step": 29130 }, { "epoch": 87.51, "grad_norm": 13.458136558532715, "learning_rate": 7.0830830830830835e-06, "loss": 0.5102, "step": 29140 }, { "epoch": 87.54, "grad_norm": 15.646551132202148, "learning_rate": 7.082082082082082e-06, "loss": 0.5237, "step": 29150 }, { "epoch": 87.57, "grad_norm": 16.076602935791016, "learning_rate": 7.0810810810810815e-06, "loss": 0.4742, "step": 29160 }, { "epoch": 87.6, "grad_norm": 17.216720581054688, "learning_rate": 7.080080080080081e-06, "loss": 0.4867, "step": 29170 }, { "epoch": 87.63, "grad_norm": 15.960564613342285, "learning_rate": 7.0790790790790796e-06, "loss": 0.526, "step": 29180 }, { "epoch": 87.66, "grad_norm": 12.131356239318848, "learning_rate": 7.078078078078079e-06, "loss": 0.5324, "step": 29190 }, { "epoch": 87.69, "grad_norm": 14.858342170715332, "learning_rate": 7.0770770770770784e-06, "loss": 0.4377, "step": 29200 }, { "epoch": 87.72, "grad_norm": 18.87102508544922, "learning_rate": 7.076076076076077e-06, "loss": 0.5009, "step": 29210 }, { "epoch": 87.75, "grad_norm": 17.45355987548828, "learning_rate": 7.075075075075075e-06, "loss": 0.5376, "step": 29220 }, { "epoch": 87.78, "grad_norm": 14.351983070373535, "learning_rate": 7.074074074074074e-06, "loss": 0.5263, "step": 29230 }, { "epoch": 87.81, "grad_norm": 11.981090545654297, "learning_rate": 7.073073073073074e-06, "loss": 0.4976, "step": 29240 }, { "epoch": 87.84, "grad_norm": 15.420083999633789, "learning_rate": 7.072072072072072e-06, "loss": 0.5039, "step": 29250 }, { "epoch": 87.87, "grad_norm": 13.468639373779297, "learning_rate": 7.071071071071072e-06, "loss": 0.4937, "step": 29260 }, { "epoch": 87.9, "grad_norm": 17.320049285888672, "learning_rate": 7.070070070070071e-06, "loss": 0.4758, "step": 29270 }, { "epoch": 87.93, "grad_norm": 15.816226959228516, "learning_rate": 7.06906906906907e-06, "loss": 0.4588, "step": 29280 }, { "epoch": 87.96, "grad_norm": 25.267274856567383, "learning_rate": 7.068068068068069e-06, "loss": 0.5345, "step": 29290 }, { "epoch": 87.99, "grad_norm": 17.635854721069336, "learning_rate": 7.067067067067067e-06, "loss": 0.4778, "step": 29300 }, { "epoch": 88.0, "eval_accuracy": 0.8677, "eval_loss": 0.475267231464386, "eval_runtime": 12.4175, "eval_samples_per_second": 805.314, "eval_steps_per_second": 3.221, "step": 29304 }, { "epoch": 88.02, "grad_norm": 19.624452590942383, "learning_rate": 7.066066066066066e-06, "loss": 0.5788, "step": 29310 }, { "epoch": 88.05, "grad_norm": 13.224385261535645, "learning_rate": 7.065065065065066e-06, "loss": 0.5006, "step": 29320 }, { "epoch": 88.08, "grad_norm": 11.519399642944336, "learning_rate": 7.064064064064064e-06, "loss": 0.4921, "step": 29330 }, { "epoch": 88.11, "grad_norm": 18.72711944580078, "learning_rate": 7.063063063063064e-06, "loss": 0.512, "step": 29340 }, { "epoch": 88.14, "grad_norm": 18.440258026123047, "learning_rate": 7.0620620620620625e-06, "loss": 0.4926, "step": 29350 }, { "epoch": 88.17, "grad_norm": 15.627878189086914, "learning_rate": 7.061061061061062e-06, "loss": 0.4521, "step": 29360 }, { "epoch": 88.2, "grad_norm": 17.182098388671875, "learning_rate": 7.060060060060061e-06, "loss": 0.4928, "step": 29370 }, { "epoch": 88.23, "grad_norm": 19.698566436767578, "learning_rate": 7.059059059059059e-06, "loss": 0.5339, "step": 29380 }, { "epoch": 88.26, "grad_norm": 17.014854431152344, "learning_rate": 7.0580580580580585e-06, "loss": 0.488, "step": 29390 }, { "epoch": 88.29, "grad_norm": 15.220260620117188, "learning_rate": 7.057057057057057e-06, "loss": 0.4544, "step": 29400 }, { "epoch": 88.32, "grad_norm": 14.810593605041504, "learning_rate": 7.0560560560560566e-06, "loss": 0.4522, "step": 29410 }, { "epoch": 88.35, "grad_norm": 18.512983322143555, "learning_rate": 7.055055055055056e-06, "loss": 0.5206, "step": 29420 }, { "epoch": 88.38, "grad_norm": 20.590789794921875, "learning_rate": 7.054054054054055e-06, "loss": 0.4482, "step": 29430 }, { "epoch": 88.41, "grad_norm": 12.691764831542969, "learning_rate": 7.053053053053054e-06, "loss": 0.459, "step": 29440 }, { "epoch": 88.44, "grad_norm": 14.093256950378418, "learning_rate": 7.052052052052053e-06, "loss": 0.4907, "step": 29450 }, { "epoch": 88.47, "grad_norm": 15.214815139770508, "learning_rate": 7.051051051051052e-06, "loss": 0.5184, "step": 29460 }, { "epoch": 88.5, "grad_norm": 12.073654174804688, "learning_rate": 7.05005005005005e-06, "loss": 0.5084, "step": 29470 }, { "epoch": 88.53, "grad_norm": 15.3258695602417, "learning_rate": 7.049049049049049e-06, "loss": 0.5069, "step": 29480 }, { "epoch": 88.56, "grad_norm": 16.83928680419922, "learning_rate": 7.048048048048049e-06, "loss": 0.5081, "step": 29490 }, { "epoch": 88.59, "grad_norm": 15.47718334197998, "learning_rate": 7.047047047047047e-06, "loss": 0.4896, "step": 29500 }, { "epoch": 88.62, "grad_norm": 11.8483247756958, "learning_rate": 7.046046046046047e-06, "loss": 0.5388, "step": 29510 }, { "epoch": 88.65, "grad_norm": 16.320606231689453, "learning_rate": 7.045045045045046e-06, "loss": 0.462, "step": 29520 }, { "epoch": 88.68, "grad_norm": 15.262286186218262, "learning_rate": 7.044044044044045e-06, "loss": 0.4784, "step": 29530 }, { "epoch": 88.71, "grad_norm": 12.162701606750488, "learning_rate": 7.043043043043044e-06, "loss": 0.5195, "step": 29540 }, { "epoch": 88.74, "grad_norm": 15.841780662536621, "learning_rate": 7.042042042042042e-06, "loss": 0.5207, "step": 29550 }, { "epoch": 88.77, "grad_norm": 13.804239273071289, "learning_rate": 7.041041041041041e-06, "loss": 0.5029, "step": 29560 }, { "epoch": 88.8, "grad_norm": 12.5025634765625, "learning_rate": 7.04004004004004e-06, "loss": 0.4648, "step": 29570 }, { "epoch": 88.83, "grad_norm": 15.818353652954102, "learning_rate": 7.0390390390390395e-06, "loss": 0.4804, "step": 29580 }, { "epoch": 88.86, "grad_norm": 13.170658111572266, "learning_rate": 7.038038038038039e-06, "loss": 0.493, "step": 29590 }, { "epoch": 88.89, "grad_norm": 12.559015274047852, "learning_rate": 7.0370370370370375e-06, "loss": 0.5041, "step": 29600 }, { "epoch": 88.92, "grad_norm": 10.675616264343262, "learning_rate": 7.036036036036037e-06, "loss": 0.5035, "step": 29610 }, { "epoch": 88.95, "grad_norm": 13.700398445129395, "learning_rate": 7.035035035035036e-06, "loss": 0.4693, "step": 29620 }, { "epoch": 88.98, "grad_norm": 20.062053680419922, "learning_rate": 7.034034034034034e-06, "loss": 0.508, "step": 29630 }, { "epoch": 89.0, "eval_accuracy": 0.867, "eval_loss": 0.4750248193740845, "eval_runtime": 12.8132, "eval_samples_per_second": 780.446, "eval_steps_per_second": 3.122, "step": 29637 }, { "epoch": 89.01, "grad_norm": 16.882631301879883, "learning_rate": 7.0330330330330336e-06, "loss": 0.3747, "step": 29640 }, { "epoch": 89.04, "grad_norm": 16.249284744262695, "learning_rate": 7.032032032032032e-06, "loss": 0.5164, "step": 29650 }, { "epoch": 89.07, "grad_norm": 26.293668746948242, "learning_rate": 7.031031031031032e-06, "loss": 0.4515, "step": 29660 }, { "epoch": 89.1, "grad_norm": 22.30623435974121, "learning_rate": 7.03003003003003e-06, "loss": 0.5377, "step": 29670 }, { "epoch": 89.13, "grad_norm": 19.941524505615234, "learning_rate": 7.02902902902903e-06, "loss": 0.5314, "step": 29680 }, { "epoch": 89.16, "grad_norm": 15.501513481140137, "learning_rate": 7.028028028028029e-06, "loss": 0.445, "step": 29690 }, { "epoch": 89.19, "grad_norm": 19.918249130249023, "learning_rate": 7.027027027027028e-06, "loss": 0.5079, "step": 29700 }, { "epoch": 89.22, "grad_norm": 13.900308609008789, "learning_rate": 7.026026026026027e-06, "loss": 0.4472, "step": 29710 }, { "epoch": 89.25, "grad_norm": 11.860645294189453, "learning_rate": 7.025025025025025e-06, "loss": 0.4591, "step": 29720 }, { "epoch": 89.28, "grad_norm": 15.179025650024414, "learning_rate": 7.024024024024024e-06, "loss": 0.5286, "step": 29730 }, { "epoch": 89.31, "grad_norm": 17.204662322998047, "learning_rate": 7.023023023023024e-06, "loss": 0.4999, "step": 29740 }, { "epoch": 89.34, "grad_norm": 12.837636947631836, "learning_rate": 7.022022022022022e-06, "loss": 0.4789, "step": 29750 }, { "epoch": 89.37, "grad_norm": 21.773744583129883, "learning_rate": 7.021021021021022e-06, "loss": 0.438, "step": 29760 }, { "epoch": 89.4, "grad_norm": 25.687503814697266, "learning_rate": 7.020020020020021e-06, "loss": 0.5505, "step": 29770 }, { "epoch": 89.43, "grad_norm": 19.49062728881836, "learning_rate": 7.01901901901902e-06, "loss": 0.4961, "step": 29780 }, { "epoch": 89.46, "grad_norm": 13.12169075012207, "learning_rate": 7.018018018018019e-06, "loss": 0.494, "step": 29790 }, { "epoch": 89.49, "grad_norm": 17.911029815673828, "learning_rate": 7.017017017017017e-06, "loss": 0.502, "step": 29800 }, { "epoch": 89.52, "grad_norm": 43.872135162353516, "learning_rate": 7.0160160160160164e-06, "loss": 0.5371, "step": 29810 }, { "epoch": 89.55, "grad_norm": 9.673543930053711, "learning_rate": 7.015015015015015e-06, "loss": 0.4873, "step": 29820 }, { "epoch": 89.58, "grad_norm": 14.422541618347168, "learning_rate": 7.0140140140140145e-06, "loss": 0.4758, "step": 29830 }, { "epoch": 89.61, "grad_norm": 11.973191261291504, "learning_rate": 7.013013013013014e-06, "loss": 0.4872, "step": 29840 }, { "epoch": 89.64, "grad_norm": 13.877923965454102, "learning_rate": 7.0120120120120125e-06, "loss": 0.4936, "step": 29850 }, { "epoch": 89.67, "grad_norm": 15.526756286621094, "learning_rate": 7.011011011011012e-06, "loss": 0.49, "step": 29860 }, { "epoch": 89.7, "grad_norm": 33.905311584472656, "learning_rate": 7.010010010010011e-06, "loss": 0.4935, "step": 29870 }, { "epoch": 89.73, "grad_norm": 16.815303802490234, "learning_rate": 7.009009009009009e-06, "loss": 0.5419, "step": 29880 }, { "epoch": 89.76, "grad_norm": 16.048723220825195, "learning_rate": 7.008008008008009e-06, "loss": 0.444, "step": 29890 }, { "epoch": 89.79, "grad_norm": 27.505531311035156, "learning_rate": 7.007007007007007e-06, "loss": 0.514, "step": 29900 }, { "epoch": 89.82, "grad_norm": 14.29491138458252, "learning_rate": 7.006006006006007e-06, "loss": 0.4577, "step": 29910 }, { "epoch": 89.85, "grad_norm": 13.674175262451172, "learning_rate": 7.005005005005005e-06, "loss": 0.498, "step": 29920 }, { "epoch": 89.88, "grad_norm": 16.51004981994629, "learning_rate": 7.004004004004005e-06, "loss": 0.5176, "step": 29930 }, { "epoch": 89.91, "grad_norm": 10.969858169555664, "learning_rate": 7.003003003003004e-06, "loss": 0.4456, "step": 29940 }, { "epoch": 89.94, "grad_norm": 15.421603202819824, "learning_rate": 7.002002002002003e-06, "loss": 0.4372, "step": 29950 }, { "epoch": 89.97, "grad_norm": 12.754996299743652, "learning_rate": 7.001001001001002e-06, "loss": 0.4877, "step": 29960 }, { "epoch": 90.0, "grad_norm": 64.5278549194336, "learning_rate": 7e-06, "loss": 0.4567, "step": 29970 }, { "epoch": 90.0, "eval_accuracy": 0.8681, "eval_loss": 0.48157060146331787, "eval_runtime": 12.9235, "eval_samples_per_second": 773.787, "eval_steps_per_second": 3.095, "step": 29970 }, { "epoch": 90.03, "grad_norm": 9.665785789489746, "learning_rate": 6.998998998998999e-06, "loss": 0.446, "step": 29980 }, { "epoch": 90.06, "grad_norm": 31.673931121826172, "learning_rate": 6.997997997997999e-06, "loss": 0.496, "step": 29990 }, { "epoch": 90.09, "grad_norm": 16.958635330200195, "learning_rate": 6.996996996996997e-06, "loss": 0.5747, "step": 30000 }, { "epoch": 90.12, "grad_norm": 16.371265411376953, "learning_rate": 6.995995995995997e-06, "loss": 0.5298, "step": 30010 }, { "epoch": 90.15, "grad_norm": 15.15768814086914, "learning_rate": 6.994994994994995e-06, "loss": 0.4442, "step": 30020 }, { "epoch": 90.18, "grad_norm": 16.892885208129883, "learning_rate": 6.993993993993995e-06, "loss": 0.4983, "step": 30030 }, { "epoch": 90.21, "grad_norm": 15.908172607421875, "learning_rate": 6.992992992992994e-06, "loss": 0.5088, "step": 30040 }, { "epoch": 90.24, "grad_norm": 23.386457443237305, "learning_rate": 6.991991991991992e-06, "loss": 0.4715, "step": 30050 }, { "epoch": 90.27, "grad_norm": 14.932313919067383, "learning_rate": 6.9909909909909915e-06, "loss": 0.499, "step": 30060 }, { "epoch": 90.3, "grad_norm": 16.232872009277344, "learning_rate": 6.98998998998999e-06, "loss": 0.514, "step": 30070 }, { "epoch": 90.33, "grad_norm": 10.346344947814941, "learning_rate": 6.9889889889889895e-06, "loss": 0.5141, "step": 30080 }, { "epoch": 90.36, "grad_norm": 13.837724685668945, "learning_rate": 6.987987987987989e-06, "loss": 0.5231, "step": 30090 }, { "epoch": 90.39, "grad_norm": 9.678596496582031, "learning_rate": 6.9869869869869876e-06, "loss": 0.4349, "step": 30100 }, { "epoch": 90.42, "grad_norm": 23.609298706054688, "learning_rate": 6.985985985985987e-06, "loss": 0.5431, "step": 30110 }, { "epoch": 90.45, "grad_norm": 17.669897079467773, "learning_rate": 6.984984984984986e-06, "loss": 0.4584, "step": 30120 }, { "epoch": 90.48, "grad_norm": 18.255775451660156, "learning_rate": 6.983983983983984e-06, "loss": 0.4486, "step": 30130 }, { "epoch": 90.51, "grad_norm": 17.63195037841797, "learning_rate": 6.982982982982983e-06, "loss": 0.5097, "step": 30140 }, { "epoch": 90.54, "grad_norm": 16.73892593383789, "learning_rate": 6.981981981981982e-06, "loss": 0.5431, "step": 30150 }, { "epoch": 90.57, "grad_norm": 14.955709457397461, "learning_rate": 6.980980980980982e-06, "loss": 0.4592, "step": 30160 }, { "epoch": 90.6, "grad_norm": 14.634506225585938, "learning_rate": 6.97997997997998e-06, "loss": 0.487, "step": 30170 }, { "epoch": 90.63, "grad_norm": 13.358802795410156, "learning_rate": 6.97897897897898e-06, "loss": 0.4685, "step": 30180 }, { "epoch": 90.66, "grad_norm": 12.058484077453613, "learning_rate": 6.977977977977979e-06, "loss": 0.5341, "step": 30190 }, { "epoch": 90.69, "grad_norm": 12.642533302307129, "learning_rate": 6.976976976976978e-06, "loss": 0.5201, "step": 30200 }, { "epoch": 90.72, "grad_norm": 12.306208610534668, "learning_rate": 6.975975975975977e-06, "loss": 0.4551, "step": 30210 }, { "epoch": 90.75, "grad_norm": 17.12994384765625, "learning_rate": 6.974974974974975e-06, "loss": 0.5189, "step": 30220 }, { "epoch": 90.78, "grad_norm": 14.606392860412598, "learning_rate": 6.973973973973974e-06, "loss": 0.509, "step": 30230 }, { "epoch": 90.81, "grad_norm": 18.8613224029541, "learning_rate": 6.972972972972973e-06, "loss": 0.5104, "step": 30240 }, { "epoch": 90.84, "grad_norm": 17.76389503479004, "learning_rate": 6.971971971971972e-06, "loss": 0.5088, "step": 30250 }, { "epoch": 90.87, "grad_norm": 13.460197448730469, "learning_rate": 6.970970970970972e-06, "loss": 0.4629, "step": 30260 }, { "epoch": 90.9, "grad_norm": 14.830193519592285, "learning_rate": 6.9699699699699704e-06, "loss": 0.5278, "step": 30270 }, { "epoch": 90.93, "grad_norm": 12.67893123626709, "learning_rate": 6.96896896896897e-06, "loss": 0.5304, "step": 30280 }, { "epoch": 90.96, "grad_norm": 17.25163459777832, "learning_rate": 6.967967967967969e-06, "loss": 0.5088, "step": 30290 }, { "epoch": 90.99, "grad_norm": 21.83803939819336, "learning_rate": 6.966966966966967e-06, "loss": 0.4828, "step": 30300 }, { "epoch": 91.0, "eval_accuracy": 0.8659, "eval_loss": 0.4806377589702606, "eval_runtime": 12.629, "eval_samples_per_second": 791.831, "eval_steps_per_second": 3.167, "step": 30303 }, { "epoch": 91.02, "grad_norm": 19.721609115600586, "learning_rate": 6.9659659659659665e-06, "loss": 0.4452, "step": 30310 }, { "epoch": 91.05, "grad_norm": 14.063167572021484, "learning_rate": 6.964964964964965e-06, "loss": 0.5178, "step": 30320 }, { "epoch": 91.08, "grad_norm": 17.104015350341797, "learning_rate": 6.9639639639639646e-06, "loss": 0.4676, "step": 30330 }, { "epoch": 91.11, "grad_norm": 16.848560333251953, "learning_rate": 6.962962962962964e-06, "loss": 0.4997, "step": 30340 }, { "epoch": 91.14, "grad_norm": 15.187195777893066, "learning_rate": 6.961961961961963e-06, "loss": 0.483, "step": 30350 }, { "epoch": 91.17, "grad_norm": 12.064476013183594, "learning_rate": 6.960960960960962e-06, "loss": 0.4512, "step": 30360 }, { "epoch": 91.2, "grad_norm": 12.98769760131836, "learning_rate": 6.959959959959961e-06, "loss": 0.4519, "step": 30370 }, { "epoch": 91.23, "grad_norm": 15.146438598632812, "learning_rate": 6.958958958958959e-06, "loss": 0.4664, "step": 30380 }, { "epoch": 91.26, "grad_norm": 18.251422882080078, "learning_rate": 6.957957957957958e-06, "loss": 0.4693, "step": 30390 }, { "epoch": 91.29, "grad_norm": 12.003494262695312, "learning_rate": 6.956956956956957e-06, "loss": 0.5161, "step": 30400 }, { "epoch": 91.32, "grad_norm": 15.278481483459473, "learning_rate": 6.955955955955957e-06, "loss": 0.4667, "step": 30410 }, { "epoch": 91.35, "grad_norm": 14.551234245300293, "learning_rate": 6.954954954954955e-06, "loss": 0.569, "step": 30420 }, { "epoch": 91.38, "grad_norm": 17.031883239746094, "learning_rate": 6.953953953953955e-06, "loss": 0.5276, "step": 30430 }, { "epoch": 91.41, "grad_norm": 13.356664657592773, "learning_rate": 6.952952952952954e-06, "loss": 0.4812, "step": 30440 }, { "epoch": 91.44, "grad_norm": 13.456093788146973, "learning_rate": 6.951951951951953e-06, "loss": 0.4337, "step": 30450 }, { "epoch": 91.47, "grad_norm": 13.668466567993164, "learning_rate": 6.950950950950952e-06, "loss": 0.5183, "step": 30460 }, { "epoch": 91.5, "grad_norm": 12.544217109680176, "learning_rate": 6.94994994994995e-06, "loss": 0.4868, "step": 30470 }, { "epoch": 91.53, "grad_norm": 14.428853988647461, "learning_rate": 6.948948948948949e-06, "loss": 0.4896, "step": 30480 }, { "epoch": 91.56, "grad_norm": 14.298720359802246, "learning_rate": 6.947947947947948e-06, "loss": 0.5876, "step": 30490 }, { "epoch": 91.59, "grad_norm": 19.13186264038086, "learning_rate": 6.9469469469469474e-06, "loss": 0.4919, "step": 30500 }, { "epoch": 91.62, "grad_norm": 13.007296562194824, "learning_rate": 6.945945945945947e-06, "loss": 0.498, "step": 30510 }, { "epoch": 91.65, "grad_norm": 15.038235664367676, "learning_rate": 6.9449449449449455e-06, "loss": 0.5064, "step": 30520 }, { "epoch": 91.68, "grad_norm": 15.917678833007812, "learning_rate": 6.943943943943945e-06, "loss": 0.534, "step": 30530 }, { "epoch": 91.71, "grad_norm": 20.900001525878906, "learning_rate": 6.942942942942944e-06, "loss": 0.4255, "step": 30540 }, { "epoch": 91.74, "grad_norm": 14.348830223083496, "learning_rate": 6.941941941941942e-06, "loss": 0.5043, "step": 30550 }, { "epoch": 91.77, "grad_norm": 20.730100631713867, "learning_rate": 6.9409409409409416e-06, "loss": 0.4839, "step": 30560 }, { "epoch": 91.8, "grad_norm": 20.62845230102539, "learning_rate": 6.93993993993994e-06, "loss": 0.4621, "step": 30570 }, { "epoch": 91.83, "grad_norm": 17.458580017089844, "learning_rate": 6.93893893893894e-06, "loss": 0.4559, "step": 30580 }, { "epoch": 91.86, "grad_norm": 16.647769927978516, "learning_rate": 6.937937937937938e-06, "loss": 0.5066, "step": 30590 }, { "epoch": 91.89, "grad_norm": 12.121013641357422, "learning_rate": 6.936936936936938e-06, "loss": 0.4387, "step": 30600 }, { "epoch": 91.92, "grad_norm": 17.93021011352539, "learning_rate": 6.935935935935937e-06, "loss": 0.4736, "step": 30610 }, { "epoch": 91.95, "grad_norm": 12.964322090148926, "learning_rate": 6.934934934934936e-06, "loss": 0.4667, "step": 30620 }, { "epoch": 91.98, "grad_norm": 11.97743034362793, "learning_rate": 6.933933933933934e-06, "loss": 0.4357, "step": 30630 }, { "epoch": 92.0, "eval_accuracy": 0.8676, "eval_loss": 0.4770067036151886, "eval_runtime": 12.9018, "eval_samples_per_second": 775.083, "eval_steps_per_second": 3.1, "step": 30636 }, { "epoch": 92.01, "grad_norm": 12.249384880065918, "learning_rate": 6.932932932932933e-06, "loss": 0.4796, "step": 30640 }, { "epoch": 92.04, "grad_norm": 17.532352447509766, "learning_rate": 6.931931931931932e-06, "loss": 0.4583, "step": 30650 }, { "epoch": 92.07, "grad_norm": 16.517398834228516, "learning_rate": 6.930930930930932e-06, "loss": 0.4828, "step": 30660 }, { "epoch": 92.1, "grad_norm": 14.836200714111328, "learning_rate": 6.92992992992993e-06, "loss": 0.4577, "step": 30670 }, { "epoch": 92.13, "grad_norm": 13.0523099899292, "learning_rate": 6.92892892892893e-06, "loss": 0.4332, "step": 30680 }, { "epoch": 92.16, "grad_norm": 19.937950134277344, "learning_rate": 6.927927927927928e-06, "loss": 0.543, "step": 30690 }, { "epoch": 92.19, "grad_norm": 13.427345275878906, "learning_rate": 6.926926926926928e-06, "loss": 0.4133, "step": 30700 }, { "epoch": 92.22, "grad_norm": 23.241262435913086, "learning_rate": 6.9259259259259256e-06, "loss": 0.4852, "step": 30710 }, { "epoch": 92.25, "grad_norm": 13.879694938659668, "learning_rate": 6.924924924924925e-06, "loss": 0.4679, "step": 30720 }, { "epoch": 92.28, "grad_norm": 10.753642082214355, "learning_rate": 6.9239239239239244e-06, "loss": 0.5127, "step": 30730 }, { "epoch": 92.31, "grad_norm": 24.71851921081543, "learning_rate": 6.922922922922923e-06, "loss": 0.4803, "step": 30740 }, { "epoch": 92.34, "grad_norm": 18.72035026550293, "learning_rate": 6.9219219219219225e-06, "loss": 0.4954, "step": 30750 }, { "epoch": 92.37, "grad_norm": 13.558321952819824, "learning_rate": 6.920920920920922e-06, "loss": 0.4936, "step": 30760 }, { "epoch": 92.4, "grad_norm": 16.504188537597656, "learning_rate": 6.9199199199199205e-06, "loss": 0.482, "step": 30770 }, { "epoch": 92.43, "grad_norm": 20.247203826904297, "learning_rate": 6.91891891891892e-06, "loss": 0.5022, "step": 30780 }, { "epoch": 92.46, "grad_norm": 10.014521598815918, "learning_rate": 6.917917917917919e-06, "loss": 0.4989, "step": 30790 }, { "epoch": 92.49, "grad_norm": 10.720829010009766, "learning_rate": 6.916916916916917e-06, "loss": 0.4706, "step": 30800 }, { "epoch": 92.52, "grad_norm": 20.679336547851562, "learning_rate": 6.915915915915916e-06, "loss": 0.4413, "step": 30810 }, { "epoch": 92.55, "grad_norm": 17.162893295288086, "learning_rate": 6.914914914914915e-06, "loss": 0.5152, "step": 30820 }, { "epoch": 92.58, "grad_norm": 13.337507247924805, "learning_rate": 6.913913913913915e-06, "loss": 0.4619, "step": 30830 }, { "epoch": 92.61, "grad_norm": 16.43746566772461, "learning_rate": 6.912912912912913e-06, "loss": 0.5025, "step": 30840 }, { "epoch": 92.64, "grad_norm": 16.886571884155273, "learning_rate": 6.911911911911913e-06, "loss": 0.4913, "step": 30850 }, { "epoch": 92.67, "grad_norm": 16.11522102355957, "learning_rate": 6.910910910910912e-06, "loss": 0.4888, "step": 30860 }, { "epoch": 92.7, "grad_norm": 19.795459747314453, "learning_rate": 6.909909909909911e-06, "loss": 0.5074, "step": 30870 }, { "epoch": 92.73, "grad_norm": 16.05385398864746, "learning_rate": 6.908908908908909e-06, "loss": 0.4741, "step": 30880 }, { "epoch": 92.76, "grad_norm": 19.977645874023438, "learning_rate": 6.907907907907908e-06, "loss": 0.4973, "step": 30890 }, { "epoch": 92.79, "grad_norm": 14.197464942932129, "learning_rate": 6.906906906906907e-06, "loss": 0.5066, "step": 30900 }, { "epoch": 92.82, "grad_norm": 15.407529830932617, "learning_rate": 6.905905905905907e-06, "loss": 0.465, "step": 30910 }, { "epoch": 92.85, "grad_norm": 17.440927505493164, "learning_rate": 6.904904904904905e-06, "loss": 0.4783, "step": 30920 }, { "epoch": 92.88, "grad_norm": 14.656294822692871, "learning_rate": 6.903903903903905e-06, "loss": 0.4681, "step": 30930 }, { "epoch": 92.91, "grad_norm": 13.552583694458008, "learning_rate": 6.902902902902903e-06, "loss": 0.453, "step": 30940 }, { "epoch": 92.94, "grad_norm": 20.369585037231445, "learning_rate": 6.901901901901903e-06, "loss": 0.5547, "step": 30950 }, { "epoch": 92.97, "grad_norm": 15.291086196899414, "learning_rate": 6.900900900900901e-06, "loss": 0.5117, "step": 30960 }, { "epoch": 93.0, "eval_accuracy": 0.8714, "eval_loss": 0.47406187653541565, "eval_runtime": 12.4967, "eval_samples_per_second": 800.208, "eval_steps_per_second": 3.201, "step": 30969 } ], "logging_steps": 10, "max_steps": 99900, "num_input_tokens_seen": 0, "num_train_epochs": 300, "save_steps": 500, "total_flos": 9.851232158788608e+19, "train_batch_size": 128, "trial_name": null, "trial_params": null }