{ "best_metric": 0.7225942611694336, "best_model_checkpoint": "/data/sora/Projects/safe-sora/outputs/cost/reward-harmlessness/checkpoint-1216", "epoch": 3.2, "eval_steps": 76, "global_step": 1216, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "step": 0, "train_accuracy": 0.4375 }, { "epoch": 0.002631578947368421, "grad_norm": 82.71702575683594, "learning_rate": 4.347826086956522e-07, "loss": 2.3203, "step": 1 }, { "epoch": 0.002631578947368421, "step": 1, "train_accuracy": 0.453125 }, { "epoch": 0.005263157894736842, "grad_norm": 33.55039978027344, "learning_rate": 8.695652173913044e-07, "loss": 2.1104, "step": 2 }, { "epoch": 0.005263157894736842, "step": 2, "train_accuracy": 0.40625 }, { "epoch": 0.007894736842105263, "grad_norm": 120.34129333496094, "learning_rate": 1.3043478260869566e-06, "loss": 2.4414, "step": 3 }, { "epoch": 0.007894736842105263, "step": 3, "train_accuracy": 0.3125 }, { "epoch": 0.010526315789473684, "grad_norm": 75.6687240600586, "learning_rate": 1.7391304347826088e-06, "loss": 2.4277, "step": 4 }, { "epoch": 0.010526315789473684, "step": 4, "train_accuracy": 0.46875 }, { "epoch": 0.013157894736842105, "grad_norm": 60.28215026855469, "learning_rate": 2.173913043478261e-06, "loss": 2.3184, "step": 5 }, { "epoch": 0.013157894736842105, "step": 5, "train_accuracy": 0.4375 }, { "epoch": 0.015789473684210527, "grad_norm": 86.08032989501953, "learning_rate": 2.6086956521739132e-06, "loss": 2.3105, "step": 6 }, { "epoch": 0.015789473684210527, "step": 6, "train_accuracy": 0.53125 }, { "epoch": 0.018421052631578946, "grad_norm": 243.07888793945312, "learning_rate": 3.043478260869566e-06, "loss": 2.499, "step": 7 }, { "epoch": 0.018421052631578946, "step": 7, "train_accuracy": 0.515625 }, { "epoch": 0.021052631578947368, "grad_norm": 176.44801330566406, "learning_rate": 3.4782608695652175e-06, "loss": 2.2891, "step": 8 }, { "epoch": 0.021052631578947368, "step": 8, "train_accuracy": 0.578125 }, { "epoch": 0.02368421052631579, "grad_norm": 28.586137771606445, "learning_rate": 3.91304347826087e-06, "loss": 2.1494, "step": 9 }, { "epoch": 0.02368421052631579, "step": 9, "train_accuracy": 0.59375 }, { "epoch": 0.02631578947368421, "grad_norm": 21.67037582397461, "learning_rate": 4.347826086956522e-06, "loss": 1.9541, "step": 10 }, { "epoch": 0.02631578947368421, "step": 10, "train_accuracy": 0.578125 }, { "epoch": 0.02894736842105263, "grad_norm": 111.27401733398438, "learning_rate": 4.782608695652174e-06, "loss": 2.1279, "step": 11 }, { "epoch": 0.02894736842105263, "step": 11, "train_accuracy": 0.640625 }, { "epoch": 0.031578947368421054, "grad_norm": 106.67390441894531, "learning_rate": 5.2173913043478265e-06, "loss": 2.0986, "step": 12 }, { "epoch": 0.031578947368421054, "step": 12, "train_accuracy": 0.59375 }, { "epoch": 0.034210526315789476, "grad_norm": 58.74238586425781, "learning_rate": 5.652173913043479e-06, "loss": 1.8252, "step": 13 }, { "epoch": 0.034210526315789476, "step": 13, "train_accuracy": 0.625 }, { "epoch": 0.03684210526315789, "grad_norm": 78.9658203125, "learning_rate": 6.086956521739132e-06, "loss": 1.8164, "step": 14 }, { "epoch": 0.03684210526315789, "step": 14, "train_accuracy": 0.625 }, { "epoch": 0.039473684210526314, "grad_norm": 8.593488693237305, "learning_rate": 6.521739130434783e-06, "loss": 1.7021, "step": 15 }, { "epoch": 0.039473684210526314, "step": 15, "train_accuracy": 0.65625 }, { "epoch": 0.042105263157894736, "grad_norm": 91.2029037475586, "learning_rate": 6.956521739130435e-06, "loss": 2.0029, "step": 16 }, { "epoch": 0.042105263157894736, "step": 16, "train_accuracy": 0.765625 }, { "epoch": 0.04473684210526316, "grad_norm": 67.36499786376953, "learning_rate": 7.391304347826087e-06, "loss": 1.6465, "step": 17 }, { "epoch": 0.04473684210526316, "step": 17, "train_accuracy": 0.625 }, { "epoch": 0.04736842105263158, "grad_norm": 105.28680419921875, "learning_rate": 7.82608695652174e-06, "loss": 1.7539, "step": 18 }, { "epoch": 0.04736842105263158, "step": 18, "train_accuracy": 0.640625 }, { "epoch": 0.05, "grad_norm": 9.923203468322754, "learning_rate": 8.260869565217392e-06, "loss": 1.7881, "step": 19 }, { "epoch": 0.05, "step": 19, "train_accuracy": 0.671875 }, { "epoch": 0.05263157894736842, "grad_norm": 58.246089935302734, "learning_rate": 8.695652173913044e-06, "loss": 1.877, "step": 20 }, { "epoch": 0.05263157894736842, "step": 20, "train_accuracy": 0.546875 }, { "epoch": 0.05526315789473684, "grad_norm": 41.38032913208008, "learning_rate": 9.130434782608697e-06, "loss": 1.8271, "step": 21 }, { "epoch": 0.05526315789473684, "step": 21, "train_accuracy": 0.640625 }, { "epoch": 0.05789473684210526, "grad_norm": 67.59843444824219, "learning_rate": 9.565217391304349e-06, "loss": 1.9014, "step": 22 }, { "epoch": 0.05789473684210526, "step": 22, "train_accuracy": 0.796875 }, { "epoch": 0.060526315789473685, "grad_norm": 19.102436065673828, "learning_rate": 1e-05, "loss": 1.3848, "step": 23 }, { "epoch": 0.060526315789473685, "step": 23, "train_accuracy": 0.546875 }, { "epoch": 0.06315789473684211, "grad_norm": 9.836153984069824, "learning_rate": 1.0434782608695653e-05, "loss": 1.7988, "step": 24 }, { "epoch": 0.06315789473684211, "step": 24, "train_accuracy": 0.59375 }, { "epoch": 0.06578947368421052, "grad_norm": 18.77242088317871, "learning_rate": 1.0869565217391305e-05, "loss": 1.5508, "step": 25 }, { "epoch": 0.06578947368421052, "step": 25, "train_accuracy": 0.6875 }, { "epoch": 0.06842105263157895, "grad_norm": 33.33855438232422, "learning_rate": 1.1304347826086957e-05, "loss": 1.7383, "step": 26 }, { "epoch": 0.06842105263157895, "step": 26, "train_accuracy": 0.78125 }, { "epoch": 0.07105263157894737, "grad_norm": 27.66554832458496, "learning_rate": 1.1739130434782611e-05, "loss": 1.5332, "step": 27 }, { "epoch": 0.07105263157894737, "step": 27, "train_accuracy": 0.578125 }, { "epoch": 0.07368421052631578, "grad_norm": 47.21514892578125, "learning_rate": 1.2173913043478263e-05, "loss": 1.7275, "step": 28 }, { "epoch": 0.07368421052631578, "step": 28, "train_accuracy": 0.640625 }, { "epoch": 0.07631578947368421, "grad_norm": 5.483724594116211, "learning_rate": 1.2608695652173915e-05, "loss": 1.6904, "step": 29 }, { "epoch": 0.07631578947368421, "step": 29, "train_accuracy": 0.59375 }, { "epoch": 0.07894736842105263, "grad_norm": 20.393482208251953, "learning_rate": 1.3043478260869566e-05, "loss": 1.6572, "step": 30 }, { "epoch": 0.07894736842105263, "step": 30, "train_accuracy": 0.640625 }, { "epoch": 0.08157894736842106, "grad_norm": 47.08179473876953, "learning_rate": 1.3478260869565218e-05, "loss": 1.8018, "step": 31 }, { "epoch": 0.08157894736842106, "step": 31, "train_accuracy": 0.71875 }, { "epoch": 0.08421052631578947, "grad_norm": 28.04905891418457, "learning_rate": 1.391304347826087e-05, "loss": 1.5576, "step": 32 }, { "epoch": 0.08421052631578947, "step": 32, "train_accuracy": 0.65625 }, { "epoch": 0.0868421052631579, "grad_norm": 8.057256698608398, "learning_rate": 1.4347826086956522e-05, "loss": 1.5527, "step": 33 }, { "epoch": 0.0868421052631579, "step": 33, "train_accuracy": 0.65625 }, { "epoch": 0.08947368421052632, "grad_norm": 4.107302665710449, "learning_rate": 1.4782608695652174e-05, "loss": 1.4678, "step": 34 }, { "epoch": 0.08947368421052632, "step": 34, "train_accuracy": 0.796875 }, { "epoch": 0.09210526315789473, "grad_norm": 24.317855834960938, "learning_rate": 1.5217391304347828e-05, "loss": 1.5107, "step": 35 }, { "epoch": 0.09210526315789473, "step": 35, "train_accuracy": 0.703125 }, { "epoch": 0.09473684210526316, "grad_norm": 56.14430236816406, "learning_rate": 1.565217391304348e-05, "loss": 1.7305, "step": 36 }, { "epoch": 0.09473684210526316, "step": 36, "train_accuracy": 0.75 }, { "epoch": 0.09736842105263158, "grad_norm": 5.203139305114746, "learning_rate": 1.6086956521739132e-05, "loss": 1.4668, "step": 37 }, { "epoch": 0.09736842105263158, "step": 37, "train_accuracy": 0.640625 }, { "epoch": 0.1, "grad_norm": 16.80422019958496, "learning_rate": 1.6521739130434785e-05, "loss": 1.4746, "step": 38 }, { "epoch": 0.1, "step": 38, "train_accuracy": 0.734375 }, { "epoch": 0.10263157894736842, "grad_norm": 32.82760238647461, "learning_rate": 1.6956521739130437e-05, "loss": 1.5713, "step": 39 }, { "epoch": 0.10263157894736842, "step": 39, "train_accuracy": 0.6875 }, { "epoch": 0.10526315789473684, "grad_norm": 34.93852996826172, "learning_rate": 1.739130434782609e-05, "loss": 1.7637, "step": 40 }, { "epoch": 0.10526315789473684, "step": 40, "train_accuracy": 0.734375 }, { "epoch": 0.10789473684210527, "grad_norm": 6.973869800567627, "learning_rate": 1.782608695652174e-05, "loss": 1.4702, "step": 41 }, { "epoch": 0.10789473684210527, "step": 41, "train_accuracy": 0.703125 }, { "epoch": 0.11052631578947368, "grad_norm": 9.916731834411621, "learning_rate": 1.8260869565217393e-05, "loss": 1.666, "step": 42 }, { "epoch": 0.11052631578947368, "step": 42, "train_accuracy": 0.640625 }, { "epoch": 0.11315789473684211, "grad_norm": 28.54862403869629, "learning_rate": 1.8695652173913045e-05, "loss": 1.7041, "step": 43 }, { "epoch": 0.11315789473684211, "step": 43, "train_accuracy": 0.734375 }, { "epoch": 0.11578947368421053, "grad_norm": 13.393380165100098, "learning_rate": 1.9130434782608697e-05, "loss": 1.4229, "step": 44 }, { "epoch": 0.11578947368421053, "step": 44, "train_accuracy": 0.71875 }, { "epoch": 0.11842105263157894, "grad_norm": 8.206025123596191, "learning_rate": 1.956521739130435e-05, "loss": 1.5791, "step": 45 }, { "epoch": 0.11842105263157894, "step": 45, "train_accuracy": 0.71875 }, { "epoch": 0.12105263157894737, "grad_norm": 19.74668312072754, "learning_rate": 2e-05, "loss": 1.5811, "step": 46 }, { "epoch": 0.12105263157894737, "step": 46, "train_accuracy": 0.609375 }, { "epoch": 0.12368421052631579, "grad_norm": 17.365779876708984, "learning_rate": 1.9999977286993863e-05, "loss": 1.6475, "step": 47 }, { "epoch": 0.12368421052631579, "step": 47, "train_accuracy": 0.734375 }, { "epoch": 0.12631578947368421, "grad_norm": 15.396943092346191, "learning_rate": 1.9999909148078624e-05, "loss": 1.5986, "step": 48 }, { "epoch": 0.12631578947368421, "step": 48, "train_accuracy": 0.6875 }, { "epoch": 0.12894736842105264, "grad_norm": 4.93903923034668, "learning_rate": 1.9999795583563814e-05, "loss": 1.4766, "step": 49 }, { "epoch": 0.12894736842105264, "step": 49, "train_accuracy": 0.734375 }, { "epoch": 0.13157894736842105, "grad_norm": 24.193723678588867, "learning_rate": 1.9999636593965306e-05, "loss": 1.5166, "step": 50 }, { "epoch": 0.13157894736842105, "step": 50, "train_accuracy": 0.78125 }, { "epoch": 0.13421052631578947, "grad_norm": 14.069121360778809, "learning_rate": 1.999943218000533e-05, "loss": 1.4102, "step": 51 }, { "epoch": 0.13421052631578947, "step": 51, "train_accuracy": 0.671875 }, { "epoch": 0.1368421052631579, "grad_norm": 6.808143615722656, "learning_rate": 1.999918234261246e-05, "loss": 1.6113, "step": 52 }, { "epoch": 0.1368421052631579, "step": 52, "train_accuracy": 0.765625 }, { "epoch": 0.1394736842105263, "grad_norm": 19.879804611206055, "learning_rate": 1.9998887082921605e-05, "loss": 1.5615, "step": 53 }, { "epoch": 0.1394736842105263, "step": 53, "train_accuracy": 0.71875 }, { "epoch": 0.14210526315789473, "grad_norm": 11.96185302734375, "learning_rate": 1.999854640227401e-05, "loss": 1.3799, "step": 54 }, { "epoch": 0.14210526315789473, "step": 54, "train_accuracy": 0.734375 }, { "epoch": 0.14473684210526316, "grad_norm": 15.333646774291992, "learning_rate": 1.9998160302217254e-05, "loss": 1.2549, "step": 55 }, { "epoch": 0.14473684210526316, "step": 55, "train_accuracy": 0.609375 }, { "epoch": 0.14736842105263157, "grad_norm": 17.457860946655273, "learning_rate": 1.9997728784505232e-05, "loss": 1.6514, "step": 56 }, { "epoch": 0.14736842105263157, "step": 56, "train_accuracy": 0.78125 }, { "epoch": 0.15, "grad_norm": 37.52303695678711, "learning_rate": 1.999725185109816e-05, "loss": 1.418, "step": 57 }, { "epoch": 0.15, "step": 57, "train_accuracy": 0.6875 }, { "epoch": 0.15263157894736842, "grad_norm": 15.378766059875488, "learning_rate": 1.999672950416256e-05, "loss": 1.5928, "step": 58 }, { "epoch": 0.15263157894736842, "step": 58, "train_accuracy": 0.640625 }, { "epoch": 0.15526315789473685, "grad_norm": 22.888885498046875, "learning_rate": 1.9996161746071238e-05, "loss": 1.3926, "step": 59 }, { "epoch": 0.15526315789473685, "step": 59, "train_accuracy": 0.6875 }, { "epoch": 0.15789473684210525, "grad_norm": 60.397743225097656, "learning_rate": 1.9995548579403296e-05, "loss": 2.126, "step": 60 }, { "epoch": 0.15789473684210525, "step": 60, "train_accuracy": 0.6875 }, { "epoch": 0.16052631578947368, "grad_norm": 5.263916969299316, "learning_rate": 1.9994890006944105e-05, "loss": 1.3906, "step": 61 }, { "epoch": 0.16052631578947368, "step": 61, "train_accuracy": 0.671875 }, { "epoch": 0.1631578947368421, "grad_norm": 24.444305419921875, "learning_rate": 1.99941860316853e-05, "loss": 1.4248, "step": 62 }, { "epoch": 0.1631578947368421, "step": 62, "train_accuracy": 0.640625 }, { "epoch": 0.16578947368421051, "grad_norm": 30.14607048034668, "learning_rate": 1.999343665682476e-05, "loss": 1.6807, "step": 63 }, { "epoch": 0.16578947368421051, "step": 63, "train_accuracy": 0.6875 }, { "epoch": 0.16842105263157894, "grad_norm": 5.177918910980225, "learning_rate": 1.999264188576659e-05, "loss": 1.582, "step": 64 }, { "epoch": 0.16842105263157894, "step": 64, "train_accuracy": 0.75 }, { "epoch": 0.17105263157894737, "grad_norm": 3.7263214588165283, "learning_rate": 1.9991801722121124e-05, "loss": 1.4702, "step": 65 }, { "epoch": 0.17105263157894737, "step": 65, "train_accuracy": 0.765625 }, { "epoch": 0.1736842105263158, "grad_norm": 14.105743408203125, "learning_rate": 1.9990916169704886e-05, "loss": 1.5986, "step": 66 }, { "epoch": 0.1736842105263158, "step": 66, "train_accuracy": 0.640625 }, { "epoch": 0.1763157894736842, "grad_norm": 30.84016990661621, "learning_rate": 1.9989985232540592e-05, "loss": 1.7539, "step": 67 }, { "epoch": 0.1763157894736842, "step": 67, "train_accuracy": 0.734375 }, { "epoch": 0.17894736842105263, "grad_norm": 5.946470737457275, "learning_rate": 1.9989008914857115e-05, "loss": 1.5361, "step": 68 }, { "epoch": 0.17894736842105263, "step": 68, "train_accuracy": 0.65625 }, { "epoch": 0.18157894736842106, "grad_norm": 21.31287384033203, "learning_rate": 1.998798722108948e-05, "loss": 1.4482, "step": 69 }, { "epoch": 0.18157894736842106, "step": 69, "train_accuracy": 0.71875 }, { "epoch": 0.18421052631578946, "grad_norm": 8.728236198425293, "learning_rate": 1.998692015587883e-05, "loss": 1.4541, "step": 70 }, { "epoch": 0.18421052631578946, "step": 70, "train_accuracy": 0.703125 }, { "epoch": 0.1868421052631579, "grad_norm": 24.142559051513672, "learning_rate": 1.998580772407242e-05, "loss": 1.458, "step": 71 }, { "epoch": 0.1868421052631579, "step": 71, "train_accuracy": 0.8125 }, { "epoch": 0.18947368421052632, "grad_norm": 8.70908260345459, "learning_rate": 1.9984649930723586e-05, "loss": 1.2969, "step": 72 }, { "epoch": 0.18947368421052632, "step": 72, "train_accuracy": 0.703125 }, { "epoch": 0.19210526315789472, "grad_norm": 18.371227264404297, "learning_rate": 1.9983446781091715e-05, "loss": 1.5898, "step": 73 }, { "epoch": 0.19210526315789472, "step": 73, "train_accuracy": 0.703125 }, { "epoch": 0.19473684210526315, "grad_norm": 21.964420318603516, "learning_rate": 1.9982198280642244e-05, "loss": 1.5381, "step": 74 }, { "epoch": 0.19473684210526315, "step": 74, "train_accuracy": 0.78125 }, { "epoch": 0.19736842105263158, "grad_norm": 11.249500274658203, "learning_rate": 1.9980904435046603e-05, "loss": 1.5513, "step": 75 }, { "epoch": 0.19736842105263158, "step": 75, "train_accuracy": 0.71875 }, { "epoch": 0.2, "grad_norm": 8.206984519958496, "learning_rate": 1.9979565250182228e-05, "loss": 1.5552, "step": 76 }, { "epoch": 0.2, "eval_accuracy": 0.6919280886650085, "eval_max_score": 6.5, "eval_min_score": -4.59375, "eval_runtime": 151.8642, "eval_samples_per_second": 18.681, "eval_steps_per_second": 0.296, "step": 76 }, { "epoch": 0.2, "step": 76, "train_accuracy": 0.703125 }, { "epoch": 0.2026315789473684, "grad_norm": 29.587980270385742, "learning_rate": 1.997818073213249e-05, "loss": 1.5024, "step": 77 }, { "epoch": 0.2026315789473684, "step": 77, "train_accuracy": 0.671875 }, { "epoch": 0.20526315789473684, "grad_norm": 10.721640586853027, "learning_rate": 1.9976750887186708e-05, "loss": 1.4014, "step": 78 }, { "epoch": 0.20526315789473684, "step": 78, "train_accuracy": 0.71875 }, { "epoch": 0.20789473684210527, "grad_norm": 8.728314399719238, "learning_rate": 1.9975275721840105e-05, "loss": 1.3784, "step": 79 }, { "epoch": 0.20789473684210527, "step": 79, "train_accuracy": 0.71875 }, { "epoch": 0.21052631578947367, "grad_norm": 7.245014667510986, "learning_rate": 1.9973755242793756e-05, "loss": 1.5894, "step": 80 }, { "epoch": 0.21052631578947367, "step": 80, "train_accuracy": 0.734375 }, { "epoch": 0.2131578947368421, "grad_norm": 4.6964850425720215, "learning_rate": 1.9972189456954595e-05, "loss": 1.4492, "step": 81 }, { "epoch": 0.2131578947368421, "step": 81, "train_accuracy": 0.75 }, { "epoch": 0.21578947368421053, "grad_norm": 5.349942207336426, "learning_rate": 1.9970578371435367e-05, "loss": 1.249, "step": 82 }, { "epoch": 0.21578947368421053, "step": 82, "train_accuracy": 0.625 }, { "epoch": 0.21842105263157896, "grad_norm": 4.20225715637207, "learning_rate": 1.996892199355459e-05, "loss": 1.541, "step": 83 }, { "epoch": 0.21842105263157896, "step": 83, "train_accuracy": 0.578125 }, { "epoch": 0.22105263157894736, "grad_norm": 13.42335033416748, "learning_rate": 1.996722033083652e-05, "loss": 1.4404, "step": 84 }, { "epoch": 0.22105263157894736, "step": 84, "train_accuracy": 0.75 }, { "epoch": 0.2236842105263158, "grad_norm": 2.8606903553009033, "learning_rate": 1.9965473391011144e-05, "loss": 1.4155, "step": 85 }, { "epoch": 0.2236842105263158, "step": 85, "train_accuracy": 0.6875 }, { "epoch": 0.22631578947368422, "grad_norm": 14.601333618164062, "learning_rate": 1.9963681182014107e-05, "loss": 1.3247, "step": 86 }, { "epoch": 0.22631578947368422, "step": 86, "train_accuracy": 0.765625 }, { "epoch": 0.22894736842105262, "grad_norm": 11.857205390930176, "learning_rate": 1.99618437119867e-05, "loss": 1.4092, "step": 87 }, { "epoch": 0.22894736842105262, "step": 87, "train_accuracy": 0.734375 }, { "epoch": 0.23157894736842105, "grad_norm": 3.8118960857391357, "learning_rate": 1.9959960989275816e-05, "loss": 1.2725, "step": 88 }, { "epoch": 0.23157894736842105, "step": 88, "train_accuracy": 0.703125 }, { "epoch": 0.23421052631578948, "grad_norm": 24.30862045288086, "learning_rate": 1.9958033022433916e-05, "loss": 1.4478, "step": 89 }, { "epoch": 0.23421052631578948, "step": 89, "train_accuracy": 0.71875 }, { "epoch": 0.23684210526315788, "grad_norm": 17.567001342773438, "learning_rate": 1.9956059820218982e-05, "loss": 1.415, "step": 90 }, { "epoch": 0.23684210526315788, "step": 90, "train_accuracy": 0.71875 }, { "epoch": 0.2394736842105263, "grad_norm": 5.111862659454346, "learning_rate": 1.9954041391594486e-05, "loss": 1.6006, "step": 91 }, { "epoch": 0.2394736842105263, "step": 91, "train_accuracy": 0.703125 }, { "epoch": 0.24210526315789474, "grad_norm": 19.393651962280273, "learning_rate": 1.9951977745729343e-05, "loss": 1.623, "step": 92 }, { "epoch": 0.24210526315789474, "step": 92, "train_accuracy": 0.734375 }, { "epoch": 0.24473684210526317, "grad_norm": 28.0972843170166, "learning_rate": 1.9949868891997877e-05, "loss": 1.8125, "step": 93 }, { "epoch": 0.24473684210526317, "step": 93, "train_accuracy": 0.71875 }, { "epoch": 0.24736842105263157, "grad_norm": 5.5879106521606445, "learning_rate": 1.9947714839979765e-05, "loss": 1.3486, "step": 94 }, { "epoch": 0.24736842105263157, "step": 94, "train_accuracy": 0.671875 }, { "epoch": 0.25, "grad_norm": 6.477169513702393, "learning_rate": 1.994551559946001e-05, "loss": 1.3794, "step": 95 }, { "epoch": 0.25, "step": 95, "train_accuracy": 0.71875 }, { "epoch": 0.25263157894736843, "grad_norm": 25.532249450683594, "learning_rate": 1.9943271180428883e-05, "loss": 1.4663, "step": 96 }, { "epoch": 0.25263157894736843, "step": 96, "train_accuracy": 0.71875 }, { "epoch": 0.25526315789473686, "grad_norm": 4.028436660766602, "learning_rate": 1.9940981593081884e-05, "loss": 1.4131, "step": 97 }, { "epoch": 0.25526315789473686, "step": 97, "train_accuracy": 0.734375 }, { "epoch": 0.2578947368421053, "grad_norm": 7.103154182434082, "learning_rate": 1.9938646847819693e-05, "loss": 1.208, "step": 98 }, { "epoch": 0.2578947368421053, "step": 98, "train_accuracy": 0.765625 }, { "epoch": 0.26052631578947366, "grad_norm": 18.36396026611328, "learning_rate": 1.9936266955248133e-05, "loss": 1.46, "step": 99 }, { "epoch": 0.26052631578947366, "step": 99, "train_accuracy": 0.71875 }, { "epoch": 0.2631578947368421, "grad_norm": 2.634291648864746, "learning_rate": 1.9933841926178104e-05, "loss": 1.4102, "step": 100 }, { "epoch": 0.2631578947368421, "step": 100, "train_accuracy": 0.84375 }, { "epoch": 0.2657894736842105, "grad_norm": 4.079046249389648, "learning_rate": 1.9931371771625545e-05, "loss": 1.1465, "step": 101 }, { "epoch": 0.2657894736842105, "step": 101, "train_accuracy": 0.71875 }, { "epoch": 0.26842105263157895, "grad_norm": 4.469728469848633, "learning_rate": 1.9928856502811383e-05, "loss": 1.3628, "step": 102 }, { "epoch": 0.26842105263157895, "step": 102, "train_accuracy": 0.640625 }, { "epoch": 0.2710526315789474, "grad_norm": 11.426676750183105, "learning_rate": 1.992629613116148e-05, "loss": 1.7158, "step": 103 }, { "epoch": 0.2710526315789474, "step": 103, "train_accuracy": 0.765625 }, { "epoch": 0.2736842105263158, "grad_norm": 4.418765544891357, "learning_rate": 1.992369066830659e-05, "loss": 1.4443, "step": 104 }, { "epoch": 0.2736842105263158, "step": 104, "train_accuracy": 0.75 }, { "epoch": 0.27631578947368424, "grad_norm": 7.475268840789795, "learning_rate": 1.992104012608228e-05, "loss": 1.5762, "step": 105 }, { "epoch": 0.27631578947368424, "step": 105, "train_accuracy": 0.640625 }, { "epoch": 0.2789473684210526, "grad_norm": 10.006223678588867, "learning_rate": 1.991834451652892e-05, "loss": 1.4473, "step": 106 }, { "epoch": 0.2789473684210526, "step": 106, "train_accuracy": 0.734375 }, { "epoch": 0.28157894736842104, "grad_norm": 4.488539695739746, "learning_rate": 1.9915603851891577e-05, "loss": 1.3716, "step": 107 }, { "epoch": 0.28157894736842104, "step": 107, "train_accuracy": 0.6875 }, { "epoch": 0.28421052631578947, "grad_norm": 31.295486450195312, "learning_rate": 1.991281814462001e-05, "loss": 1.6162, "step": 108 }, { "epoch": 0.28421052631578947, "step": 108, "train_accuracy": 0.75 }, { "epoch": 0.2868421052631579, "grad_norm": 4.2928290367126465, "learning_rate": 1.9909987407368565e-05, "loss": 1.2925, "step": 109 }, { "epoch": 0.2868421052631579, "step": 109, "train_accuracy": 0.609375 }, { "epoch": 0.2894736842105263, "grad_norm": 33.315330505371094, "learning_rate": 1.9907111652996156e-05, "loss": 1.6572, "step": 110 }, { "epoch": 0.2894736842105263, "step": 110, "train_accuracy": 0.75 }, { "epoch": 0.29210526315789476, "grad_norm": 24.655488967895508, "learning_rate": 1.9904190894566194e-05, "loss": 1.4414, "step": 111 }, { "epoch": 0.29210526315789476, "step": 111, "train_accuracy": 0.65625 }, { "epoch": 0.29473684210526313, "grad_norm": 15.197490692138672, "learning_rate": 1.990122514534651e-05, "loss": 1.522, "step": 112 }, { "epoch": 0.29473684210526313, "step": 112, "train_accuracy": 0.6875 }, { "epoch": 0.29736842105263156, "grad_norm": 28.140966415405273, "learning_rate": 1.989821441880933e-05, "loss": 1.688, "step": 113 }, { "epoch": 0.29736842105263156, "step": 113, "train_accuracy": 0.703125 }, { "epoch": 0.3, "grad_norm": 20.251832962036133, "learning_rate": 1.9895158728631176e-05, "loss": 1.4038, "step": 114 }, { "epoch": 0.3, "step": 114, "train_accuracy": 0.65625 }, { "epoch": 0.3026315789473684, "grad_norm": 4.522186279296875, "learning_rate": 1.9892058088692834e-05, "loss": 1.584, "step": 115 }, { "epoch": 0.3026315789473684, "step": 115, "train_accuracy": 0.75 }, { "epoch": 0.30526315789473685, "grad_norm": 31.17378044128418, "learning_rate": 1.9888912513079276e-05, "loss": 1.7578, "step": 116 }, { "epoch": 0.30526315789473685, "step": 116, "train_accuracy": 0.71875 }, { "epoch": 0.3078947368421053, "grad_norm": 25.056249618530273, "learning_rate": 1.9885722016079594e-05, "loss": 1.5654, "step": 117 }, { "epoch": 0.3078947368421053, "step": 117, "train_accuracy": 0.796875 }, { "epoch": 0.3105263157894737, "grad_norm": 10.085057258605957, "learning_rate": 1.9882486612186943e-05, "loss": 1.3105, "step": 118 }, { "epoch": 0.3105263157894737, "step": 118, "train_accuracy": 0.8125 }, { "epoch": 0.3131578947368421, "grad_norm": 11.358416557312012, "learning_rate": 1.9879206316098477e-05, "loss": 1.416, "step": 119 }, { "epoch": 0.3131578947368421, "step": 119, "train_accuracy": 0.734375 }, { "epoch": 0.3157894736842105, "grad_norm": 13.83195972442627, "learning_rate": 1.9875881142715272e-05, "loss": 1.3457, "step": 120 }, { "epoch": 0.3157894736842105, "step": 120, "train_accuracy": 0.71875 }, { "epoch": 0.31842105263157894, "grad_norm": 23.987102508544922, "learning_rate": 1.987251110714226e-05, "loss": 1.4287, "step": 121 }, { "epoch": 0.31842105263157894, "step": 121, "train_accuracy": 0.5625 }, { "epoch": 0.32105263157894737, "grad_norm": 19.9754581451416, "learning_rate": 1.986909622468818e-05, "loss": 1.6104, "step": 122 }, { "epoch": 0.32105263157894737, "step": 122, "train_accuracy": 0.609375 }, { "epoch": 0.3236842105263158, "grad_norm": 10.121444702148438, "learning_rate": 1.9865636510865466e-05, "loss": 1.3672, "step": 123 }, { "epoch": 0.3236842105263158, "step": 123, "train_accuracy": 0.71875 }, { "epoch": 0.3263157894736842, "grad_norm": 14.994632720947266, "learning_rate": 1.986213198139023e-05, "loss": 1.5693, "step": 124 }, { "epoch": 0.3263157894736842, "step": 124, "train_accuracy": 0.6875 }, { "epoch": 0.32894736842105265, "grad_norm": 19.378555297851562, "learning_rate": 1.9858582652182146e-05, "loss": 1.5337, "step": 125 }, { "epoch": 0.32894736842105265, "step": 125, "train_accuracy": 0.640625 }, { "epoch": 0.33157894736842103, "grad_norm": 27.150787353515625, "learning_rate": 1.9854988539364403e-05, "loss": 1.4336, "step": 126 }, { "epoch": 0.33157894736842103, "step": 126, "train_accuracy": 0.703125 }, { "epoch": 0.33421052631578946, "grad_norm": 6.9245100021362305, "learning_rate": 1.9851349659263624e-05, "loss": 1.5127, "step": 127 }, { "epoch": 0.33421052631578946, "step": 127, "train_accuracy": 0.71875 }, { "epoch": 0.3368421052631579, "grad_norm": 15.940958976745605, "learning_rate": 1.9847666028409787e-05, "loss": 1.3149, "step": 128 }, { "epoch": 0.3368421052631579, "step": 128, "train_accuracy": 0.671875 }, { "epoch": 0.3394736842105263, "grad_norm": 27.410593032836914, "learning_rate": 1.984393766353616e-05, "loss": 1.5811, "step": 129 }, { "epoch": 0.3394736842105263, "step": 129, "train_accuracy": 0.671875 }, { "epoch": 0.34210526315789475, "grad_norm": 19.065568923950195, "learning_rate": 1.9840164581579217e-05, "loss": 1.6299, "step": 130 }, { "epoch": 0.34210526315789475, "step": 130, "train_accuracy": 0.765625 }, { "epoch": 0.3447368421052632, "grad_norm": 3.3670198917388916, "learning_rate": 1.983634679967857e-05, "loss": 1.4824, "step": 131 }, { "epoch": 0.3447368421052632, "step": 131, "train_accuracy": 0.765625 }, { "epoch": 0.3473684210526316, "grad_norm": 20.537992477416992, "learning_rate": 1.9832484335176866e-05, "loss": 1.4312, "step": 132 }, { "epoch": 0.3473684210526316, "step": 132, "train_accuracy": 0.640625 }, { "epoch": 0.35, "grad_norm": 18.157855987548828, "learning_rate": 1.9828577205619757e-05, "loss": 1.3296, "step": 133 }, { "epoch": 0.35, "step": 133, "train_accuracy": 0.640625 }, { "epoch": 0.3526315789473684, "grad_norm": 10.166598320007324, "learning_rate": 1.982462542875576e-05, "loss": 1.3745, "step": 134 }, { "epoch": 0.3526315789473684, "step": 134, "train_accuracy": 0.71875 }, { "epoch": 0.35526315789473684, "grad_norm": 7.348676681518555, "learning_rate": 1.9820629022536234e-05, "loss": 1.416, "step": 135 }, { "epoch": 0.35526315789473684, "step": 135, "train_accuracy": 0.78125 }, { "epoch": 0.35789473684210527, "grad_norm": 5.667979717254639, "learning_rate": 1.9816588005115255e-05, "loss": 1.1729, "step": 136 }, { "epoch": 0.35789473684210527, "step": 136, "train_accuracy": 0.578125 }, { "epoch": 0.3605263157894737, "grad_norm": 18.713518142700195, "learning_rate": 1.9812502394849554e-05, "loss": 1.665, "step": 137 }, { "epoch": 0.3605263157894737, "step": 137, "train_accuracy": 0.65625 }, { "epoch": 0.3631578947368421, "grad_norm": 17.586336135864258, "learning_rate": 1.9808372210298425e-05, "loss": 1.645, "step": 138 }, { "epoch": 0.3631578947368421, "step": 138, "train_accuracy": 0.78125 }, { "epoch": 0.36578947368421055, "grad_norm": 4.807186603546143, "learning_rate": 1.980419747022365e-05, "loss": 1.4297, "step": 139 }, { "epoch": 0.36578947368421055, "step": 139, "train_accuracy": 0.765625 }, { "epoch": 0.3684210526315789, "grad_norm": 11.270689010620117, "learning_rate": 1.9799978193589407e-05, "loss": 1.5156, "step": 140 }, { "epoch": 0.3684210526315789, "step": 140, "train_accuracy": 0.765625 }, { "epoch": 0.37105263157894736, "grad_norm": 6.3195343017578125, "learning_rate": 1.9795714399562198e-05, "loss": 1.4634, "step": 141 }, { "epoch": 0.37105263157894736, "step": 141, "train_accuracy": 0.78125 }, { "epoch": 0.3736842105263158, "grad_norm": 8.104639053344727, "learning_rate": 1.979140610751073e-05, "loss": 1.2305, "step": 142 }, { "epoch": 0.3736842105263158, "step": 142, "train_accuracy": 0.625 }, { "epoch": 0.3763157894736842, "grad_norm": 6.707549571990967, "learning_rate": 1.9787053337005855e-05, "loss": 1.5098, "step": 143 }, { "epoch": 0.3763157894736842, "step": 143, "train_accuracy": 0.796875 }, { "epoch": 0.37894736842105264, "grad_norm": 4.008877277374268, "learning_rate": 1.9782656107820476e-05, "loss": 1.2354, "step": 144 }, { "epoch": 0.37894736842105264, "step": 144, "train_accuracy": 0.71875 }, { "epoch": 0.3815789473684211, "grad_norm": 16.74215316772461, "learning_rate": 1.9778214439929453e-05, "loss": 1.314, "step": 145 }, { "epoch": 0.3815789473684211, "step": 145, "train_accuracy": 0.71875 }, { "epoch": 0.38421052631578945, "grad_norm": 6.672449111938477, "learning_rate": 1.9773728353509512e-05, "loss": 1.3169, "step": 146 }, { "epoch": 0.38421052631578945, "step": 146, "train_accuracy": 0.8125 }, { "epoch": 0.3868421052631579, "grad_norm": 3.9568095207214355, "learning_rate": 1.9769197868939153e-05, "loss": 1.1396, "step": 147 }, { "epoch": 0.3868421052631579, "step": 147, "train_accuracy": 0.6875 }, { "epoch": 0.3894736842105263, "grad_norm": 3.1683335304260254, "learning_rate": 1.9764623006798554e-05, "loss": 1.3745, "step": 148 }, { "epoch": 0.3894736842105263, "step": 148, "train_accuracy": 0.78125 }, { "epoch": 0.39210526315789473, "grad_norm": 5.442292213439941, "learning_rate": 1.9760003787869504e-05, "loss": 1.4702, "step": 149 }, { "epoch": 0.39210526315789473, "step": 149, "train_accuracy": 0.65625 }, { "epoch": 0.39473684210526316, "grad_norm": 3.1763217449188232, "learning_rate": 1.9755340233135265e-05, "loss": 1.5659, "step": 150 }, { "epoch": 0.39473684210526316, "step": 150, "train_accuracy": 0.671875 }, { "epoch": 0.3973684210526316, "grad_norm": 9.710025787353516, "learning_rate": 1.9750632363780503e-05, "loss": 1.5723, "step": 151 }, { "epoch": 0.3973684210526316, "step": 151, "train_accuracy": 0.734375 }, { "epoch": 0.4, "grad_norm": 11.149826049804688, "learning_rate": 1.9745880201191198e-05, "loss": 1.6113, "step": 152 }, { "epoch": 0.4, "eval_accuracy": 0.6922805905342102, "eval_max_score": 6.78125, "eval_min_score": -4.21875, "eval_runtime": 151.353, "eval_samples_per_second": 18.744, "eval_steps_per_second": 0.297, "step": 152 }, { "epoch": 0.4, "step": 152, "train_accuracy": 0.6875 }, { "epoch": 0.4026315789473684, "grad_norm": 17.558015823364258, "learning_rate": 1.9741083766954527e-05, "loss": 1.4136, "step": 153 }, { "epoch": 0.4026315789473684, "step": 153, "train_accuracy": 0.71875 }, { "epoch": 0.4052631578947368, "grad_norm": 6.4166178703308105, "learning_rate": 1.9736243082858772e-05, "loss": 1.4238, "step": 154 }, { "epoch": 0.4052631578947368, "step": 154, "train_accuracy": 0.78125 }, { "epoch": 0.40789473684210525, "grad_norm": 16.045656204223633, "learning_rate": 1.973135817089324e-05, "loss": 1.3569, "step": 155 }, { "epoch": 0.40789473684210525, "step": 155, "train_accuracy": 0.609375 }, { "epoch": 0.4105263157894737, "grad_norm": 11.161890983581543, "learning_rate": 1.972642905324813e-05, "loss": 1.7246, "step": 156 }, { "epoch": 0.4105263157894737, "step": 156, "train_accuracy": 0.609375 }, { "epoch": 0.4131578947368421, "grad_norm": 10.395303726196289, "learning_rate": 1.9721455752314468e-05, "loss": 1.2827, "step": 157 }, { "epoch": 0.4131578947368421, "step": 157, "train_accuracy": 0.703125 }, { "epoch": 0.41578947368421054, "grad_norm": 6.473245143890381, "learning_rate": 1.9716438290683964e-05, "loss": 1.667, "step": 158 }, { "epoch": 0.41578947368421054, "step": 158, "train_accuracy": 0.6875 }, { "epoch": 0.41842105263157897, "grad_norm": 26.937177658081055, "learning_rate": 1.9711376691148958e-05, "loss": 1.5586, "step": 159 }, { "epoch": 0.41842105263157897, "step": 159, "train_accuracy": 0.703125 }, { "epoch": 0.42105263157894735, "grad_norm": 8.536986351013184, "learning_rate": 1.970627097670227e-05, "loss": 1.4272, "step": 160 }, { "epoch": 0.42105263157894735, "step": 160, "train_accuracy": 0.734375 }, { "epoch": 0.4236842105263158, "grad_norm": 6.249920845031738, "learning_rate": 1.9701121170537125e-05, "loss": 1.5605, "step": 161 }, { "epoch": 0.4236842105263158, "step": 161, "train_accuracy": 0.75 }, { "epoch": 0.4263157894736842, "grad_norm": 20.86353302001953, "learning_rate": 1.9695927296047044e-05, "loss": 1.3545, "step": 162 }, { "epoch": 0.4263157894736842, "step": 162, "train_accuracy": 0.625 }, { "epoch": 0.42894736842105263, "grad_norm": 18.73291778564453, "learning_rate": 1.969068937682572e-05, "loss": 1.5581, "step": 163 }, { "epoch": 0.42894736842105263, "step": 163, "train_accuracy": 0.75 }, { "epoch": 0.43157894736842106, "grad_norm": 3.9674601554870605, "learning_rate": 1.968540743666694e-05, "loss": 1.3638, "step": 164 }, { "epoch": 0.43157894736842106, "step": 164, "train_accuracy": 0.796875 }, { "epoch": 0.4342105263157895, "grad_norm": 10.880136489868164, "learning_rate": 1.9680081499564446e-05, "loss": 1.248, "step": 165 }, { "epoch": 0.4342105263157895, "step": 165, "train_accuracy": 0.6875 }, { "epoch": 0.4368421052631579, "grad_norm": 23.13611602783203, "learning_rate": 1.967471158971185e-05, "loss": 1.4531, "step": 166 }, { "epoch": 0.4368421052631579, "step": 166, "train_accuracy": 0.609375 }, { "epoch": 0.4394736842105263, "grad_norm": 8.650835990905762, "learning_rate": 1.966929773150251e-05, "loss": 1.3369, "step": 167 }, { "epoch": 0.4394736842105263, "step": 167, "train_accuracy": 0.765625 }, { "epoch": 0.4421052631578947, "grad_norm": 4.803102493286133, "learning_rate": 1.966383994952942e-05, "loss": 1.2661, "step": 168 }, { "epoch": 0.4421052631578947, "step": 168, "train_accuracy": 0.703125 }, { "epoch": 0.44473684210526315, "grad_norm": 16.57624053955078, "learning_rate": 1.9658338268585113e-05, "loss": 1.4502, "step": 169 }, { "epoch": 0.44473684210526315, "step": 169, "train_accuracy": 0.65625 }, { "epoch": 0.4473684210526316, "grad_norm": 11.971793174743652, "learning_rate": 1.965279271366153e-05, "loss": 1.3433, "step": 170 }, { "epoch": 0.4473684210526316, "step": 170, "train_accuracy": 0.65625 }, { "epoch": 0.45, "grad_norm": 5.445674419403076, "learning_rate": 1.9647203309949913e-05, "loss": 1.5737, "step": 171 }, { "epoch": 0.45, "step": 171, "train_accuracy": 0.6875 }, { "epoch": 0.45263157894736844, "grad_norm": 10.206953048706055, "learning_rate": 1.96415700828407e-05, "loss": 1.3804, "step": 172 }, { "epoch": 0.45263157894736844, "step": 172, "train_accuracy": 0.765625 }, { "epoch": 0.45526315789473687, "grad_norm": 11.522626876831055, "learning_rate": 1.963589305792339e-05, "loss": 1.2734, "step": 173 }, { "epoch": 0.45526315789473687, "step": 173, "train_accuracy": 0.65625 }, { "epoch": 0.45789473684210524, "grad_norm": 4.055401802062988, "learning_rate": 1.9630172260986447e-05, "loss": 1.4268, "step": 174 }, { "epoch": 0.45789473684210524, "step": 174, "train_accuracy": 0.71875 }, { "epoch": 0.4605263157894737, "grad_norm": 8.796987533569336, "learning_rate": 1.9624407718017165e-05, "loss": 1.3555, "step": 175 }, { "epoch": 0.4605263157894737, "step": 175, "train_accuracy": 0.734375 }, { "epoch": 0.4631578947368421, "grad_norm": 3.7710328102111816, "learning_rate": 1.961859945520157e-05, "loss": 1.2656, "step": 176 }, { "epoch": 0.4631578947368421, "step": 176, "train_accuracy": 0.734375 }, { "epoch": 0.46578947368421053, "grad_norm": 5.011978626251221, "learning_rate": 1.961274749892428e-05, "loss": 1.2974, "step": 177 }, { "epoch": 0.46578947368421053, "step": 177, "train_accuracy": 0.65625 }, { "epoch": 0.46842105263157896, "grad_norm": 6.948405742645264, "learning_rate": 1.9606851875768404e-05, "loss": 1.5273, "step": 178 }, { "epoch": 0.46842105263157896, "step": 178, "train_accuracy": 0.640625 }, { "epoch": 0.4710526315789474, "grad_norm": 14.704864501953125, "learning_rate": 1.96009126125154e-05, "loss": 1.7246, "step": 179 }, { "epoch": 0.4710526315789474, "step": 179, "train_accuracy": 0.640625 }, { "epoch": 0.47368421052631576, "grad_norm": 4.491285800933838, "learning_rate": 1.9594929736144978e-05, "loss": 1.4385, "step": 180 }, { "epoch": 0.47368421052631576, "step": 180, "train_accuracy": 0.65625 }, { "epoch": 0.4763157894736842, "grad_norm": 6.6391401290893555, "learning_rate": 1.9588903273834954e-05, "loss": 1.4521, "step": 181 }, { "epoch": 0.4763157894736842, "step": 181, "train_accuracy": 0.71875 }, { "epoch": 0.4789473684210526, "grad_norm": 8.603846549987793, "learning_rate": 1.9582833252961143e-05, "loss": 1.334, "step": 182 }, { "epoch": 0.4789473684210526, "step": 182, "train_accuracy": 0.625 }, { "epoch": 0.48157894736842105, "grad_norm": 5.131749153137207, "learning_rate": 1.9576719701097238e-05, "loss": 1.2568, "step": 183 }, { "epoch": 0.48157894736842105, "step": 183, "train_accuracy": 0.578125 }, { "epoch": 0.4842105263157895, "grad_norm": 4.089670658111572, "learning_rate": 1.957056264601466e-05, "loss": 1.394, "step": 184 }, { "epoch": 0.4842105263157895, "step": 184, "train_accuracy": 0.703125 }, { "epoch": 0.4868421052631579, "grad_norm": 2.8140037059783936, "learning_rate": 1.956436211568246e-05, "loss": 1.4531, "step": 185 }, { "epoch": 0.4868421052631579, "step": 185, "train_accuracy": 0.71875 }, { "epoch": 0.48947368421052634, "grad_norm": 19.234317779541016, "learning_rate": 1.9558118138267166e-05, "loss": 1.3413, "step": 186 }, { "epoch": 0.48947368421052634, "step": 186, "train_accuracy": 0.6875 }, { "epoch": 0.4921052631578947, "grad_norm": 3.8546056747436523, "learning_rate": 1.9551830742132684e-05, "loss": 1.4771, "step": 187 }, { "epoch": 0.4921052631578947, "step": 187, "train_accuracy": 0.671875 }, { "epoch": 0.49473684210526314, "grad_norm": 6.853395462036133, "learning_rate": 1.9545499955840145e-05, "loss": 1.3462, "step": 188 }, { "epoch": 0.49473684210526314, "step": 188, "train_accuracy": 0.703125 }, { "epoch": 0.49736842105263157, "grad_norm": 2.805311918258667, "learning_rate": 1.953912580814779e-05, "loss": 1.3135, "step": 189 }, { "epoch": 0.49736842105263157, "step": 189, "train_accuracy": 0.765625 }, { "epoch": 0.5, "grad_norm": 7.851466178894043, "learning_rate": 1.953270832801083e-05, "loss": 1.2954, "step": 190 }, { "epoch": 0.5, "step": 190, "train_accuracy": 0.71875 }, { "epoch": 0.5026315789473684, "grad_norm": 19.671802520751953, "learning_rate": 1.9526247544581315e-05, "loss": 1.3569, "step": 191 }, { "epoch": 0.5026315789473684, "step": 191, "train_accuracy": 0.59375 }, { "epoch": 0.5052631578947369, "grad_norm": 4.735298156738281, "learning_rate": 1.9519743487208008e-05, "loss": 1.4355, "step": 192 }, { "epoch": 0.5052631578947369, "step": 192, "train_accuracy": 0.8125 }, { "epoch": 0.5078947368421053, "grad_norm": 6.873162269592285, "learning_rate": 1.9513196185436248e-05, "loss": 1.439, "step": 193 }, { "epoch": 0.5078947368421053, "step": 193, "train_accuracy": 0.75 }, { "epoch": 0.5105263157894737, "grad_norm": 13.055280685424805, "learning_rate": 1.9506605669007815e-05, "loss": 1.584, "step": 194 }, { "epoch": 0.5105263157894737, "step": 194, "train_accuracy": 0.671875 }, { "epoch": 0.5131578947368421, "grad_norm": 6.923583030700684, "learning_rate": 1.94999719678608e-05, "loss": 1.4561, "step": 195 }, { "epoch": 0.5131578947368421, "step": 195, "train_accuracy": 0.703125 }, { "epoch": 0.5157894736842106, "grad_norm": 4.102768421173096, "learning_rate": 1.9493295112129464e-05, "loss": 1.6006, "step": 196 }, { "epoch": 0.5157894736842106, "step": 196, "train_accuracy": 0.6875 }, { "epoch": 0.5184210526315789, "grad_norm": 9.221837997436523, "learning_rate": 1.9486575132144095e-05, "loss": 1.4302, "step": 197 }, { "epoch": 0.5184210526315789, "step": 197, "train_accuracy": 0.71875 }, { "epoch": 0.5210526315789473, "grad_norm": 3.7125375270843506, "learning_rate": 1.9479812058430886e-05, "loss": 1.3184, "step": 198 }, { "epoch": 0.5210526315789473, "step": 198, "train_accuracy": 0.71875 }, { "epoch": 0.5236842105263158, "grad_norm": 6.4270243644714355, "learning_rate": 1.9473005921711778e-05, "loss": 1.3823, "step": 199 }, { "epoch": 0.5236842105263158, "step": 199, "train_accuracy": 0.6875 }, { "epoch": 0.5263157894736842, "grad_norm": 4.891445159912109, "learning_rate": 1.9466156752904344e-05, "loss": 1.4551, "step": 200 }, { "epoch": 0.5263157894736842, "step": 200, "train_accuracy": 0.859375 }, { "epoch": 0.5289473684210526, "grad_norm": 10.10925006866455, "learning_rate": 1.945926458312162e-05, "loss": 1.2012, "step": 201 }, { "epoch": 0.5289473684210526, "step": 201, "train_accuracy": 0.78125 }, { "epoch": 0.531578947368421, "grad_norm": 17.743282318115234, "learning_rate": 1.945232944367199e-05, "loss": 1.3965, "step": 202 }, { "epoch": 0.531578947368421, "step": 202, "train_accuracy": 0.671875 }, { "epoch": 0.5342105263157895, "grad_norm": 5.660726547241211, "learning_rate": 1.9445351366059025e-05, "loss": 1.3765, "step": 203 }, { "epoch": 0.5342105263157895, "step": 203, "train_accuracy": 0.6875 }, { "epoch": 0.5368421052631579, "grad_norm": 17.128097534179688, "learning_rate": 1.9438330381981348e-05, "loss": 1.291, "step": 204 }, { "epoch": 0.5368421052631579, "step": 204, "train_accuracy": 0.734375 }, { "epoch": 0.5394736842105263, "grad_norm": 5.735154151916504, "learning_rate": 1.9431266523332488e-05, "loss": 1.7031, "step": 205 }, { "epoch": 0.5394736842105263, "step": 205, "train_accuracy": 0.703125 }, { "epoch": 0.5421052631578948, "grad_norm": 8.40402603149414, "learning_rate": 1.9424159822200744e-05, "loss": 1.3916, "step": 206 }, { "epoch": 0.5421052631578948, "step": 206, "train_accuracy": 0.828125 }, { "epoch": 0.5447368421052632, "grad_norm": 8.66818618774414, "learning_rate": 1.941701031086902e-05, "loss": 1.3877, "step": 207 }, { "epoch": 0.5447368421052632, "step": 207, "train_accuracy": 0.6875 }, { "epoch": 0.5473684210526316, "grad_norm": 16.262784957885742, "learning_rate": 1.9409818021814698e-05, "loss": 1.4619, "step": 208 }, { "epoch": 0.5473684210526316, "step": 208, "train_accuracy": 0.796875 }, { "epoch": 0.55, "grad_norm": 2.998375177383423, "learning_rate": 1.9402582987709477e-05, "loss": 1.3452, "step": 209 }, { "epoch": 0.55, "step": 209, "train_accuracy": 0.8125 }, { "epoch": 0.5526315789473685, "grad_norm": 2.688655138015747, "learning_rate": 1.9395305241419234e-05, "loss": 1.3125, "step": 210 }, { "epoch": 0.5526315789473685, "step": 210, "train_accuracy": 0.71875 }, { "epoch": 0.5552631578947368, "grad_norm": 9.62772274017334, "learning_rate": 1.9387984816003868e-05, "loss": 1.3271, "step": 211 }, { "epoch": 0.5552631578947368, "step": 211, "train_accuracy": 0.640625 }, { "epoch": 0.5578947368421052, "grad_norm": 4.155248165130615, "learning_rate": 1.9380621744717144e-05, "loss": 1.3545, "step": 212 }, { "epoch": 0.5578947368421052, "step": 212, "train_accuracy": 0.65625 }, { "epoch": 0.5605263157894737, "grad_norm": 12.761597633361816, "learning_rate": 1.9373216061006576e-05, "loss": 1.4141, "step": 213 }, { "epoch": 0.5605263157894737, "step": 213, "train_accuracy": 0.703125 }, { "epoch": 0.5631578947368421, "grad_norm": 7.922176837921143, "learning_rate": 1.9365767798513216e-05, "loss": 1.2588, "step": 214 }, { "epoch": 0.5631578947368421, "step": 214, "train_accuracy": 0.765625 }, { "epoch": 0.5657894736842105, "grad_norm": 6.710723876953125, "learning_rate": 1.9358276991071556e-05, "loss": 1.3638, "step": 215 }, { "epoch": 0.5657894736842105, "step": 215, "train_accuracy": 0.734375 }, { "epoch": 0.5684210526315789, "grad_norm": 3.7787835597991943, "learning_rate": 1.935074367270935e-05, "loss": 1.2544, "step": 216 }, { "epoch": 0.5684210526315789, "step": 216, "train_accuracy": 0.671875 }, { "epoch": 0.5710526315789474, "grad_norm": 7.387803554534912, "learning_rate": 1.9343167877647457e-05, "loss": 1.3369, "step": 217 }, { "epoch": 0.5710526315789474, "step": 217, "train_accuracy": 0.875 }, { "epoch": 0.5736842105263158, "grad_norm": 3.0733940601348877, "learning_rate": 1.9335549640299688e-05, "loss": 1.2944, "step": 218 }, { "epoch": 0.5736842105263158, "step": 218, "train_accuracy": 0.734375 }, { "epoch": 0.5763157894736842, "grad_norm": 16.344900131225586, "learning_rate": 1.9327888995272667e-05, "loss": 1.1758, "step": 219 }, { "epoch": 0.5763157894736842, "step": 219, "train_accuracy": 0.71875 }, { "epoch": 0.5789473684210527, "grad_norm": 6.135222434997559, "learning_rate": 1.9320185977365643e-05, "loss": 1.3555, "step": 220 }, { "epoch": 0.5789473684210527, "step": 220, "train_accuracy": 0.75 }, { "epoch": 0.5815789473684211, "grad_norm": 5.324588298797607, "learning_rate": 1.9312440621570355e-05, "loss": 1.4307, "step": 221 }, { "epoch": 0.5815789473684211, "step": 221, "train_accuracy": 0.703125 }, { "epoch": 0.5842105263157895, "grad_norm": 4.523622512817383, "learning_rate": 1.9304652963070868e-05, "loss": 1.2983, "step": 222 }, { "epoch": 0.5842105263157895, "step": 222, "train_accuracy": 0.765625 }, { "epoch": 0.5868421052631579, "grad_norm": 4.101083278656006, "learning_rate": 1.9296823037243406e-05, "loss": 1.3501, "step": 223 }, { "epoch": 0.5868421052631579, "step": 223, "train_accuracy": 0.75 }, { "epoch": 0.5894736842105263, "grad_norm": 8.83919620513916, "learning_rate": 1.9288950879656205e-05, "loss": 1.2852, "step": 224 }, { "epoch": 0.5894736842105263, "step": 224, "train_accuracy": 0.65625 }, { "epoch": 0.5921052631578947, "grad_norm": 9.160242080688477, "learning_rate": 1.9281036526069333e-05, "loss": 1.3491, "step": 225 }, { "epoch": 0.5921052631578947, "step": 225, "train_accuracy": 0.71875 }, { "epoch": 0.5947368421052631, "grad_norm": 10.763931274414062, "learning_rate": 1.927308001243454e-05, "loss": 1.3047, "step": 226 }, { "epoch": 0.5947368421052631, "step": 226, "train_accuracy": 0.671875 }, { "epoch": 0.5973684210526315, "grad_norm": 9.965821266174316, "learning_rate": 1.92650813748951e-05, "loss": 1.4458, "step": 227 }, { "epoch": 0.5973684210526315, "step": 227, "train_accuracy": 0.71875 }, { "epoch": 0.6, "grad_norm": 3.940614938735962, "learning_rate": 1.9257040649785633e-05, "loss": 1.4888, "step": 228 }, { "epoch": 0.6, "eval_accuracy": 0.7053225040435791, "eval_max_score": 5.4375, "eval_min_score": -5.0, "eval_runtime": 151.3974, "eval_samples_per_second": 18.739, "eval_steps_per_second": 0.297, "step": 228 }, { "epoch": 0.6, "step": 228, "train_accuracy": 0.703125 }, { "epoch": 0.6026315789473684, "grad_norm": 12.34426498413086, "learning_rate": 1.9248957873631947e-05, "loss": 1.4258, "step": 229 }, { "epoch": 0.6026315789473684, "step": 229, "train_accuracy": 0.65625 }, { "epoch": 0.6052631578947368, "grad_norm": 13.6069917678833, "learning_rate": 1.9240833083150864e-05, "loss": 1.5156, "step": 230 }, { "epoch": 0.6052631578947368, "step": 230, "train_accuracy": 0.6875 }, { "epoch": 0.6078947368421053, "grad_norm": 4.457098484039307, "learning_rate": 1.9232666315250078e-05, "loss": 1.3008, "step": 231 }, { "epoch": 0.6078947368421053, "step": 231, "train_accuracy": 0.734375 }, { "epoch": 0.6105263157894737, "grad_norm": 6.309927463531494, "learning_rate": 1.922445760702795e-05, "loss": 1.3384, "step": 232 }, { "epoch": 0.6105263157894737, "step": 232, "train_accuracy": 0.796875 }, { "epoch": 0.6131578947368421, "grad_norm": 7.830813407897949, "learning_rate": 1.9216206995773373e-05, "loss": 1.2866, "step": 233 }, { "epoch": 0.6131578947368421, "step": 233, "train_accuracy": 0.671875 }, { "epoch": 0.6157894736842106, "grad_norm": 10.779923439025879, "learning_rate": 1.9207914518965585e-05, "loss": 1.4932, "step": 234 }, { "epoch": 0.6157894736842106, "step": 234, "train_accuracy": 0.640625 }, { "epoch": 0.618421052631579, "grad_norm": 2.9620766639709473, "learning_rate": 1.9199580214274e-05, "loss": 1.3242, "step": 235 }, { "epoch": 0.618421052631579, "step": 235, "train_accuracy": 0.71875 }, { "epoch": 0.6210526315789474, "grad_norm": 8.38760757446289, "learning_rate": 1.9191204119558034e-05, "loss": 1.3672, "step": 236 }, { "epoch": 0.6210526315789474, "step": 236, "train_accuracy": 0.703125 }, { "epoch": 0.6236842105263158, "grad_norm": 5.167934417724609, "learning_rate": 1.9182786272866955e-05, "loss": 1.3555, "step": 237 }, { "epoch": 0.6236842105263158, "step": 237, "train_accuracy": 0.75 }, { "epoch": 0.6263157894736842, "grad_norm": 12.282572746276855, "learning_rate": 1.9174326712439674e-05, "loss": 1.6484, "step": 238 }, { "epoch": 0.6263157894736842, "step": 238, "train_accuracy": 0.78125 }, { "epoch": 0.6289473684210526, "grad_norm": 14.087172508239746, "learning_rate": 1.916582547670461e-05, "loss": 1.4004, "step": 239 }, { "epoch": 0.6289473684210526, "step": 239, "train_accuracy": 0.703125 }, { "epoch": 0.631578947368421, "grad_norm": 3.8323986530303955, "learning_rate": 1.9157282604279482e-05, "loss": 1.3779, "step": 240 }, { "epoch": 0.631578947368421, "step": 240, "train_accuracy": 0.8125 }, { "epoch": 0.6342105263157894, "grad_norm": 9.426977157592773, "learning_rate": 1.9148698133971156e-05, "loss": 1.4463, "step": 241 }, { "epoch": 0.6342105263157894, "step": 241, "train_accuracy": 0.640625 }, { "epoch": 0.6368421052631579, "grad_norm": 6.784728527069092, "learning_rate": 1.914007210477545e-05, "loss": 1.3521, "step": 242 }, { "epoch": 0.6368421052631579, "step": 242, "train_accuracy": 0.734375 }, { "epoch": 0.6394736842105263, "grad_norm": 7.3046650886535645, "learning_rate": 1.913140455587698e-05, "loss": 1.4111, "step": 243 }, { "epoch": 0.6394736842105263, "step": 243, "train_accuracy": 0.6875 }, { "epoch": 0.6421052631578947, "grad_norm": 18.1398983001709, "learning_rate": 1.9122695526648968e-05, "loss": 1.5938, "step": 244 }, { "epoch": 0.6421052631578947, "step": 244, "train_accuracy": 0.734375 }, { "epoch": 0.6447368421052632, "grad_norm": 4.717808246612549, "learning_rate": 1.911394505665306e-05, "loss": 1.3379, "step": 245 }, { "epoch": 0.6447368421052632, "step": 245, "train_accuracy": 0.71875 }, { "epoch": 0.6473684210526316, "grad_norm": 5.125016212463379, "learning_rate": 1.9105153185639142e-05, "loss": 1.2539, "step": 246 }, { "epoch": 0.6473684210526316, "step": 246, "train_accuracy": 0.828125 }, { "epoch": 0.65, "grad_norm": 5.333531856536865, "learning_rate": 1.9096319953545186e-05, "loss": 1.374, "step": 247 }, { "epoch": 0.65, "step": 247, "train_accuracy": 0.703125 }, { "epoch": 0.6526315789473685, "grad_norm": 11.1005220413208, "learning_rate": 1.908744540049704e-05, "loss": 1.4688, "step": 248 }, { "epoch": 0.6526315789473685, "step": 248, "train_accuracy": 0.734375 }, { "epoch": 0.6552631578947369, "grad_norm": 18.011417388916016, "learning_rate": 1.9078529566808265e-05, "loss": 1.5732, "step": 249 }, { "epoch": 0.6552631578947369, "step": 249, "train_accuracy": 0.78125 }, { "epoch": 0.6578947368421053, "grad_norm": 9.390484809875488, "learning_rate": 1.9069572492979933e-05, "loss": 1.1738, "step": 250 }, { "epoch": 0.6578947368421053, "step": 250, "train_accuracy": 0.71875 }, { "epoch": 0.6605263157894737, "grad_norm": 21.22768211364746, "learning_rate": 1.906057421970046e-05, "loss": 1.5244, "step": 251 }, { "epoch": 0.6605263157894737, "step": 251, "train_accuracy": 0.703125 }, { "epoch": 0.6631578947368421, "grad_norm": 3.9320061206817627, "learning_rate": 1.9051534787845414e-05, "loss": 1.5396, "step": 252 }, { "epoch": 0.6631578947368421, "step": 252, "train_accuracy": 0.65625 }, { "epoch": 0.6657894736842105, "grad_norm": 16.053037643432617, "learning_rate": 1.9042454238477326e-05, "loss": 1.4902, "step": 253 }, { "epoch": 0.6657894736842105, "step": 253, "train_accuracy": 0.71875 }, { "epoch": 0.6684210526315789, "grad_norm": 3.7904529571533203, "learning_rate": 1.9033332612845516e-05, "loss": 1.3354, "step": 254 }, { "epoch": 0.6684210526315789, "step": 254, "train_accuracy": 0.625 }, { "epoch": 0.6710526315789473, "grad_norm": 3.9885382652282715, "learning_rate": 1.9024169952385887e-05, "loss": 1.5967, "step": 255 }, { "epoch": 0.6710526315789473, "step": 255, "train_accuracy": 0.734375 }, { "epoch": 0.6736842105263158, "grad_norm": 5.4799885749816895, "learning_rate": 1.9014966298720752e-05, "loss": 1.5703, "step": 256 }, { "epoch": 0.6736842105263158, "step": 256, "train_accuracy": 0.71875 }, { "epoch": 0.6763157894736842, "grad_norm": 3.2502963542938232, "learning_rate": 1.9005721693658642e-05, "loss": 1.2104, "step": 257 }, { "epoch": 0.6763157894736842, "step": 257, "train_accuracy": 0.578125 }, { "epoch": 0.6789473684210526, "grad_norm": 10.936656951904297, "learning_rate": 1.899643617919411e-05, "loss": 1.519, "step": 258 }, { "epoch": 0.6789473684210526, "step": 258, "train_accuracy": 0.796875 }, { "epoch": 0.6815789473684211, "grad_norm": 3.5721821784973145, "learning_rate": 1.898710979750755e-05, "loss": 1.3594, "step": 259 }, { "epoch": 0.6815789473684211, "step": 259, "train_accuracy": 0.765625 }, { "epoch": 0.6842105263157895, "grad_norm": 2.8655872344970703, "learning_rate": 1.8977742590964985e-05, "loss": 1.3838, "step": 260 }, { "epoch": 0.6842105263157895, "step": 260, "train_accuracy": 0.796875 }, { "epoch": 0.6868421052631579, "grad_norm": 4.112249374389648, "learning_rate": 1.8968334602117906e-05, "loss": 1.168, "step": 261 }, { "epoch": 0.6868421052631579, "step": 261, "train_accuracy": 0.71875 }, { "epoch": 0.6894736842105263, "grad_norm": 23.64423370361328, "learning_rate": 1.8958885873703055e-05, "loss": 1.5669, "step": 262 }, { "epoch": 0.6894736842105263, "step": 262, "train_accuracy": 0.765625 }, { "epoch": 0.6921052631578948, "grad_norm": 9.901530265808105, "learning_rate": 1.8949396448642233e-05, "loss": 1.1182, "step": 263 }, { "epoch": 0.6921052631578948, "step": 263, "train_accuracy": 0.734375 }, { "epoch": 0.6947368421052632, "grad_norm": 8.222607612609863, "learning_rate": 1.8939866370042116e-05, "loss": 1.4614, "step": 264 }, { "epoch": 0.6947368421052632, "step": 264, "train_accuracy": 0.796875 }, { "epoch": 0.6973684210526315, "grad_norm": 3.2499775886535645, "learning_rate": 1.8930295681194054e-05, "loss": 1.2705, "step": 265 }, { "epoch": 0.6973684210526315, "step": 265, "train_accuracy": 0.734375 }, { "epoch": 0.7, "grad_norm": 21.207712173461914, "learning_rate": 1.8920684425573865e-05, "loss": 1.4531, "step": 266 }, { "epoch": 0.7, "step": 266, "train_accuracy": 0.75 }, { "epoch": 0.7026315789473684, "grad_norm": 3.7196009159088135, "learning_rate": 1.8911032646841657e-05, "loss": 1.3164, "step": 267 }, { "epoch": 0.7026315789473684, "step": 267, "train_accuracy": 0.796875 }, { "epoch": 0.7052631578947368, "grad_norm": 3.2149417400360107, "learning_rate": 1.8901340388841602e-05, "loss": 1.251, "step": 268 }, { "epoch": 0.7052631578947368, "step": 268, "train_accuracy": 0.703125 }, { "epoch": 0.7078947368421052, "grad_norm": 5.346362113952637, "learning_rate": 1.889160769560177e-05, "loss": 1.5195, "step": 269 }, { "epoch": 0.7078947368421052, "step": 269, "train_accuracy": 0.734375 }, { "epoch": 0.7105263157894737, "grad_norm": 19.827333450317383, "learning_rate": 1.8881834611333906e-05, "loss": 1.3813, "step": 270 }, { "epoch": 0.7105263157894737, "step": 270, "train_accuracy": 0.671875 }, { "epoch": 0.7131578947368421, "grad_norm": 3.6186683177948, "learning_rate": 1.887202118043323e-05, "loss": 1.3633, "step": 271 }, { "epoch": 0.7131578947368421, "step": 271, "train_accuracy": 0.6875 }, { "epoch": 0.7157894736842105, "grad_norm": 7.749443531036377, "learning_rate": 1.886216744747825e-05, "loss": 1.3379, "step": 272 }, { "epoch": 0.7157894736842105, "step": 272, "train_accuracy": 0.734375 }, { "epoch": 0.718421052631579, "grad_norm": 18.11931610107422, "learning_rate": 1.885227345723054e-05, "loss": 1.7515, "step": 273 }, { "epoch": 0.718421052631579, "step": 273, "train_accuracy": 0.75 }, { "epoch": 0.7210526315789474, "grad_norm": 4.238664627075195, "learning_rate": 1.8842339254634558e-05, "loss": 1.3262, "step": 274 }, { "epoch": 0.7210526315789474, "step": 274, "train_accuracy": 0.8125 }, { "epoch": 0.7236842105263158, "grad_norm": 5.915544033050537, "learning_rate": 1.8832364884817424e-05, "loss": 1.3804, "step": 275 }, { "epoch": 0.7236842105263158, "step": 275, "train_accuracy": 0.765625 }, { "epoch": 0.7263157894736842, "grad_norm": 4.206833839416504, "learning_rate": 1.8822350393088717e-05, "loss": 1.1641, "step": 276 }, { "epoch": 0.7263157894736842, "step": 276, "train_accuracy": 0.703125 }, { "epoch": 0.7289473684210527, "grad_norm": 15.429242134094238, "learning_rate": 1.8812295824940284e-05, "loss": 1.3164, "step": 277 }, { "epoch": 0.7289473684210527, "step": 277, "train_accuracy": 0.71875 }, { "epoch": 0.7315789473684211, "grad_norm": 4.126450061798096, "learning_rate": 1.8802201226046023e-05, "loss": 1.208, "step": 278 }, { "epoch": 0.7315789473684211, "step": 278, "train_accuracy": 0.765625 }, { "epoch": 0.7342105263157894, "grad_norm": 5.984684944152832, "learning_rate": 1.879206664226166e-05, "loss": 1.333, "step": 279 }, { "epoch": 0.7342105263157894, "step": 279, "train_accuracy": 0.578125 }, { "epoch": 0.7368421052631579, "grad_norm": 3.0508310794830322, "learning_rate": 1.8781892119624578e-05, "loss": 1.499, "step": 280 }, { "epoch": 0.7368421052631579, "step": 280, "train_accuracy": 0.734375 }, { "epoch": 0.7394736842105263, "grad_norm": 5.437434673309326, "learning_rate": 1.877167770435357e-05, "loss": 1.3906, "step": 281 }, { "epoch": 0.7394736842105263, "step": 281, "train_accuracy": 0.71875 }, { "epoch": 0.7421052631578947, "grad_norm": 14.601706504821777, "learning_rate": 1.8761423442848655e-05, "loss": 1.4321, "step": 282 }, { "epoch": 0.7421052631578947, "step": 282, "train_accuracy": 0.703125 }, { "epoch": 0.7447368421052631, "grad_norm": 5.646981716156006, "learning_rate": 1.875112938169085e-05, "loss": 1.4238, "step": 283 }, { "epoch": 0.7447368421052631, "step": 283, "train_accuracy": 0.703125 }, { "epoch": 0.7473684210526316, "grad_norm": 3.7609148025512695, "learning_rate": 1.874079556764197e-05, "loss": 1.1875, "step": 284 }, { "epoch": 0.7473684210526316, "step": 284, "train_accuracy": 0.703125 }, { "epoch": 0.75, "grad_norm": 3.995249032974243, "learning_rate": 1.8730422047644417e-05, "loss": 1.5713, "step": 285 }, { "epoch": 0.75, "step": 285, "train_accuracy": 0.703125 }, { "epoch": 0.7526315789473684, "grad_norm": 13.330533027648926, "learning_rate": 1.8720008868820954e-05, "loss": 1.4902, "step": 286 }, { "epoch": 0.7526315789473684, "step": 286, "train_accuracy": 0.78125 }, { "epoch": 0.7552631578947369, "grad_norm": 9.846002578735352, "learning_rate": 1.8709556078474497e-05, "loss": 1.2695, "step": 287 }, { "epoch": 0.7552631578947369, "step": 287, "train_accuracy": 0.734375 }, { "epoch": 0.7578947368421053, "grad_norm": 2.9425392150878906, "learning_rate": 1.8699063724087905e-05, "loss": 1.1797, "step": 288 }, { "epoch": 0.7578947368421053, "step": 288, "train_accuracy": 0.625 }, { "epoch": 0.7605263157894737, "grad_norm": 9.827260971069336, "learning_rate": 1.868853185332376e-05, "loss": 1.5898, "step": 289 }, { "epoch": 0.7605263157894737, "step": 289, "train_accuracy": 0.71875 }, { "epoch": 0.7631578947368421, "grad_norm": 6.012570858001709, "learning_rate": 1.867796051402415e-05, "loss": 1.3945, "step": 290 }, { "epoch": 0.7631578947368421, "step": 290, "train_accuracy": 0.75 }, { "epoch": 0.7657894736842106, "grad_norm": 12.465781211853027, "learning_rate": 1.8667349754210456e-05, "loss": 1.1934, "step": 291 }, { "epoch": 0.7657894736842106, "step": 291, "train_accuracy": 0.75 }, { "epoch": 0.7684210526315789, "grad_norm": 11.289711952209473, "learning_rate": 1.865669962208313e-05, "loss": 1.2749, "step": 292 }, { "epoch": 0.7684210526315789, "step": 292, "train_accuracy": 0.671875 }, { "epoch": 0.7710526315789473, "grad_norm": 9.718608856201172, "learning_rate": 1.864601016602147e-05, "loss": 1.3511, "step": 293 }, { "epoch": 0.7710526315789473, "step": 293, "train_accuracy": 0.6875 }, { "epoch": 0.7736842105263158, "grad_norm": 14.851384162902832, "learning_rate": 1.863528143458342e-05, "loss": 1.3115, "step": 294 }, { "epoch": 0.7736842105263158, "step": 294, "train_accuracy": 0.578125 }, { "epoch": 0.7763157894736842, "grad_norm": 27.674755096435547, "learning_rate": 1.8624513476505316e-05, "loss": 1.9219, "step": 295 }, { "epoch": 0.7763157894736842, "step": 295, "train_accuracy": 0.671875 }, { "epoch": 0.7789473684210526, "grad_norm": 3.874488115310669, "learning_rate": 1.861370634070171e-05, "loss": 1.5938, "step": 296 }, { "epoch": 0.7789473684210526, "step": 296, "train_accuracy": 0.671875 }, { "epoch": 0.781578947368421, "grad_norm": 12.267851829528809, "learning_rate": 1.8602860076265107e-05, "loss": 1.2485, "step": 297 }, { "epoch": 0.781578947368421, "step": 297, "train_accuracy": 0.75 }, { "epoch": 0.7842105263157895, "grad_norm": 10.122313499450684, "learning_rate": 1.859197473246576e-05, "loss": 1.4756, "step": 298 }, { "epoch": 0.7842105263157895, "step": 298, "train_accuracy": 0.6875 }, { "epoch": 0.7868421052631579, "grad_norm": 11.50515079498291, "learning_rate": 1.8581050358751444e-05, "loss": 1.2676, "step": 299 }, { "epoch": 0.7868421052631579, "step": 299, "train_accuracy": 0.765625 }, { "epoch": 0.7894736842105263, "grad_norm": 12.21772289276123, "learning_rate": 1.857008700474723e-05, "loss": 1.1899, "step": 300 }, { "epoch": 0.7894736842105263, "step": 300, "train_accuracy": 0.71875 }, { "epoch": 0.7921052631578948, "grad_norm": 8.307805061340332, "learning_rate": 1.8559084720255276e-05, "loss": 1.418, "step": 301 }, { "epoch": 0.7921052631578948, "step": 301, "train_accuracy": 0.640625 }, { "epoch": 0.7947368421052632, "grad_norm": 4.6956610679626465, "learning_rate": 1.8548043555254556e-05, "loss": 1.4775, "step": 302 }, { "epoch": 0.7947368421052632, "step": 302, "train_accuracy": 0.703125 }, { "epoch": 0.7973684210526316, "grad_norm": 11.48013687133789, "learning_rate": 1.853696355990069e-05, "loss": 1.665, "step": 303 }, { "epoch": 0.7973684210526316, "step": 303, "train_accuracy": 0.828125 }, { "epoch": 0.8, "grad_norm": 15.654217720031738, "learning_rate": 1.852584478452568e-05, "loss": 1.2144, "step": 304 }, { "epoch": 0.8, "eval_accuracy": 0.7014451622962952, "eval_max_score": 4.8125, "eval_min_score": -5.15625, "eval_runtime": 151.0436, "eval_samples_per_second": 18.783, "eval_steps_per_second": 0.298, "step": 304 }, { "epoch": 0.8, "step": 304, "train_accuracy": 0.6875 }, { "epoch": 0.8026315789473685, "grad_norm": 6.716573238372803, "learning_rate": 1.8514687279637677e-05, "loss": 1.1758, "step": 305 }, { "epoch": 0.8026315789473685, "step": 305, "train_accuracy": 0.765625 }, { "epoch": 0.8052631578947368, "grad_norm": 12.827051162719727, "learning_rate": 1.8503491095920788e-05, "loss": 1.5117, "step": 306 }, { "epoch": 0.8052631578947368, "step": 306, "train_accuracy": 0.78125 }, { "epoch": 0.8078947368421052, "grad_norm": 15.274651527404785, "learning_rate": 1.849225628423481e-05, "loss": 1.377, "step": 307 }, { "epoch": 0.8078947368421052, "step": 307, "train_accuracy": 0.703125 }, { "epoch": 0.8105263157894737, "grad_norm": 12.7550048828125, "learning_rate": 1.8480982895615005e-05, "loss": 1.2598, "step": 308 }, { "epoch": 0.8105263157894737, "step": 308, "train_accuracy": 0.59375 }, { "epoch": 0.8131578947368421, "grad_norm": 12.460929870605469, "learning_rate": 1.846967098127189e-05, "loss": 1.3872, "step": 309 }, { "epoch": 0.8131578947368421, "step": 309, "train_accuracy": 0.640625 }, { "epoch": 0.8157894736842105, "grad_norm": 21.332063674926758, "learning_rate": 1.8458320592590976e-05, "loss": 1.4429, "step": 310 }, { "epoch": 0.8157894736842105, "step": 310, "train_accuracy": 0.734375 }, { "epoch": 0.8184210526315789, "grad_norm": 27.87535285949707, "learning_rate": 1.8446931781132553e-05, "loss": 1.7881, "step": 311 }, { "epoch": 0.8184210526315789, "step": 311, "train_accuracy": 0.671875 }, { "epoch": 0.8210526315789474, "grad_norm": 3.5377941131591797, "learning_rate": 1.843550459863145e-05, "loss": 1.417, "step": 312 }, { "epoch": 0.8210526315789474, "step": 312, "train_accuracy": 0.5625 }, { "epoch": 0.8236842105263158, "grad_norm": 21.97355842590332, "learning_rate": 1.8424039096996804e-05, "loss": 1.6807, "step": 313 }, { "epoch": 0.8236842105263158, "step": 313, "train_accuracy": 0.6875 }, { "epoch": 0.8263157894736842, "grad_norm": 27.53736686706543, "learning_rate": 1.8412535328311813e-05, "loss": 1.5273, "step": 314 }, { "epoch": 0.8263157894736842, "step": 314, "train_accuracy": 0.671875 }, { "epoch": 0.8289473684210527, "grad_norm": 15.879623413085938, "learning_rate": 1.8400993344833513e-05, "loss": 1.3857, "step": 315 }, { "epoch": 0.8289473684210527, "step": 315, "train_accuracy": 0.734375 }, { "epoch": 0.8315789473684211, "grad_norm": 4.012557029724121, "learning_rate": 1.8389413198992528e-05, "loss": 1.4648, "step": 316 }, { "epoch": 0.8315789473684211, "step": 316, "train_accuracy": 0.78125 }, { "epoch": 0.8342105263157895, "grad_norm": 11.08835220336914, "learning_rate": 1.8377794943392848e-05, "loss": 1.3896, "step": 317 }, { "epoch": 0.8342105263157895, "step": 317, "train_accuracy": 0.765625 }, { "epoch": 0.8368421052631579, "grad_norm": 24.538671493530273, "learning_rate": 1.8366138630811573e-05, "loss": 1.4434, "step": 318 }, { "epoch": 0.8368421052631579, "step": 318, "train_accuracy": 0.625 }, { "epoch": 0.8394736842105263, "grad_norm": 29.274654388427734, "learning_rate": 1.835444431419868e-05, "loss": 1.7217, "step": 319 }, { "epoch": 0.8394736842105263, "step": 319, "train_accuracy": 0.6875 }, { "epoch": 0.8421052631578947, "grad_norm": 12.992687225341797, "learning_rate": 1.834271204667679e-05, "loss": 1.459, "step": 320 }, { "epoch": 0.8421052631578947, "step": 320, "train_accuracy": 0.78125 }, { "epoch": 0.8447368421052631, "grad_norm": 14.843409538269043, "learning_rate": 1.8330941881540917e-05, "loss": 1.4219, "step": 321 }, { "epoch": 0.8447368421052631, "step": 321, "train_accuracy": 0.71875 }, { "epoch": 0.8473684210526315, "grad_norm": 35.8585319519043, "learning_rate": 1.8319133872258224e-05, "loss": 1.7939, "step": 322 }, { "epoch": 0.8473684210526315, "step": 322, "train_accuracy": 0.609375 }, { "epoch": 0.85, "grad_norm": 30.27836036682129, "learning_rate": 1.830728807246779e-05, "loss": 1.4526, "step": 323 }, { "epoch": 0.85, "step": 323, "train_accuracy": 0.71875 }, { "epoch": 0.8526315789473684, "grad_norm": 13.607982635498047, "learning_rate": 1.8295404535980357e-05, "loss": 1.4277, "step": 324 }, { "epoch": 0.8526315789473684, "step": 324, "train_accuracy": 0.640625 }, { "epoch": 0.8552631578947368, "grad_norm": 2.9818201065063477, "learning_rate": 1.8283483316778097e-05, "loss": 1.2144, "step": 325 }, { "epoch": 0.8552631578947368, "step": 325, "train_accuracy": 0.625 }, { "epoch": 0.8578947368421053, "grad_norm": 26.039756774902344, "learning_rate": 1.827152446901435e-05, "loss": 1.6626, "step": 326 }, { "epoch": 0.8578947368421053, "step": 326, "train_accuracy": 0.78125 }, { "epoch": 0.8605263157894737, "grad_norm": 34.77094268798828, "learning_rate": 1.82595280470134e-05, "loss": 1.8359, "step": 327 }, { "epoch": 0.8605263157894737, "step": 327, "train_accuracy": 0.703125 }, { "epoch": 0.8631578947368421, "grad_norm": 22.246620178222656, "learning_rate": 1.8247494105270198e-05, "loss": 1.5605, "step": 328 }, { "epoch": 0.8631578947368421, "step": 328, "train_accuracy": 0.703125 }, { "epoch": 0.8657894736842106, "grad_norm": 3.848651885986328, "learning_rate": 1.8235422698450153e-05, "loss": 1.3169, "step": 329 }, { "epoch": 0.8657894736842106, "step": 329, "train_accuracy": 0.671875 }, { "epoch": 0.868421052631579, "grad_norm": 15.153877258300781, "learning_rate": 1.8223313881388845e-05, "loss": 1.5996, "step": 330 }, { "epoch": 0.868421052631579, "step": 330, "train_accuracy": 0.6875 }, { "epoch": 0.8710526315789474, "grad_norm": 9.871405601501465, "learning_rate": 1.8211167709091805e-05, "loss": 1.2954, "step": 331 }, { "epoch": 0.8710526315789474, "step": 331, "train_accuracy": 0.703125 }, { "epoch": 0.8736842105263158, "grad_norm": 30.780324935913086, "learning_rate": 1.8198984236734246e-05, "loss": 1.7251, "step": 332 }, { "epoch": 0.8736842105263158, "step": 332, "train_accuracy": 0.765625 }, { "epoch": 0.8763157894736842, "grad_norm": 21.378108978271484, "learning_rate": 1.818676351966083e-05, "loss": 1.5273, "step": 333 }, { "epoch": 0.8763157894736842, "step": 333, "train_accuracy": 0.640625 }, { "epoch": 0.8789473684210526, "grad_norm": 5.893415451049805, "learning_rate": 1.81745056133854e-05, "loss": 1.3877, "step": 334 }, { "epoch": 0.8789473684210526, "step": 334, "train_accuracy": 0.734375 }, { "epoch": 0.881578947368421, "grad_norm": 12.588119506835938, "learning_rate": 1.8162210573590733e-05, "loss": 1.4844, "step": 335 }, { "epoch": 0.881578947368421, "step": 335, "train_accuracy": 0.78125 }, { "epoch": 0.8842105263157894, "grad_norm": 19.006074905395508, "learning_rate": 1.8149878456128296e-05, "loss": 1.4629, "step": 336 }, { "epoch": 0.8842105263157894, "step": 336, "train_accuracy": 0.75 }, { "epoch": 0.8868421052631579, "grad_norm": 15.575494766235352, "learning_rate": 1.8137509317017976e-05, "loss": 1.5322, "step": 337 }, { "epoch": 0.8868421052631579, "step": 337, "train_accuracy": 0.53125 }, { "epoch": 0.8894736842105263, "grad_norm": 8.873583793640137, "learning_rate": 1.8125103212447842e-05, "loss": 1.6191, "step": 338 }, { "epoch": 0.8894736842105263, "step": 338, "train_accuracy": 0.703125 }, { "epoch": 0.8921052631578947, "grad_norm": 3.71246337890625, "learning_rate": 1.8112660198773883e-05, "loss": 1.4995, "step": 339 }, { "epoch": 0.8921052631578947, "step": 339, "train_accuracy": 0.75 }, { "epoch": 0.8947368421052632, "grad_norm": 10.060272216796875, "learning_rate": 1.8100180332519746e-05, "loss": 1.2642, "step": 340 }, { "epoch": 0.8947368421052632, "step": 340, "train_accuracy": 0.671875 }, { "epoch": 0.8973684210526316, "grad_norm": 15.318142890930176, "learning_rate": 1.8087663670376483e-05, "loss": 1.5645, "step": 341 }, { "epoch": 0.8973684210526316, "step": 341, "train_accuracy": 0.640625 }, { "epoch": 0.9, "grad_norm": 18.948949813842773, "learning_rate": 1.80751102692023e-05, "loss": 1.6025, "step": 342 }, { "epoch": 0.9, "step": 342, "train_accuracy": 0.796875 }, { "epoch": 0.9026315789473685, "grad_norm": 5.428231716156006, "learning_rate": 1.80625201860223e-05, "loss": 1.4634, "step": 343 }, { "epoch": 0.9026315789473685, "step": 343, "train_accuracy": 0.78125 }, { "epoch": 0.9052631578947369, "grad_norm": 4.9712934494018555, "learning_rate": 1.8049893478028203e-05, "loss": 1.5166, "step": 344 }, { "epoch": 0.9052631578947369, "step": 344, "train_accuracy": 0.828125 }, { "epoch": 0.9078947368421053, "grad_norm": 15.7850341796875, "learning_rate": 1.803723020257811e-05, "loss": 1.4326, "step": 345 }, { "epoch": 0.9078947368421053, "step": 345, "train_accuracy": 0.6875 }, { "epoch": 0.9105263157894737, "grad_norm": 14.186681747436523, "learning_rate": 1.8024530417196228e-05, "loss": 1.5503, "step": 346 }, { "epoch": 0.9105263157894737, "step": 346, "train_accuracy": 0.65625 }, { "epoch": 0.9131578947368421, "grad_norm": 2.5712296962738037, "learning_rate": 1.8011794179572628e-05, "loss": 1.2954, "step": 347 }, { "epoch": 0.9131578947368421, "step": 347, "train_accuracy": 0.71875 }, { "epoch": 0.9157894736842105, "grad_norm": 3.0300934314727783, "learning_rate": 1.7999021547562943e-05, "loss": 1.4614, "step": 348 }, { "epoch": 0.9157894736842105, "step": 348, "train_accuracy": 0.71875 }, { "epoch": 0.9184210526315789, "grad_norm": 11.72201156616211, "learning_rate": 1.7986212579188163e-05, "loss": 1.312, "step": 349 }, { "epoch": 0.9184210526315789, "step": 349, "train_accuracy": 0.6875 }, { "epoch": 0.9210526315789473, "grad_norm": 2.3370659351348877, "learning_rate": 1.7973367332634314e-05, "loss": 1.3076, "step": 350 }, { "epoch": 0.9210526315789473, "step": 350, "train_accuracy": 0.703125 }, { "epoch": 0.9236842105263158, "grad_norm": 7.042386531829834, "learning_rate": 1.796048586625223e-05, "loss": 1.2827, "step": 351 }, { "epoch": 0.9236842105263158, "step": 351, "train_accuracy": 0.671875 }, { "epoch": 0.9263157894736842, "grad_norm": 3.6882100105285645, "learning_rate": 1.7947568238557282e-05, "loss": 1.6367, "step": 352 }, { "epoch": 0.9263157894736842, "step": 352, "train_accuracy": 0.671875 }, { "epoch": 0.9289473684210526, "grad_norm": 2.304743766784668, "learning_rate": 1.793461450822909e-05, "loss": 1.3926, "step": 353 }, { "epoch": 0.9289473684210526, "step": 353, "train_accuracy": 0.75 }, { "epoch": 0.9315789473684211, "grad_norm": 8.389480590820312, "learning_rate": 1.7921624734111292e-05, "loss": 1.3057, "step": 354 }, { "epoch": 0.9315789473684211, "step": 354, "train_accuracy": 0.8125 }, { "epoch": 0.9342105263157895, "grad_norm": 7.985996246337891, "learning_rate": 1.7908598975211256e-05, "loss": 1.3267, "step": 355 }, { "epoch": 0.9342105263157895, "step": 355, "train_accuracy": 0.59375 }, { "epoch": 0.9368421052631579, "grad_norm": 9.184244155883789, "learning_rate": 1.7895537290699806e-05, "loss": 1.4209, "step": 356 }, { "epoch": 0.9368421052631579, "step": 356, "train_accuracy": 0.796875 }, { "epoch": 0.9394736842105263, "grad_norm": 13.210733413696289, "learning_rate": 1.7882439739910964e-05, "loss": 1.3384, "step": 357 }, { "epoch": 0.9394736842105263, "step": 357, "train_accuracy": 0.71875 }, { "epoch": 0.9421052631578948, "grad_norm": 7.19890832901001, "learning_rate": 1.7869306382341682e-05, "loss": 1.4727, "step": 358 }, { "epoch": 0.9421052631578948, "step": 358, "train_accuracy": 0.703125 }, { "epoch": 0.9447368421052632, "grad_norm": 4.545177936553955, "learning_rate": 1.7856137277651567e-05, "loss": 1.395, "step": 359 }, { "epoch": 0.9447368421052632, "step": 359, "train_accuracy": 0.703125 }, { "epoch": 0.9473684210526315, "grad_norm": 14.59473991394043, "learning_rate": 1.784293248566261e-05, "loss": 1.3652, "step": 360 }, { "epoch": 0.9473684210526315, "step": 360, "train_accuracy": 0.703125 }, { "epoch": 0.95, "grad_norm": 3.9617745876312256, "learning_rate": 1.7829692066358914e-05, "loss": 1.2046, "step": 361 }, { "epoch": 0.95, "step": 361, "train_accuracy": 0.71875 }, { "epoch": 0.9526315789473684, "grad_norm": 12.78911018371582, "learning_rate": 1.7816416079886427e-05, "loss": 1.4561, "step": 362 }, { "epoch": 0.9526315789473684, "step": 362, "train_accuracy": 0.75 }, { "epoch": 0.9552631578947368, "grad_norm": 2.799234628677368, "learning_rate": 1.780310458655266e-05, "loss": 1.2793, "step": 363 }, { "epoch": 0.9552631578947368, "step": 363, "train_accuracy": 0.6875 }, { "epoch": 0.9578947368421052, "grad_norm": 4.567415714263916, "learning_rate": 1.7789757646826416e-05, "loss": 1.5762, "step": 364 }, { "epoch": 0.9578947368421052, "step": 364, "train_accuracy": 0.765625 }, { "epoch": 0.9605263157894737, "grad_norm": 2.880171775817871, "learning_rate": 1.7776375321337523e-05, "loss": 1.4688, "step": 365 }, { "epoch": 0.9605263157894737, "step": 365, "train_accuracy": 0.71875 }, { "epoch": 0.9631578947368421, "grad_norm": 4.474959373474121, "learning_rate": 1.7762957670876547e-05, "loss": 1.25, "step": 366 }, { "epoch": 0.9631578947368421, "step": 366, "train_accuracy": 0.671875 }, { "epoch": 0.9657894736842105, "grad_norm": 4.31421422958374, "learning_rate": 1.7749504756394528e-05, "loss": 1.5176, "step": 367 }, { "epoch": 0.9657894736842105, "step": 367, "train_accuracy": 0.625 }, { "epoch": 0.968421052631579, "grad_norm": 3.9677927494049072, "learning_rate": 1.7736016639002683e-05, "loss": 1.3076, "step": 368 }, { "epoch": 0.968421052631579, "step": 368, "train_accuracy": 0.671875 }, { "epoch": 0.9710526315789474, "grad_norm": 2.26666522026062, "learning_rate": 1.7722493379972163e-05, "loss": 1.4307, "step": 369 }, { "epoch": 0.9710526315789474, "step": 369, "train_accuracy": 0.703125 }, { "epoch": 0.9736842105263158, "grad_norm": 6.098262310028076, "learning_rate": 1.770893504073373e-05, "loss": 1.4111, "step": 370 }, { "epoch": 0.9736842105263158, "step": 370, "train_accuracy": 0.734375 }, { "epoch": 0.9763157894736842, "grad_norm": 8.281463623046875, "learning_rate": 1.769534168287752e-05, "loss": 1.4824, "step": 371 }, { "epoch": 0.9763157894736842, "step": 371, "train_accuracy": 0.6875 }, { "epoch": 0.9789473684210527, "grad_norm": 6.954835891723633, "learning_rate": 1.7681713368152733e-05, "loss": 1.4072, "step": 372 }, { "epoch": 0.9789473684210527, "step": 372, "train_accuracy": 0.6875 }, { "epoch": 0.9815789473684211, "grad_norm": 5.460451126098633, "learning_rate": 1.7668050158467367e-05, "loss": 1.5469, "step": 373 }, { "epoch": 0.9815789473684211, "step": 373, "train_accuracy": 0.765625 }, { "epoch": 0.9842105263157894, "grad_norm": 2.4143896102905273, "learning_rate": 1.765435211588794e-05, "loss": 1.2954, "step": 374 }, { "epoch": 0.9842105263157894, "step": 374, "train_accuracy": 0.71875 }, { "epoch": 0.9868421052631579, "grad_norm": 3.317214012145996, "learning_rate": 1.7640619302639194e-05, "loss": 1.2959, "step": 375 }, { "epoch": 0.9868421052631579, "step": 375, "train_accuracy": 0.65625 }, { "epoch": 0.9894736842105263, "grad_norm": 10.918193817138672, "learning_rate": 1.762685178110382e-05, "loss": 1.5464, "step": 376 }, { "epoch": 0.9894736842105263, "step": 376, "train_accuracy": 0.734375 }, { "epoch": 0.9921052631578947, "grad_norm": 5.174256801605225, "learning_rate": 1.7613049613822188e-05, "loss": 1.1704, "step": 377 }, { "epoch": 0.9921052631578947, "step": 377, "train_accuracy": 0.65625 }, { "epoch": 0.9947368421052631, "grad_norm": 4.161032199859619, "learning_rate": 1.759921286349203e-05, "loss": 1.2808, "step": 378 }, { "epoch": 0.9947368421052631, "step": 378, "train_accuracy": 0.625 }, { "epoch": 0.9973684210526316, "grad_norm": 8.024152755737305, "learning_rate": 1.7585341592968188e-05, "loss": 1.1694, "step": 379 }, { "epoch": 0.9973684210526316, "step": 379, "train_accuracy": 0.65625 }, { "epoch": 1.0, "grad_norm": 6.739095687866211, "learning_rate": 1.7571435865262314e-05, "loss": 1.3779, "step": 380 }, { "epoch": 1.0, "eval_accuracy": 0.7010927200317383, "eval_max_score": 5.375, "eval_min_score": -5.71875, "eval_runtime": 151.0289, "eval_samples_per_second": 18.784, "eval_steps_per_second": 0.298, "step": 380 }, { "epoch": 1.0, "step": 380, "train_accuracy": 0.828125 }, { "epoch": 1.0026315789473683, "grad_norm": 4.438934326171875, "learning_rate": 1.7557495743542586e-05, "loss": 1.0659, "step": 381 }, { "epoch": 1.0026315789473683, "step": 381, "train_accuracy": 0.796875 }, { "epoch": 1.0052631578947369, "grad_norm": 8.325281143188477, "learning_rate": 1.7543521291133413e-05, "loss": 1.2104, "step": 382 }, { "epoch": 1.0052631578947369, "step": 382, "train_accuracy": 0.765625 }, { "epoch": 1.0078947368421052, "grad_norm": 4.250606536865234, "learning_rate": 1.752951257151516e-05, "loss": 1.2192, "step": 383 }, { "epoch": 1.0078947368421052, "step": 383, "train_accuracy": 0.71875 }, { "epoch": 1.0105263157894737, "grad_norm": 8.005126953125, "learning_rate": 1.751546964832386e-05, "loss": 1.2773, "step": 384 }, { "epoch": 1.0105263157894737, "step": 384, "train_accuracy": 0.765625 }, { "epoch": 1.013157894736842, "grad_norm": 3.093712329864502, "learning_rate": 1.7501392585350903e-05, "loss": 1.1494, "step": 385 }, { "epoch": 1.013157894736842, "step": 385, "train_accuracy": 0.78125 }, { "epoch": 1.0157894736842106, "grad_norm": 4.209375381469727, "learning_rate": 1.7487281446542782e-05, "loss": 1.1475, "step": 386 }, { "epoch": 1.0157894736842106, "step": 386, "train_accuracy": 0.703125 }, { "epoch": 1.018421052631579, "grad_norm": 11.179862022399902, "learning_rate": 1.747313629600077e-05, "loss": 1.4062, "step": 387 }, { "epoch": 1.018421052631579, "step": 387, "train_accuracy": 0.71875 }, { "epoch": 1.0210526315789474, "grad_norm": 2.9365897178649902, "learning_rate": 1.745895719798065e-05, "loss": 1.2676, "step": 388 }, { "epoch": 1.0210526315789474, "step": 388, "train_accuracy": 0.859375 }, { "epoch": 1.0236842105263158, "grad_norm": 6.062369346618652, "learning_rate": 1.74447442168924e-05, "loss": 1.1021, "step": 389 }, { "epoch": 1.0236842105263158, "step": 389, "train_accuracy": 0.703125 }, { "epoch": 1.0263157894736843, "grad_norm": 4.148881435394287, "learning_rate": 1.743049741729993e-05, "loss": 1.2554, "step": 390 }, { "epoch": 1.0263157894736843, "step": 390, "train_accuracy": 0.78125 }, { "epoch": 1.0289473684210526, "grad_norm": 4.890829563140869, "learning_rate": 1.741621686392077e-05, "loss": 1.1567, "step": 391 }, { "epoch": 1.0289473684210526, "step": 391, "train_accuracy": 0.71875 }, { "epoch": 1.0315789473684212, "grad_norm": 5.793519973754883, "learning_rate": 1.740190262162578e-05, "loss": 1.2739, "step": 392 }, { "epoch": 1.0315789473684212, "step": 392, "train_accuracy": 0.765625 }, { "epoch": 1.0342105263157895, "grad_norm": 9.222723960876465, "learning_rate": 1.7387554755438857e-05, "loss": 1.3457, "step": 393 }, { "epoch": 1.0342105263157895, "step": 393, "train_accuracy": 0.796875 }, { "epoch": 1.0368421052631578, "grad_norm": 6.217665195465088, "learning_rate": 1.7373173330536628e-05, "loss": 1.4707, "step": 394 }, { "epoch": 1.0368421052631578, "step": 394, "train_accuracy": 0.734375 }, { "epoch": 1.0394736842105263, "grad_norm": 3.183716058731079, "learning_rate": 1.7358758412248176e-05, "loss": 1.1362, "step": 395 }, { "epoch": 1.0394736842105263, "step": 395, "train_accuracy": 0.703125 }, { "epoch": 1.0421052631578946, "grad_norm": 8.913907051086426, "learning_rate": 1.734431006605473e-05, "loss": 1.3467, "step": 396 }, { "epoch": 1.0421052631578946, "step": 396, "train_accuracy": 0.765625 }, { "epoch": 1.0447368421052632, "grad_norm": 14.859869003295898, "learning_rate": 1.7329828357589356e-05, "loss": 1.3984, "step": 397 }, { "epoch": 1.0447368421052632, "step": 397, "train_accuracy": 0.75 }, { "epoch": 1.0473684210526315, "grad_norm": 6.55880880355835, "learning_rate": 1.731531335263669e-05, "loss": 1.27, "step": 398 }, { "epoch": 1.0473684210526315, "step": 398, "train_accuracy": 0.71875 }, { "epoch": 1.05, "grad_norm": 3.913179874420166, "learning_rate": 1.7300765117132608e-05, "loss": 1.2803, "step": 399 }, { "epoch": 1.05, "step": 399, "train_accuracy": 0.828125 }, { "epoch": 1.0526315789473684, "grad_norm": 15.469132423400879, "learning_rate": 1.7286183717163942e-05, "loss": 1.2852, "step": 400 }, { "epoch": 1.0526315789473684, "step": 400, "train_accuracy": 0.734375 }, { "epoch": 1.055263157894737, "grad_norm": 17.792240142822266, "learning_rate": 1.7271569218968175e-05, "loss": 1.4697, "step": 401 }, { "epoch": 1.055263157894737, "step": 401, "train_accuracy": 0.71875 }, { "epoch": 1.0578947368421052, "grad_norm": 8.638894081115723, "learning_rate": 1.7256921688933145e-05, "loss": 1.5859, "step": 402 }, { "epoch": 1.0578947368421052, "step": 402, "train_accuracy": 0.75 }, { "epoch": 1.0605263157894738, "grad_norm": 4.139883995056152, "learning_rate": 1.7242241193596747e-05, "loss": 1.2485, "step": 403 }, { "epoch": 1.0605263157894738, "step": 403, "train_accuracy": 0.734375 }, { "epoch": 1.063157894736842, "grad_norm": 5.416935443878174, "learning_rate": 1.722752779964661e-05, "loss": 1.2969, "step": 404 }, { "epoch": 1.063157894736842, "step": 404, "train_accuracy": 0.6875 }, { "epoch": 1.0657894736842106, "grad_norm": 18.150854110717773, "learning_rate": 1.7212781573919818e-05, "loss": 1.2886, "step": 405 }, { "epoch": 1.0657894736842106, "step": 405, "train_accuracy": 0.734375 }, { "epoch": 1.068421052631579, "grad_norm": 17.88170623779297, "learning_rate": 1.7198002583402588e-05, "loss": 1.4951, "step": 406 }, { "epoch": 1.068421052631579, "step": 406, "train_accuracy": 0.734375 }, { "epoch": 1.0710526315789473, "grad_norm": 7.877701759338379, "learning_rate": 1.718319089522999e-05, "loss": 1.2217, "step": 407 }, { "epoch": 1.0710526315789473, "step": 407, "train_accuracy": 0.828125 }, { "epoch": 1.0736842105263158, "grad_norm": 3.5272531509399414, "learning_rate": 1.7168346576685616e-05, "loss": 1.1587, "step": 408 }, { "epoch": 1.0736842105263158, "step": 408, "train_accuracy": 0.75 }, { "epoch": 1.0763157894736841, "grad_norm": 4.579117298126221, "learning_rate": 1.7153469695201278e-05, "loss": 1.2354, "step": 409 }, { "epoch": 1.0763157894736841, "step": 409, "train_accuracy": 0.78125 }, { "epoch": 1.0789473684210527, "grad_norm": 14.453102111816406, "learning_rate": 1.713856031835672e-05, "loss": 1.374, "step": 410 }, { "epoch": 1.0789473684210527, "step": 410, "train_accuracy": 0.875 }, { "epoch": 1.081578947368421, "grad_norm": 7.408053874969482, "learning_rate": 1.7123618513879296e-05, "loss": 1.0933, "step": 411 }, { "epoch": 1.081578947368421, "step": 411, "train_accuracy": 0.8125 }, { "epoch": 1.0842105263157895, "grad_norm": 5.875626564025879, "learning_rate": 1.710864434964367e-05, "loss": 1.127, "step": 412 }, { "epoch": 1.0842105263157895, "step": 412, "train_accuracy": 0.734375 }, { "epoch": 1.0868421052631578, "grad_norm": 3.377965211868286, "learning_rate": 1.709363789367149e-05, "loss": 1.2173, "step": 413 }, { "epoch": 1.0868421052631578, "step": 413, "train_accuracy": 0.6875 }, { "epoch": 1.0894736842105264, "grad_norm": 3.2645585536956787, "learning_rate": 1.7078599214131105e-05, "loss": 1.3379, "step": 414 }, { "epoch": 1.0894736842105264, "step": 414, "train_accuracy": 0.765625 }, { "epoch": 1.0921052631578947, "grad_norm": 5.091524124145508, "learning_rate": 1.7063528379337238e-05, "loss": 1.272, "step": 415 }, { "epoch": 1.0921052631578947, "step": 415, "train_accuracy": 0.75 }, { "epoch": 1.0947368421052632, "grad_norm": 5.960602283477783, "learning_rate": 1.7048425457750685e-05, "loss": 1.272, "step": 416 }, { "epoch": 1.0947368421052632, "step": 416, "train_accuracy": 0.765625 }, { "epoch": 1.0973684210526315, "grad_norm": 4.821347713470459, "learning_rate": 1.7033290517977996e-05, "loss": 1.1245, "step": 417 }, { "epoch": 1.0973684210526315, "step": 417, "train_accuracy": 0.71875 }, { "epoch": 1.1, "grad_norm": 3.2067134380340576, "learning_rate": 1.7018123628771166e-05, "loss": 1.2695, "step": 418 }, { "epoch": 1.1, "step": 418, "train_accuracy": 0.734375 }, { "epoch": 1.1026315789473684, "grad_norm": 3.3393802642822266, "learning_rate": 1.7002924859027322e-05, "loss": 1.0884, "step": 419 }, { "epoch": 1.1026315789473684, "step": 419, "train_accuracy": 0.84375 }, { "epoch": 1.1052631578947367, "grad_norm": 4.24766731262207, "learning_rate": 1.698769427778842e-05, "loss": 0.9609, "step": 420 }, { "epoch": 1.1052631578947367, "step": 420, "train_accuracy": 0.75 }, { "epoch": 1.1078947368421053, "grad_norm": 13.302772521972656, "learning_rate": 1.6972431954240906e-05, "loss": 1.1318, "step": 421 }, { "epoch": 1.1078947368421053, "step": 421, "train_accuracy": 0.75 }, { "epoch": 1.1105263157894736, "grad_norm": 3.8378937244415283, "learning_rate": 1.6957137957715442e-05, "loss": 1.2275, "step": 422 }, { "epoch": 1.1105263157894736, "step": 422, "train_accuracy": 0.765625 }, { "epoch": 1.1131578947368421, "grad_norm": 5.301788806915283, "learning_rate": 1.6941812357686547e-05, "loss": 1.3276, "step": 423 }, { "epoch": 1.1131578947368421, "step": 423, "train_accuracy": 0.71875 }, { "epoch": 1.1157894736842104, "grad_norm": 3.5003018379211426, "learning_rate": 1.6926455223772317e-05, "loss": 1.2417, "step": 424 }, { "epoch": 1.1157894736842104, "step": 424, "train_accuracy": 0.796875 }, { "epoch": 1.118421052631579, "grad_norm": 7.428293704986572, "learning_rate": 1.6911066625734082e-05, "loss": 1.1333, "step": 425 }, { "epoch": 1.118421052631579, "step": 425, "train_accuracy": 0.765625 }, { "epoch": 1.1210526315789473, "grad_norm": 5.929080009460449, "learning_rate": 1.689564663347611e-05, "loss": 1.251, "step": 426 }, { "epoch": 1.1210526315789473, "step": 426, "train_accuracy": 0.625 }, { "epoch": 1.1236842105263158, "grad_norm": 4.198963165283203, "learning_rate": 1.6880195317045274e-05, "loss": 1.4849, "step": 427 }, { "epoch": 1.1236842105263158, "step": 427, "train_accuracy": 0.8125 }, { "epoch": 1.1263157894736842, "grad_norm": 5.26378059387207, "learning_rate": 1.6864712746630745e-05, "loss": 1.4326, "step": 428 }, { "epoch": 1.1263157894736842, "step": 428, "train_accuracy": 0.671875 }, { "epoch": 1.1289473684210527, "grad_norm": 9.662724494934082, "learning_rate": 1.6849198992563666e-05, "loss": 1.3887, "step": 429 }, { "epoch": 1.1289473684210527, "step": 429, "train_accuracy": 0.640625 }, { "epoch": 1.131578947368421, "grad_norm": 3.6311473846435547, "learning_rate": 1.6833654125316832e-05, "loss": 1.2671, "step": 430 }, { "epoch": 1.131578947368421, "step": 430, "train_accuracy": 0.734375 }, { "epoch": 1.1342105263157896, "grad_norm": 4.167621612548828, "learning_rate": 1.681807821550438e-05, "loss": 1.4048, "step": 431 }, { "epoch": 1.1342105263157896, "step": 431, "train_accuracy": 0.734375 }, { "epoch": 1.1368421052631579, "grad_norm": 2.69230318069458, "learning_rate": 1.6802471333881456e-05, "loss": 1.2554, "step": 432 }, { "epoch": 1.1368421052631579, "step": 432, "train_accuracy": 0.8125 }, { "epoch": 1.1394736842105262, "grad_norm": 3.8613479137420654, "learning_rate": 1.6786833551343896e-05, "loss": 1.1846, "step": 433 }, { "epoch": 1.1394736842105262, "step": 433, "train_accuracy": 0.8125 }, { "epoch": 1.1421052631578947, "grad_norm": 3.424915075302124, "learning_rate": 1.677116493892792e-05, "loss": 1.3999, "step": 434 }, { "epoch": 1.1421052631578947, "step": 434, "train_accuracy": 0.765625 }, { "epoch": 1.1447368421052633, "grad_norm": 6.499844551086426, "learning_rate": 1.6755465567809776e-05, "loss": 1.2183, "step": 435 }, { "epoch": 1.1447368421052633, "step": 435, "train_accuracy": 0.78125 }, { "epoch": 1.1473684210526316, "grad_norm": 8.367511749267578, "learning_rate": 1.6739735509305452e-05, "loss": 1.1621, "step": 436 }, { "epoch": 1.1473684210526316, "step": 436, "train_accuracy": 0.703125 }, { "epoch": 1.15, "grad_norm": 3.175813913345337, "learning_rate": 1.6723974834870327e-05, "loss": 1.2539, "step": 437 }, { "epoch": 1.15, "step": 437, "train_accuracy": 0.71875 }, { "epoch": 1.1526315789473685, "grad_norm": 2.990525245666504, "learning_rate": 1.6708183616098864e-05, "loss": 1.1562, "step": 438 }, { "epoch": 1.1526315789473685, "step": 438, "train_accuracy": 0.78125 }, { "epoch": 1.1552631578947368, "grad_norm": 2.673862934112549, "learning_rate": 1.669236192472427e-05, "loss": 1.1377, "step": 439 }, { "epoch": 1.1552631578947368, "step": 439, "train_accuracy": 0.765625 }, { "epoch": 1.1578947368421053, "grad_norm": 3.241729497909546, "learning_rate": 1.667650983261818e-05, "loss": 1.2139, "step": 440 }, { "epoch": 1.1578947368421053, "step": 440, "train_accuracy": 0.71875 }, { "epoch": 1.1605263157894736, "grad_norm": 3.701186418533325, "learning_rate": 1.6660627411790327e-05, "loss": 1.3027, "step": 441 }, { "epoch": 1.1605263157894736, "step": 441, "train_accuracy": 0.75 }, { "epoch": 1.1631578947368422, "grad_norm": 3.050072431564331, "learning_rate": 1.664471473438822e-05, "loss": 1.1738, "step": 442 }, { "epoch": 1.1631578947368422, "step": 442, "train_accuracy": 0.703125 }, { "epoch": 1.1657894736842105, "grad_norm": 8.023958206176758, "learning_rate": 1.66287718726968e-05, "loss": 1.2134, "step": 443 }, { "epoch": 1.1657894736842105, "step": 443, "train_accuracy": 0.71875 }, { "epoch": 1.168421052631579, "grad_norm": 7.616509914398193, "learning_rate": 1.6612798899138134e-05, "loss": 1.145, "step": 444 }, { "epoch": 1.168421052631579, "step": 444, "train_accuracy": 0.71875 }, { "epoch": 1.1710526315789473, "grad_norm": 8.092353820800781, "learning_rate": 1.6596795886271067e-05, "loss": 1.0767, "step": 445 }, { "epoch": 1.1710526315789473, "step": 445, "train_accuracy": 0.765625 }, { "epoch": 1.1736842105263159, "grad_norm": 9.653562545776367, "learning_rate": 1.6580762906790913e-05, "loss": 1.1504, "step": 446 }, { "epoch": 1.1736842105263159, "step": 446, "train_accuracy": 0.671875 }, { "epoch": 1.1763157894736842, "grad_norm": 4.4737067222595215, "learning_rate": 1.65647000335291e-05, "loss": 1.2695, "step": 447 }, { "epoch": 1.1763157894736842, "step": 447, "train_accuracy": 0.71875 }, { "epoch": 1.1789473684210527, "grad_norm": 5.050171375274658, "learning_rate": 1.6548607339452853e-05, "loss": 1.3516, "step": 448 }, { "epoch": 1.1789473684210527, "step": 448, "train_accuracy": 0.8125 }, { "epoch": 1.181578947368421, "grad_norm": 3.861908197402954, "learning_rate": 1.6532484897664868e-05, "loss": 1.0137, "step": 449 }, { "epoch": 1.181578947368421, "step": 449, "train_accuracy": 0.78125 }, { "epoch": 1.1842105263157894, "grad_norm": 3.3620777130126953, "learning_rate": 1.6516332781402965e-05, "loss": 1.1274, "step": 450 }, { "epoch": 1.1842105263157894, "step": 450, "train_accuracy": 0.78125 }, { "epoch": 1.186842105263158, "grad_norm": 3.629772186279297, "learning_rate": 1.6500151064039768e-05, "loss": 1.1787, "step": 451 }, { "epoch": 1.186842105263158, "step": 451, "train_accuracy": 0.765625 }, { "epoch": 1.1894736842105262, "grad_norm": 6.909435749053955, "learning_rate": 1.6483939819082368e-05, "loss": 1.1567, "step": 452 }, { "epoch": 1.1894736842105262, "step": 452, "train_accuracy": 0.796875 }, { "epoch": 1.1921052631578948, "grad_norm": 8.454055786132812, "learning_rate": 1.646769912017199e-05, "loss": 1.3823, "step": 453 }, { "epoch": 1.1921052631578948, "step": 453, "train_accuracy": 0.6875 }, { "epoch": 1.194736842105263, "grad_norm": 4.320656776428223, "learning_rate": 1.645142904108364e-05, "loss": 1.1353, "step": 454 }, { "epoch": 1.194736842105263, "step": 454, "train_accuracy": 0.65625 }, { "epoch": 1.1973684210526316, "grad_norm": 12.18797779083252, "learning_rate": 1.6435129655725813e-05, "loss": 1.4482, "step": 455 }, { "epoch": 1.1973684210526316, "step": 455, "train_accuracy": 0.71875 }, { "epoch": 1.2, "grad_norm": 3.5705020427703857, "learning_rate": 1.6418801038140114e-05, "loss": 1.1245, "step": 456 }, { "epoch": 1.2, "eval_accuracy": 0.7049700617790222, "eval_max_score": 5.6875, "eval_min_score": -7.0, "eval_runtime": 151.3491, "eval_samples_per_second": 18.745, "eval_steps_per_second": 0.297, "step": 456 }, { "epoch": 1.2, "step": 456, "train_accuracy": 0.828125 }, { "epoch": 1.2026315789473685, "grad_norm": 9.271757125854492, "learning_rate": 1.6402443262500936e-05, "loss": 1.3506, "step": 457 }, { "epoch": 1.2026315789473685, "step": 457, "train_accuracy": 0.765625 }, { "epoch": 1.2052631578947368, "grad_norm": 6.957113265991211, "learning_rate": 1.6386056403115135e-05, "loss": 1.3022, "step": 458 }, { "epoch": 1.2052631578947368, "step": 458, "train_accuracy": 0.75 }, { "epoch": 1.2078947368421054, "grad_norm": 3.5725152492523193, "learning_rate": 1.6369640534421675e-05, "loss": 1.1992, "step": 459 }, { "epoch": 1.2078947368421054, "step": 459, "train_accuracy": 0.765625 }, { "epoch": 1.2105263157894737, "grad_norm": 11.582955360412598, "learning_rate": 1.6353195730991308e-05, "loss": 1.2861, "step": 460 }, { "epoch": 1.2105263157894737, "step": 460, "train_accuracy": 0.75 }, { "epoch": 1.2131578947368422, "grad_norm": 6.4823527336120605, "learning_rate": 1.633672206752621e-05, "loss": 1.4482, "step": 461 }, { "epoch": 1.2131578947368422, "step": 461, "train_accuracy": 0.671875 }, { "epoch": 1.2157894736842105, "grad_norm": 15.534878730773926, "learning_rate": 1.6320219618859668e-05, "loss": 1.2925, "step": 462 }, { "epoch": 1.2157894736842105, "step": 462, "train_accuracy": 0.8125 }, { "epoch": 1.2184210526315788, "grad_norm": 4.061946868896484, "learning_rate": 1.6303688459955728e-05, "loss": 1.2837, "step": 463 }, { "epoch": 1.2184210526315788, "step": 463, "train_accuracy": 0.78125 }, { "epoch": 1.2210526315789474, "grad_norm": 2.7848575115203857, "learning_rate": 1.628712866590885e-05, "loss": 1.188, "step": 464 }, { "epoch": 1.2210526315789474, "step": 464, "train_accuracy": 0.8125 }, { "epoch": 1.2236842105263157, "grad_norm": 9.240684509277344, "learning_rate": 1.627054031194358e-05, "loss": 1.2993, "step": 465 }, { "epoch": 1.2236842105263157, "step": 465, "train_accuracy": 0.703125 }, { "epoch": 1.2263157894736842, "grad_norm": 3.1299266815185547, "learning_rate": 1.6253923473414185e-05, "loss": 1.3467, "step": 466 }, { "epoch": 1.2263157894736842, "step": 466, "train_accuracy": 0.78125 }, { "epoch": 1.2289473684210526, "grad_norm": 4.89497184753418, "learning_rate": 1.623727822580434e-05, "loss": 1.1733, "step": 467 }, { "epoch": 1.2289473684210526, "step": 467, "train_accuracy": 0.78125 }, { "epoch": 1.231578947368421, "grad_norm": 3.6426620483398438, "learning_rate": 1.6220604644726778e-05, "loss": 1.0483, "step": 468 }, { "epoch": 1.231578947368421, "step": 468, "train_accuracy": 0.6875 }, { "epoch": 1.2342105263157894, "grad_norm": 4.336991786956787, "learning_rate": 1.620390280592291e-05, "loss": 1.3164, "step": 469 }, { "epoch": 1.2342105263157894, "step": 469, "train_accuracy": 0.75 }, { "epoch": 1.236842105263158, "grad_norm": 9.696093559265137, "learning_rate": 1.6187172785262544e-05, "loss": 1.2285, "step": 470 }, { "epoch": 1.236842105263158, "step": 470, "train_accuracy": 0.84375 }, { "epoch": 1.2394736842105263, "grad_norm": 10.798023223876953, "learning_rate": 1.6170414658743488e-05, "loss": 1.2935, "step": 471 }, { "epoch": 1.2394736842105263, "step": 471, "train_accuracy": 0.828125 }, { "epoch": 1.2421052631578948, "grad_norm": 6.651049613952637, "learning_rate": 1.6153628502491228e-05, "loss": 1.0566, "step": 472 }, { "epoch": 1.2421052631578948, "step": 472, "train_accuracy": 0.703125 }, { "epoch": 1.2447368421052631, "grad_norm": 15.750227928161621, "learning_rate": 1.613681439275858e-05, "loss": 1.4531, "step": 473 }, { "epoch": 1.2447368421052631, "step": 473, "train_accuracy": 0.703125 }, { "epoch": 1.2473684210526317, "grad_norm": 12.205035209655762, "learning_rate": 1.6119972405925332e-05, "loss": 1.3672, "step": 474 }, { "epoch": 1.2473684210526317, "step": 474, "train_accuracy": 0.71875 }, { "epoch": 1.25, "grad_norm": 4.43392276763916, "learning_rate": 1.6103102618497922e-05, "loss": 1.3491, "step": 475 }, { "epoch": 1.25, "step": 475, "train_accuracy": 0.71875 }, { "epoch": 1.2526315789473683, "grad_norm": 4.819791793823242, "learning_rate": 1.6086205107109067e-05, "loss": 1.165, "step": 476 }, { "epoch": 1.2526315789473683, "step": 476, "train_accuracy": 0.734375 }, { "epoch": 1.2552631578947369, "grad_norm": 4.512659549713135, "learning_rate": 1.6069279948517416e-05, "loss": 1.146, "step": 477 }, { "epoch": 1.2552631578947369, "step": 477, "train_accuracy": 0.765625 }, { "epoch": 1.2578947368421054, "grad_norm": 11.076248168945312, "learning_rate": 1.6052327219607223e-05, "loss": 1.207, "step": 478 }, { "epoch": 1.2578947368421054, "step": 478, "train_accuracy": 0.71875 }, { "epoch": 1.2605263157894737, "grad_norm": 13.904632568359375, "learning_rate": 1.603534699738797e-05, "loss": 1.3438, "step": 479 }, { "epoch": 1.2605263157894737, "step": 479, "train_accuracy": 0.671875 }, { "epoch": 1.263157894736842, "grad_norm": 4.511821746826172, "learning_rate": 1.601833935899404e-05, "loss": 1.1997, "step": 480 }, { "epoch": 1.263157894736842, "step": 480, "train_accuracy": 0.765625 }, { "epoch": 1.2657894736842106, "grad_norm": 5.979182243347168, "learning_rate": 1.600130438168435e-05, "loss": 1.2031, "step": 481 }, { "epoch": 1.2657894736842106, "step": 481, "train_accuracy": 0.671875 }, { "epoch": 1.268421052631579, "grad_norm": 10.593663215637207, "learning_rate": 1.5984242142842003e-05, "loss": 1.293, "step": 482 }, { "epoch": 1.268421052631579, "step": 482, "train_accuracy": 0.6875 }, { "epoch": 1.2710526315789474, "grad_norm": 16.006187438964844, "learning_rate": 1.5967152719973954e-05, "loss": 1.4736, "step": 483 }, { "epoch": 1.2710526315789474, "step": 483, "train_accuracy": 0.78125 }, { "epoch": 1.2736842105263158, "grad_norm": 7.199328899383545, "learning_rate": 1.5950036190710637e-05, "loss": 1.3931, "step": 484 }, { "epoch": 1.2736842105263158, "step": 484, "train_accuracy": 0.671875 }, { "epoch": 1.2763157894736843, "grad_norm": 12.612074851989746, "learning_rate": 1.593289263280561e-05, "loss": 1.2217, "step": 485 }, { "epoch": 1.2763157894736843, "step": 485, "train_accuracy": 0.8125 }, { "epoch": 1.2789473684210526, "grad_norm": 13.020115852355957, "learning_rate": 1.5915722124135227e-05, "loss": 1.4404, "step": 486 }, { "epoch": 1.2789473684210526, "step": 486, "train_accuracy": 0.734375 }, { "epoch": 1.2815789473684212, "grad_norm": 15.411005020141602, "learning_rate": 1.5898524742698257e-05, "loss": 1.627, "step": 487 }, { "epoch": 1.2815789473684212, "step": 487, "train_accuracy": 0.703125 }, { "epoch": 1.2842105263157895, "grad_norm": 2.5929243564605713, "learning_rate": 1.5881300566615555e-05, "loss": 1.2729, "step": 488 }, { "epoch": 1.2842105263157895, "step": 488, "train_accuracy": 0.796875 }, { "epoch": 1.2868421052631578, "grad_norm": 19.66521644592285, "learning_rate": 1.5864049674129677e-05, "loss": 1.6582, "step": 489 }, { "epoch": 1.2868421052631578, "step": 489, "train_accuracy": 0.765625 }, { "epoch": 1.2894736842105263, "grad_norm": 7.616007328033447, "learning_rate": 1.584677214360455e-05, "loss": 1.2085, "step": 490 }, { "epoch": 1.2894736842105263, "step": 490, "train_accuracy": 0.65625 }, { "epoch": 1.2921052631578949, "grad_norm": 3.8917739391326904, "learning_rate": 1.5829468053525104e-05, "loss": 1.3672, "step": 491 }, { "epoch": 1.2921052631578949, "step": 491, "train_accuracy": 0.75 }, { "epoch": 1.2947368421052632, "grad_norm": 9.001242637634277, "learning_rate": 1.5812137482496925e-05, "loss": 1.4292, "step": 492 }, { "epoch": 1.2947368421052632, "step": 492, "train_accuracy": 0.78125 }, { "epoch": 1.2973684210526315, "grad_norm": 3.5048999786376953, "learning_rate": 1.5794780509245876e-05, "loss": 1.2676, "step": 493 }, { "epoch": 1.2973684210526315, "step": 493, "train_accuracy": 0.765625 }, { "epoch": 1.3, "grad_norm": 17.744800567626953, "learning_rate": 1.5777397212617776e-05, "loss": 1.4194, "step": 494 }, { "epoch": 1.3, "step": 494, "train_accuracy": 0.703125 }, { "epoch": 1.3026315789473684, "grad_norm": 15.553146362304688, "learning_rate": 1.5759987671578007e-05, "loss": 1.2896, "step": 495 }, { "epoch": 1.3026315789473684, "step": 495, "train_accuracy": 0.734375 }, { "epoch": 1.305263157894737, "grad_norm": 8.133227348327637, "learning_rate": 1.5742551965211167e-05, "loss": 1.1895, "step": 496 }, { "epoch": 1.305263157894737, "step": 496, "train_accuracy": 0.734375 }, { "epoch": 1.3078947368421052, "grad_norm": 3.2711598873138428, "learning_rate": 1.572509017272072e-05, "loss": 1.4629, "step": 497 }, { "epoch": 1.3078947368421052, "step": 497, "train_accuracy": 0.75 }, { "epoch": 1.3105263157894738, "grad_norm": 8.634398460388184, "learning_rate": 1.5707602373428628e-05, "loss": 1.3159, "step": 498 }, { "epoch": 1.3105263157894738, "step": 498, "train_accuracy": 0.6875 }, { "epoch": 1.313157894736842, "grad_norm": 7.313762664794922, "learning_rate": 1.5690088646774983e-05, "loss": 1.3647, "step": 499 }, { "epoch": 1.313157894736842, "step": 499, "train_accuracy": 0.78125 }, { "epoch": 1.3157894736842106, "grad_norm": 6.731086730957031, "learning_rate": 1.5672549072317667e-05, "loss": 1.1221, "step": 500 }, { "epoch": 1.3157894736842106, "step": 500, "train_accuracy": 0.609375 }, { "epoch": 1.318421052631579, "grad_norm": 6.832492351531982, "learning_rate": 1.5654983729731978e-05, "loss": 1.2764, "step": 501 }, { "epoch": 1.318421052631579, "step": 501, "train_accuracy": 0.765625 }, { "epoch": 1.3210526315789473, "grad_norm": 3.859423875808716, "learning_rate": 1.5637392698810247e-05, "loss": 1.2837, "step": 502 }, { "epoch": 1.3210526315789473, "step": 502, "train_accuracy": 0.828125 }, { "epoch": 1.3236842105263158, "grad_norm": 7.8143534660339355, "learning_rate": 1.5619776059461523e-05, "loss": 1.1621, "step": 503 }, { "epoch": 1.3236842105263158, "step": 503, "train_accuracy": 0.765625 }, { "epoch": 1.3263157894736843, "grad_norm": 4.160390853881836, "learning_rate": 1.5602133891711175e-05, "loss": 1.4302, "step": 504 }, { "epoch": 1.3263157894736843, "step": 504, "train_accuracy": 0.734375 }, { "epoch": 1.3289473684210527, "grad_norm": 2.8057210445404053, "learning_rate": 1.558446627570053e-05, "loss": 1.1064, "step": 505 }, { "epoch": 1.3289473684210527, "step": 505, "train_accuracy": 0.59375 }, { "epoch": 1.331578947368421, "grad_norm": 5.990983009338379, "learning_rate": 1.5566773291686527e-05, "loss": 1.4414, "step": 506 }, { "epoch": 1.331578947368421, "step": 506, "train_accuracy": 0.671875 }, { "epoch": 1.3342105263157895, "grad_norm": 8.961088180541992, "learning_rate": 1.554905502004133e-05, "loss": 1.3325, "step": 507 }, { "epoch": 1.3342105263157895, "step": 507, "train_accuracy": 0.75 }, { "epoch": 1.3368421052631578, "grad_norm": 5.393622875213623, "learning_rate": 1.5531311541251995e-05, "loss": 1.248, "step": 508 }, { "epoch": 1.3368421052631578, "step": 508, "train_accuracy": 0.640625 }, { "epoch": 1.3394736842105264, "grad_norm": 6.529601097106934, "learning_rate": 1.5513542935920058e-05, "loss": 1.3818, "step": 509 }, { "epoch": 1.3394736842105264, "step": 509, "train_accuracy": 0.796875 }, { "epoch": 1.3421052631578947, "grad_norm": 13.529115676879883, "learning_rate": 1.5495749284761213e-05, "loss": 1.5435, "step": 510 }, { "epoch": 1.3421052631578947, "step": 510, "train_accuracy": 0.6875 }, { "epoch": 1.3447368421052632, "grad_norm": 5.483336925506592, "learning_rate": 1.5477930668604917e-05, "loss": 1.2622, "step": 511 }, { "epoch": 1.3447368421052632, "step": 511, "train_accuracy": 0.6875 }, { "epoch": 1.3473684210526315, "grad_norm": 3.2029190063476562, "learning_rate": 1.5460087168394042e-05, "loss": 1.1821, "step": 512 }, { "epoch": 1.3473684210526315, "step": 512, "train_accuracy": 0.890625 }, { "epoch": 1.35, "grad_norm": 15.539850234985352, "learning_rate": 1.5442218865184493e-05, "loss": 1.0308, "step": 513 }, { "epoch": 1.35, "step": 513, "train_accuracy": 0.796875 }, { "epoch": 1.3526315789473684, "grad_norm": 5.350340366363525, "learning_rate": 1.5424325840144847e-05, "loss": 1.2207, "step": 514 }, { "epoch": 1.3526315789473684, "step": 514, "train_accuracy": 0.796875 }, { "epoch": 1.3552631578947367, "grad_norm": 8.735867500305176, "learning_rate": 1.5406408174555978e-05, "loss": 1.29, "step": 515 }, { "epoch": 1.3552631578947367, "step": 515, "train_accuracy": 0.78125 }, { "epoch": 1.3578947368421053, "grad_norm": 4.313580513000488, "learning_rate": 1.53884659498107e-05, "loss": 1.0488, "step": 516 }, { "epoch": 1.3578947368421053, "step": 516, "train_accuracy": 0.734375 }, { "epoch": 1.3605263157894738, "grad_norm": 5.2027387619018555, "learning_rate": 1.537049924741338e-05, "loss": 1.1738, "step": 517 }, { "epoch": 1.3605263157894738, "step": 517, "train_accuracy": 0.6875 }, { "epoch": 1.3631578947368421, "grad_norm": 7.884912967681885, "learning_rate": 1.5352508148979585e-05, "loss": 1.5806, "step": 518 }, { "epoch": 1.3631578947368421, "step": 518, "train_accuracy": 0.6875 }, { "epoch": 1.3657894736842104, "grad_norm": 3.1361117362976074, "learning_rate": 1.5334492736235703e-05, "loss": 1.144, "step": 519 }, { "epoch": 1.3657894736842104, "step": 519, "train_accuracy": 0.75 }, { "epoch": 1.368421052631579, "grad_norm": 8.120806694030762, "learning_rate": 1.5316453091018572e-05, "loss": 1.0537, "step": 520 }, { "epoch": 1.368421052631579, "step": 520, "train_accuracy": 0.6875 }, { "epoch": 1.3710526315789473, "grad_norm": 3.6984429359436035, "learning_rate": 1.5298389295275098e-05, "loss": 1.0972, "step": 521 }, { "epoch": 1.3710526315789473, "step": 521, "train_accuracy": 0.75 }, { "epoch": 1.3736842105263158, "grad_norm": 4.505008220672607, "learning_rate": 1.5280301431061907e-05, "loss": 1.4058, "step": 522 }, { "epoch": 1.3736842105263158, "step": 522, "train_accuracy": 0.734375 }, { "epoch": 1.3763157894736842, "grad_norm": 3.8629140853881836, "learning_rate": 1.5262189580544955e-05, "loss": 1.2212, "step": 523 }, { "epoch": 1.3763157894736842, "step": 523, "train_accuracy": 0.6875 }, { "epoch": 1.3789473684210527, "grad_norm": 6.1243109703063965, "learning_rate": 1.5244053825999152e-05, "loss": 1.3076, "step": 524 }, { "epoch": 1.3789473684210527, "step": 524, "train_accuracy": 0.828125 }, { "epoch": 1.381578947368421, "grad_norm": 13.835949897766113, "learning_rate": 1.5225894249808005e-05, "loss": 1.2031, "step": 525 }, { "epoch": 1.381578947368421, "step": 525, "train_accuracy": 0.765625 }, { "epoch": 1.3842105263157896, "grad_norm": 5.034552097320557, "learning_rate": 1.5207710934463218e-05, "loss": 1.3672, "step": 526 }, { "epoch": 1.3842105263157896, "step": 526, "train_accuracy": 0.71875 }, { "epoch": 1.3868421052631579, "grad_norm": 10.014500617980957, "learning_rate": 1.5189503962564347e-05, "loss": 1.1841, "step": 527 }, { "epoch": 1.3868421052631579, "step": 527, "train_accuracy": 0.8125 }, { "epoch": 1.3894736842105262, "grad_norm": 14.438556671142578, "learning_rate": 1.5171273416818406e-05, "loss": 1.3545, "step": 528 }, { "epoch": 1.3894736842105262, "step": 528, "train_accuracy": 0.6875 }, { "epoch": 1.3921052631578947, "grad_norm": 4.237582206726074, "learning_rate": 1.5153019380039493e-05, "loss": 1.3281, "step": 529 }, { "epoch": 1.3921052631578947, "step": 529, "train_accuracy": 0.640625 }, { "epoch": 1.3947368421052633, "grad_norm": 13.250812530517578, "learning_rate": 1.513474193514842e-05, "loss": 1.4199, "step": 530 }, { "epoch": 1.3947368421052633, "step": 530, "train_accuracy": 0.734375 }, { "epoch": 1.3973684210526316, "grad_norm": 9.62955379486084, "learning_rate": 1.5116441165172328e-05, "loss": 1.4917, "step": 531 }, { "epoch": 1.3973684210526316, "step": 531, "train_accuracy": 0.703125 }, { "epoch": 1.4, "grad_norm": 3.8189797401428223, "learning_rate": 1.5098117153244317e-05, "loss": 1.293, "step": 532 }, { "epoch": 1.4, "eval_accuracy": 0.7084949016571045, "eval_max_score": 5.59375, "eval_min_score": -5.59375, "eval_runtime": 151.3104, "eval_samples_per_second": 18.75, "eval_steps_per_second": 0.297, "step": 532 }, { "epoch": 1.4, "step": 532, "train_accuracy": 0.703125 }, { "epoch": 1.4026315789473685, "grad_norm": 7.4259843826293945, "learning_rate": 1.5079769982603067e-05, "loss": 1.3726, "step": 533 }, { "epoch": 1.4026315789473685, "step": 533, "train_accuracy": 0.75 }, { "epoch": 1.4052631578947368, "grad_norm": 3.3982386589050293, "learning_rate": 1.5061399736592457e-05, "loss": 1.2163, "step": 534 }, { "epoch": 1.4052631578947368, "step": 534, "train_accuracy": 0.703125 }, { "epoch": 1.4078947368421053, "grad_norm": 17.446184158325195, "learning_rate": 1.504300649866119e-05, "loss": 1.1636, "step": 535 }, { "epoch": 1.4078947368421053, "step": 535, "train_accuracy": 0.703125 }, { "epoch": 1.4105263157894736, "grad_norm": 9.651148796081543, "learning_rate": 1.5024590352362412e-05, "loss": 1.5947, "step": 536 }, { "epoch": 1.4105263157894736, "step": 536, "train_accuracy": 0.71875 }, { "epoch": 1.4131578947368422, "grad_norm": 4.520640850067139, "learning_rate": 1.5006151381353328e-05, "loss": 1.353, "step": 537 }, { "epoch": 1.4131578947368422, "step": 537, "train_accuracy": 0.75 }, { "epoch": 1.4157894736842105, "grad_norm": 4.592413902282715, "learning_rate": 1.4987689669394836e-05, "loss": 1.0796, "step": 538 }, { "epoch": 1.4157894736842105, "step": 538, "train_accuracy": 0.703125 }, { "epoch": 1.418421052631579, "grad_norm": 8.244244575500488, "learning_rate": 1.4969205300351128e-05, "loss": 1.2051, "step": 539 }, { "epoch": 1.418421052631579, "step": 539, "train_accuracy": 0.78125 }, { "epoch": 1.4210526315789473, "grad_norm": 22.695865631103516, "learning_rate": 1.4950698358189322e-05, "loss": 1.6543, "step": 540 }, { "epoch": 1.4210526315789473, "step": 540, "train_accuracy": 0.671875 }, { "epoch": 1.4236842105263157, "grad_norm": 14.040301322937012, "learning_rate": 1.4932168926979074e-05, "loss": 1.354, "step": 541 }, { "epoch": 1.4236842105263157, "step": 541, "train_accuracy": 0.796875 }, { "epoch": 1.4263157894736842, "grad_norm": 12.0126314163208, "learning_rate": 1.4913617090892206e-05, "loss": 1.3936, "step": 542 }, { "epoch": 1.4263157894736842, "step": 542, "train_accuracy": 0.8125 }, { "epoch": 1.4289473684210527, "grad_norm": 9.563346862792969, "learning_rate": 1.4895042934202306e-05, "loss": 1.1978, "step": 543 }, { "epoch": 1.4289473684210527, "step": 543, "train_accuracy": 0.6875 }, { "epoch": 1.431578947368421, "grad_norm": 15.859170913696289, "learning_rate": 1.4876446541284365e-05, "loss": 1.3774, "step": 544 }, { "epoch": 1.431578947368421, "step": 544, "train_accuracy": 0.859375 }, { "epoch": 1.4342105263157894, "grad_norm": 17.58700942993164, "learning_rate": 1.485782799661438e-05, "loss": 1.2646, "step": 545 }, { "epoch": 1.4342105263157894, "step": 545, "train_accuracy": 0.65625 }, { "epoch": 1.436842105263158, "grad_norm": 8.851125717163086, "learning_rate": 1.4839187384768971e-05, "loss": 1.2417, "step": 546 }, { "epoch": 1.436842105263158, "step": 546, "train_accuracy": 0.71875 }, { "epoch": 1.4394736842105262, "grad_norm": 3.8548920154571533, "learning_rate": 1.4820524790425007e-05, "loss": 1.2793, "step": 547 }, { "epoch": 1.4394736842105262, "step": 547, "train_accuracy": 0.703125 }, { "epoch": 1.4421052631578948, "grad_norm": 9.304672241210938, "learning_rate": 1.4801840298359216e-05, "loss": 1.314, "step": 548 }, { "epoch": 1.4421052631578948, "step": 548, "train_accuracy": 0.765625 }, { "epoch": 1.444736842105263, "grad_norm": 10.30390453338623, "learning_rate": 1.4783133993447789e-05, "loss": 1.3345, "step": 549 }, { "epoch": 1.444736842105263, "step": 549, "train_accuracy": 0.71875 }, { "epoch": 1.4473684210526316, "grad_norm": 10.108946800231934, "learning_rate": 1.4764405960666011e-05, "loss": 1.374, "step": 550 }, { "epoch": 1.4473684210526316, "step": 550, "train_accuracy": 0.71875 }, { "epoch": 1.45, "grad_norm": 4.669325351715088, "learning_rate": 1.4745656285087866e-05, "loss": 1.2256, "step": 551 }, { "epoch": 1.45, "step": 551, "train_accuracy": 0.8125 }, { "epoch": 1.4526315789473685, "grad_norm": 6.70346736907959, "learning_rate": 1.4726885051885654e-05, "loss": 1.3486, "step": 552 }, { "epoch": 1.4526315789473685, "step": 552, "train_accuracy": 0.84375 }, { "epoch": 1.4552631578947368, "grad_norm": 3.9752542972564697, "learning_rate": 1.4708092346329604e-05, "loss": 1.147, "step": 553 }, { "epoch": 1.4552631578947368, "step": 553, "train_accuracy": 0.8125 }, { "epoch": 1.4578947368421051, "grad_norm": 8.792821884155273, "learning_rate": 1.468927825378748e-05, "loss": 1.3594, "step": 554 }, { "epoch": 1.4578947368421051, "step": 554, "train_accuracy": 0.6875 }, { "epoch": 1.4605263157894737, "grad_norm": 5.539193153381348, "learning_rate": 1.4670442859724204e-05, "loss": 1.0762, "step": 555 }, { "epoch": 1.4605263157894737, "step": 555, "train_accuracy": 0.78125 }, { "epoch": 1.4631578947368422, "grad_norm": 9.367209434509277, "learning_rate": 1.4651586249701458e-05, "loss": 1.2529, "step": 556 }, { "epoch": 1.4631578947368422, "step": 556, "train_accuracy": 0.75 }, { "epoch": 1.4657894736842105, "grad_norm": 13.890340805053711, "learning_rate": 1.4632708509377305e-05, "loss": 1.3589, "step": 557 }, { "epoch": 1.4657894736842105, "step": 557, "train_accuracy": 0.75 }, { "epoch": 1.4684210526315788, "grad_norm": 3.0693435668945312, "learning_rate": 1.461380972450579e-05, "loss": 1.1621, "step": 558 }, { "epoch": 1.4684210526315788, "step": 558, "train_accuracy": 0.78125 }, { "epoch": 1.4710526315789474, "grad_norm": 9.083732604980469, "learning_rate": 1.4594889980936554e-05, "loss": 1.3843, "step": 559 }, { "epoch": 1.4710526315789474, "step": 559, "train_accuracy": 0.6875 }, { "epoch": 1.4736842105263157, "grad_norm": 8.28622055053711, "learning_rate": 1.4575949364614453e-05, "loss": 1.2578, "step": 560 }, { "epoch": 1.4736842105263157, "step": 560, "train_accuracy": 0.703125 }, { "epoch": 1.4763157894736842, "grad_norm": 10.74136734008789, "learning_rate": 1.4556987961579148e-05, "loss": 1.5986, "step": 561 }, { "epoch": 1.4763157894736842, "step": 561, "train_accuracy": 0.640625 }, { "epoch": 1.4789473684210526, "grad_norm": 5.316714763641357, "learning_rate": 1.4538005857964735e-05, "loss": 1.5508, "step": 562 }, { "epoch": 1.4789473684210526, "step": 562, "train_accuracy": 0.671875 }, { "epoch": 1.481578947368421, "grad_norm": 3.3459677696228027, "learning_rate": 1.451900313999934e-05, "loss": 1.332, "step": 563 }, { "epoch": 1.481578947368421, "step": 563, "train_accuracy": 0.8125 }, { "epoch": 1.4842105263157894, "grad_norm": 8.240880012512207, "learning_rate": 1.4499979894004733e-05, "loss": 1.001, "step": 564 }, { "epoch": 1.4842105263157894, "step": 564, "train_accuracy": 0.65625 }, { "epoch": 1.486842105263158, "grad_norm": 8.606131553649902, "learning_rate": 1.4480936206395936e-05, "loss": 1.436, "step": 565 }, { "epoch": 1.486842105263158, "step": 565, "train_accuracy": 0.765625 }, { "epoch": 1.4894736842105263, "grad_norm": 3.6511523723602295, "learning_rate": 1.4461872163680826e-05, "loss": 1.1348, "step": 566 }, { "epoch": 1.4894736842105263, "step": 566, "train_accuracy": 0.671875 }, { "epoch": 1.4921052631578946, "grad_norm": 5.405193328857422, "learning_rate": 1.4442787852459748e-05, "loss": 1.1343, "step": 567 }, { "epoch": 1.4921052631578946, "step": 567, "train_accuracy": 0.65625 }, { "epoch": 1.4947368421052631, "grad_norm": 5.214520454406738, "learning_rate": 1.4423683359425118e-05, "loss": 1.4219, "step": 568 }, { "epoch": 1.4947368421052631, "step": 568, "train_accuracy": 0.640625 }, { "epoch": 1.4973684210526317, "grad_norm": 2.837728500366211, "learning_rate": 1.4404558771361027e-05, "loss": 1.3423, "step": 569 }, { "epoch": 1.4973684210526317, "step": 569, "train_accuracy": 0.8125 }, { "epoch": 1.5, "grad_norm": 3.266947031021118, "learning_rate": 1.4385414175142855e-05, "loss": 1.1948, "step": 570 }, { "epoch": 1.5, "step": 570, "train_accuracy": 0.671875 }, { "epoch": 1.5026315789473683, "grad_norm": 12.204270362854004, "learning_rate": 1.4366249657736866e-05, "loss": 1.3867, "step": 571 }, { "epoch": 1.5026315789473683, "step": 571, "train_accuracy": 0.828125 }, { "epoch": 1.5052631578947369, "grad_norm": 6.56190299987793, "learning_rate": 1.4347065306199823e-05, "loss": 1.2837, "step": 572 }, { "epoch": 1.5052631578947369, "step": 572, "train_accuracy": 0.8125 }, { "epoch": 1.5078947368421054, "grad_norm": 5.147476673126221, "learning_rate": 1.4327861207678581e-05, "loss": 1.2783, "step": 573 }, { "epoch": 1.5078947368421054, "step": 573, "train_accuracy": 0.6875 }, { "epoch": 1.5105263157894737, "grad_norm": 3.911860704421997, "learning_rate": 1.4308637449409705e-05, "loss": 1.3105, "step": 574 }, { "epoch": 1.5105263157894737, "step": 574, "train_accuracy": 0.71875 }, { "epoch": 1.513157894736842, "grad_norm": 4.936273097991943, "learning_rate": 1.4289394118719061e-05, "loss": 1.4072, "step": 575 }, { "epoch": 1.513157894736842, "step": 575, "train_accuracy": 0.703125 }, { "epoch": 1.5157894736842106, "grad_norm": 4.839917182922363, "learning_rate": 1.4270131303021431e-05, "loss": 1.4307, "step": 576 }, { "epoch": 1.5157894736842106, "step": 576, "train_accuracy": 0.671875 }, { "epoch": 1.518421052631579, "grad_norm": 9.490140914916992, "learning_rate": 1.4250849089820095e-05, "loss": 1.3818, "step": 577 }, { "epoch": 1.518421052631579, "step": 577, "train_accuracy": 0.734375 }, { "epoch": 1.5210526315789474, "grad_norm": 3.4175305366516113, "learning_rate": 1.423154756670647e-05, "loss": 1.0977, "step": 578 }, { "epoch": 1.5210526315789474, "step": 578, "train_accuracy": 0.75 }, { "epoch": 1.5236842105263158, "grad_norm": 7.429515838623047, "learning_rate": 1.4212226821359672e-05, "loss": 1.1333, "step": 579 }, { "epoch": 1.5236842105263158, "step": 579, "train_accuracy": 0.796875 }, { "epoch": 1.526315789473684, "grad_norm": 4.309225559234619, "learning_rate": 1.4192886941546141e-05, "loss": 0.9233, "step": 580 }, { "epoch": 1.526315789473684, "step": 580, "train_accuracy": 0.765625 }, { "epoch": 1.5289473684210526, "grad_norm": 15.609049797058105, "learning_rate": 1.4173528015119247e-05, "loss": 1.4473, "step": 581 }, { "epoch": 1.5289473684210526, "step": 581, "train_accuracy": 0.640625 }, { "epoch": 1.5315789473684212, "grad_norm": 8.803685188293457, "learning_rate": 1.4154150130018867e-05, "loss": 1.293, "step": 582 }, { "epoch": 1.5315789473684212, "step": 582, "train_accuracy": 0.8125 }, { "epoch": 1.5342105263157895, "grad_norm": 3.669485092163086, "learning_rate": 1.4134753374271003e-05, "loss": 1.0322, "step": 583 }, { "epoch": 1.5342105263157895, "step": 583, "train_accuracy": 0.734375 }, { "epoch": 1.5368421052631578, "grad_norm": 6.106133460998535, "learning_rate": 1.4115337835987388e-05, "loss": 1.4111, "step": 584 }, { "epoch": 1.5368421052631578, "step": 584, "train_accuracy": 0.828125 }, { "epoch": 1.5394736842105263, "grad_norm": 5.285925388336182, "learning_rate": 1.4095903603365067e-05, "loss": 1.21, "step": 585 }, { "epoch": 1.5394736842105263, "step": 585, "train_accuracy": 0.734375 }, { "epoch": 1.5421052631578949, "grad_norm": 7.868580341339111, "learning_rate": 1.4076450764686005e-05, "loss": 1.2734, "step": 586 }, { "epoch": 1.5421052631578949, "step": 586, "train_accuracy": 0.828125 }, { "epoch": 1.5447368421052632, "grad_norm": 9.833069801330566, "learning_rate": 1.40569794083167e-05, "loss": 1.1172, "step": 587 }, { "epoch": 1.5447368421052632, "step": 587, "train_accuracy": 0.71875 }, { "epoch": 1.5473684210526315, "grad_norm": 12.843183517456055, "learning_rate": 1.4037489622707749e-05, "loss": 1.4414, "step": 588 }, { "epoch": 1.5473684210526315, "step": 588, "train_accuracy": 0.703125 }, { "epoch": 1.55, "grad_norm": 4.2173662185668945, "learning_rate": 1.4017981496393484e-05, "loss": 1.186, "step": 589 }, { "epoch": 1.55, "step": 589, "train_accuracy": 0.734375 }, { "epoch": 1.5526315789473686, "grad_norm": 9.550527572631836, "learning_rate": 1.3998455117991542e-05, "loss": 1.1289, "step": 590 }, { "epoch": 1.5526315789473686, "step": 590, "train_accuracy": 0.671875 }, { "epoch": 1.555263157894737, "grad_norm": 17.769779205322266, "learning_rate": 1.3978910576202472e-05, "loss": 1.9341, "step": 591 }, { "epoch": 1.555263157894737, "step": 591, "train_accuracy": 0.6875 }, { "epoch": 1.5578947368421052, "grad_norm": 11.969633102416992, "learning_rate": 1.395934795980933e-05, "loss": 1.4902, "step": 592 }, { "epoch": 1.5578947368421052, "step": 592, "train_accuracy": 0.71875 }, { "epoch": 1.5605263157894735, "grad_norm": 2.4993820190429688, "learning_rate": 1.3939767357677287e-05, "loss": 1.1899, "step": 593 }, { "epoch": 1.5605263157894735, "step": 593, "train_accuracy": 0.796875 }, { "epoch": 1.563157894736842, "grad_norm": 6.35914421081543, "learning_rate": 1.3920168858753208e-05, "loss": 1.2163, "step": 594 }, { "epoch": 1.563157894736842, "step": 594, "train_accuracy": 0.703125 }, { "epoch": 1.5657894736842106, "grad_norm": 9.9996337890625, "learning_rate": 1.3900552552065259e-05, "loss": 1.3228, "step": 595 }, { "epoch": 1.5657894736842106, "step": 595, "train_accuracy": 0.6875 }, { "epoch": 1.568421052631579, "grad_norm": 8.151016235351562, "learning_rate": 1.3880918526722497e-05, "loss": 1.5107, "step": 596 }, { "epoch": 1.568421052631579, "step": 596, "train_accuracy": 0.78125 }, { "epoch": 1.5710526315789473, "grad_norm": 5.698625564575195, "learning_rate": 1.3861266871914473e-05, "loss": 1.4619, "step": 597 }, { "epoch": 1.5710526315789473, "step": 597, "train_accuracy": 0.734375 }, { "epoch": 1.5736842105263158, "grad_norm": 10.527148246765137, "learning_rate": 1.3841597676910816e-05, "loss": 1.3984, "step": 598 }, { "epoch": 1.5736842105263158, "step": 598, "train_accuracy": 0.78125 }, { "epoch": 1.5763157894736843, "grad_norm": 4.670735836029053, "learning_rate": 1.3821911031060834e-05, "loss": 1.3779, "step": 599 }, { "epoch": 1.5763157894736843, "step": 599, "train_accuracy": 0.640625 }, { "epoch": 1.5789473684210527, "grad_norm": 5.227207183837891, "learning_rate": 1.3802207023793112e-05, "loss": 1.3438, "step": 600 }, { "epoch": 1.5789473684210527, "step": 600, "train_accuracy": 0.75 }, { "epoch": 1.581578947368421, "grad_norm": 7.337916851043701, "learning_rate": 1.3782485744615098e-05, "loss": 1.3682, "step": 601 }, { "epoch": 1.581578947368421, "step": 601, "train_accuracy": 0.703125 }, { "epoch": 1.5842105263157895, "grad_norm": 9.321327209472656, "learning_rate": 1.3762747283112692e-05, "loss": 1.3521, "step": 602 }, { "epoch": 1.5842105263157895, "step": 602, "train_accuracy": 0.625 }, { "epoch": 1.586842105263158, "grad_norm": 2.6610355377197266, "learning_rate": 1.3742991728949862e-05, "loss": 1.252, "step": 603 }, { "epoch": 1.586842105263158, "step": 603, "train_accuracy": 0.78125 }, { "epoch": 1.5894736842105264, "grad_norm": 4.487646102905273, "learning_rate": 1.3723219171868207e-05, "loss": 1.1855, "step": 604 }, { "epoch": 1.5894736842105264, "step": 604, "train_accuracy": 0.734375 }, { "epoch": 1.5921052631578947, "grad_norm": 2.9246814250946045, "learning_rate": 1.370342970168657e-05, "loss": 1.3052, "step": 605 }, { "epoch": 1.5921052631578947, "step": 605, "train_accuracy": 0.8125 }, { "epoch": 1.594736842105263, "grad_norm": 11.696772575378418, "learning_rate": 1.3683623408300626e-05, "loss": 1.2334, "step": 606 }, { "epoch": 1.594736842105263, "step": 606, "train_accuracy": 0.828125 }, { "epoch": 1.5973684210526315, "grad_norm": 10.42992115020752, "learning_rate": 1.3663800381682465e-05, "loss": 1.1255, "step": 607 }, { "epoch": 1.5973684210526315, "step": 607, "train_accuracy": 0.8125 }, { "epoch": 1.6, "grad_norm": 8.260746955871582, "learning_rate": 1.3643960711880191e-05, "loss": 1.1284, "step": 608 }, { "epoch": 1.6, "eval_accuracy": 0.7060275077819824, "eval_max_score": 5.46875, "eval_min_score": -6.65625, "eval_runtime": 151.2596, "eval_samples_per_second": 18.756, "eval_steps_per_second": 0.298, "step": 608 }, { "epoch": 1.6, "step": 608, "train_accuracy": 0.703125 }, { "epoch": 1.6026315789473684, "grad_norm": 3.6682422161102295, "learning_rate": 1.3624104489017513e-05, "loss": 1.2915, "step": 609 }, { "epoch": 1.6026315789473684, "step": 609, "train_accuracy": 0.859375 }, { "epoch": 1.6052631578947367, "grad_norm": 7.5316596031188965, "learning_rate": 1.3604231803293336e-05, "loss": 1.1611, "step": 610 }, { "epoch": 1.6052631578947367, "step": 610, "train_accuracy": 0.765625 }, { "epoch": 1.6078947368421053, "grad_norm": 10.11808967590332, "learning_rate": 1.3584342744981343e-05, "loss": 1.2471, "step": 611 }, { "epoch": 1.6078947368421053, "step": 611, "train_accuracy": 0.703125 }, { "epoch": 1.6105263157894738, "grad_norm": 5.6028242111206055, "learning_rate": 1.3564437404429595e-05, "loss": 1.5654, "step": 612 }, { "epoch": 1.6105263157894738, "step": 612, "train_accuracy": 0.703125 }, { "epoch": 1.6131578947368421, "grad_norm": 4.102721691131592, "learning_rate": 1.3544515872060118e-05, "loss": 1.1572, "step": 613 }, { "epoch": 1.6131578947368421, "step": 613, "train_accuracy": 0.671875 }, { "epoch": 1.6157894736842104, "grad_norm": 3.3373892307281494, "learning_rate": 1.3524578238368489e-05, "loss": 1.3301, "step": 614 }, { "epoch": 1.6157894736842104, "step": 614, "train_accuracy": 0.75 }, { "epoch": 1.618421052631579, "grad_norm": 19.588586807250977, "learning_rate": 1.350462459392343e-05, "loss": 1.3608, "step": 615 }, { "epoch": 1.618421052631579, "step": 615, "train_accuracy": 0.6875 }, { "epoch": 1.6210526315789475, "grad_norm": 4.01005220413208, "learning_rate": 1.3484655029366387e-05, "loss": 1.4072, "step": 616 }, { "epoch": 1.6210526315789475, "step": 616, "train_accuracy": 0.734375 }, { "epoch": 1.6236842105263158, "grad_norm": 3.243781566619873, "learning_rate": 1.3464669635411127e-05, "loss": 1.3027, "step": 617 }, { "epoch": 1.6236842105263158, "step": 617, "train_accuracy": 0.75 }, { "epoch": 1.6263157894736842, "grad_norm": 6.2855753898620605, "learning_rate": 1.344466850284333e-05, "loss": 1.209, "step": 618 }, { "epoch": 1.6263157894736842, "step": 618, "train_accuracy": 0.734375 }, { "epoch": 1.6289473684210525, "grad_norm": 8.140256881713867, "learning_rate": 1.3424651722520164e-05, "loss": 1.543, "step": 619 }, { "epoch": 1.6289473684210525, "step": 619, "train_accuracy": 0.71875 }, { "epoch": 1.631578947368421, "grad_norm": 4.431771278381348, "learning_rate": 1.3404619385369876e-05, "loss": 1.3579, "step": 620 }, { "epoch": 1.631578947368421, "step": 620, "train_accuracy": 0.78125 }, { "epoch": 1.6342105263157896, "grad_norm": 11.76955509185791, "learning_rate": 1.3384571582391392e-05, "loss": 1.1875, "step": 621 }, { "epoch": 1.6342105263157896, "step": 621, "train_accuracy": 0.625 }, { "epoch": 1.6368421052631579, "grad_norm": 6.960719585418701, "learning_rate": 1.3364508404653879e-05, "loss": 1.5352, "step": 622 }, { "epoch": 1.6368421052631579, "step": 622, "train_accuracy": 0.78125 }, { "epoch": 1.6394736842105262, "grad_norm": 4.3750762939453125, "learning_rate": 1.3344429943296358e-05, "loss": 1.3037, "step": 623 }, { "epoch": 1.6394736842105262, "step": 623, "train_accuracy": 0.8125 }, { "epoch": 1.6421052631578947, "grad_norm": 11.1054105758667, "learning_rate": 1.3324336289527272e-05, "loss": 1.4268, "step": 624 }, { "epoch": 1.6421052631578947, "step": 624, "train_accuracy": 0.609375 }, { "epoch": 1.6447368421052633, "grad_norm": 14.030521392822266, "learning_rate": 1.3304227534624072e-05, "loss": 1.4775, "step": 625 }, { "epoch": 1.6447368421052633, "step": 625, "train_accuracy": 0.75 }, { "epoch": 1.6473684210526316, "grad_norm": 12.589299201965332, "learning_rate": 1.328410376993282e-05, "loss": 1.2598, "step": 626 }, { "epoch": 1.6473684210526316, "step": 626, "train_accuracy": 0.625 }, { "epoch": 1.65, "grad_norm": 4.208714962005615, "learning_rate": 1.3263965086867752e-05, "loss": 1.4551, "step": 627 }, { "epoch": 1.65, "step": 627, "train_accuracy": 0.875 }, { "epoch": 1.6526315789473685, "grad_norm": 8.422722816467285, "learning_rate": 1.3243811576910873e-05, "loss": 1.1475, "step": 628 }, { "epoch": 1.6526315789473685, "step": 628, "train_accuracy": 0.765625 }, { "epoch": 1.655263157894737, "grad_norm": 11.778885841369629, "learning_rate": 1.3223643331611538e-05, "loss": 1.2666, "step": 629 }, { "epoch": 1.655263157894737, "step": 629, "train_accuracy": 0.765625 }, { "epoch": 1.6578947368421053, "grad_norm": 3.499476432800293, "learning_rate": 1.3203460442586052e-05, "loss": 1.1436, "step": 630 }, { "epoch": 1.6578947368421053, "step": 630, "train_accuracy": 0.796875 }, { "epoch": 1.6605263157894736, "grad_norm": 9.276093482971191, "learning_rate": 1.3183263001517224e-05, "loss": 1.2021, "step": 631 }, { "epoch": 1.6605263157894736, "step": 631, "train_accuracy": 0.875 }, { "epoch": 1.663157894736842, "grad_norm": 5.010464668273926, "learning_rate": 1.3163051100153979e-05, "loss": 0.9712, "step": 632 }, { "epoch": 1.663157894736842, "step": 632, "train_accuracy": 0.71875 }, { "epoch": 1.6657894736842105, "grad_norm": 4.382296085357666, "learning_rate": 1.314282483031092e-05, "loss": 1.3379, "step": 633 }, { "epoch": 1.6657894736842105, "step": 633, "train_accuracy": 0.765625 }, { "epoch": 1.668421052631579, "grad_norm": 8.575241088867188, "learning_rate": 1.3122584283867932e-05, "loss": 1.3179, "step": 634 }, { "epoch": 1.668421052631579, "step": 634, "train_accuracy": 0.71875 }, { "epoch": 1.6710526315789473, "grad_norm": 9.079877853393555, "learning_rate": 1.3102329552769742e-05, "loss": 1.1982, "step": 635 }, { "epoch": 1.6710526315789473, "step": 635, "train_accuracy": 0.6875 }, { "epoch": 1.6736842105263157, "grad_norm": 4.818666458129883, "learning_rate": 1.3082060729025515e-05, "loss": 1.3408, "step": 636 }, { "epoch": 1.6736842105263157, "step": 636, "train_accuracy": 0.75 }, { "epoch": 1.6763157894736842, "grad_norm": 4.9298601150512695, "learning_rate": 1.3061777904708437e-05, "loss": 1.3608, "step": 637 }, { "epoch": 1.6763157894736842, "step": 637, "train_accuracy": 0.734375 }, { "epoch": 1.6789473684210527, "grad_norm": 3.88163685798645, "learning_rate": 1.3041481171955293e-05, "loss": 1.2104, "step": 638 }, { "epoch": 1.6789473684210527, "step": 638, "train_accuracy": 0.796875 }, { "epoch": 1.681578947368421, "grad_norm": 3.570984125137329, "learning_rate": 1.3021170622966039e-05, "loss": 1.3003, "step": 639 }, { "epoch": 1.681578947368421, "step": 639, "train_accuracy": 0.671875 }, { "epoch": 1.6842105263157894, "grad_norm": 3.3834519386291504, "learning_rate": 1.300084635000341e-05, "loss": 1.293, "step": 640 }, { "epoch": 1.6842105263157894, "step": 640, "train_accuracy": 0.75 }, { "epoch": 1.686842105263158, "grad_norm": 6.387214660644531, "learning_rate": 1.298050844539246e-05, "loss": 1.3428, "step": 641 }, { "epoch": 1.686842105263158, "step": 641, "train_accuracy": 0.6875 }, { "epoch": 1.6894736842105265, "grad_norm": 5.291227340698242, "learning_rate": 1.2960157001520193e-05, "loss": 1.0625, "step": 642 }, { "epoch": 1.6894736842105265, "step": 642, "train_accuracy": 0.703125 }, { "epoch": 1.6921052631578948, "grad_norm": 8.750785827636719, "learning_rate": 1.2939792110835094e-05, "loss": 1.2295, "step": 643 }, { "epoch": 1.6921052631578948, "step": 643, "train_accuracy": 0.671875 }, { "epoch": 1.694736842105263, "grad_norm": 5.106113910675049, "learning_rate": 1.2919413865846744e-05, "loss": 1.4746, "step": 644 }, { "epoch": 1.694736842105263, "step": 644, "train_accuracy": 0.703125 }, { "epoch": 1.6973684210526314, "grad_norm": 3.0670228004455566, "learning_rate": 1.2899022359125381e-05, "loss": 1.3691, "step": 645 }, { "epoch": 1.6973684210526314, "step": 645, "train_accuracy": 0.765625 }, { "epoch": 1.7, "grad_norm": 11.231282234191895, "learning_rate": 1.2878617683301493e-05, "loss": 1.3296, "step": 646 }, { "epoch": 1.7, "step": 646, "train_accuracy": 0.78125 }, { "epoch": 1.7026315789473685, "grad_norm": 4.981436252593994, "learning_rate": 1.2858199931065382e-05, "loss": 1.2363, "step": 647 }, { "epoch": 1.7026315789473685, "step": 647, "train_accuracy": 0.65625 }, { "epoch": 1.7052631578947368, "grad_norm": 5.563715934753418, "learning_rate": 1.2837769195166757e-05, "loss": 1.2451, "step": 648 }, { "epoch": 1.7052631578947368, "step": 648, "train_accuracy": 0.65625 }, { "epoch": 1.7078947368421051, "grad_norm": 4.0916972160339355, "learning_rate": 1.2817325568414299e-05, "loss": 1.4819, "step": 649 }, { "epoch": 1.7078947368421051, "step": 649, "train_accuracy": 0.734375 }, { "epoch": 1.7105263157894737, "grad_norm": 2.385584831237793, "learning_rate": 1.2796869143675254e-05, "loss": 1.3257, "step": 650 }, { "epoch": 1.7105263157894737, "step": 650, "train_accuracy": 0.828125 }, { "epoch": 1.7131578947368422, "grad_norm": 6.606967926025391, "learning_rate": 1.2776400013875006e-05, "loss": 1.145, "step": 651 }, { "epoch": 1.7131578947368422, "step": 651, "train_accuracy": 0.671875 }, { "epoch": 1.7157894736842105, "grad_norm": 6.199295520782471, "learning_rate": 1.2755918271996645e-05, "loss": 1.3477, "step": 652 }, { "epoch": 1.7157894736842105, "step": 652, "train_accuracy": 0.734375 }, { "epoch": 1.7184210526315788, "grad_norm": 3.354233980178833, "learning_rate": 1.2735424011080562e-05, "loss": 1.2456, "step": 653 }, { "epoch": 1.7184210526315788, "step": 653, "train_accuracy": 0.75 }, { "epoch": 1.7210526315789474, "grad_norm": 2.5530078411102295, "learning_rate": 1.2714917324224003e-05, "loss": 1.3369, "step": 654 }, { "epoch": 1.7210526315789474, "step": 654, "train_accuracy": 0.703125 }, { "epoch": 1.723684210526316, "grad_norm": 2.3354504108428955, "learning_rate": 1.2694398304580677e-05, "loss": 1.2271, "step": 655 }, { "epoch": 1.723684210526316, "step": 655, "train_accuracy": 0.71875 }, { "epoch": 1.7263157894736842, "grad_norm": 4.240665435791016, "learning_rate": 1.2673867045360304e-05, "loss": 1.3633, "step": 656 }, { "epoch": 1.7263157894736842, "step": 656, "train_accuracy": 0.71875 }, { "epoch": 1.7289473684210526, "grad_norm": 2.5598838329315186, "learning_rate": 1.2653323639828208e-05, "loss": 1.3408, "step": 657 }, { "epoch": 1.7289473684210526, "step": 657, "train_accuracy": 0.796875 }, { "epoch": 1.731578947368421, "grad_norm": 4.531867027282715, "learning_rate": 1.2632768181304888e-05, "loss": 1.3018, "step": 658 }, { "epoch": 1.731578947368421, "step": 658, "train_accuracy": 0.828125 }, { "epoch": 1.7342105263157894, "grad_norm": 2.8843955993652344, "learning_rate": 1.2612200763165597e-05, "loss": 1.3086, "step": 659 }, { "epoch": 1.7342105263157894, "step": 659, "train_accuracy": 0.78125 }, { "epoch": 1.736842105263158, "grad_norm": 2.640134572982788, "learning_rate": 1.2591621478839911e-05, "loss": 1.231, "step": 660 }, { "epoch": 1.736842105263158, "step": 660, "train_accuracy": 0.703125 }, { "epoch": 1.7394736842105263, "grad_norm": 2.6228537559509277, "learning_rate": 1.2571030421811314e-05, "loss": 1.3301, "step": 661 }, { "epoch": 1.7394736842105263, "step": 661, "train_accuracy": 0.734375 }, { "epoch": 1.7421052631578946, "grad_norm": 4.74251127243042, "learning_rate": 1.2550427685616767e-05, "loss": 1.3125, "step": 662 }, { "epoch": 1.7421052631578946, "step": 662, "train_accuracy": 0.75 }, { "epoch": 1.7447368421052631, "grad_norm": 8.167641639709473, "learning_rate": 1.2529813363846284e-05, "loss": 1.3848, "step": 663 }, { "epoch": 1.7447368421052631, "step": 663, "train_accuracy": 0.625 }, { "epoch": 1.7473684210526317, "grad_norm": 3.8640027046203613, "learning_rate": 1.2509187550142507e-05, "loss": 1.2764, "step": 664 }, { "epoch": 1.7473684210526317, "step": 664, "train_accuracy": 0.703125 }, { "epoch": 1.75, "grad_norm": 3.9919190406799316, "learning_rate": 1.2488550338200285e-05, "loss": 1.3896, "step": 665 }, { "epoch": 1.75, "step": 665, "train_accuracy": 0.75 }, { "epoch": 1.7526315789473683, "grad_norm": 6.61153507232666, "learning_rate": 1.2467901821766241e-05, "loss": 1.0601, "step": 666 }, { "epoch": 1.7526315789473683, "step": 666, "train_accuracy": 0.671875 }, { "epoch": 1.7552631578947369, "grad_norm": 3.031205892562866, "learning_rate": 1.2447242094638349e-05, "loss": 1.3569, "step": 667 }, { "epoch": 1.7552631578947369, "step": 667, "train_accuracy": 0.71875 }, { "epoch": 1.7578947368421054, "grad_norm": 5.902942657470703, "learning_rate": 1.2426571250665517e-05, "loss": 1.1152, "step": 668 }, { "epoch": 1.7578947368421054, "step": 668, "train_accuracy": 0.734375 }, { "epoch": 1.7605263157894737, "grad_norm": 2.655332326889038, "learning_rate": 1.2405889383747144e-05, "loss": 1.3159, "step": 669 }, { "epoch": 1.7605263157894737, "step": 669, "train_accuracy": 0.71875 }, { "epoch": 1.763157894736842, "grad_norm": 5.942189693450928, "learning_rate": 1.2385196587832702e-05, "loss": 1.2368, "step": 670 }, { "epoch": 1.763157894736842, "step": 670, "train_accuracy": 0.71875 }, { "epoch": 1.7657894736842106, "grad_norm": 5.477725028991699, "learning_rate": 1.236449295692131e-05, "loss": 1.2134, "step": 671 }, { "epoch": 1.7657894736842106, "step": 671, "train_accuracy": 0.734375 }, { "epoch": 1.768421052631579, "grad_norm": 3.354971408843994, "learning_rate": 1.234377858506131e-05, "loss": 1.3511, "step": 672 }, { "epoch": 1.768421052631579, "step": 672, "train_accuracy": 0.71875 }, { "epoch": 1.7710526315789474, "grad_norm": 3.9351298809051514, "learning_rate": 1.2323053566349834e-05, "loss": 1.2578, "step": 673 }, { "epoch": 1.7710526315789474, "step": 673, "train_accuracy": 0.75 }, { "epoch": 1.7736842105263158, "grad_norm": 4.374091148376465, "learning_rate": 1.2302317994932373e-05, "loss": 1.3262, "step": 674 }, { "epoch": 1.7736842105263158, "step": 674, "train_accuracy": 0.734375 }, { "epoch": 1.776315789473684, "grad_norm": 7.173335552215576, "learning_rate": 1.2281571965002363e-05, "loss": 1.0754, "step": 675 }, { "epoch": 1.776315789473684, "step": 675, "train_accuracy": 0.75 }, { "epoch": 1.7789473684210526, "grad_norm": 6.442569732666016, "learning_rate": 1.2260815570800743e-05, "loss": 1.3569, "step": 676 }, { "epoch": 1.7789473684210526, "step": 676, "train_accuracy": 0.671875 }, { "epoch": 1.7815789473684212, "grad_norm": 3.3768608570098877, "learning_rate": 1.2240048906615536e-05, "loss": 1.373, "step": 677 }, { "epoch": 1.7815789473684212, "step": 677, "train_accuracy": 0.703125 }, { "epoch": 1.7842105263157895, "grad_norm": 8.247380256652832, "learning_rate": 1.2219272066781416e-05, "loss": 1.3501, "step": 678 }, { "epoch": 1.7842105263157895, "step": 678, "train_accuracy": 0.796875 }, { "epoch": 1.7868421052631578, "grad_norm": 3.1177291870117188, "learning_rate": 1.219848514567928e-05, "loss": 1.167, "step": 679 }, { "epoch": 1.7868421052631578, "step": 679, "train_accuracy": 0.84375 }, { "epoch": 1.7894736842105263, "grad_norm": 3.9514224529266357, "learning_rate": 1.2177688237735823e-05, "loss": 1.4126, "step": 680 }, { "epoch": 1.7894736842105263, "step": 680, "train_accuracy": 0.765625 }, { "epoch": 1.7921052631578949, "grad_norm": 5.616189002990723, "learning_rate": 1.2156881437423103e-05, "loss": 1.3857, "step": 681 }, { "epoch": 1.7921052631578949, "step": 681, "train_accuracy": 0.8125 }, { "epoch": 1.7947368421052632, "grad_norm": 12.491791725158691, "learning_rate": 1.2136064839258119e-05, "loss": 1.3823, "step": 682 }, { "epoch": 1.7947368421052632, "step": 682, "train_accuracy": 0.875 }, { "epoch": 1.7973684210526315, "grad_norm": 5.0712504386901855, "learning_rate": 1.2115238537802371e-05, "loss": 1.3481, "step": 683 }, { "epoch": 1.7973684210526315, "step": 683, "train_accuracy": 0.875 }, { "epoch": 1.8, "grad_norm": 2.5290355682373047, "learning_rate": 1.2094402627661447e-05, "loss": 1.0576, "step": 684 }, { "epoch": 1.8, "eval_accuracy": 0.7134296894073486, "eval_max_score": 5.09375, "eval_min_score": -5.46875, "eval_runtime": 151.5395, "eval_samples_per_second": 18.721, "eval_steps_per_second": 0.297, "step": 684 }, { "epoch": 1.8, "step": 684, "train_accuracy": 0.875 }, { "epoch": 1.8026315789473686, "grad_norm": 4.08900260925293, "learning_rate": 1.2073557203484571e-05, "loss": 1.0898, "step": 685 }, { "epoch": 1.8026315789473686, "step": 685, "train_accuracy": 0.796875 }, { "epoch": 1.805263157894737, "grad_norm": 3.661402940750122, "learning_rate": 1.2052702359964201e-05, "loss": 1.27, "step": 686 }, { "epoch": 1.805263157894737, "step": 686, "train_accuracy": 0.734375 }, { "epoch": 1.8078947368421052, "grad_norm": 3.4118309020996094, "learning_rate": 1.2031838191835569e-05, "loss": 1.416, "step": 687 }, { "epoch": 1.8078947368421052, "step": 687, "train_accuracy": 0.78125 }, { "epoch": 1.8105263157894735, "grad_norm": 7.626739978790283, "learning_rate": 1.2010964793876274e-05, "loss": 1.2495, "step": 688 }, { "epoch": 1.8105263157894735, "step": 688, "train_accuracy": 0.6875 }, { "epoch": 1.813157894736842, "grad_norm": 3.9986813068389893, "learning_rate": 1.1990082260905836e-05, "loss": 1.4053, "step": 689 }, { "epoch": 1.813157894736842, "step": 689, "train_accuracy": 0.828125 }, { "epoch": 1.8157894736842106, "grad_norm": 5.343590259552002, "learning_rate": 1.1969190687785278e-05, "loss": 1.1636, "step": 690 }, { "epoch": 1.8157894736842106, "step": 690, "train_accuracy": 0.671875 }, { "epoch": 1.818421052631579, "grad_norm": 3.1849772930145264, "learning_rate": 1.1948290169416682e-05, "loss": 1.3369, "step": 691 }, { "epoch": 1.818421052631579, "step": 691, "train_accuracy": 0.71875 }, { "epoch": 1.8210526315789473, "grad_norm": 8.547561645507812, "learning_rate": 1.1927380800742772e-05, "loss": 1.2246, "step": 692 }, { "epoch": 1.8210526315789473, "step": 692, "train_accuracy": 0.84375 }, { "epoch": 1.8236842105263158, "grad_norm": 4.1838202476501465, "learning_rate": 1.1906462676746471e-05, "loss": 1.1265, "step": 693 }, { "epoch": 1.8236842105263158, "step": 693, "train_accuracy": 0.71875 }, { "epoch": 1.8263157894736843, "grad_norm": 3.1758480072021484, "learning_rate": 1.1885535892450473e-05, "loss": 1.1968, "step": 694 }, { "epoch": 1.8263157894736843, "step": 694, "train_accuracy": 0.75 }, { "epoch": 1.8289473684210527, "grad_norm": 3.9789164066314697, "learning_rate": 1.1864600542916813e-05, "loss": 1.4497, "step": 695 }, { "epoch": 1.8289473684210527, "step": 695, "train_accuracy": 0.71875 }, { "epoch": 1.831578947368421, "grad_norm": 3.6292474269866943, "learning_rate": 1.1843656723246442e-05, "loss": 1.4697, "step": 696 }, { "epoch": 1.831578947368421, "step": 696, "train_accuracy": 0.8125 }, { "epoch": 1.8342105263157895, "grad_norm": 4.709868907928467, "learning_rate": 1.1822704528578771e-05, "loss": 1.2129, "step": 697 }, { "epoch": 1.8342105263157895, "step": 697, "train_accuracy": 0.75 }, { "epoch": 1.836842105263158, "grad_norm": 2.8560163974761963, "learning_rate": 1.1801744054091275e-05, "loss": 1.2881, "step": 698 }, { "epoch": 1.836842105263158, "step": 698, "train_accuracy": 0.765625 }, { "epoch": 1.8394736842105264, "grad_norm": 3.929037094116211, "learning_rate": 1.1780775394999026e-05, "loss": 1.2056, "step": 699 }, { "epoch": 1.8394736842105264, "step": 699, "train_accuracy": 0.6875 }, { "epoch": 1.8421052631578947, "grad_norm": 4.683406352996826, "learning_rate": 1.1759798646554284e-05, "loss": 1.5181, "step": 700 }, { "epoch": 1.8421052631578947, "step": 700, "train_accuracy": 0.71875 }, { "epoch": 1.844736842105263, "grad_norm": 8.26242733001709, "learning_rate": 1.1738813904046044e-05, "loss": 1.3506, "step": 701 }, { "epoch": 1.844736842105263, "step": 701, "train_accuracy": 0.703125 }, { "epoch": 1.8473684210526315, "grad_norm": 4.075243949890137, "learning_rate": 1.1717821262799633e-05, "loss": 1.2554, "step": 702 }, { "epoch": 1.8473684210526315, "step": 702, "train_accuracy": 0.640625 }, { "epoch": 1.85, "grad_norm": 3.1833837032318115, "learning_rate": 1.1696820818176242e-05, "loss": 1.2778, "step": 703 }, { "epoch": 1.85, "step": 703, "train_accuracy": 0.734375 }, { "epoch": 1.8526315789473684, "grad_norm": 7.329124450683594, "learning_rate": 1.1675812665572522e-05, "loss": 1.1284, "step": 704 }, { "epoch": 1.8526315789473684, "step": 704, "train_accuracy": 0.765625 }, { "epoch": 1.8552631578947367, "grad_norm": 13.962849617004395, "learning_rate": 1.165479690042013e-05, "loss": 1.3296, "step": 705 }, { "epoch": 1.8552631578947367, "step": 705, "train_accuracy": 0.8125 }, { "epoch": 1.8578947368421053, "grad_norm": 5.744749069213867, "learning_rate": 1.1633773618185302e-05, "loss": 1.2402, "step": 706 }, { "epoch": 1.8578947368421053, "step": 706, "train_accuracy": 0.8125 }, { "epoch": 1.8605263157894738, "grad_norm": 3.8666727542877197, "learning_rate": 1.1612742914368436e-05, "loss": 1.2944, "step": 707 }, { "epoch": 1.8605263157894738, "step": 707, "train_accuracy": 0.671875 }, { "epoch": 1.8631578947368421, "grad_norm": 4.552417278289795, "learning_rate": 1.1591704884503625e-05, "loss": 1.4844, "step": 708 }, { "epoch": 1.8631578947368421, "step": 708, "train_accuracy": 0.78125 }, { "epoch": 1.8657894736842104, "grad_norm": 6.385856628417969, "learning_rate": 1.1570659624158252e-05, "loss": 1.1055, "step": 709 }, { "epoch": 1.8657894736842104, "step": 709, "train_accuracy": 0.828125 }, { "epoch": 1.868421052631579, "grad_norm": 9.559865951538086, "learning_rate": 1.154960722893254e-05, "loss": 1.3003, "step": 710 }, { "epoch": 1.868421052631579, "step": 710, "train_accuracy": 0.75 }, { "epoch": 1.8710526315789475, "grad_norm": 7.753382205963135, "learning_rate": 1.1528547794459128e-05, "loss": 1.314, "step": 711 }, { "epoch": 1.8710526315789475, "step": 711, "train_accuracy": 0.703125 }, { "epoch": 1.8736842105263158, "grad_norm": 6.830855369567871, "learning_rate": 1.1507481416402631e-05, "loss": 1.2871, "step": 712 }, { "epoch": 1.8736842105263158, "step": 712, "train_accuracy": 0.703125 }, { "epoch": 1.8763157894736842, "grad_norm": 9.252273559570312, "learning_rate": 1.14864081904592e-05, "loss": 1.4844, "step": 713 }, { "epoch": 1.8763157894736842, "step": 713, "train_accuracy": 0.6875 }, { "epoch": 1.8789473684210525, "grad_norm": 3.4712741374969482, "learning_rate": 1.1465328212356096e-05, "loss": 1.2153, "step": 714 }, { "epoch": 1.8789473684210525, "step": 714, "train_accuracy": 0.703125 }, { "epoch": 1.881578947368421, "grad_norm": 10.48102855682373, "learning_rate": 1.1444241577851259e-05, "loss": 1.3228, "step": 715 }, { "epoch": 1.881578947368421, "step": 715, "train_accuracy": 0.75 }, { "epoch": 1.8842105263157896, "grad_norm": 13.115138053894043, "learning_rate": 1.1423148382732854e-05, "loss": 1.3706, "step": 716 }, { "epoch": 1.8842105263157896, "step": 716, "train_accuracy": 0.703125 }, { "epoch": 1.8868421052631579, "grad_norm": 12.306279182434082, "learning_rate": 1.1402048722818862e-05, "loss": 1.3506, "step": 717 }, { "epoch": 1.8868421052631579, "step": 717, "train_accuracy": 0.75 }, { "epoch": 1.8894736842105262, "grad_norm": 5.738865375518799, "learning_rate": 1.1380942693956616e-05, "loss": 1.3467, "step": 718 }, { "epoch": 1.8894736842105262, "step": 718, "train_accuracy": 0.71875 }, { "epoch": 1.8921052631578947, "grad_norm": 11.618247032165527, "learning_rate": 1.1359830392022397e-05, "loss": 1.3203, "step": 719 }, { "epoch": 1.8921052631578947, "step": 719, "train_accuracy": 0.71875 }, { "epoch": 1.8947368421052633, "grad_norm": 10.782801628112793, "learning_rate": 1.1338711912920966e-05, "loss": 1.2524, "step": 720 }, { "epoch": 1.8947368421052633, "step": 720, "train_accuracy": 0.796875 }, { "epoch": 1.8973684210526316, "grad_norm": 10.74484920501709, "learning_rate": 1.1317587352585158e-05, "loss": 1.1309, "step": 721 }, { "epoch": 1.8973684210526316, "step": 721, "train_accuracy": 0.734375 }, { "epoch": 1.9, "grad_norm": 10.578213691711426, "learning_rate": 1.1296456806975425e-05, "loss": 1.2578, "step": 722 }, { "epoch": 1.9, "step": 722, "train_accuracy": 0.796875 }, { "epoch": 1.9026315789473685, "grad_norm": 6.203153610229492, "learning_rate": 1.1275320372079409e-05, "loss": 1.0459, "step": 723 }, { "epoch": 1.9026315789473685, "step": 723, "train_accuracy": 0.75 }, { "epoch": 1.905263157894737, "grad_norm": 3.5795280933380127, "learning_rate": 1.1254178143911505e-05, "loss": 1.2227, "step": 724 }, { "epoch": 1.905263157894737, "step": 724, "train_accuracy": 0.734375 }, { "epoch": 1.9078947368421053, "grad_norm": 3.7006609439849854, "learning_rate": 1.1233030218512424e-05, "loss": 1.1079, "step": 725 }, { "epoch": 1.9078947368421053, "step": 725, "train_accuracy": 0.71875 }, { "epoch": 1.9105263157894736, "grad_norm": 7.2685418128967285, "learning_rate": 1.121187669194876e-05, "loss": 1.1953, "step": 726 }, { "epoch": 1.9105263157894736, "step": 726, "train_accuracy": 0.734375 }, { "epoch": 1.913157894736842, "grad_norm": 10.036980628967285, "learning_rate": 1.1190717660312546e-05, "loss": 1.4414, "step": 727 }, { "epoch": 1.913157894736842, "step": 727, "train_accuracy": 0.703125 }, { "epoch": 1.9157894736842105, "grad_norm": 18.515457153320312, "learning_rate": 1.1169553219720828e-05, "loss": 1.5098, "step": 728 }, { "epoch": 1.9157894736842105, "step": 728, "train_accuracy": 0.640625 }, { "epoch": 1.918421052631579, "grad_norm": 10.847208023071289, "learning_rate": 1.1148383466315215e-05, "loss": 1.2827, "step": 729 }, { "epoch": 1.918421052631579, "step": 729, "train_accuracy": 0.828125 }, { "epoch": 1.9210526315789473, "grad_norm": 2.8338921070098877, "learning_rate": 1.112720849626146e-05, "loss": 1.2183, "step": 730 }, { "epoch": 1.9210526315789473, "step": 730, "train_accuracy": 0.71875 }, { "epoch": 1.9236842105263157, "grad_norm": 2.897954225540161, "learning_rate": 1.1106028405749005e-05, "loss": 1.228, "step": 731 }, { "epoch": 1.9236842105263157, "step": 731, "train_accuracy": 0.65625 }, { "epoch": 1.9263157894736842, "grad_norm": 4.838603973388672, "learning_rate": 1.108484329099056e-05, "loss": 1.2529, "step": 732 }, { "epoch": 1.9263157894736842, "step": 732, "train_accuracy": 0.796875 }, { "epoch": 1.9289473684210527, "grad_norm": 15.432750701904297, "learning_rate": 1.1063653248221647e-05, "loss": 1.1182, "step": 733 }, { "epoch": 1.9289473684210527, "step": 733, "train_accuracy": 0.703125 }, { "epoch": 1.931578947368421, "grad_norm": 24.764320373535156, "learning_rate": 1.1042458373700182e-05, "loss": 1.8174, "step": 734 }, { "epoch": 1.931578947368421, "step": 734, "train_accuracy": 0.75 }, { "epoch": 1.9342105263157894, "grad_norm": 7.018698692321777, "learning_rate": 1.102125876370603e-05, "loss": 1.25, "step": 735 }, { "epoch": 1.9342105263157894, "step": 735, "train_accuracy": 0.875 }, { "epoch": 1.936842105263158, "grad_norm": 6.331981658935547, "learning_rate": 1.1000054514540563e-05, "loss": 1.3584, "step": 736 }, { "epoch": 1.936842105263158, "step": 736, "train_accuracy": 0.703125 }, { "epoch": 1.9394736842105265, "grad_norm": 4.260552406311035, "learning_rate": 1.0978845722526233e-05, "loss": 0.9893, "step": 737 }, { "epoch": 1.9394736842105265, "step": 737, "train_accuracy": 0.828125 }, { "epoch": 1.9421052631578948, "grad_norm": 7.735119342803955, "learning_rate": 1.095763248400612e-05, "loss": 1.2583, "step": 738 }, { "epoch": 1.9421052631578948, "step": 738, "train_accuracy": 0.75 }, { "epoch": 1.944736842105263, "grad_norm": 8.540193557739258, "learning_rate": 1.093641489534351e-05, "loss": 1.2437, "step": 739 }, { "epoch": 1.944736842105263, "step": 739, "train_accuracy": 0.71875 }, { "epoch": 1.9473684210526314, "grad_norm": 22.239505767822266, "learning_rate": 1.0915193052921444e-05, "loss": 1.6079, "step": 740 }, { "epoch": 1.9473684210526314, "step": 740, "train_accuracy": 0.796875 }, { "epoch": 1.95, "grad_norm": 2.21054744720459, "learning_rate": 1.0893967053142296e-05, "loss": 1.0818, "step": 741 }, { "epoch": 1.95, "step": 741, "train_accuracy": 0.796875 }, { "epoch": 1.9526315789473685, "grad_norm": 7.5065131187438965, "learning_rate": 1.0872736992427313e-05, "loss": 1.3613, "step": 742 }, { "epoch": 1.9526315789473685, "step": 742, "train_accuracy": 0.78125 }, { "epoch": 1.9552631578947368, "grad_norm": 3.3990285396575928, "learning_rate": 1.0851502967216199e-05, "loss": 1.2324, "step": 743 }, { "epoch": 1.9552631578947368, "step": 743, "train_accuracy": 0.6875 }, { "epoch": 1.9578947368421051, "grad_norm": 5.191620826721191, "learning_rate": 1.0830265073966659e-05, "loss": 1.4365, "step": 744 }, { "epoch": 1.9578947368421051, "step": 744, "train_accuracy": 0.8125 }, { "epoch": 1.9605263157894737, "grad_norm": 12.306520462036133, "learning_rate": 1.0809023409153975e-05, "loss": 1.3271, "step": 745 }, { "epoch": 1.9605263157894737, "step": 745, "train_accuracy": 0.71875 }, { "epoch": 1.9631578947368422, "grad_norm": 18.141544342041016, "learning_rate": 1.078777806927056e-05, "loss": 1.5117, "step": 746 }, { "epoch": 1.9631578947368422, "step": 746, "train_accuracy": 0.71875 }, { "epoch": 1.9657894736842105, "grad_norm": 17.377368927001953, "learning_rate": 1.076652915082552e-05, "loss": 1.5439, "step": 747 }, { "epoch": 1.9657894736842105, "step": 747, "train_accuracy": 0.84375 }, { "epoch": 1.9684210526315788, "grad_norm": 7.152597427368164, "learning_rate": 1.0745276750344217e-05, "loss": 1.1499, "step": 748 }, { "epoch": 1.9684210526315788, "step": 748, "train_accuracy": 0.640625 }, { "epoch": 1.9710526315789474, "grad_norm": 9.160234451293945, "learning_rate": 1.0724020964367836e-05, "loss": 1.4727, "step": 749 }, { "epoch": 1.9710526315789474, "step": 749, "train_accuracy": 0.796875 }, { "epoch": 1.973684210526316, "grad_norm": 12.51110553741455, "learning_rate": 1.070276188945293e-05, "loss": 1.5566, "step": 750 }, { "epoch": 1.973684210526316, "step": 750, "train_accuracy": 0.78125 }, { "epoch": 1.9763157894736842, "grad_norm": 14.964673042297363, "learning_rate": 1.0681499622171006e-05, "loss": 1.2637, "step": 751 }, { "epoch": 1.9763157894736842, "step": 751, "train_accuracy": 0.8125 }, { "epoch": 1.9789473684210526, "grad_norm": 7.775798797607422, "learning_rate": 1.0660234259108058e-05, "loss": 1.0046, "step": 752 }, { "epoch": 1.9789473684210526, "step": 752, "train_accuracy": 0.78125 }, { "epoch": 1.981578947368421, "grad_norm": 6.908376216888428, "learning_rate": 1.0638965896864155e-05, "loss": 1.1677, "step": 753 }, { "epoch": 1.981578947368421, "step": 753, "train_accuracy": 0.8125 }, { "epoch": 1.9842105263157894, "grad_norm": 4.5304975509643555, "learning_rate": 1.0617694632052985e-05, "loss": 1.3921, "step": 754 }, { "epoch": 1.9842105263157894, "step": 754, "train_accuracy": 0.78125 }, { "epoch": 1.986842105263158, "grad_norm": 3.6965484619140625, "learning_rate": 1.0596420561301421e-05, "loss": 1.2329, "step": 755 }, { "epoch": 1.986842105263158, "step": 755, "train_accuracy": 0.6875 }, { "epoch": 1.9894736842105263, "grad_norm": 10.095308303833008, "learning_rate": 1.0575143781249085e-05, "loss": 1.3174, "step": 756 }, { "epoch": 1.9894736842105263, "step": 756, "train_accuracy": 0.71875 }, { "epoch": 1.9921052631578946, "grad_norm": 18.230331420898438, "learning_rate": 1.0553864388547898e-05, "loss": 1.417, "step": 757 }, { "epoch": 1.9921052631578946, "step": 757, "train_accuracy": 0.796875 }, { "epoch": 1.9947368421052631, "grad_norm": 15.718420028686523, "learning_rate": 1.0532582479861661e-05, "loss": 1.2627, "step": 758 }, { "epoch": 1.9947368421052631, "step": 758, "train_accuracy": 0.671875 }, { "epoch": 1.9973684210526317, "grad_norm": 5.491973400115967, "learning_rate": 1.05112981518656e-05, "loss": 1.3433, "step": 759 }, { "epoch": 1.9973684210526317, "step": 759, "train_accuracy": 0.875 }, { "epoch": 2.0, "grad_norm": 3.911191701889038, "learning_rate": 1.0490011501245922e-05, "loss": 0.9978, "step": 760 }, { "epoch": 2.0, "eval_accuracy": 0.7095523476600647, "eval_max_score": 5.4375, "eval_min_score": -7.53125, "eval_runtime": 151.3191, "eval_samples_per_second": 18.748, "eval_steps_per_second": 0.297, "step": 760 }, { "epoch": 2.0, "step": 760, "train_accuracy": 0.703125 }, { "epoch": 2.0026315789473683, "grad_norm": 11.913331985473633, "learning_rate": 1.0468722624699401e-05, "loss": 1.1938, "step": 761 }, { "epoch": 2.0026315789473683, "step": 761, "train_accuracy": 0.796875 }, { "epoch": 2.0052631578947366, "grad_norm": 3.7143356800079346, "learning_rate": 1.0447431618932908e-05, "loss": 0.9375, "step": 762 }, { "epoch": 2.0052631578947366, "step": 762, "train_accuracy": 0.6875 }, { "epoch": 2.0078947368421054, "grad_norm": 11.733597755432129, "learning_rate": 1.0426138580662994e-05, "loss": 1.0718, "step": 763 }, { "epoch": 2.0078947368421054, "step": 763, "train_accuracy": 0.765625 }, { "epoch": 2.0105263157894737, "grad_norm": 5.847121715545654, "learning_rate": 1.040484360661544e-05, "loss": 0.9795, "step": 764 }, { "epoch": 2.0105263157894737, "step": 764, "train_accuracy": 0.78125 }, { "epoch": 2.013157894736842, "grad_norm": 10.243609428405762, "learning_rate": 1.0383546793524821e-05, "loss": 1.1621, "step": 765 }, { "epoch": 2.013157894736842, "step": 765, "train_accuracy": 0.828125 }, { "epoch": 2.0157894736842104, "grad_norm": 4.13088846206665, "learning_rate": 1.0362248238134069e-05, "loss": 1.2065, "step": 766 }, { "epoch": 2.0157894736842104, "step": 766, "train_accuracy": 0.78125 }, { "epoch": 2.018421052631579, "grad_norm": 13.116296768188477, "learning_rate": 1.0340948037194022e-05, "loss": 1.1763, "step": 767 }, { "epoch": 2.018421052631579, "step": 767, "train_accuracy": 0.875 }, { "epoch": 2.0210526315789474, "grad_norm": 9.769474983215332, "learning_rate": 1.0319646287463007e-05, "loss": 0.9854, "step": 768 }, { "epoch": 2.0210526315789474, "step": 768, "train_accuracy": 0.734375 }, { "epoch": 2.0236842105263158, "grad_norm": 5.3271870613098145, "learning_rate": 1.0298343085706373e-05, "loss": 1.1587, "step": 769 }, { "epoch": 2.0236842105263158, "step": 769, "train_accuracy": 0.765625 }, { "epoch": 2.026315789473684, "grad_norm": 5.8491339683532715, "learning_rate": 1.0277038528696069e-05, "loss": 1.1216, "step": 770 }, { "epoch": 2.026315789473684, "step": 770, "train_accuracy": 0.859375 }, { "epoch": 2.028947368421053, "grad_norm": 4.986546516418457, "learning_rate": 1.0255732713210207e-05, "loss": 0.8696, "step": 771 }, { "epoch": 2.028947368421053, "step": 771, "train_accuracy": 0.828125 }, { "epoch": 2.031578947368421, "grad_norm": 6.160093784332275, "learning_rate": 1.0234425736032607e-05, "loss": 0.8853, "step": 772 }, { "epoch": 2.031578947368421, "step": 772, "train_accuracy": 0.859375 }, { "epoch": 2.0342105263157895, "grad_norm": 5.357452869415283, "learning_rate": 1.021311769395237e-05, "loss": 1.061, "step": 773 }, { "epoch": 2.0342105263157895, "step": 773, "train_accuracy": 0.78125 }, { "epoch": 2.036842105263158, "grad_norm": 4.175728797912598, "learning_rate": 1.0191808683763435e-05, "loss": 1.1689, "step": 774 }, { "epoch": 2.036842105263158, "step": 774, "train_accuracy": 0.84375 }, { "epoch": 2.039473684210526, "grad_norm": 12.665717124938965, "learning_rate": 1.0170498802264137e-05, "loss": 1.2319, "step": 775 }, { "epoch": 2.039473684210526, "step": 775, "train_accuracy": 0.78125 }, { "epoch": 2.042105263157895, "grad_norm": 3.677302360534668, "learning_rate": 1.0149188146256772e-05, "loss": 1.1147, "step": 776 }, { "epoch": 2.042105263157895, "step": 776, "train_accuracy": 0.734375 }, { "epoch": 2.044736842105263, "grad_norm": 9.071887016296387, "learning_rate": 1.012787681254715e-05, "loss": 1.4141, "step": 777 }, { "epoch": 2.044736842105263, "step": 777, "train_accuracy": 0.796875 }, { "epoch": 2.0473684210526315, "grad_norm": 4.74293327331543, "learning_rate": 1.0106564897944161e-05, "loss": 1.2603, "step": 778 }, { "epoch": 2.0473684210526315, "step": 778, "train_accuracy": 0.796875 }, { "epoch": 2.05, "grad_norm": 6.72886323928833, "learning_rate": 1.0085252499259339e-05, "loss": 1.2246, "step": 779 }, { "epoch": 2.05, "step": 779, "train_accuracy": 0.875 }, { "epoch": 2.0526315789473686, "grad_norm": 7.605119705200195, "learning_rate": 1.0063939713306408e-05, "loss": 1.0317, "step": 780 }, { "epoch": 2.0526315789473686, "step": 780, "train_accuracy": 0.890625 }, { "epoch": 2.055263157894737, "grad_norm": 3.3540549278259277, "learning_rate": 1.0042626636900857e-05, "loss": 0.8687, "step": 781 }, { "epoch": 2.055263157894737, "step": 781, "train_accuracy": 0.765625 }, { "epoch": 2.057894736842105, "grad_norm": 3.413945436477661, "learning_rate": 1.0021313366859492e-05, "loss": 0.9585, "step": 782 }, { "epoch": 2.057894736842105, "step": 782, "train_accuracy": 0.859375 }, { "epoch": 2.0605263157894735, "grad_norm": 8.70964527130127, "learning_rate": 1e-05, "loss": 0.9995, "step": 783 }, { "epoch": 2.0605263157894735, "step": 783, "train_accuracy": 0.75 }, { "epoch": 2.0631578947368423, "grad_norm": 3.214557409286499, "learning_rate": 9.97868663314051e-06, "loss": 1.1606, "step": 784 }, { "epoch": 2.0631578947368423, "step": 784, "train_accuracy": 0.765625 }, { "epoch": 2.0657894736842106, "grad_norm": 3.9605302810668945, "learning_rate": 9.957373363099145e-06, "loss": 1.1567, "step": 785 }, { "epoch": 2.0657894736842106, "step": 785, "train_accuracy": 0.875 }, { "epoch": 2.068421052631579, "grad_norm": 4.05962610244751, "learning_rate": 9.936060286693592e-06, "loss": 1.2017, "step": 786 }, { "epoch": 2.068421052631579, "step": 786, "train_accuracy": 0.78125 }, { "epoch": 2.0710526315789473, "grad_norm": 5.312036991119385, "learning_rate": 9.914747500740664e-06, "loss": 1.1226, "step": 787 }, { "epoch": 2.0710526315789473, "step": 787, "train_accuracy": 0.890625 }, { "epoch": 2.0736842105263156, "grad_norm": 3.9408695697784424, "learning_rate": 9.893435102055837e-06, "loss": 0.9087, "step": 788 }, { "epoch": 2.0736842105263156, "step": 788, "train_accuracy": 0.859375 }, { "epoch": 2.0763157894736843, "grad_norm": 7.020488739013672, "learning_rate": 9.872123187452853e-06, "loss": 0.832, "step": 789 }, { "epoch": 2.0763157894736843, "step": 789, "train_accuracy": 0.765625 }, { "epoch": 2.0789473684210527, "grad_norm": 11.928730964660645, "learning_rate": 9.850811853743228e-06, "loss": 0.9893, "step": 790 }, { "epoch": 2.0789473684210527, "step": 790, "train_accuracy": 0.859375 }, { "epoch": 2.081578947368421, "grad_norm": 3.8629775047302246, "learning_rate": 9.829501197735866e-06, "loss": 1.0938, "step": 791 }, { "epoch": 2.081578947368421, "step": 791, "train_accuracy": 0.84375 }, { "epoch": 2.0842105263157893, "grad_norm": 5.992745876312256, "learning_rate": 9.808191316236567e-06, "loss": 1.0854, "step": 792 }, { "epoch": 2.0842105263157893, "step": 792, "train_accuracy": 0.8125 }, { "epoch": 2.086842105263158, "grad_norm": 13.866453170776367, "learning_rate": 9.786882306047634e-06, "loss": 1.1436, "step": 793 }, { "epoch": 2.086842105263158, "step": 793, "train_accuracy": 0.796875 }, { "epoch": 2.0894736842105264, "grad_norm": 6.951359748840332, "learning_rate": 9.765574263967397e-06, "loss": 0.9795, "step": 794 }, { "epoch": 2.0894736842105264, "step": 794, "train_accuracy": 0.78125 }, { "epoch": 2.0921052631578947, "grad_norm": 3.920224666595459, "learning_rate": 9.7442672867898e-06, "loss": 0.9082, "step": 795 }, { "epoch": 2.0921052631578947, "step": 795, "train_accuracy": 0.765625 }, { "epoch": 2.094736842105263, "grad_norm": 9.016509056091309, "learning_rate": 9.722961471303933e-06, "loss": 1.0391, "step": 796 }, { "epoch": 2.094736842105263, "step": 796, "train_accuracy": 0.765625 }, { "epoch": 2.0973684210526318, "grad_norm": 4.021119594573975, "learning_rate": 9.701656914293633e-06, "loss": 1.0806, "step": 797 }, { "epoch": 2.0973684210526318, "step": 797, "train_accuracy": 0.765625 }, { "epoch": 2.1, "grad_norm": 10.586660385131836, "learning_rate": 9.680353712536996e-06, "loss": 1.2656, "step": 798 }, { "epoch": 2.1, "step": 798, "train_accuracy": 0.71875 }, { "epoch": 2.1026315789473684, "grad_norm": 8.6112699508667, "learning_rate": 9.659051962805981e-06, "loss": 1.4502, "step": 799 }, { "epoch": 2.1026315789473684, "step": 799, "train_accuracy": 0.78125 }, { "epoch": 2.1052631578947367, "grad_norm": 11.76105785369873, "learning_rate": 9.637751761865935e-06, "loss": 1.4658, "step": 800 }, { "epoch": 2.1052631578947367, "step": 800, "train_accuracy": 0.90625 }, { "epoch": 2.1078947368421055, "grad_norm": 3.611421823501587, "learning_rate": 9.616453206475179e-06, "loss": 0.8423, "step": 801 }, { "epoch": 2.1078947368421055, "step": 801, "train_accuracy": 0.828125 }, { "epoch": 2.110526315789474, "grad_norm": 7.084579944610596, "learning_rate": 9.595156393384563e-06, "loss": 1.0503, "step": 802 }, { "epoch": 2.110526315789474, "step": 802, "train_accuracy": 0.6875 }, { "epoch": 2.113157894736842, "grad_norm": 7.71038293838501, "learning_rate": 9.573861419337006e-06, "loss": 1.0415, "step": 803 }, { "epoch": 2.113157894736842, "step": 803, "train_accuracy": 0.828125 }, { "epoch": 2.1157894736842104, "grad_norm": 3.812354564666748, "learning_rate": 9.552568381067094e-06, "loss": 1.0215, "step": 804 }, { "epoch": 2.1157894736842104, "step": 804, "train_accuracy": 0.78125 }, { "epoch": 2.1184210526315788, "grad_norm": 5.703183650970459, "learning_rate": 9.531277375300599e-06, "loss": 0.9658, "step": 805 }, { "epoch": 2.1184210526315788, "step": 805, "train_accuracy": 0.84375 }, { "epoch": 2.1210526315789475, "grad_norm": 4.764461517333984, "learning_rate": 9.50998849875408e-06, "loss": 1.0361, "step": 806 }, { "epoch": 2.1210526315789475, "step": 806, "train_accuracy": 0.859375 }, { "epoch": 2.123684210526316, "grad_norm": 3.6121408939361572, "learning_rate": 9.488701848134402e-06, "loss": 1.1016, "step": 807 }, { "epoch": 2.123684210526316, "step": 807, "train_accuracy": 0.84375 }, { "epoch": 2.126315789473684, "grad_norm": 5.072896480560303, "learning_rate": 9.467417520138342e-06, "loss": 1.1719, "step": 808 }, { "epoch": 2.126315789473684, "step": 808, "train_accuracy": 0.84375 }, { "epoch": 2.1289473684210525, "grad_norm": 7.073615550994873, "learning_rate": 9.446135611452104e-06, "loss": 1.0371, "step": 809 }, { "epoch": 2.1289473684210525, "step": 809, "train_accuracy": 0.84375 }, { "epoch": 2.1315789473684212, "grad_norm": 4.111389636993408, "learning_rate": 9.42485621875092e-06, "loss": 1.1528, "step": 810 }, { "epoch": 2.1315789473684212, "step": 810, "train_accuracy": 0.6875 }, { "epoch": 2.1342105263157896, "grad_norm": 5.092117786407471, "learning_rate": 9.40357943869858e-06, "loss": 1.2002, "step": 811 }, { "epoch": 2.1342105263157896, "step": 811, "train_accuracy": 0.78125 }, { "epoch": 2.136842105263158, "grad_norm": 6.998875617980957, "learning_rate": 9.382305367947018e-06, "loss": 1.1992, "step": 812 }, { "epoch": 2.136842105263158, "step": 812, "train_accuracy": 0.78125 }, { "epoch": 2.139473684210526, "grad_norm": 4.185628890991211, "learning_rate": 9.361034103135847e-06, "loss": 0.8735, "step": 813 }, { "epoch": 2.139473684210526, "step": 813, "train_accuracy": 0.8125 }, { "epoch": 2.1421052631578945, "grad_norm": 5.1156325340271, "learning_rate": 9.339765740891946e-06, "loss": 1.0894, "step": 814 }, { "epoch": 2.1421052631578945, "step": 814, "train_accuracy": 0.875 }, { "epoch": 2.1447368421052633, "grad_norm": 7.7576189041137695, "learning_rate": 9.318500377828998e-06, "loss": 0.9214, "step": 815 }, { "epoch": 2.1447368421052633, "step": 815, "train_accuracy": 0.765625 }, { "epoch": 2.1473684210526316, "grad_norm": 4.9661126136779785, "learning_rate": 9.297238110547075e-06, "loss": 1.2227, "step": 816 }, { "epoch": 2.1473684210526316, "step": 816, "train_accuracy": 0.828125 }, { "epoch": 2.15, "grad_norm": 3.897723913192749, "learning_rate": 9.275979035632167e-06, "loss": 0.9333, "step": 817 }, { "epoch": 2.15, "step": 817, "train_accuracy": 0.828125 }, { "epoch": 2.1526315789473682, "grad_norm": 13.26174545288086, "learning_rate": 9.254723249655784e-06, "loss": 1.1143, "step": 818 }, { "epoch": 2.1526315789473682, "step": 818, "train_accuracy": 0.75 }, { "epoch": 2.155263157894737, "grad_norm": 7.426464557647705, "learning_rate": 9.233470849174484e-06, "loss": 1.2188, "step": 819 }, { "epoch": 2.155263157894737, "step": 819, "train_accuracy": 0.921875 }, { "epoch": 2.1578947368421053, "grad_norm": 5.145357608795166, "learning_rate": 9.212221930729442e-06, "loss": 1.075, "step": 820 }, { "epoch": 2.1578947368421053, "step": 820, "train_accuracy": 0.859375 }, { "epoch": 2.1605263157894736, "grad_norm": 13.33061695098877, "learning_rate": 9.190976590846028e-06, "loss": 1.3589, "step": 821 }, { "epoch": 2.1605263157894736, "step": 821, "train_accuracy": 0.75 }, { "epoch": 2.163157894736842, "grad_norm": 15.381673812866211, "learning_rate": 9.169734926033343e-06, "loss": 1.6045, "step": 822 }, { "epoch": 2.163157894736842, "step": 822, "train_accuracy": 0.8125 }, { "epoch": 2.1657894736842107, "grad_norm": 5.126620769500732, "learning_rate": 9.148497032783804e-06, "loss": 1.2603, "step": 823 }, { "epoch": 2.1657894736842107, "step": 823, "train_accuracy": 0.78125 }, { "epoch": 2.168421052631579, "grad_norm": 6.827272891998291, "learning_rate": 9.127263007572688e-06, "loss": 1.0527, "step": 824 }, { "epoch": 2.168421052631579, "step": 824, "train_accuracy": 0.78125 }, { "epoch": 2.1710526315789473, "grad_norm": 11.240569114685059, "learning_rate": 9.106032946857708e-06, "loss": 1.0391, "step": 825 }, { "epoch": 2.1710526315789473, "step": 825, "train_accuracy": 0.78125 }, { "epoch": 2.1736842105263157, "grad_norm": 5.50578498840332, "learning_rate": 9.084806947078558e-06, "loss": 0.9722, "step": 826 }, { "epoch": 2.1736842105263157, "step": 826, "train_accuracy": 0.796875 }, { "epoch": 2.1763157894736844, "grad_norm": 6.608829021453857, "learning_rate": 9.063585104656494e-06, "loss": 1.0566, "step": 827 }, { "epoch": 2.1763157894736844, "step": 827, "train_accuracy": 0.796875 }, { "epoch": 2.1789473684210527, "grad_norm": 7.2398176193237305, "learning_rate": 9.042367515993884e-06, "loss": 1.0808, "step": 828 }, { "epoch": 2.1789473684210527, "step": 828, "train_accuracy": 0.828125 }, { "epoch": 2.181578947368421, "grad_norm": 5.127984046936035, "learning_rate": 9.021154277473772e-06, "loss": 1.063, "step": 829 }, { "epoch": 2.181578947368421, "step": 829, "train_accuracy": 0.78125 }, { "epoch": 2.1842105263157894, "grad_norm": 5.692725658416748, "learning_rate": 8.999945485459439e-06, "loss": 1.0215, "step": 830 }, { "epoch": 2.1842105263157894, "step": 830, "train_accuracy": 0.8125 }, { "epoch": 2.1868421052631577, "grad_norm": 9.340150833129883, "learning_rate": 8.978741236293972e-06, "loss": 1.0005, "step": 831 }, { "epoch": 2.1868421052631577, "step": 831, "train_accuracy": 0.765625 }, { "epoch": 2.1894736842105265, "grad_norm": 16.168376922607422, "learning_rate": 8.957541626299821e-06, "loss": 1.3848, "step": 832 }, { "epoch": 2.1894736842105265, "step": 832, "train_accuracy": 0.859375 }, { "epoch": 2.192105263157895, "grad_norm": 12.331050872802734, "learning_rate": 8.936346751778358e-06, "loss": 1.1753, "step": 833 }, { "epoch": 2.192105263157895, "step": 833, "train_accuracy": 0.78125 }, { "epoch": 2.194736842105263, "grad_norm": 5.23305082321167, "learning_rate": 8.915156709009445e-06, "loss": 1.0239, "step": 834 }, { "epoch": 2.194736842105263, "step": 834, "train_accuracy": 0.796875 }, { "epoch": 2.1973684210526314, "grad_norm": 4.159104347229004, "learning_rate": 8.893971594250998e-06, "loss": 1.105, "step": 835 }, { "epoch": 2.1973684210526314, "step": 835, "train_accuracy": 0.765625 }, { "epoch": 2.2, "grad_norm": 9.729438781738281, "learning_rate": 8.872791503738543e-06, "loss": 1.5513, "step": 836 }, { "epoch": 2.2, "eval_accuracy": 0.7173070311546326, "eval_max_score": 8.125, "eval_min_score": -8.625, "eval_runtime": 151.7446, "eval_samples_per_second": 18.696, "eval_steps_per_second": 0.297, "step": 836 }, { "epoch": 2.2, "step": 836, "train_accuracy": 0.90625 }, { "epoch": 2.2026315789473685, "grad_norm": 6.211835861206055, "learning_rate": 8.851616533684788e-06, "loss": 0.9358, "step": 837 }, { "epoch": 2.2026315789473685, "step": 837, "train_accuracy": 0.78125 }, { "epoch": 2.205263157894737, "grad_norm": 4.620204448699951, "learning_rate": 8.830446780279175e-06, "loss": 1.1157, "step": 838 }, { "epoch": 2.205263157894737, "step": 838, "train_accuracy": 0.765625 }, { "epoch": 2.207894736842105, "grad_norm": 11.254515647888184, "learning_rate": 8.809282339687457e-06, "loss": 1.0879, "step": 839 }, { "epoch": 2.207894736842105, "step": 839, "train_accuracy": 0.8125 }, { "epoch": 2.2105263157894735, "grad_norm": 4.592057704925537, "learning_rate": 8.788123308051244e-06, "loss": 1.0962, "step": 840 }, { "epoch": 2.2105263157894735, "step": 840, "train_accuracy": 0.796875 }, { "epoch": 2.213157894736842, "grad_norm": 7.248822212219238, "learning_rate": 8.766969781487579e-06, "loss": 1.0967, "step": 841 }, { "epoch": 2.213157894736842, "step": 841, "train_accuracy": 0.796875 }, { "epoch": 2.2157894736842105, "grad_norm": 4.9665985107421875, "learning_rate": 8.7458218560885e-06, "loss": 1.0249, "step": 842 }, { "epoch": 2.2157894736842105, "step": 842, "train_accuracy": 0.84375 }, { "epoch": 2.218421052631579, "grad_norm": 7.975915431976318, "learning_rate": 8.724679627920595e-06, "loss": 1.2192, "step": 843 }, { "epoch": 2.218421052631579, "step": 843, "train_accuracy": 0.765625 }, { "epoch": 2.221052631578947, "grad_norm": 3.6533076763153076, "learning_rate": 8.703543193024578e-06, "loss": 1.0068, "step": 844 }, { "epoch": 2.221052631578947, "step": 844, "train_accuracy": 0.859375 }, { "epoch": 2.223684210526316, "grad_norm": 4.886638164520264, "learning_rate": 8.682412647414845e-06, "loss": 0.9976, "step": 845 }, { "epoch": 2.223684210526316, "step": 845, "train_accuracy": 0.84375 }, { "epoch": 2.2263157894736842, "grad_norm": 8.30583381652832, "learning_rate": 8.661288087079038e-06, "loss": 1.0015, "step": 846 }, { "epoch": 2.2263157894736842, "step": 846, "train_accuracy": 0.890625 }, { "epoch": 2.2289473684210526, "grad_norm": 16.96817970275879, "learning_rate": 8.640169607977606e-06, "loss": 1.2681, "step": 847 }, { "epoch": 2.2289473684210526, "step": 847, "train_accuracy": 0.84375 }, { "epoch": 2.231578947368421, "grad_norm": 8.309298515319824, "learning_rate": 8.619057306043388e-06, "loss": 1.1211, "step": 848 }, { "epoch": 2.231578947368421, "step": 848, "train_accuracy": 0.75 }, { "epoch": 2.2342105263157896, "grad_norm": 4.597687244415283, "learning_rate": 8.597951277181143e-06, "loss": 0.9634, "step": 849 }, { "epoch": 2.2342105263157896, "step": 849, "train_accuracy": 0.890625 }, { "epoch": 2.236842105263158, "grad_norm": 6.457274436950684, "learning_rate": 8.576851617267151e-06, "loss": 1.0137, "step": 850 }, { "epoch": 2.236842105263158, "step": 850, "train_accuracy": 0.796875 }, { "epoch": 2.2394736842105263, "grad_norm": 9.658507347106934, "learning_rate": 8.555758422148746e-06, "loss": 1.0164, "step": 851 }, { "epoch": 2.2394736842105263, "step": 851, "train_accuracy": 0.8125 }, { "epoch": 2.2421052631578946, "grad_norm": 12.8440580368042, "learning_rate": 8.534671787643909e-06, "loss": 1.1245, "step": 852 }, { "epoch": 2.2421052631578946, "step": 852, "train_accuracy": 0.765625 }, { "epoch": 2.2447368421052634, "grad_norm": 9.525819778442383, "learning_rate": 8.513591809540804e-06, "loss": 1.1011, "step": 853 }, { "epoch": 2.2447368421052634, "step": 853, "train_accuracy": 0.828125 }, { "epoch": 2.2473684210526317, "grad_norm": 10.477482795715332, "learning_rate": 8.492518583597374e-06, "loss": 1.0659, "step": 854 }, { "epoch": 2.2473684210526317, "step": 854, "train_accuracy": 0.828125 }, { "epoch": 2.25, "grad_norm": 5.5181708335876465, "learning_rate": 8.471452205540873e-06, "loss": 1.1631, "step": 855 }, { "epoch": 2.25, "step": 855, "train_accuracy": 0.84375 }, { "epoch": 2.2526315789473683, "grad_norm": 4.574328422546387, "learning_rate": 8.450392771067463e-06, "loss": 0.8623, "step": 856 }, { "epoch": 2.2526315789473683, "step": 856, "train_accuracy": 0.796875 }, { "epoch": 2.2552631578947366, "grad_norm": 16.64838981628418, "learning_rate": 8.429340375841753e-06, "loss": 1.3433, "step": 857 }, { "epoch": 2.2552631578947366, "step": 857, "train_accuracy": 0.796875 }, { "epoch": 2.2578947368421054, "grad_norm": 7.746898174285889, "learning_rate": 8.408295115496376e-06, "loss": 1.147, "step": 858 }, { "epoch": 2.2578947368421054, "step": 858, "train_accuracy": 0.734375 }, { "epoch": 2.2605263157894737, "grad_norm": 4.429878234863281, "learning_rate": 8.387257085631567e-06, "loss": 1.0591, "step": 859 }, { "epoch": 2.2605263157894737, "step": 859, "train_accuracy": 0.796875 }, { "epoch": 2.263157894736842, "grad_norm": 8.77043342590332, "learning_rate": 8.366226381814698e-06, "loss": 1.0776, "step": 860 }, { "epoch": 2.263157894736842, "step": 860, "train_accuracy": 0.765625 }, { "epoch": 2.2657894736842104, "grad_norm": 11.112512588500977, "learning_rate": 8.345203099579874e-06, "loss": 1.1587, "step": 861 }, { "epoch": 2.2657894736842104, "step": 861, "train_accuracy": 0.8125 }, { "epoch": 2.268421052631579, "grad_norm": 3.823777914047241, "learning_rate": 8.32418733442748e-06, "loss": 0.9067, "step": 862 }, { "epoch": 2.268421052631579, "step": 862, "train_accuracy": 0.75 }, { "epoch": 2.2710526315789474, "grad_norm": 5.121189117431641, "learning_rate": 8.30317918182376e-06, "loss": 1.3232, "step": 863 }, { "epoch": 2.2710526315789474, "step": 863, "train_accuracy": 0.796875 }, { "epoch": 2.2736842105263158, "grad_norm": 13.663050651550293, "learning_rate": 8.282178737200369e-06, "loss": 1.1323, "step": 864 }, { "epoch": 2.2736842105263158, "step": 864, "train_accuracy": 0.71875 }, { "epoch": 2.276315789473684, "grad_norm": 11.7109956741333, "learning_rate": 8.261186095953959e-06, "loss": 1.2798, "step": 865 }, { "epoch": 2.276315789473684, "step": 865, "train_accuracy": 0.734375 }, { "epoch": 2.2789473684210524, "grad_norm": 3.7948272228240967, "learning_rate": 8.240201353445721e-06, "loss": 1.0508, "step": 866 }, { "epoch": 2.2789473684210524, "step": 866, "train_accuracy": 0.765625 }, { "epoch": 2.281578947368421, "grad_norm": 4.340938568115234, "learning_rate": 8.219224605000979e-06, "loss": 1.1313, "step": 867 }, { "epoch": 2.281578947368421, "step": 867, "train_accuracy": 0.75 }, { "epoch": 2.2842105263157895, "grad_norm": 9.902191162109375, "learning_rate": 8.198255945908727e-06, "loss": 1.1665, "step": 868 }, { "epoch": 2.2842105263157895, "step": 868, "train_accuracy": 0.78125 }, { "epoch": 2.286842105263158, "grad_norm": 3.9820899963378906, "learning_rate": 8.177295471421232e-06, "loss": 1.2061, "step": 869 }, { "epoch": 2.286842105263158, "step": 869, "train_accuracy": 0.71875 }, { "epoch": 2.2894736842105265, "grad_norm": 11.108955383300781, "learning_rate": 8.156343276753563e-06, "loss": 1.2837, "step": 870 }, { "epoch": 2.2894736842105265, "step": 870, "train_accuracy": 0.796875 }, { "epoch": 2.292105263157895, "grad_norm": 10.159497261047363, "learning_rate": 8.13539945708319e-06, "loss": 1.0798, "step": 871 }, { "epoch": 2.292105263157895, "step": 871, "train_accuracy": 0.90625 }, { "epoch": 2.294736842105263, "grad_norm": 5.310483932495117, "learning_rate": 8.114464107549532e-06, "loss": 0.9238, "step": 872 }, { "epoch": 2.294736842105263, "step": 872, "train_accuracy": 0.84375 }, { "epoch": 2.2973684210526315, "grad_norm": 3.798041582107544, "learning_rate": 8.09353732325353e-06, "loss": 1.0635, "step": 873 }, { "epoch": 2.2973684210526315, "step": 873, "train_accuracy": 0.859375 }, { "epoch": 2.3, "grad_norm": 7.455261707305908, "learning_rate": 8.072619199257232e-06, "loss": 1.1836, "step": 874 }, { "epoch": 2.3, "step": 874, "train_accuracy": 0.765625 }, { "epoch": 2.3026315789473686, "grad_norm": 5.471553325653076, "learning_rate": 8.05170983058332e-06, "loss": 1.064, "step": 875 }, { "epoch": 2.3026315789473686, "step": 875, "train_accuracy": 0.765625 }, { "epoch": 2.305263157894737, "grad_norm": 4.1305694580078125, "learning_rate": 8.030809312214726e-06, "loss": 1.1958, "step": 876 }, { "epoch": 2.305263157894737, "step": 876, "train_accuracy": 0.75 }, { "epoch": 2.307894736842105, "grad_norm": 5.602186679840088, "learning_rate": 8.009917739094164e-06, "loss": 1.1851, "step": 877 }, { "epoch": 2.307894736842105, "step": 877, "train_accuracy": 0.6875 }, { "epoch": 2.3105263157894735, "grad_norm": 6.608561038970947, "learning_rate": 7.98903520612373e-06, "loss": 1.2207, "step": 878 }, { "epoch": 2.3105263157894735, "step": 878, "train_accuracy": 0.796875 }, { "epoch": 2.3131578947368423, "grad_norm": 7.152756214141846, "learning_rate": 7.968161808164431e-06, "loss": 1.1006, "step": 879 }, { "epoch": 2.3131578947368423, "step": 879, "train_accuracy": 0.75 }, { "epoch": 2.3157894736842106, "grad_norm": 4.266940593719482, "learning_rate": 7.9472976400358e-06, "loss": 1.2261, "step": 880 }, { "epoch": 2.3157894736842106, "step": 880, "train_accuracy": 0.734375 }, { "epoch": 2.318421052631579, "grad_norm": 11.65020751953125, "learning_rate": 7.926442796515429e-06, "loss": 1.501, "step": 881 }, { "epoch": 2.318421052631579, "step": 881, "train_accuracy": 0.84375 }, { "epoch": 2.3210526315789473, "grad_norm": 4.304981708526611, "learning_rate": 7.905597372338558e-06, "loss": 1.0825, "step": 882 }, { "epoch": 2.3210526315789473, "step": 882, "train_accuracy": 0.765625 }, { "epoch": 2.3236842105263156, "grad_norm": 4.352926731109619, "learning_rate": 7.88476146219763e-06, "loss": 1.0073, "step": 883 }, { "epoch": 2.3236842105263156, "step": 883, "train_accuracy": 0.828125 }, { "epoch": 2.3263157894736843, "grad_norm": 5.145933628082275, "learning_rate": 7.863935160741886e-06, "loss": 1.2544, "step": 884 }, { "epoch": 2.3263157894736843, "step": 884, "train_accuracy": 0.859375 }, { "epoch": 2.3289473684210527, "grad_norm": 4.757798194885254, "learning_rate": 7.843118562576899e-06, "loss": 0.916, "step": 885 }, { "epoch": 2.3289473684210527, "step": 885, "train_accuracy": 0.765625 }, { "epoch": 2.331578947368421, "grad_norm": 5.980506896972656, "learning_rate": 7.822311762264182e-06, "loss": 1.3086, "step": 886 }, { "epoch": 2.331578947368421, "step": 886, "train_accuracy": 0.765625 }, { "epoch": 2.3342105263157893, "grad_norm": 4.120883941650391, "learning_rate": 7.801514854320724e-06, "loss": 1.1172, "step": 887 }, { "epoch": 2.3342105263157893, "step": 887, "train_accuracy": 0.734375 }, { "epoch": 2.336842105263158, "grad_norm": 3.8416996002197266, "learning_rate": 7.780727933218589e-06, "loss": 1.0269, "step": 888 }, { "epoch": 2.336842105263158, "step": 888, "train_accuracy": 0.765625 }, { "epoch": 2.3394736842105264, "grad_norm": 4.1616597175598145, "learning_rate": 7.759951093384467e-06, "loss": 1.3921, "step": 889 }, { "epoch": 2.3394736842105264, "step": 889, "train_accuracy": 0.765625 }, { "epoch": 2.3421052631578947, "grad_norm": 7.139702796936035, "learning_rate": 7.739184429199262e-06, "loss": 1.0664, "step": 890 }, { "epoch": 2.3421052631578947, "step": 890, "train_accuracy": 0.828125 }, { "epoch": 2.344736842105263, "grad_norm": 3.5956788063049316, "learning_rate": 7.71842803499764e-06, "loss": 1.0742, "step": 891 }, { "epoch": 2.344736842105263, "step": 891, "train_accuracy": 0.84375 }, { "epoch": 2.3473684210526318, "grad_norm": 5.877930164337158, "learning_rate": 7.697682005067627e-06, "loss": 1.0059, "step": 892 }, { "epoch": 2.3473684210526318, "step": 892, "train_accuracy": 0.8125 }, { "epoch": 2.35, "grad_norm": 8.738607406616211, "learning_rate": 7.67694643365017e-06, "loss": 0.9917, "step": 893 }, { "epoch": 2.35, "step": 893, "train_accuracy": 0.796875 }, { "epoch": 2.3526315789473684, "grad_norm": 5.848970413208008, "learning_rate": 7.65622141493869e-06, "loss": 1.3345, "step": 894 }, { "epoch": 2.3526315789473684, "step": 894, "train_accuracy": 0.765625 }, { "epoch": 2.3552631578947367, "grad_norm": 8.351785659790039, "learning_rate": 7.635507043078692e-06, "loss": 1.1919, "step": 895 }, { "epoch": 2.3552631578947367, "step": 895, "train_accuracy": 0.75 }, { "epoch": 2.3578947368421055, "grad_norm": 7.921352386474609, "learning_rate": 7.614803412167299e-06, "loss": 1.0029, "step": 896 }, { "epoch": 2.3578947368421055, "step": 896, "train_accuracy": 0.8125 }, { "epoch": 2.360526315789474, "grad_norm": 4.196382522583008, "learning_rate": 7.594110616252859e-06, "loss": 0.9062, "step": 897 }, { "epoch": 2.360526315789474, "step": 897, "train_accuracy": 0.75 }, { "epoch": 2.363157894736842, "grad_norm": 5.209059238433838, "learning_rate": 7.573428749334482e-06, "loss": 1.1943, "step": 898 }, { "epoch": 2.363157894736842, "step": 898, "train_accuracy": 0.875 }, { "epoch": 2.3657894736842104, "grad_norm": 3.660780191421509, "learning_rate": 7.552757905361652e-06, "loss": 0.8054, "step": 899 }, { "epoch": 2.3657894736842104, "step": 899, "train_accuracy": 0.75 }, { "epoch": 2.3684210526315788, "grad_norm": 6.490506172180176, "learning_rate": 7.532098178233761e-06, "loss": 1.1636, "step": 900 }, { "epoch": 2.3684210526315788, "step": 900, "train_accuracy": 0.84375 }, { "epoch": 2.3710526315789475, "grad_norm": 4.385256767272949, "learning_rate": 7.5114496617997205e-06, "loss": 1.1226, "step": 901 }, { "epoch": 2.3710526315789475, "step": 901, "train_accuracy": 0.890625 }, { "epoch": 2.373684210526316, "grad_norm": 5.040467262268066, "learning_rate": 7.4908124498574964e-06, "loss": 0.855, "step": 902 }, { "epoch": 2.373684210526316, "step": 902, "train_accuracy": 0.859375 }, { "epoch": 2.376315789473684, "grad_norm": 8.077067375183105, "learning_rate": 7.470186636153722e-06, "loss": 1.1592, "step": 903 }, { "epoch": 2.376315789473684, "step": 903, "train_accuracy": 0.734375 }, { "epoch": 2.3789473684210525, "grad_norm": 4.04118537902832, "learning_rate": 7.449572314383237e-06, "loss": 1.1375, "step": 904 }, { "epoch": 2.3789473684210525, "step": 904, "train_accuracy": 0.671875 }, { "epoch": 2.3815789473684212, "grad_norm": 6.724638938903809, "learning_rate": 7.428969578188692e-06, "loss": 1.1167, "step": 905 }, { "epoch": 2.3815789473684212, "step": 905, "train_accuracy": 0.765625 }, { "epoch": 2.3842105263157896, "grad_norm": 8.281147003173828, "learning_rate": 7.408378521160091e-06, "loss": 1.064, "step": 906 }, { "epoch": 2.3842105263157896, "step": 906, "train_accuracy": 0.84375 }, { "epoch": 2.386842105263158, "grad_norm": 5.431487083435059, "learning_rate": 7.387799236834408e-06, "loss": 0.96, "step": 907 }, { "epoch": 2.386842105263158, "step": 907, "train_accuracy": 0.75 }, { "epoch": 2.389473684210526, "grad_norm": 5.093214511871338, "learning_rate": 7.367231818695113e-06, "loss": 1.0693, "step": 908 }, { "epoch": 2.389473684210526, "step": 908, "train_accuracy": 0.78125 }, { "epoch": 2.3921052631578945, "grad_norm": 7.53461217880249, "learning_rate": 7.346676360171792e-06, "loss": 1.4072, "step": 909 }, { "epoch": 2.3921052631578945, "step": 909, "train_accuracy": 0.703125 }, { "epoch": 2.3947368421052633, "grad_norm": 7.927123069763184, "learning_rate": 7.326132954639699e-06, "loss": 0.9409, "step": 910 }, { "epoch": 2.3947368421052633, "step": 910, "train_accuracy": 0.796875 }, { "epoch": 2.3973684210526316, "grad_norm": 4.111649513244629, "learning_rate": 7.3056016954193235e-06, "loss": 1.1978, "step": 911 }, { "epoch": 2.3973684210526316, "step": 911, "train_accuracy": 0.875 }, { "epoch": 2.4, "grad_norm": 8.101927757263184, "learning_rate": 7.285082675775998e-06, "loss": 0.9675, "step": 912 }, { "epoch": 2.4, "eval_accuracy": 0.709199845790863, "eval_max_score": 7.4375, "eval_min_score": -9.75, "eval_runtime": 151.321, "eval_samples_per_second": 18.748, "eval_steps_per_second": 0.297, "step": 912 }, { "epoch": 2.4, "step": 912, "train_accuracy": 0.78125 }, { "epoch": 2.4026315789473682, "grad_norm": 6.553269386291504, "learning_rate": 7.26457598891944e-06, "loss": 1.0269, "step": 913 }, { "epoch": 2.4026315789473682, "step": 913, "train_accuracy": 0.703125 }, { "epoch": 2.405263157894737, "grad_norm": 11.197577476501465, "learning_rate": 7.2440817280033555e-06, "loss": 1.3179, "step": 914 }, { "epoch": 2.405263157894737, "step": 914, "train_accuracy": 0.78125 }, { "epoch": 2.4078947368421053, "grad_norm": 12.725153923034668, "learning_rate": 7.223599986124994e-06, "loss": 1.3213, "step": 915 }, { "epoch": 2.4078947368421053, "step": 915, "train_accuracy": 0.875 }, { "epoch": 2.4105263157894736, "grad_norm": 3.8353846073150635, "learning_rate": 7.20313085632475e-06, "loss": 0.9346, "step": 916 }, { "epoch": 2.4105263157894736, "step": 916, "train_accuracy": 0.859375 }, { "epoch": 2.413157894736842, "grad_norm": 4.217707633972168, "learning_rate": 7.182674431585703e-06, "loss": 1.0293, "step": 917 }, { "epoch": 2.413157894736842, "step": 917, "train_accuracy": 0.828125 }, { "epoch": 2.4157894736842107, "grad_norm": 3.886138916015625, "learning_rate": 7.162230804833249e-06, "loss": 1.249, "step": 918 }, { "epoch": 2.4157894736842107, "step": 918, "train_accuracy": 0.734375 }, { "epoch": 2.418421052631579, "grad_norm": 4.176194190979004, "learning_rate": 7.14180006893462e-06, "loss": 1.1792, "step": 919 }, { "epoch": 2.418421052631579, "step": 919, "train_accuracy": 0.796875 }, { "epoch": 2.4210526315789473, "grad_norm": 4.922031402587891, "learning_rate": 7.121382316698511e-06, "loss": 1.1738, "step": 920 }, { "epoch": 2.4210526315789473, "step": 920, "train_accuracy": 0.828125 }, { "epoch": 2.4236842105263157, "grad_norm": 10.639328002929688, "learning_rate": 7.1009776408746205e-06, "loss": 1.0972, "step": 921 }, { "epoch": 2.4236842105263157, "step": 921, "train_accuracy": 0.875 }, { "epoch": 2.4263157894736844, "grad_norm": 4.55307149887085, "learning_rate": 7.08058613415326e-06, "loss": 1.1309, "step": 922 }, { "epoch": 2.4263157894736844, "step": 922, "train_accuracy": 0.78125 }, { "epoch": 2.4289473684210527, "grad_norm": 3.423119306564331, "learning_rate": 7.060207889164909e-06, "loss": 1.1821, "step": 923 }, { "epoch": 2.4289473684210527, "step": 923, "train_accuracy": 0.75 }, { "epoch": 2.431578947368421, "grad_norm": 3.477151870727539, "learning_rate": 7.03984299847981e-06, "loss": 1.1514, "step": 924 }, { "epoch": 2.431578947368421, "step": 924, "train_accuracy": 0.875 }, { "epoch": 2.4342105263157894, "grad_norm": 3.4678921699523926, "learning_rate": 7.01949155460754e-06, "loss": 0.8901, "step": 925 }, { "epoch": 2.4342105263157894, "step": 925, "train_accuracy": 0.796875 }, { "epoch": 2.4368421052631577, "grad_norm": 3.4634525775909424, "learning_rate": 6.999153649996595e-06, "loss": 1.0435, "step": 926 }, { "epoch": 2.4368421052631577, "step": 926, "train_accuracy": 0.8125 }, { "epoch": 2.4394736842105265, "grad_norm": 8.086597442626953, "learning_rate": 6.978829377033962e-06, "loss": 1.0676, "step": 927 }, { "epoch": 2.4394736842105265, "step": 927, "train_accuracy": 0.765625 }, { "epoch": 2.442105263157895, "grad_norm": 6.657654762268066, "learning_rate": 6.9585188280447094e-06, "loss": 1.2402, "step": 928 }, { "epoch": 2.442105263157895, "step": 928, "train_accuracy": 0.734375 }, { "epoch": 2.444736842105263, "grad_norm": 7.919809818267822, "learning_rate": 6.938222095291565e-06, "loss": 1.1038, "step": 929 }, { "epoch": 2.444736842105263, "step": 929, "train_accuracy": 0.8125 }, { "epoch": 2.4473684210526314, "grad_norm": 5.626051425933838, "learning_rate": 6.917939270974485e-06, "loss": 1.0713, "step": 930 }, { "epoch": 2.4473684210526314, "step": 930, "train_accuracy": 0.765625 }, { "epoch": 2.45, "grad_norm": 3.9721519947052, "learning_rate": 6.897670447230263e-06, "loss": 1.041, "step": 931 }, { "epoch": 2.45, "step": 931, "train_accuracy": 0.78125 }, { "epoch": 2.4526315789473685, "grad_norm": 5.174097061157227, "learning_rate": 6.87741571613207e-06, "loss": 1.2671, "step": 932 }, { "epoch": 2.4526315789473685, "step": 932, "train_accuracy": 0.859375 }, { "epoch": 2.455263157894737, "grad_norm": 8.099953651428223, "learning_rate": 6.8571751696890835e-06, "loss": 1.0227, "step": 933 }, { "epoch": 2.455263157894737, "step": 933, "train_accuracy": 0.84375 }, { "epoch": 2.457894736842105, "grad_norm": 11.488319396972656, "learning_rate": 6.836948899846024e-06, "loss": 1.146, "step": 934 }, { "epoch": 2.457894736842105, "step": 934, "train_accuracy": 0.78125 }, { "epoch": 2.4605263157894735, "grad_norm": 8.036605834960938, "learning_rate": 6.816736998482778e-06, "loss": 1.0474, "step": 935 }, { "epoch": 2.4605263157894735, "step": 935, "train_accuracy": 0.859375 }, { "epoch": 2.463157894736842, "grad_norm": 5.297451019287109, "learning_rate": 6.796539557413951e-06, "loss": 1.1514, "step": 936 }, { "epoch": 2.463157894736842, "step": 936, "train_accuracy": 0.8125 }, { "epoch": 2.4657894736842105, "grad_norm": 5.096290111541748, "learning_rate": 6.776356668388464e-06, "loss": 1.0708, "step": 937 }, { "epoch": 2.4657894736842105, "step": 937, "train_accuracy": 0.734375 }, { "epoch": 2.468421052631579, "grad_norm": 4.746647834777832, "learning_rate": 6.756188423089131e-06, "loss": 1.1162, "step": 938 }, { "epoch": 2.468421052631579, "step": 938, "train_accuracy": 0.828125 }, { "epoch": 2.4710526315789476, "grad_norm": 4.6044158935546875, "learning_rate": 6.736034913132253e-06, "loss": 0.9243, "step": 939 }, { "epoch": 2.4710526315789476, "step": 939, "train_accuracy": 0.875 }, { "epoch": 2.473684210526316, "grad_norm": 5.2404866218566895, "learning_rate": 6.715896230067183e-06, "loss": 0.9922, "step": 940 }, { "epoch": 2.473684210526316, "step": 940, "train_accuracy": 0.859375 }, { "epoch": 2.4763157894736842, "grad_norm": 4.998322010040283, "learning_rate": 6.695772465375929e-06, "loss": 1.0859, "step": 941 }, { "epoch": 2.4763157894736842, "step": 941, "train_accuracy": 0.84375 }, { "epoch": 2.4789473684210526, "grad_norm": 6.812887668609619, "learning_rate": 6.675663710472733e-06, "loss": 0.8792, "step": 942 }, { "epoch": 2.4789473684210526, "step": 942, "train_accuracy": 0.78125 }, { "epoch": 2.481578947368421, "grad_norm": 10.387646675109863, "learning_rate": 6.655570056703646e-06, "loss": 1.3091, "step": 943 }, { "epoch": 2.481578947368421, "step": 943, "train_accuracy": 0.75 }, { "epoch": 2.4842105263157896, "grad_norm": 5.717840194702148, "learning_rate": 6.635491595346122e-06, "loss": 1.2109, "step": 944 }, { "epoch": 2.4842105263157896, "step": 944, "train_accuracy": 0.75 }, { "epoch": 2.486842105263158, "grad_norm": 8.672900199890137, "learning_rate": 6.615428417608611e-06, "loss": 1.2695, "step": 945 }, { "epoch": 2.486842105263158, "step": 945, "train_accuracy": 0.796875 }, { "epoch": 2.4894736842105263, "grad_norm": 4.839511871337891, "learning_rate": 6.5953806146301245e-06, "loss": 1.0244, "step": 946 }, { "epoch": 2.4894736842105263, "step": 946, "train_accuracy": 0.75 }, { "epoch": 2.4921052631578946, "grad_norm": 4.422204971313477, "learning_rate": 6.575348277479838e-06, "loss": 1.0557, "step": 947 }, { "epoch": 2.4921052631578946, "step": 947, "train_accuracy": 0.828125 }, { "epoch": 2.4947368421052634, "grad_norm": 11.49475383758545, "learning_rate": 6.555331497156671e-06, "loss": 1.1226, "step": 948 }, { "epoch": 2.4947368421052634, "step": 948, "train_accuracy": 0.875 }, { "epoch": 2.4973684210526317, "grad_norm": 6.345265865325928, "learning_rate": 6.535330364588875e-06, "loss": 1.0649, "step": 949 }, { "epoch": 2.4973684210526317, "step": 949, "train_accuracy": 0.734375 }, { "epoch": 2.5, "grad_norm": 7.858110427856445, "learning_rate": 6.515344970633617e-06, "loss": 1.3809, "step": 950 }, { "epoch": 2.5, "step": 950, "train_accuracy": 0.84375 }, { "epoch": 2.5026315789473683, "grad_norm": 9.39013957977295, "learning_rate": 6.495375406076574e-06, "loss": 0.9399, "step": 951 }, { "epoch": 2.5026315789473683, "step": 951, "train_accuracy": 0.71875 }, { "epoch": 2.5052631578947366, "grad_norm": 4.707727432250977, "learning_rate": 6.4754217616315125e-06, "loss": 1.1304, "step": 952 }, { "epoch": 2.5052631578947366, "step": 952, "train_accuracy": 0.8125 }, { "epoch": 2.5078947368421054, "grad_norm": 3.284626007080078, "learning_rate": 6.455484127939885e-06, "loss": 0.9111, "step": 953 }, { "epoch": 2.5078947368421054, "step": 953, "train_accuracy": 0.859375 }, { "epoch": 2.5105263157894737, "grad_norm": 11.840646743774414, "learning_rate": 6.4355625955704096e-06, "loss": 1.1851, "step": 954 }, { "epoch": 2.5105263157894737, "step": 954, "train_accuracy": 0.765625 }, { "epoch": 2.513157894736842, "grad_norm": 6.086373329162598, "learning_rate": 6.415657255018662e-06, "loss": 1.3726, "step": 955 }, { "epoch": 2.513157894736842, "step": 955, "train_accuracy": 0.703125 }, { "epoch": 2.515789473684211, "grad_norm": 10.698202133178711, "learning_rate": 6.3957681967066695e-06, "loss": 1.5376, "step": 956 }, { "epoch": 2.515789473684211, "step": 956, "train_accuracy": 0.8125 }, { "epoch": 2.518421052631579, "grad_norm": 3.7446768283843994, "learning_rate": 6.375895510982491e-06, "loss": 0.9751, "step": 957 }, { "epoch": 2.518421052631579, "step": 957, "train_accuracy": 0.921875 }, { "epoch": 2.5210526315789474, "grad_norm": 4.004354953765869, "learning_rate": 6.356039288119815e-06, "loss": 0.9243, "step": 958 }, { "epoch": 2.5210526315789474, "step": 958, "train_accuracy": 0.765625 }, { "epoch": 2.5236842105263158, "grad_norm": 4.783187389373779, "learning_rate": 6.336199618317538e-06, "loss": 1.0728, "step": 959 }, { "epoch": 2.5236842105263158, "step": 959, "train_accuracy": 0.84375 }, { "epoch": 2.526315789473684, "grad_norm": 4.596762180328369, "learning_rate": 6.316376591699378e-06, "loss": 0.9634, "step": 960 }, { "epoch": 2.526315789473684, "step": 960, "train_accuracy": 0.828125 }, { "epoch": 2.5289473684210524, "grad_norm": 5.435537815093994, "learning_rate": 6.2965702983134314e-06, "loss": 1.0088, "step": 961 }, { "epoch": 2.5289473684210524, "step": 961, "train_accuracy": 0.8125 }, { "epoch": 2.531578947368421, "grad_norm": 6.328050136566162, "learning_rate": 6.276780828131798e-06, "loss": 1.0698, "step": 962 }, { "epoch": 2.531578947368421, "step": 962, "train_accuracy": 0.8125 }, { "epoch": 2.5342105263157895, "grad_norm": 5.2449798583984375, "learning_rate": 6.257008271050141e-06, "loss": 0.9053, "step": 963 }, { "epoch": 2.5342105263157895, "step": 963, "train_accuracy": 0.703125 }, { "epoch": 2.536842105263158, "grad_norm": 4.180750370025635, "learning_rate": 6.237252716887307e-06, "loss": 1.2451, "step": 964 }, { "epoch": 2.536842105263158, "step": 964, "train_accuracy": 0.78125 }, { "epoch": 2.5394736842105265, "grad_norm": 7.252860069274902, "learning_rate": 6.217514255384907e-06, "loss": 1.1162, "step": 965 }, { "epoch": 2.5394736842105265, "step": 965, "train_accuracy": 0.765625 }, { "epoch": 2.542105263157895, "grad_norm": 7.941829681396484, "learning_rate": 6.197792976206887e-06, "loss": 1.0415, "step": 966 }, { "epoch": 2.542105263157895, "step": 966, "train_accuracy": 0.71875 }, { "epoch": 2.544736842105263, "grad_norm": 5.3874359130859375, "learning_rate": 6.178088968939166e-06, "loss": 1.0444, "step": 967 }, { "epoch": 2.544736842105263, "step": 967, "train_accuracy": 0.8125 }, { "epoch": 2.5473684210526315, "grad_norm": 3.811061382293701, "learning_rate": 6.158402323089184e-06, "loss": 1.0552, "step": 968 }, { "epoch": 2.5473684210526315, "step": 968, "train_accuracy": 0.828125 }, { "epoch": 2.55, "grad_norm": 4.76802921295166, "learning_rate": 6.138733128085529e-06, "loss": 0.958, "step": 969 }, { "epoch": 2.55, "step": 969, "train_accuracy": 0.90625 }, { "epoch": 2.5526315789473686, "grad_norm": 5.948366165161133, "learning_rate": 6.119081473277502e-06, "loss": 0.9478, "step": 970 }, { "epoch": 2.5526315789473686, "step": 970, "train_accuracy": 0.875 }, { "epoch": 2.555263157894737, "grad_norm": 8.939352989196777, "learning_rate": 6.0994474479347435e-06, "loss": 1.0188, "step": 971 }, { "epoch": 2.555263157894737, "step": 971, "train_accuracy": 0.796875 }, { "epoch": 2.557894736842105, "grad_norm": 4.726570129394531, "learning_rate": 6.079831141246792e-06, "loss": 1.0972, "step": 972 }, { "epoch": 2.557894736842105, "step": 972, "train_accuracy": 0.703125 }, { "epoch": 2.5605263157894735, "grad_norm": 6.62232780456543, "learning_rate": 6.060232642322717e-06, "loss": 1.1201, "step": 973 }, { "epoch": 2.5605263157894735, "step": 973, "train_accuracy": 0.90625 }, { "epoch": 2.5631578947368423, "grad_norm": 6.835370063781738, "learning_rate": 6.040652040190672e-06, "loss": 1.063, "step": 974 }, { "epoch": 2.5631578947368423, "step": 974, "train_accuracy": 0.75 }, { "epoch": 2.5657894736842106, "grad_norm": 9.736258506774902, "learning_rate": 6.021089423797535e-06, "loss": 1.2314, "step": 975 }, { "epoch": 2.5657894736842106, "step": 975, "train_accuracy": 0.8125 }, { "epoch": 2.568421052631579, "grad_norm": 4.367083549499512, "learning_rate": 6.001544882008461e-06, "loss": 1.0264, "step": 976 }, { "epoch": 2.568421052631579, "step": 976, "train_accuracy": 0.828125 }, { "epoch": 2.5710526315789473, "grad_norm": 6.833847522735596, "learning_rate": 5.982018503606519e-06, "loss": 1.5303, "step": 977 }, { "epoch": 2.5710526315789473, "step": 977, "train_accuracy": 0.84375 }, { "epoch": 2.5736842105263156, "grad_norm": 4.34575080871582, "learning_rate": 5.962510377292252e-06, "loss": 1.0857, "step": 978 }, { "epoch": 2.5736842105263156, "step": 978, "train_accuracy": 0.734375 }, { "epoch": 2.5763157894736843, "grad_norm": 4.48021125793457, "learning_rate": 5.943020591683306e-06, "loss": 1.2573, "step": 979 }, { "epoch": 2.5763157894736843, "step": 979, "train_accuracy": 0.78125 }, { "epoch": 2.5789473684210527, "grad_norm": 7.216438293457031, "learning_rate": 5.923549235313997e-06, "loss": 1.3081, "step": 980 }, { "epoch": 2.5789473684210527, "step": 980, "train_accuracy": 0.875 }, { "epoch": 2.581578947368421, "grad_norm": 5.685892105102539, "learning_rate": 5.904096396634935e-06, "loss": 0.9727, "step": 981 }, { "epoch": 2.581578947368421, "step": 981, "train_accuracy": 0.8125 }, { "epoch": 2.5842105263157897, "grad_norm": 3.4921514987945557, "learning_rate": 5.884662164012616e-06, "loss": 1.0112, "step": 982 }, { "epoch": 2.5842105263157897, "step": 982, "train_accuracy": 0.828125 }, { "epoch": 2.586842105263158, "grad_norm": 3.5595004558563232, "learning_rate": 5.8652466257289974e-06, "loss": 0.9966, "step": 983 }, { "epoch": 2.586842105263158, "step": 983, "train_accuracy": 0.734375 }, { "epoch": 2.5894736842105264, "grad_norm": 10.218847274780273, "learning_rate": 5.845849869981137e-06, "loss": 1.2466, "step": 984 }, { "epoch": 2.5894736842105264, "step": 984, "train_accuracy": 0.765625 }, { "epoch": 2.5921052631578947, "grad_norm": 6.644645690917969, "learning_rate": 5.826471984880754e-06, "loss": 0.9751, "step": 985 }, { "epoch": 2.5921052631578947, "step": 985, "train_accuracy": 0.78125 }, { "epoch": 2.594736842105263, "grad_norm": 7.35904598236084, "learning_rate": 5.807113058453862e-06, "loss": 1.2002, "step": 986 }, { "epoch": 2.594736842105263, "step": 986, "train_accuracy": 0.703125 }, { "epoch": 2.5973684210526313, "grad_norm": 5.091441631317139, "learning_rate": 5.7877731786403304e-06, "loss": 1.1816, "step": 987 }, { "epoch": 2.5973684210526313, "step": 987, "train_accuracy": 0.734375 }, { "epoch": 2.6, "grad_norm": 10.243165969848633, "learning_rate": 5.768452433293532e-06, "loss": 1.1782, "step": 988 }, { "epoch": 2.6, "eval_accuracy": 0.7113147974014282, "eval_max_score": 5.84375, "eval_min_score": -10.25, "eval_runtime": 151.3594, "eval_samples_per_second": 18.743, "eval_steps_per_second": 0.297, "step": 988 }, { "epoch": 2.6, "step": 988, "train_accuracy": 0.765625 }, { "epoch": 2.6026315789473684, "grad_norm": 3.4999477863311768, "learning_rate": 5.7491509101799055e-06, "loss": 1.0269, "step": 989 }, { "epoch": 2.6026315789473684, "step": 989, "train_accuracy": 0.765625 }, { "epoch": 2.6052631578947367, "grad_norm": 3.9146370887756348, "learning_rate": 5.729868696978574e-06, "loss": 1.1133, "step": 990 }, { "epoch": 2.6052631578947367, "step": 990, "train_accuracy": 0.8125 }, { "epoch": 2.6078947368421055, "grad_norm": 7.038024425506592, "learning_rate": 5.710605881280939e-06, "loss": 1.1978, "step": 991 }, { "epoch": 2.6078947368421055, "step": 991, "train_accuracy": 0.71875 }, { "epoch": 2.610526315789474, "grad_norm": 4.451397895812988, "learning_rate": 5.6913625505902966e-06, "loss": 1.0557, "step": 992 }, { "epoch": 2.610526315789474, "step": 992, "train_accuracy": 0.796875 }, { "epoch": 2.613157894736842, "grad_norm": 4.300213813781738, "learning_rate": 5.6721387923214215e-06, "loss": 1.23, "step": 993 }, { "epoch": 2.613157894736842, "step": 993, "train_accuracy": 0.71875 }, { "epoch": 2.6157894736842104, "grad_norm": 3.7951157093048096, "learning_rate": 5.65293469380018e-06, "loss": 1.144, "step": 994 }, { "epoch": 2.6157894736842104, "step": 994, "train_accuracy": 0.71875 }, { "epoch": 2.6184210526315788, "grad_norm": 4.785224437713623, "learning_rate": 5.633750342263136e-06, "loss": 1.3882, "step": 995 }, { "epoch": 2.6184210526315788, "step": 995, "train_accuracy": 0.796875 }, { "epoch": 2.6210526315789475, "grad_norm": 15.803621292114258, "learning_rate": 5.614585824857148e-06, "loss": 1.2593, "step": 996 }, { "epoch": 2.6210526315789475, "step": 996, "train_accuracy": 0.8125 }, { "epoch": 2.623684210526316, "grad_norm": 4.082406997680664, "learning_rate": 5.595441228638976e-06, "loss": 0.9116, "step": 997 }, { "epoch": 2.623684210526316, "step": 997, "train_accuracy": 0.84375 }, { "epoch": 2.626315789473684, "grad_norm": 9.289669036865234, "learning_rate": 5.576316640574886e-06, "loss": 1.2944, "step": 998 }, { "epoch": 2.626315789473684, "step": 998, "train_accuracy": 0.796875 }, { "epoch": 2.6289473684210525, "grad_norm": 6.445914268493652, "learning_rate": 5.557212147540254e-06, "loss": 1.1318, "step": 999 }, { "epoch": 2.6289473684210525, "step": 999, "train_accuracy": 0.796875 }, { "epoch": 2.6315789473684212, "grad_norm": 11.213533401489258, "learning_rate": 5.538127836319176e-06, "loss": 1.1318, "step": 1000 }, { "epoch": 2.6315789473684212, "step": 1000, "train_accuracy": 0.8125 }, { "epoch": 2.6342105263157896, "grad_norm": 15.603263854980469, "learning_rate": 5.519063793604067e-06, "loss": 1.2778, "step": 1001 }, { "epoch": 2.6342105263157896, "step": 1001, "train_accuracy": 0.84375 }, { "epoch": 2.636842105263158, "grad_norm": 10.475622177124023, "learning_rate": 5.50002010599527e-06, "loss": 1.126, "step": 1002 }, { "epoch": 2.636842105263158, "step": 1002, "train_accuracy": 0.8125 }, { "epoch": 2.639473684210526, "grad_norm": 5.916659832000732, "learning_rate": 5.480996860000664e-06, "loss": 1.2539, "step": 1003 }, { "epoch": 2.639473684210526, "step": 1003, "train_accuracy": 0.859375 }, { "epoch": 2.6421052631578945, "grad_norm": 6.055222511291504, "learning_rate": 5.461994142035269e-06, "loss": 1.0186, "step": 1004 }, { "epoch": 2.6421052631578945, "step": 1004, "train_accuracy": 0.828125 }, { "epoch": 2.6447368421052633, "grad_norm": 17.587617874145508, "learning_rate": 5.443012038420856e-06, "loss": 1.2661, "step": 1005 }, { "epoch": 2.6447368421052633, "step": 1005, "train_accuracy": 0.859375 }, { "epoch": 2.6473684210526316, "grad_norm": 4.647197246551514, "learning_rate": 5.424050635385552e-06, "loss": 1.0176, "step": 1006 }, { "epoch": 2.6473684210526316, "step": 1006, "train_accuracy": 0.78125 }, { "epoch": 2.65, "grad_norm": 3.68725323677063, "learning_rate": 5.405110019063449e-06, "loss": 0.9331, "step": 1007 }, { "epoch": 2.65, "step": 1007, "train_accuracy": 0.828125 }, { "epoch": 2.6526315789473687, "grad_norm": 5.881679534912109, "learning_rate": 5.3861902754942104e-06, "loss": 0.8833, "step": 1008 }, { "epoch": 2.6526315789473687, "step": 1008, "train_accuracy": 0.78125 }, { "epoch": 2.655263157894737, "grad_norm": 10.934441566467285, "learning_rate": 5.367291490622699e-06, "loss": 1.2524, "step": 1009 }, { "epoch": 2.655263157894737, "step": 1009, "train_accuracy": 0.796875 }, { "epoch": 2.6578947368421053, "grad_norm": 4.606790542602539, "learning_rate": 5.348413750298542e-06, "loss": 1.1841, "step": 1010 }, { "epoch": 2.6578947368421053, "step": 1010, "train_accuracy": 0.765625 }, { "epoch": 2.6605263157894736, "grad_norm": 9.060807228088379, "learning_rate": 5.329557140275802e-06, "loss": 1.4224, "step": 1011 }, { "epoch": 2.6605263157894736, "step": 1011, "train_accuracy": 0.75 }, { "epoch": 2.663157894736842, "grad_norm": 3.719575881958008, "learning_rate": 5.310721746212522e-06, "loss": 1.0898, "step": 1012 }, { "epoch": 2.663157894736842, "step": 1012, "train_accuracy": 0.828125 }, { "epoch": 2.6657894736842103, "grad_norm": 4.574690341949463, "learning_rate": 5.291907653670402e-06, "loss": 1.2852, "step": 1013 }, { "epoch": 2.6657894736842103, "step": 1013, "train_accuracy": 0.765625 }, { "epoch": 2.668421052631579, "grad_norm": 5.748828411102295, "learning_rate": 5.273114948114346e-06, "loss": 1.0542, "step": 1014 }, { "epoch": 2.668421052631579, "step": 1014, "train_accuracy": 0.84375 }, { "epoch": 2.6710526315789473, "grad_norm": 3.7456870079040527, "learning_rate": 5.254343714912139e-06, "loss": 1.1528, "step": 1015 }, { "epoch": 2.6710526315789473, "step": 1015, "train_accuracy": 0.671875 }, { "epoch": 2.6736842105263157, "grad_norm": 4.605350494384766, "learning_rate": 5.2355940393339914e-06, "loss": 1.3657, "step": 1016 }, { "epoch": 2.6736842105263157, "step": 1016, "train_accuracy": 0.84375 }, { "epoch": 2.6763157894736844, "grad_norm": 9.041370391845703, "learning_rate": 5.216866006552213e-06, "loss": 1.1028, "step": 1017 }, { "epoch": 2.6763157894736844, "step": 1017, "train_accuracy": 0.71875 }, { "epoch": 2.6789473684210527, "grad_norm": 4.867583274841309, "learning_rate": 5.198159701640784e-06, "loss": 1.1343, "step": 1018 }, { "epoch": 2.6789473684210527, "step": 1018, "train_accuracy": 0.78125 }, { "epoch": 2.681578947368421, "grad_norm": 8.326457977294922, "learning_rate": 5.179475209574991e-06, "loss": 1.0972, "step": 1019 }, { "epoch": 2.681578947368421, "step": 1019, "train_accuracy": 0.75 }, { "epoch": 2.6842105263157894, "grad_norm": 3.207512140274048, "learning_rate": 5.1608126152310286e-06, "loss": 1.0972, "step": 1020 }, { "epoch": 2.6842105263157894, "step": 1020, "train_accuracy": 0.828125 }, { "epoch": 2.6868421052631577, "grad_norm": 9.606292724609375, "learning_rate": 5.142172003385622e-06, "loss": 1.1665, "step": 1021 }, { "epoch": 2.6868421052631577, "step": 1021, "train_accuracy": 0.71875 }, { "epoch": 2.6894736842105265, "grad_norm": 8.760821342468262, "learning_rate": 5.123553458715635e-06, "loss": 1.2441, "step": 1022 }, { "epoch": 2.6894736842105265, "step": 1022, "train_accuracy": 0.765625 }, { "epoch": 2.692105263157895, "grad_norm": 4.255366802215576, "learning_rate": 5.104957065797696e-06, "loss": 1.0542, "step": 1023 }, { "epoch": 2.692105263157895, "step": 1023, "train_accuracy": 0.75 }, { "epoch": 2.694736842105263, "grad_norm": 3.35002064704895, "learning_rate": 5.086382909107797e-06, "loss": 0.8862, "step": 1024 }, { "epoch": 2.694736842105263, "step": 1024, "train_accuracy": 0.859375 }, { "epoch": 2.6973684210526314, "grad_norm": 7.932205677032471, "learning_rate": 5.067831073020928e-06, "loss": 1.1333, "step": 1025 }, { "epoch": 2.6973684210526314, "step": 1025, "train_accuracy": 0.796875 }, { "epoch": 2.7, "grad_norm": 8.935729026794434, "learning_rate": 5.049301641810682e-06, "loss": 1.0669, "step": 1026 }, { "epoch": 2.7, "step": 1026, "train_accuracy": 0.84375 }, { "epoch": 2.7026315789473685, "grad_norm": 4.217672348022461, "learning_rate": 5.030794699648875e-06, "loss": 1.1006, "step": 1027 }, { "epoch": 2.7026315789473685, "step": 1027, "train_accuracy": 0.765625 }, { "epoch": 2.705263157894737, "grad_norm": 17.088626861572266, "learning_rate": 5.012310330605167e-06, "loss": 1.3882, "step": 1028 }, { "epoch": 2.705263157894737, "step": 1028, "train_accuracy": 0.8125 }, { "epoch": 2.707894736842105, "grad_norm": 6.726995944976807, "learning_rate": 4.9938486186466736e-06, "loss": 1.0659, "step": 1029 }, { "epoch": 2.707894736842105, "step": 1029, "train_accuracy": 0.859375 }, { "epoch": 2.7105263157894735, "grad_norm": 3.387362241744995, "learning_rate": 4.975409647637591e-06, "loss": 0.9692, "step": 1030 }, { "epoch": 2.7105263157894735, "step": 1030, "train_accuracy": 0.84375 }, { "epoch": 2.713157894736842, "grad_norm": 5.338740825653076, "learning_rate": 4.9569935013388125e-06, "loss": 1.2466, "step": 1031 }, { "epoch": 2.713157894736842, "step": 1031, "train_accuracy": 0.796875 }, { "epoch": 2.7157894736842105, "grad_norm": 10.06348705291748, "learning_rate": 4.938600263407546e-06, "loss": 1.0942, "step": 1032 }, { "epoch": 2.7157894736842105, "step": 1032, "train_accuracy": 0.78125 }, { "epoch": 2.718421052631579, "grad_norm": 12.942997932434082, "learning_rate": 4.9202300173969364e-06, "loss": 1.2822, "step": 1033 }, { "epoch": 2.718421052631579, "step": 1033, "train_accuracy": 0.796875 }, { "epoch": 2.7210526315789476, "grad_norm": 7.932915687561035, "learning_rate": 4.901882846755687e-06, "loss": 1.0498, "step": 1034 }, { "epoch": 2.7210526315789476, "step": 1034, "train_accuracy": 0.6875 }, { "epoch": 2.723684210526316, "grad_norm": 7.6717352867126465, "learning_rate": 4.883558834827675e-06, "loss": 1.2739, "step": 1035 }, { "epoch": 2.723684210526316, "step": 1035, "train_accuracy": 0.765625 }, { "epoch": 2.7263157894736842, "grad_norm": 6.054356575012207, "learning_rate": 4.865258064851579e-06, "loss": 1.208, "step": 1036 }, { "epoch": 2.7263157894736842, "step": 1036, "train_accuracy": 0.796875 }, { "epoch": 2.7289473684210526, "grad_norm": 4.3364362716674805, "learning_rate": 4.846980619960509e-06, "loss": 1.1011, "step": 1037 }, { "epoch": 2.7289473684210526, "step": 1037, "train_accuracy": 0.765625 }, { "epoch": 2.731578947368421, "grad_norm": 3.8560869693756104, "learning_rate": 4.8287265831815924e-06, "loss": 1.0205, "step": 1038 }, { "epoch": 2.731578947368421, "step": 1038, "train_accuracy": 0.796875 }, { "epoch": 2.734210526315789, "grad_norm": 5.655264377593994, "learning_rate": 4.810496037435654e-06, "loss": 1.1133, "step": 1039 }, { "epoch": 2.734210526315789, "step": 1039, "train_accuracy": 0.890625 }, { "epoch": 2.736842105263158, "grad_norm": 4.983482360839844, "learning_rate": 4.792289065536783e-06, "loss": 0.8828, "step": 1040 }, { "epoch": 2.736842105263158, "step": 1040, "train_accuracy": 0.796875 }, { "epoch": 2.7394736842105263, "grad_norm": 3.325807809829712, "learning_rate": 4.774105750192001e-06, "loss": 1.0684, "step": 1041 }, { "epoch": 2.7394736842105263, "step": 1041, "train_accuracy": 0.75 }, { "epoch": 2.7421052631578946, "grad_norm": 4.7382893562316895, "learning_rate": 4.7559461740008475e-06, "loss": 1.2109, "step": 1042 }, { "epoch": 2.7421052631578946, "step": 1042, "train_accuracy": 0.75 }, { "epoch": 2.7447368421052634, "grad_norm": 5.645082950592041, "learning_rate": 4.7378104194550485e-06, "loss": 1.2959, "step": 1043 }, { "epoch": 2.7447368421052634, "step": 1043, "train_accuracy": 0.71875 }, { "epoch": 2.7473684210526317, "grad_norm": 4.51600456237793, "learning_rate": 4.719698568938092e-06, "loss": 1.25, "step": 1044 }, { "epoch": 2.7473684210526317, "step": 1044, "train_accuracy": 0.78125 }, { "epoch": 2.75, "grad_norm": 5.984616756439209, "learning_rate": 4.701610704724906e-06, "loss": 1.1694, "step": 1045 }, { "epoch": 2.75, "step": 1045, "train_accuracy": 0.9375 }, { "epoch": 2.7526315789473683, "grad_norm": 5.485406398773193, "learning_rate": 4.6835469089814304e-06, "loss": 0.8899, "step": 1046 }, { "epoch": 2.7526315789473683, "step": 1046, "train_accuracy": 0.796875 }, { "epoch": 2.7552631578947366, "grad_norm": 4.4941534996032715, "learning_rate": 4.665507263764299e-06, "loss": 1.2178, "step": 1047 }, { "epoch": 2.7552631578947366, "step": 1047, "train_accuracy": 0.859375 }, { "epoch": 2.7578947368421054, "grad_norm": 6.920727252960205, "learning_rate": 4.6474918510204145e-06, "loss": 1.0352, "step": 1048 }, { "epoch": 2.7578947368421054, "step": 1048, "train_accuracy": 0.796875 }, { "epoch": 2.7605263157894737, "grad_norm": 5.5470356941223145, "learning_rate": 4.629500752586625e-06, "loss": 1.0195, "step": 1049 }, { "epoch": 2.7605263157894737, "step": 1049, "train_accuracy": 0.75 }, { "epoch": 2.763157894736842, "grad_norm": 5.659350872039795, "learning_rate": 4.611534050189304e-06, "loss": 1.0259, "step": 1050 }, { "epoch": 2.763157894736842, "step": 1050, "train_accuracy": 0.828125 }, { "epoch": 2.765789473684211, "grad_norm": 8.783293724060059, "learning_rate": 4.593591825444028e-06, "loss": 1.1479, "step": 1051 }, { "epoch": 2.765789473684211, "step": 1051, "train_accuracy": 0.75 }, { "epoch": 2.768421052631579, "grad_norm": 3.798257827758789, "learning_rate": 4.575674159855156e-06, "loss": 1.0312, "step": 1052 }, { "epoch": 2.768421052631579, "step": 1052, "train_accuracy": 0.8125 }, { "epoch": 2.7710526315789474, "grad_norm": 5.9507155418396, "learning_rate": 4.557781134815509e-06, "loss": 0.9458, "step": 1053 }, { "epoch": 2.7710526315789474, "step": 1053, "train_accuracy": 0.796875 }, { "epoch": 2.7736842105263158, "grad_norm": 8.45976448059082, "learning_rate": 4.539912831605959e-06, "loss": 1.3091, "step": 1054 }, { "epoch": 2.7736842105263158, "step": 1054, "train_accuracy": 0.765625 }, { "epoch": 2.776315789473684, "grad_norm": 9.173246383666992, "learning_rate": 4.522069331395085e-06, "loss": 1.127, "step": 1055 }, { "epoch": 2.776315789473684, "step": 1055, "train_accuracy": 0.78125 }, { "epoch": 2.7789473684210524, "grad_norm": 5.204906940460205, "learning_rate": 4.504250715238791e-06, "loss": 1.0767, "step": 1056 }, { "epoch": 2.7789473684210524, "step": 1056, "train_accuracy": 0.75 }, { "epoch": 2.781578947368421, "grad_norm": 5.059564113616943, "learning_rate": 4.486457064079943e-06, "loss": 1.1562, "step": 1057 }, { "epoch": 2.781578947368421, "step": 1057, "train_accuracy": 0.734375 }, { "epoch": 2.7842105263157895, "grad_norm": 3.4430267810821533, "learning_rate": 4.468688458748006e-06, "loss": 1.0962, "step": 1058 }, { "epoch": 2.7842105263157895, "step": 1058, "train_accuracy": 0.75 }, { "epoch": 2.786842105263158, "grad_norm": 10.936015129089355, "learning_rate": 4.450944979958668e-06, "loss": 1.3589, "step": 1059 }, { "epoch": 2.786842105263158, "step": 1059, "train_accuracy": 0.8125 }, { "epoch": 2.7894736842105265, "grad_norm": 3.82942533493042, "learning_rate": 4.433226708313475e-06, "loss": 1.1372, "step": 1060 }, { "epoch": 2.7894736842105265, "step": 1060, "train_accuracy": 0.8125 }, { "epoch": 2.792105263157895, "grad_norm": 2.9924509525299072, "learning_rate": 4.415533724299471e-06, "loss": 0.9639, "step": 1061 }, { "epoch": 2.792105263157895, "step": 1061, "train_accuracy": 0.8125 }, { "epoch": 2.794736842105263, "grad_norm": 8.909473419189453, "learning_rate": 4.397866108288828e-06, "loss": 1.1021, "step": 1062 }, { "epoch": 2.794736842105263, "step": 1062, "train_accuracy": 0.890625 }, { "epoch": 2.7973684210526315, "grad_norm": 9.16897201538086, "learning_rate": 4.380223940538478e-06, "loss": 1.1138, "step": 1063 }, { "epoch": 2.7973684210526315, "step": 1063, "train_accuracy": 0.8125 }, { "epoch": 2.8, "grad_norm": 4.704495906829834, "learning_rate": 4.362607301189756e-06, "loss": 1.0015, "step": 1064 }, { "epoch": 2.8, "eval_accuracy": 0.7099048495292664, "eval_max_score": 7.96875, "eval_min_score": -8.875, "eval_runtime": 151.1898, "eval_samples_per_second": 18.764, "eval_steps_per_second": 0.298, "step": 1064 }, { "epoch": 2.8, "step": 1064, "train_accuracy": 0.703125 }, { "epoch": 2.8026315789473686, "grad_norm": 5.374510765075684, "learning_rate": 4.345016270268029e-06, "loss": 1.1904, "step": 1065 }, { "epoch": 2.8026315789473686, "step": 1065, "train_accuracy": 0.78125 }, { "epoch": 2.805263157894737, "grad_norm": 6.463529586791992, "learning_rate": 4.327450927682334e-06, "loss": 1.1279, "step": 1066 }, { "epoch": 2.805263157894737, "step": 1066, "train_accuracy": 0.875 }, { "epoch": 2.807894736842105, "grad_norm": 4.455600738525391, "learning_rate": 4.309911353225019e-06, "loss": 0.9233, "step": 1067 }, { "epoch": 2.807894736842105, "step": 1067, "train_accuracy": 0.84375 }, { "epoch": 2.8105263157894735, "grad_norm": 7.1986589431762695, "learning_rate": 4.2923976265713765e-06, "loss": 0.8882, "step": 1068 }, { "epoch": 2.8105263157894735, "step": 1068, "train_accuracy": 0.75 }, { "epoch": 2.8131578947368423, "grad_norm": 4.7391462326049805, "learning_rate": 4.274909827279283e-06, "loss": 1.2251, "step": 1069 }, { "epoch": 2.8131578947368423, "step": 1069, "train_accuracy": 0.796875 }, { "epoch": 2.8157894736842106, "grad_norm": 4.475101470947266, "learning_rate": 4.257448034788837e-06, "loss": 1.0859, "step": 1070 }, { "epoch": 2.8157894736842106, "step": 1070, "train_accuracy": 0.8125 }, { "epoch": 2.818421052631579, "grad_norm": 5.544163703918457, "learning_rate": 4.240012328421998e-06, "loss": 0.9385, "step": 1071 }, { "epoch": 2.818421052631579, "step": 1071, "train_accuracy": 0.875 }, { "epoch": 2.8210526315789473, "grad_norm": 3.5065155029296875, "learning_rate": 4.222602787382223e-06, "loss": 0.8945, "step": 1072 }, { "epoch": 2.8210526315789473, "step": 1072, "train_accuracy": 0.75 }, { "epoch": 2.8236842105263156, "grad_norm": 5.043632507324219, "learning_rate": 4.2052194907541255e-06, "loss": 1.311, "step": 1073 }, { "epoch": 2.8236842105263156, "step": 1073, "train_accuracy": 0.796875 }, { "epoch": 2.8263157894736843, "grad_norm": 3.5553743839263916, "learning_rate": 4.187862517503077e-06, "loss": 0.9697, "step": 1074 }, { "epoch": 2.8263157894736843, "step": 1074, "train_accuracy": 0.828125 }, { "epoch": 2.8289473684210527, "grad_norm": 3.987929344177246, "learning_rate": 4.1705319464749e-06, "loss": 1.0498, "step": 1075 }, { "epoch": 2.8289473684210527, "step": 1075, "train_accuracy": 0.78125 }, { "epoch": 2.831578947368421, "grad_norm": 5.27581262588501, "learning_rate": 4.153227856395452e-06, "loss": 1.2754, "step": 1076 }, { "epoch": 2.831578947368421, "step": 1076, "train_accuracy": 0.75 }, { "epoch": 2.8342105263157897, "grad_norm": 4.249098300933838, "learning_rate": 4.135950325870328e-06, "loss": 0.9639, "step": 1077 }, { "epoch": 2.8342105263157897, "step": 1077, "train_accuracy": 0.78125 }, { "epoch": 2.836842105263158, "grad_norm": 3.359182357788086, "learning_rate": 4.118699433384446e-06, "loss": 0.7979, "step": 1078 }, { "epoch": 2.836842105263158, "step": 1078, "train_accuracy": 0.875 }, { "epoch": 2.8394736842105264, "grad_norm": 5.7397589683532715, "learning_rate": 4.101475257301746e-06, "loss": 1.0815, "step": 1079 }, { "epoch": 2.8394736842105264, "step": 1079, "train_accuracy": 0.734375 }, { "epoch": 2.8421052631578947, "grad_norm": 3.884835720062256, "learning_rate": 4.084277875864776e-06, "loss": 1.0981, "step": 1080 }, { "epoch": 2.8421052631578947, "step": 1080, "train_accuracy": 0.796875 }, { "epoch": 2.844736842105263, "grad_norm": 5.433359622955322, "learning_rate": 4.067107367194397e-06, "loss": 1.2388, "step": 1081 }, { "epoch": 2.844736842105263, "step": 1081, "train_accuracy": 0.765625 }, { "epoch": 2.8473684210526313, "grad_norm": 6.485168933868408, "learning_rate": 4.049963809289368e-06, "loss": 1.3252, "step": 1082 }, { "epoch": 2.8473684210526313, "step": 1082, "train_accuracy": 0.8125 }, { "epoch": 2.85, "grad_norm": 3.8819828033447266, "learning_rate": 4.032847280026051e-06, "loss": 1.1392, "step": 1083 }, { "epoch": 2.85, "step": 1083, "train_accuracy": 0.71875 }, { "epoch": 2.8526315789473684, "grad_norm": 4.981328964233398, "learning_rate": 4.015757857157999e-06, "loss": 1.2705, "step": 1084 }, { "epoch": 2.8526315789473684, "step": 1084, "train_accuracy": 0.703125 }, { "epoch": 2.8552631578947367, "grad_norm": 7.025356769561768, "learning_rate": 3.998695618315655e-06, "loss": 1.02, "step": 1085 }, { "epoch": 2.8552631578947367, "step": 1085, "train_accuracy": 0.796875 }, { "epoch": 2.8578947368421055, "grad_norm": 6.157021522521973, "learning_rate": 3.9816606410059625e-06, "loss": 1.2275, "step": 1086 }, { "epoch": 2.8578947368421055, "step": 1086, "train_accuracy": 0.75 }, { "epoch": 2.860526315789474, "grad_norm": 4.282430648803711, "learning_rate": 3.964653002612031e-06, "loss": 0.9609, "step": 1087 }, { "epoch": 2.860526315789474, "step": 1087, "train_accuracy": 0.796875 }, { "epoch": 2.863157894736842, "grad_norm": 3.4297397136688232, "learning_rate": 3.94767278039278e-06, "loss": 1.103, "step": 1088 }, { "epoch": 2.863157894736842, "step": 1088, "train_accuracy": 0.8125 }, { "epoch": 2.8657894736842104, "grad_norm": 3.6751058101654053, "learning_rate": 3.930720051482585e-06, "loss": 0.9619, "step": 1089 }, { "epoch": 2.8657894736842104, "step": 1089, "train_accuracy": 0.765625 }, { "epoch": 2.8684210526315788, "grad_norm": 4.563840389251709, "learning_rate": 3.9137948928909374e-06, "loss": 1.3232, "step": 1090 }, { "epoch": 2.8684210526315788, "step": 1090, "train_accuracy": 0.78125 }, { "epoch": 2.8710526315789475, "grad_norm": 4.432309150695801, "learning_rate": 3.896897381502081e-06, "loss": 1.0767, "step": 1091 }, { "epoch": 2.8710526315789475, "step": 1091, "train_accuracy": 0.65625 }, { "epoch": 2.873684210526316, "grad_norm": 7.510792255401611, "learning_rate": 3.880027594074671e-06, "loss": 1.2344, "step": 1092 }, { "epoch": 2.873684210526316, "step": 1092, "train_accuracy": 0.8125 }, { "epoch": 2.876315789473684, "grad_norm": 3.3003294467926025, "learning_rate": 3.863185607241425e-06, "loss": 0.9473, "step": 1093 }, { "epoch": 2.876315789473684, "step": 1093, "train_accuracy": 0.78125 }, { "epoch": 2.8789473684210525, "grad_norm": 4.405003547668457, "learning_rate": 3.846371497508775e-06, "loss": 1.1001, "step": 1094 }, { "epoch": 2.8789473684210525, "step": 1094, "train_accuracy": 0.75 }, { "epoch": 2.8815789473684212, "grad_norm": 4.802317142486572, "learning_rate": 3.829585341256515e-06, "loss": 1.0273, "step": 1095 }, { "epoch": 2.8815789473684212, "step": 1095, "train_accuracy": 0.875 }, { "epoch": 2.8842105263157896, "grad_norm": 3.6775407791137695, "learning_rate": 3.812827214737459e-06, "loss": 0.9351, "step": 1096 }, { "epoch": 2.8842105263157896, "step": 1096, "train_accuracy": 0.75 }, { "epoch": 2.886842105263158, "grad_norm": 3.8660459518432617, "learning_rate": 3.796097194077093e-06, "loss": 1.1362, "step": 1097 }, { "epoch": 2.886842105263158, "step": 1097, "train_accuracy": 0.765625 }, { "epoch": 2.889473684210526, "grad_norm": 9.875020027160645, "learning_rate": 3.77939535527323e-06, "loss": 1.2524, "step": 1098 }, { "epoch": 2.889473684210526, "step": 1098, "train_accuracy": 0.71875 }, { "epoch": 2.8921052631578945, "grad_norm": 3.817460298538208, "learning_rate": 3.7627217741956625e-06, "loss": 1.0151, "step": 1099 }, { "epoch": 2.8921052631578945, "step": 1099, "train_accuracy": 0.6875 }, { "epoch": 2.8947368421052633, "grad_norm": 7.155078411102295, "learning_rate": 3.7460765265858213e-06, "loss": 1.1729, "step": 1100 }, { "epoch": 2.8947368421052633, "step": 1100, "train_accuracy": 0.796875 }, { "epoch": 2.8973684210526316, "grad_norm": 5.495885848999023, "learning_rate": 3.729459688056427e-06, "loss": 0.9888, "step": 1101 }, { "epoch": 2.8973684210526316, "step": 1101, "train_accuracy": 0.71875 }, { "epoch": 2.9, "grad_norm": 5.324273586273193, "learning_rate": 3.712871334091154e-06, "loss": 1.239, "step": 1102 }, { "epoch": 2.9, "step": 1102, "train_accuracy": 0.78125 }, { "epoch": 2.9026315789473687, "grad_norm": 4.5909905433654785, "learning_rate": 3.696311540044276e-06, "loss": 1.0537, "step": 1103 }, { "epoch": 2.9026315789473687, "step": 1103, "train_accuracy": 0.828125 }, { "epoch": 2.905263157894737, "grad_norm": 3.9828968048095703, "learning_rate": 3.6797803811403354e-06, "loss": 1.0264, "step": 1104 }, { "epoch": 2.905263157894737, "step": 1104, "train_accuracy": 0.859375 }, { "epoch": 2.9078947368421053, "grad_norm": 4.042853355407715, "learning_rate": 3.663277932473791e-06, "loss": 0.9209, "step": 1105 }, { "epoch": 2.9078947368421053, "step": 1105, "train_accuracy": 0.84375 }, { "epoch": 2.9105263157894736, "grad_norm": 6.130189418792725, "learning_rate": 3.646804269008697e-06, "loss": 1.2104, "step": 1106 }, { "epoch": 2.9105263157894736, "step": 1106, "train_accuracy": 0.84375 }, { "epoch": 2.913157894736842, "grad_norm": 6.112072944641113, "learning_rate": 3.630359465578324e-06, "loss": 1.2646, "step": 1107 }, { "epoch": 2.913157894736842, "step": 1107, "train_accuracy": 0.78125 }, { "epoch": 2.9157894736842103, "grad_norm": 8.988405227661133, "learning_rate": 3.613943596884865e-06, "loss": 1.1348, "step": 1108 }, { "epoch": 2.9157894736842103, "step": 1108, "train_accuracy": 0.65625 }, { "epoch": 2.918421052631579, "grad_norm": 7.373897552490234, "learning_rate": 3.597556737499064e-06, "loss": 1.3979, "step": 1109 }, { "epoch": 2.918421052631579, "step": 1109, "train_accuracy": 0.84375 }, { "epoch": 2.9210526315789473, "grad_norm": 4.439242839813232, "learning_rate": 3.5811989618598863e-06, "loss": 0.8975, "step": 1110 }, { "epoch": 2.9210526315789473, "step": 1110, "train_accuracy": 0.703125 }, { "epoch": 2.9236842105263157, "grad_norm": 8.199309349060059, "learning_rate": 3.564870344274185e-06, "loss": 1.2334, "step": 1111 }, { "epoch": 2.9236842105263157, "step": 1111, "train_accuracy": 0.8125 }, { "epoch": 2.9263157894736844, "grad_norm": 4.312633037567139, "learning_rate": 3.5485709589163576e-06, "loss": 1.2168, "step": 1112 }, { "epoch": 2.9263157894736844, "step": 1112, "train_accuracy": 0.828125 }, { "epoch": 2.9289473684210527, "grad_norm": 3.521815776824951, "learning_rate": 3.532300879828013e-06, "loss": 1.0327, "step": 1113 }, { "epoch": 2.9289473684210527, "step": 1113, "train_accuracy": 0.875 }, { "epoch": 2.931578947368421, "grad_norm": 5.077023983001709, "learning_rate": 3.516060180917632e-06, "loss": 1.041, "step": 1114 }, { "epoch": 2.931578947368421, "step": 1114, "train_accuracy": 0.8125 }, { "epoch": 2.9342105263157894, "grad_norm": 4.366433143615723, "learning_rate": 3.499848935960234e-06, "loss": 1.0889, "step": 1115 }, { "epoch": 2.9342105263157894, "step": 1115, "train_accuracy": 0.734375 }, { "epoch": 2.9368421052631577, "grad_norm": 5.330564498901367, "learning_rate": 3.483667218597039e-06, "loss": 1.4038, "step": 1116 }, { "epoch": 2.9368421052631577, "step": 1116, "train_accuracy": 0.828125 }, { "epoch": 2.9394736842105265, "grad_norm": 3.7983169555664062, "learning_rate": 3.467515102335136e-06, "loss": 0.8452, "step": 1117 }, { "epoch": 2.9394736842105265, "step": 1117, "train_accuracy": 0.765625 }, { "epoch": 2.942105263157895, "grad_norm": 4.798806667327881, "learning_rate": 3.4513926605471504e-06, "loss": 0.9983, "step": 1118 }, { "epoch": 2.942105263157895, "step": 1118, "train_accuracy": 0.765625 }, { "epoch": 2.944736842105263, "grad_norm": 3.3308746814727783, "learning_rate": 3.435299966470903e-06, "loss": 0.957, "step": 1119 }, { "epoch": 2.944736842105263, "step": 1119, "train_accuracy": 0.78125 }, { "epoch": 2.9473684210526314, "grad_norm": 5.4025044441223145, "learning_rate": 3.4192370932090892e-06, "loss": 1.1567, "step": 1120 }, { "epoch": 2.9473684210526314, "step": 1120, "train_accuracy": 0.8125 }, { "epoch": 2.95, "grad_norm": 12.746258735656738, "learning_rate": 3.4032041137289327e-06, "loss": 1.1982, "step": 1121 }, { "epoch": 2.95, "step": 1121, "train_accuracy": 0.765625 }, { "epoch": 2.9526315789473685, "grad_norm": 4.749011039733887, "learning_rate": 3.387201100861869e-06, "loss": 1.1455, "step": 1122 }, { "epoch": 2.9526315789473685, "step": 1122, "train_accuracy": 0.765625 }, { "epoch": 2.955263157894737, "grad_norm": 4.1148200035095215, "learning_rate": 3.371228127303203e-06, "loss": 1.1035, "step": 1123 }, { "epoch": 2.955263157894737, "step": 1123, "train_accuracy": 0.8125 }, { "epoch": 2.957894736842105, "grad_norm": 3.990781545639038, "learning_rate": 3.355285265611784e-06, "loss": 0.9458, "step": 1124 }, { "epoch": 2.957894736842105, "step": 1124, "train_accuracy": 0.796875 }, { "epoch": 2.9605263157894735, "grad_norm": 4.621012210845947, "learning_rate": 3.339372588209672e-06, "loss": 0.979, "step": 1125 }, { "epoch": 2.9605263157894735, "step": 1125, "train_accuracy": 0.828125 }, { "epoch": 2.963157894736842, "grad_norm": 4.63665771484375, "learning_rate": 3.323490167381823e-06, "loss": 1.2349, "step": 1126 }, { "epoch": 2.963157894736842, "step": 1126, "train_accuracy": 0.84375 }, { "epoch": 2.9657894736842105, "grad_norm": 3.3280749320983887, "learning_rate": 3.307638075275731e-06, "loss": 0.9976, "step": 1127 }, { "epoch": 2.9657894736842105, "step": 1127, "train_accuracy": 0.828125 }, { "epoch": 2.968421052631579, "grad_norm": 3.5311524868011475, "learning_rate": 3.2918163839011408e-06, "loss": 1.0435, "step": 1128 }, { "epoch": 2.968421052631579, "step": 1128, "train_accuracy": 0.828125 }, { "epoch": 2.9710526315789476, "grad_norm": 6.561064720153809, "learning_rate": 3.2760251651296737e-06, "loss": 1.1294, "step": 1129 }, { "epoch": 2.9710526315789476, "step": 1129, "train_accuracy": 0.734375 }, { "epoch": 2.973684210526316, "grad_norm": 3.985593318939209, "learning_rate": 3.2602644906945536e-06, "loss": 1.168, "step": 1130 }, { "epoch": 2.973684210526316, "step": 1130, "train_accuracy": 0.75 }, { "epoch": 2.9763157894736842, "grad_norm": 3.7214221954345703, "learning_rate": 3.244534432190225e-06, "loss": 1.1406, "step": 1131 }, { "epoch": 2.9763157894736842, "step": 1131, "train_accuracy": 0.84375 }, { "epoch": 2.9789473684210526, "grad_norm": 5.923731327056885, "learning_rate": 3.228835061072084e-06, "loss": 1.1328, "step": 1132 }, { "epoch": 2.9789473684210526, "step": 1132, "train_accuracy": 0.703125 }, { "epoch": 2.981578947368421, "grad_norm": 4.362364768981934, "learning_rate": 3.2131664486561022e-06, "loss": 1.0903, "step": 1133 }, { "epoch": 2.981578947368421, "step": 1133, "train_accuracy": 0.8125 }, { "epoch": 2.984210526315789, "grad_norm": 3.981119155883789, "learning_rate": 3.197528666118549e-06, "loss": 1.1641, "step": 1134 }, { "epoch": 2.984210526315789, "step": 1134, "train_accuracy": 0.765625 }, { "epoch": 2.986842105263158, "grad_norm": 7.54674768447876, "learning_rate": 3.1819217844956216e-06, "loss": 1.1636, "step": 1135 }, { "epoch": 2.986842105263158, "step": 1135, "train_accuracy": 0.8125 }, { "epoch": 2.9894736842105263, "grad_norm": 7.682521820068359, "learning_rate": 3.1663458746831734e-06, "loss": 1.1372, "step": 1136 }, { "epoch": 2.9894736842105263, "step": 1136, "train_accuracy": 0.828125 }, { "epoch": 2.9921052631578946, "grad_norm": 6.8332672119140625, "learning_rate": 3.1508010074363384e-06, "loss": 0.9878, "step": 1137 }, { "epoch": 2.9921052631578946, "step": 1137, "train_accuracy": 0.765625 }, { "epoch": 2.9947368421052634, "grad_norm": 10.780973434448242, "learning_rate": 3.1352872533692603e-06, "loss": 1.1616, "step": 1138 }, { "epoch": 2.9947368421052634, "step": 1138, "train_accuracy": 0.859375 }, { "epoch": 2.9973684210526317, "grad_norm": 6.914345741271973, "learning_rate": 3.119804682954728e-06, "loss": 1.1758, "step": 1139 }, { "epoch": 2.9973684210526317, "step": 1139, "train_accuracy": 0.796875 }, { "epoch": 3.0, "grad_norm": 28.889413833618164, "learning_rate": 3.1043533665238944e-06, "loss": 0.9897, "step": 1140 }, { "epoch": 3.0, "eval_accuracy": 0.7127246856689453, "eval_max_score": 6.8125, "eval_min_score": -8.25, "eval_runtime": 151.4052, "eval_samples_per_second": 18.738, "eval_steps_per_second": 0.297, "step": 1140 }, { "epoch": 3.0, "step": 1140, "train_accuracy": 0.890625 }, { "epoch": 3.0026315789473683, "grad_norm": 91.07470703125, "learning_rate": 3.0889333742659187e-06, "loss": 0.9707, "step": 1141 }, { "epoch": 3.0026315789473683, "step": 1141, "train_accuracy": 0.84375 }, { "epoch": 3.0052631578947366, "grad_norm": 6.031994819641113, "learning_rate": 3.0735447762276872e-06, "loss": 1.0366, "step": 1142 }, { "epoch": 3.0052631578947366, "step": 1142, "train_accuracy": 0.859375 }, { "epoch": 3.0078947368421054, "grad_norm": 4.031905651092529, "learning_rate": 3.0581876423134527e-06, "loss": 0.7954, "step": 1143 }, { "epoch": 3.0078947368421054, "step": 1143, "train_accuracy": 0.921875 }, { "epoch": 3.0105263157894737, "grad_norm": 7.002809047698975, "learning_rate": 3.042862042284559e-06, "loss": 0.7837, "step": 1144 }, { "epoch": 3.0105263157894737, "step": 1144, "train_accuracy": 0.828125 }, { "epoch": 3.013157894736842, "grad_norm": 4.162775039672852, "learning_rate": 3.027568045759094e-06, "loss": 1.0454, "step": 1145 }, { "epoch": 3.013157894736842, "step": 1145, "train_accuracy": 0.859375 }, { "epoch": 3.0157894736842104, "grad_norm": 6.396974086761475, "learning_rate": 3.0123057222115835e-06, "loss": 1.0654, "step": 1146 }, { "epoch": 3.0157894736842104, "step": 1146, "train_accuracy": 0.859375 }, { "epoch": 3.018421052631579, "grad_norm": 3.9046335220336914, "learning_rate": 2.9970751409726785e-06, "loss": 0.886, "step": 1147 }, { "epoch": 3.018421052631579, "step": 1147, "train_accuracy": 0.8125 }, { "epoch": 3.0210526315789474, "grad_norm": 7.014838218688965, "learning_rate": 2.981876371228836e-06, "loss": 1.019, "step": 1148 }, { "epoch": 3.0210526315789474, "step": 1148, "train_accuracy": 0.90625 }, { "epoch": 3.0236842105263158, "grad_norm": 6.357172012329102, "learning_rate": 2.9667094820220044e-06, "loss": 0.873, "step": 1149 }, { "epoch": 3.0236842105263158, "step": 1149, "train_accuracy": 0.859375 }, { "epoch": 3.026315789473684, "grad_norm": 3.373042583465576, "learning_rate": 2.951574542249315e-06, "loss": 0.958, "step": 1150 }, { "epoch": 3.026315789473684, "step": 1150, "train_accuracy": 0.875 }, { "epoch": 3.028947368421053, "grad_norm": 4.8992743492126465, "learning_rate": 2.936471620662763e-06, "loss": 0.8518, "step": 1151 }, { "epoch": 3.028947368421053, "step": 1151, "train_accuracy": 0.90625 }, { "epoch": 3.031578947368421, "grad_norm": 3.5850627422332764, "learning_rate": 2.9214007858688986e-06, "loss": 0.9675, "step": 1152 }, { "epoch": 3.031578947368421, "step": 1152, "train_accuracy": 0.828125 }, { "epoch": 3.0342105263157895, "grad_norm": 9.761622428894043, "learning_rate": 2.906362106328515e-06, "loss": 0.9331, "step": 1153 }, { "epoch": 3.0342105263157895, "step": 1153, "train_accuracy": 0.8125 }, { "epoch": 3.036842105263158, "grad_norm": 4.369013786315918, "learning_rate": 2.8913556503563356e-06, "loss": 1.0645, "step": 1154 }, { "epoch": 3.036842105263158, "step": 1154, "train_accuracy": 0.921875 }, { "epoch": 3.039473684210526, "grad_norm": 3.727263927459717, "learning_rate": 2.876381486120706e-06, "loss": 0.8364, "step": 1155 }, { "epoch": 3.039473684210526, "step": 1155, "train_accuracy": 0.859375 }, { "epoch": 3.042105263157895, "grad_norm": 3.8516135215759277, "learning_rate": 2.861439681643283e-06, "loss": 0.853, "step": 1156 }, { "epoch": 3.042105263157895, "step": 1156, "train_accuracy": 0.84375 }, { "epoch": 3.044736842105263, "grad_norm": 4.059918403625488, "learning_rate": 2.846530304798727e-06, "loss": 1.0015, "step": 1157 }, { "epoch": 3.044736842105263, "step": 1157, "train_accuracy": 0.890625 }, { "epoch": 3.0473684210526315, "grad_norm": 4.983440399169922, "learning_rate": 2.831653423314389e-06, "loss": 0.7947, "step": 1158 }, { "epoch": 3.0473684210526315, "step": 1158, "train_accuracy": 0.875 }, { "epoch": 3.05, "grad_norm": 7.605556011199951, "learning_rate": 2.816809104770012e-06, "loss": 0.9277, "step": 1159 }, { "epoch": 3.05, "step": 1159, "train_accuracy": 0.921875 }, { "epoch": 3.0526315789473686, "grad_norm": 3.867741584777832, "learning_rate": 2.8019974165974127e-06, "loss": 0.9189, "step": 1160 }, { "epoch": 3.0526315789473686, "step": 1160, "train_accuracy": 0.9375 }, { "epoch": 3.055263157894737, "grad_norm": 5.580239772796631, "learning_rate": 2.787218426080184e-06, "loss": 0.7227, "step": 1161 }, { "epoch": 3.055263157894737, "step": 1161, "train_accuracy": 0.921875 }, { "epoch": 3.057894736842105, "grad_norm": 4.5618672370910645, "learning_rate": 2.7724722003533945e-06, "loss": 0.9521, "step": 1162 }, { "epoch": 3.057894736842105, "step": 1162, "train_accuracy": 0.921875 }, { "epoch": 3.0605263157894735, "grad_norm": 7.543132305145264, "learning_rate": 2.7577588064032533e-06, "loss": 0.7461, "step": 1163 }, { "epoch": 3.0605263157894735, "step": 1163, "train_accuracy": 0.859375 }, { "epoch": 3.0631578947368423, "grad_norm": 4.477904319763184, "learning_rate": 2.7430783110668557e-06, "loss": 0.9336, "step": 1164 }, { "epoch": 3.0631578947368423, "step": 1164, "train_accuracy": 0.828125 }, { "epoch": 3.0657894736842106, "grad_norm": 4.662795543670654, "learning_rate": 2.7284307810318257e-06, "loss": 0.8987, "step": 1165 }, { "epoch": 3.0657894736842106, "step": 1165, "train_accuracy": 0.921875 }, { "epoch": 3.068421052631579, "grad_norm": 5.89164400100708, "learning_rate": 2.7138162828360628e-06, "loss": 0.7316, "step": 1166 }, { "epoch": 3.068421052631579, "step": 1166, "train_accuracy": 0.875 }, { "epoch": 3.0710526315789473, "grad_norm": 10.350142478942871, "learning_rate": 2.699234882867393e-06, "loss": 1.0649, "step": 1167 }, { "epoch": 3.0710526315789473, "step": 1167, "train_accuracy": 0.8125 }, { "epoch": 3.0736842105263156, "grad_norm": 4.684883117675781, "learning_rate": 2.6846866473633126e-06, "loss": 0.8745, "step": 1168 }, { "epoch": 3.0736842105263156, "step": 1168, "train_accuracy": 0.90625 }, { "epoch": 3.0763157894736843, "grad_norm": 6.489368915557861, "learning_rate": 2.6701716424106425e-06, "loss": 0.8135, "step": 1169 }, { "epoch": 3.0763157894736843, "step": 1169, "train_accuracy": 0.828125 }, { "epoch": 3.0789473684210527, "grad_norm": 5.5763702392578125, "learning_rate": 2.6556899339452757e-06, "loss": 0.8262, "step": 1170 }, { "epoch": 3.0789473684210527, "step": 1170, "train_accuracy": 0.828125 }, { "epoch": 3.081578947368421, "grad_norm": 4.210776329040527, "learning_rate": 2.641241587751824e-06, "loss": 0.8428, "step": 1171 }, { "epoch": 3.081578947368421, "step": 1171, "train_accuracy": 0.765625 }, { "epoch": 3.0842105263157893, "grad_norm": 5.931998252868652, "learning_rate": 2.626826669463377e-06, "loss": 0.9688, "step": 1172 }, { "epoch": 3.0842105263157893, "step": 1172, "train_accuracy": 0.9375 }, { "epoch": 3.086842105263158, "grad_norm": 4.297722816467285, "learning_rate": 2.6124452445611458e-06, "loss": 0.7803, "step": 1173 }, { "epoch": 3.086842105263158, "step": 1173, "train_accuracy": 0.921875 }, { "epoch": 3.0894736842105264, "grad_norm": 4.3388495445251465, "learning_rate": 2.5980973783742236e-06, "loss": 0.7305, "step": 1174 }, { "epoch": 3.0894736842105264, "step": 1174, "train_accuracy": 0.859375 }, { "epoch": 3.0921052631578947, "grad_norm": 6.159016132354736, "learning_rate": 2.583783136079231e-06, "loss": 0.8511, "step": 1175 }, { "epoch": 3.0921052631578947, "step": 1175, "train_accuracy": 0.78125 }, { "epoch": 3.094736842105263, "grad_norm": 7.548987865447998, "learning_rate": 2.5695025827000752e-06, "loss": 0.9966, "step": 1176 }, { "epoch": 3.094736842105263, "step": 1176, "train_accuracy": 0.90625 }, { "epoch": 3.0973684210526318, "grad_norm": 8.062982559204102, "learning_rate": 2.555255783107603e-06, "loss": 0.8833, "step": 1177 }, { "epoch": 3.0973684210526318, "step": 1177, "train_accuracy": 0.921875 }, { "epoch": 3.1, "grad_norm": 6.597980976104736, "learning_rate": 2.5410428020193568e-06, "loss": 0.7385, "step": 1178 }, { "epoch": 3.1, "step": 1178, "train_accuracy": 0.875 }, { "epoch": 3.1026315789473684, "grad_norm": 11.51430892944336, "learning_rate": 2.5268637039992296e-06, "loss": 1.0493, "step": 1179 }, { "epoch": 3.1026315789473684, "step": 1179, "train_accuracy": 0.859375 }, { "epoch": 3.1052631578947367, "grad_norm": 9.351119041442871, "learning_rate": 2.5127185534572173e-06, "loss": 0.938, "step": 1180 }, { "epoch": 3.1052631578947367, "step": 1180, "train_accuracy": 0.90625 }, { "epoch": 3.1078947368421055, "grad_norm": 7.316296577453613, "learning_rate": 2.4986074146490967e-06, "loss": 0.8232, "step": 1181 }, { "epoch": 3.1078947368421055, "step": 1181, "train_accuracy": 0.78125 }, { "epoch": 3.110526315789474, "grad_norm": 6.263866424560547, "learning_rate": 2.4845303516761442e-06, "loss": 0.9543, "step": 1182 }, { "epoch": 3.110526315789474, "step": 1182, "train_accuracy": 0.890625 }, { "epoch": 3.113157894736842, "grad_norm": 6.839787006378174, "learning_rate": 2.4704874284848425e-06, "loss": 0.9771, "step": 1183 }, { "epoch": 3.113157894736842, "step": 1183, "train_accuracy": 0.8125 }, { "epoch": 3.1157894736842104, "grad_norm": 6.001226902008057, "learning_rate": 2.456478708866591e-06, "loss": 0.8281, "step": 1184 }, { "epoch": 3.1157894736842104, "step": 1184, "train_accuracy": 0.84375 }, { "epoch": 3.1184210526315788, "grad_norm": 5.544653415679932, "learning_rate": 2.4425042564574186e-06, "loss": 0.7673, "step": 1185 }, { "epoch": 3.1184210526315788, "step": 1185, "train_accuracy": 0.8125 }, { "epoch": 3.1210526315789475, "grad_norm": 11.175527572631836, "learning_rate": 2.4285641347376887e-06, "loss": 0.8721, "step": 1186 }, { "epoch": 3.1210526315789475, "step": 1186, "train_accuracy": 0.765625 }, { "epoch": 3.123684210526316, "grad_norm": 6.080649375915527, "learning_rate": 2.4146584070318145e-06, "loss": 0.9204, "step": 1187 }, { "epoch": 3.123684210526316, "step": 1187, "train_accuracy": 0.875 }, { "epoch": 3.126315789473684, "grad_norm": 8.601665496826172, "learning_rate": 2.400787136507975e-06, "loss": 0.8398, "step": 1188 }, { "epoch": 3.126315789473684, "step": 1188, "train_accuracy": 0.84375 }, { "epoch": 3.1289473684210525, "grad_norm": 9.983627319335938, "learning_rate": 2.3869503861778176e-06, "loss": 0.8701, "step": 1189 }, { "epoch": 3.1289473684210525, "step": 1189, "train_accuracy": 0.90625 }, { "epoch": 3.1315789473684212, "grad_norm": 3.987053394317627, "learning_rate": 2.373148218896182e-06, "loss": 0.6858, "step": 1190 }, { "epoch": 3.1315789473684212, "step": 1190, "train_accuracy": 0.859375 }, { "epoch": 3.1342105263157896, "grad_norm": 6.030102729797363, "learning_rate": 2.35938069736081e-06, "loss": 0.8005, "step": 1191 }, { "epoch": 3.1342105263157896, "step": 1191, "train_accuracy": 0.84375 }, { "epoch": 3.136842105263158, "grad_norm": 4.159539222717285, "learning_rate": 2.3456478841120634e-06, "loss": 0.8113, "step": 1192 }, { "epoch": 3.136842105263158, "step": 1192, "train_accuracy": 0.84375 }, { "epoch": 3.139473684210526, "grad_norm": 5.042561054229736, "learning_rate": 2.331949841532636e-06, "loss": 0.8757, "step": 1193 }, { "epoch": 3.139473684210526, "step": 1193, "train_accuracy": 0.84375 }, { "epoch": 3.1421052631578945, "grad_norm": 12.693655967712402, "learning_rate": 2.318286631847272e-06, "loss": 0.9448, "step": 1194 }, { "epoch": 3.1421052631578945, "step": 1194, "train_accuracy": 0.859375 }, { "epoch": 3.1447368421052633, "grad_norm": 4.802595615386963, "learning_rate": 2.3046583171224835e-06, "loss": 0.8074, "step": 1195 }, { "epoch": 3.1447368421052633, "step": 1195, "train_accuracy": 0.796875 }, { "epoch": 3.1473684210526316, "grad_norm": 5.915050983428955, "learning_rate": 2.2910649592662724e-06, "loss": 1.0713, "step": 1196 }, { "epoch": 3.1473684210526316, "step": 1196, "train_accuracy": 0.890625 }, { "epoch": 3.15, "grad_norm": 4.1914472579956055, "learning_rate": 2.2775066200278383e-06, "loss": 0.7119, "step": 1197 }, { "epoch": 3.15, "step": 1197, "train_accuracy": 0.875 }, { "epoch": 3.1526315789473682, "grad_norm": 7.356982231140137, "learning_rate": 2.2639833609973182e-06, "loss": 0.8606, "step": 1198 }, { "epoch": 3.1526315789473682, "step": 1198, "train_accuracy": 0.96875 }, { "epoch": 3.155263157894737, "grad_norm": 4.53258752822876, "learning_rate": 2.250495243605475e-06, "loss": 0.6907, "step": 1199 }, { "epoch": 3.155263157894737, "step": 1199, "train_accuracy": 0.875 }, { "epoch": 3.1578947368421053, "grad_norm": 6.945924758911133, "learning_rate": 2.2370423291234543e-06, "loss": 0.7888, "step": 1200 }, { "epoch": 3.1578947368421053, "step": 1200, "train_accuracy": 0.875 }, { "epoch": 3.1605263157894736, "grad_norm": 8.83996868133545, "learning_rate": 2.2236246786624794e-06, "loss": 0.9097, "step": 1201 }, { "epoch": 3.1605263157894736, "step": 1201, "train_accuracy": 0.8125 }, { "epoch": 3.163157894736842, "grad_norm": 6.74652099609375, "learning_rate": 2.210242353173586e-06, "loss": 1.0459, "step": 1202 }, { "epoch": 3.163157894736842, "step": 1202, "train_accuracy": 0.90625 }, { "epoch": 3.1657894736842107, "grad_norm": 5.417960166931152, "learning_rate": 2.196895413447343e-06, "loss": 0.835, "step": 1203 }, { "epoch": 3.1657894736842107, "step": 1203, "train_accuracy": 0.890625 }, { "epoch": 3.168421052631579, "grad_norm": 5.827296733856201, "learning_rate": 2.1835839201135743e-06, "loss": 0.8953, "step": 1204 }, { "epoch": 3.168421052631579, "step": 1204, "train_accuracy": 0.828125 }, { "epoch": 3.1710526315789473, "grad_norm": 13.554121971130371, "learning_rate": 2.170307933641087e-06, "loss": 0.9985, "step": 1205 }, { "epoch": 3.1710526315789473, "step": 1205, "train_accuracy": 0.84375 }, { "epoch": 3.1736842105263157, "grad_norm": 6.409447193145752, "learning_rate": 2.157067514337392e-06, "loss": 1.0347, "step": 1206 }, { "epoch": 3.1736842105263157, "step": 1206, "train_accuracy": 0.890625 }, { "epoch": 3.1763157894736844, "grad_norm": 7.8958234786987305, "learning_rate": 2.143862722348434e-06, "loss": 0.9468, "step": 1207 }, { "epoch": 3.1763157894736844, "step": 1207, "train_accuracy": 0.796875 }, { "epoch": 3.1789473684210527, "grad_norm": 5.160824298858643, "learning_rate": 2.1306936176583206e-06, "loss": 0.8315, "step": 1208 }, { "epoch": 3.1789473684210527, "step": 1208, "train_accuracy": 0.859375 }, { "epoch": 3.181578947368421, "grad_norm": 6.878368377685547, "learning_rate": 2.117560260089039e-06, "loss": 1.0015, "step": 1209 }, { "epoch": 3.181578947368421, "step": 1209, "train_accuracy": 0.90625 }, { "epoch": 3.1842105263157894, "grad_norm": 6.8526930809021, "learning_rate": 2.1044627093001966e-06, "loss": 0.9253, "step": 1210 }, { "epoch": 3.1842105263157894, "step": 1210, "train_accuracy": 0.890625 }, { "epoch": 3.1868421052631577, "grad_norm": 6.191732883453369, "learning_rate": 2.091401024788745e-06, "loss": 0.8765, "step": 1211 }, { "epoch": 3.1868421052631577, "step": 1211, "train_accuracy": 0.875 }, { "epoch": 3.1894736842105265, "grad_norm": 7.804241180419922, "learning_rate": 2.078375265888707e-06, "loss": 0.9224, "step": 1212 }, { "epoch": 3.1894736842105265, "step": 1212, "train_accuracy": 0.84375 }, { "epoch": 3.192105263157895, "grad_norm": 5.588228225708008, "learning_rate": 2.0653854917709115e-06, "loss": 1.0356, "step": 1213 }, { "epoch": 3.192105263157895, "step": 1213, "train_accuracy": 0.875 }, { "epoch": 3.194736842105263, "grad_norm": 9.253522872924805, "learning_rate": 2.0524317614427225e-06, "loss": 1.0024, "step": 1214 }, { "epoch": 3.194736842105263, "step": 1214, "train_accuracy": 0.828125 }, { "epoch": 3.1973684210526314, "grad_norm": 4.661523818969727, "learning_rate": 2.039514133747771e-06, "loss": 0.6868, "step": 1215 }, { "epoch": 3.1973684210526314, "step": 1215, "train_accuracy": 0.84375 }, { "epoch": 3.2, "grad_norm": 6.566859245300293, "learning_rate": 2.0266326673656877e-06, "loss": 1.1104, "step": 1216 }, { "epoch": 3.2, "eval_accuracy": 0.7225942611694336, "eval_max_score": 10.875, "eval_min_score": -13.625, "eval_runtime": 151.3303, "eval_samples_per_second": 18.747, "eval_steps_per_second": 0.297, "step": 1216 } ], "logging_steps": 1.0, "max_steps": 1520, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 76, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.927371051866522e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }