{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": -1.4581298828125, "accuracy": 0.28125, "epoch": 0, "step": 0 }, { "Batch Mean": -1.4786376953125, "accuracy": 0.46875, "epoch": 0, "step": 0 }, { "Batch Mean": -1.486572265625, "accuracy": 0.5, "epoch": 0, "step": 0 }, { "Batch Mean": -1.439697265625, "accuracy": 0.625, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 2.7191572189331055, "learning_rate": 1.5000000000000002e-07, "loss": 0.6927, "step": 1 }, { "Batch Mean": -1.4107666015625, "accuracy": 0.4375, "epoch": 0.0025, "step": 1 }, { "Batch Mean": -1.4342041015625, "accuracy": 0.5, "epoch": 0.0025, "step": 1 }, { "Batch Mean": -1.45263671875, "accuracy": 0.5625, "epoch": 0.0025, "step": 1 }, { "Batch Mean": -1.4517822265625, "accuracy": 0.5625, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 3.204066038131714, "learning_rate": 3.0000000000000004e-07, "loss": 0.6964, "step": 2 }, { "Batch Mean": -1.4908447265625, "accuracy": 0.59375, "epoch": 0.005, "step": 2 }, { "Batch Mean": -1.425048828125, "accuracy": 0.4375, "epoch": 0.005, "step": 2 }, { "Batch Mean": -1.464111328125, "accuracy": 0.375, "epoch": 0.005, "step": 2 }, { "Batch Mean": -1.4324951171875, "accuracy": 0.59375, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 3.103353261947632, "learning_rate": 4.5e-07, "loss": 0.6991, "step": 3 }, { "Batch Mean": -1.494140625, "accuracy": 0.46875, "epoch": 0.0075, "step": 3 }, { "Batch Mean": -1.4178466796875, "accuracy": 0.625, "epoch": 0.0075, "step": 3 }, { "Batch Mean": -1.520751953125, "accuracy": 0.59375, "epoch": 0.0075, "step": 3 }, { "Batch Mean": -1.4844970703125, "accuracy": 0.5625, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 3.3672587871551514, "learning_rate": 6.000000000000001e-07, "loss": 0.6883, "step": 4 }, { "Batch Mean": -1.4312744140625, "accuracy": 0.4375, "epoch": 0.01, "step": 4 }, { "Batch Mean": -1.4820556640625, "accuracy": 0.5625, "epoch": 0.01, "step": 4 }, { "Batch Mean": -1.4405517578125, "accuracy": 0.5, "epoch": 0.01, "step": 4 }, { "Batch Mean": -1.4302978515625, "accuracy": 0.53125, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 3.158576011657715, "learning_rate": 7.5e-07, "loss": 0.7012, "step": 5 }, { "Batch Mean": -1.4569091796875, "accuracy": 0.40625, "epoch": 0.0125, "step": 5 }, { "Batch Mean": -1.46435546875, "accuracy": 0.5, "epoch": 0.0125, "step": 5 }, { "Batch Mean": -1.4354248046875, "accuracy": 0.5625, "epoch": 0.0125, "step": 5 }, { "Batch Mean": -1.47412109375, "accuracy": 0.40625, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 4.888192176818848, "learning_rate": 9e-07, "loss": 0.7118, "step": 6 }, { "Batch Mean": -1.4361572265625, "accuracy": 0.53125, "epoch": 0.015, "step": 6 }, { "Batch Mean": -1.4234619140625, "accuracy": 0.625, "epoch": 0.015, "step": 6 }, { "Batch Mean": -1.4453125, "accuracy": 0.375, "epoch": 0.015, "step": 6 }, { "Batch Mean": -1.44287109375, "accuracy": 0.5, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 3.654751777648926, "learning_rate": 1.05e-06, "loss": 0.6901, "step": 7 }, { "Batch Mean": -1.4200439453125, "accuracy": 0.53125, "epoch": 0.0175, "step": 7 }, { "Batch Mean": -1.406494140625, "accuracy": 0.4375, "epoch": 0.0175, "step": 7 }, { "Batch Mean": -1.4012451171875, "accuracy": 0.53125, "epoch": 0.0175, "step": 7 }, { "Batch Mean": -1.4122314453125, "accuracy": 0.4375, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 3.2707793712615967, "learning_rate": 1.2000000000000002e-06, "loss": 0.7026, "step": 8 }, { "Batch Mean": -1.400634765625, "accuracy": 0.5625, "epoch": 0.02, "step": 8 }, { "Batch Mean": -1.3936767578125, "accuracy": 0.46875, "epoch": 0.02, "step": 8 }, { "Batch Mean": -1.4110107421875, "accuracy": 0.34375, "epoch": 0.02, "step": 8 }, { "Batch Mean": -1.4215087890625, "accuracy": 0.625, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 3.053551197052002, "learning_rate": 1.35e-06, "loss": 0.6859, "step": 9 }, { "Batch Mean": -1.35302734375, "accuracy": 0.5625, "epoch": 0.0225, "step": 9 }, { "Batch Mean": -1.35003662109375, "accuracy": 0.40625, "epoch": 0.0225, "step": 9 }, { "Batch Mean": -1.39306640625, "accuracy": 0.5625, "epoch": 0.0225, "step": 9 }, { "Batch Mean": -1.3843994140625, "accuracy": 0.53125, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 2.9442760944366455, "learning_rate": 1.5e-06, "loss": 0.6853, "step": 10 }, { "Batch Mean": -1.31396484375, "accuracy": 0.46875, "epoch": 0.025, "step": 10 }, { "Batch Mean": -1.33154296875, "accuracy": 0.5625, "epoch": 0.025, "step": 10 }, { "Batch Mean": -1.3260498046875, "accuracy": 0.46875, "epoch": 0.025, "step": 10 }, { "Batch Mean": -1.3170166015625, "accuracy": 0.4375, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 2.729567050933838, "learning_rate": 1.65e-06, "loss": 0.6946, "step": 11 }, { "Batch Mean": -1.24346923828125, "accuracy": 0.4375, "epoch": 0.0275, "step": 11 }, { "Batch Mean": -1.239013671875, "accuracy": 0.59375, "epoch": 0.0275, "step": 11 }, { "Batch Mean": -1.3074951171875, "accuracy": 0.5, "epoch": 0.0275, "step": 11 }, { "Batch Mean": -1.24664306640625, "accuracy": 0.53125, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 2.8832643032073975, "learning_rate": 1.8e-06, "loss": 0.6869, "step": 12 }, { "Batch Mean": -1.2061767578125, "accuracy": 0.625, "epoch": 0.03, "step": 12 }, { "Batch Mean": -1.09735107421875, "accuracy": 0.5625, "epoch": 0.03, "step": 12 }, { "Batch Mean": -1.1669921875, "accuracy": 0.625, "epoch": 0.03, "step": 12 }, { "Batch Mean": -1.107421875, "accuracy": 0.53125, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 3.347060441970825, "learning_rate": 1.95e-06, "loss": 0.676, "step": 13 }, { "Batch Mean": -0.99713134765625, "accuracy": 0.53125, "epoch": 0.0325, "step": 13 }, { "Batch Mean": -0.992431640625, "accuracy": 0.6875, "epoch": 0.0325, "step": 13 }, { "Batch Mean": -1.08367919921875, "accuracy": 0.65625, "epoch": 0.0325, "step": 13 }, { "Batch Mean": -1.073486328125, "accuracy": 0.5625, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 3.0629279613494873, "learning_rate": 2.1e-06, "loss": 0.6446, "step": 14 }, { "Batch Mean": -1.027008056640625, "accuracy": 0.5625, "epoch": 0.035, "step": 14 }, { "Batch Mean": -1.04302978515625, "accuracy": 0.625, "epoch": 0.035, "step": 14 }, { "Batch Mean": -0.986724853515625, "accuracy": 0.71875, "epoch": 0.035, "step": 14 }, { "Batch Mean": -1.010406494140625, "accuracy": 0.65625, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 3.297088146209717, "learning_rate": 2.25e-06, "loss": 0.6466, "step": 15 }, { "Batch Mean": -0.945648193359375, "accuracy": 0.625, "epoch": 0.0375, "step": 15 }, { "Batch Mean": -0.90460205078125, "accuracy": 0.625, "epoch": 0.0375, "step": 15 }, { "Batch Mean": -0.9103546142578125, "accuracy": 0.625, "epoch": 0.0375, "step": 15 }, { "Batch Mean": -0.84765625, "accuracy": 0.8125, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 3.339815855026245, "learning_rate": 2.4000000000000003e-06, "loss": 0.6261, "step": 16 }, { "Batch Mean": -0.7341957092285156, "accuracy": 0.65625, "epoch": 0.04, "step": 16 }, { "Batch Mean": -0.6576881408691406, "accuracy": 0.8125, "epoch": 0.04, "step": 16 }, { "Batch Mean": -0.7573471069335938, "accuracy": 0.75, "epoch": 0.04, "step": 16 }, { "Batch Mean": -0.8988265991210938, "accuracy": 0.84375, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 4.010303974151611, "learning_rate": 2.55e-06, "loss": 0.6324, "step": 17 }, { "Batch Mean": -0.45727968215942383, "accuracy": 0.59375, "epoch": 0.0425, "step": 17 }, { "Batch Mean": -0.40456533432006836, "accuracy": 0.6875, "epoch": 0.0425, "step": 17 }, { "Batch Mean": -0.4847888946533203, "accuracy": 0.59375, "epoch": 0.0425, "step": 17 }, { "Batch Mean": -0.31931304931640625, "accuracy": 0.65625, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 4.431520462036133, "learning_rate": 2.7e-06, "loss": 0.62, "step": 18 }, { "Batch Mean": -0.0693979263305664, "accuracy": 0.8125, "epoch": 0.045, "step": 18 }, { "Batch Mean": -0.23062896728515625, "accuracy": 0.625, "epoch": 0.045, "step": 18 }, { "Batch Mean": -0.10647201538085938, "accuracy": 0.65625, "epoch": 0.045, "step": 18 }, { "Batch Mean": -0.07384902238845825, "accuracy": 0.6875, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 5.421309947967529, "learning_rate": 2.85e-06, "loss": 0.5896, "step": 19 }, { "Batch Mean": -0.12799835205078125, "accuracy": 0.6875, "epoch": 0.0475, "step": 19 }, { "Batch Mean": 0.0684967041015625, "accuracy": 0.875, "epoch": 0.0475, "step": 19 }, { "Batch Mean": -0.014011383056640625, "accuracy": 0.65625, "epoch": 0.0475, "step": 19 }, { "Batch Mean": 0.0633087158203125, "accuracy": 0.84375, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 5.102872848510742, "learning_rate": 3e-06, "loss": 0.4931, "step": 20 }, { "Batch Mean": 0.19290733337402344, "accuracy": 0.8125, "epoch": 0.05, "step": 20 }, { "Batch Mean": 0.29687976837158203, "accuracy": 0.53125, "epoch": 0.05, "step": 20 }, { "Batch Mean": 0.103363037109375, "accuracy": 0.625, "epoch": 0.05, "step": 20 }, { "Batch Mean": 0.3869609832763672, "accuracy": 0.71875, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 7.569705486297607, "learning_rate": 2.992105263157895e-06, "loss": 0.5976, "step": 21 }, { "Batch Mean": 0.47769927978515625, "accuracy": 0.71875, "epoch": 0.0525, "step": 21 }, { "Batch Mean": 0.5898284912109375, "accuracy": 0.875, "epoch": 0.0525, "step": 21 }, { "Batch Mean": 0.3037242889404297, "accuracy": 0.75, "epoch": 0.0525, "step": 21 }, { "Batch Mean": -0.037357330322265625, "accuracy": 0.75, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 7.273393630981445, "learning_rate": 2.9842105263157896e-06, "loss": 0.4982, "step": 22 }, { "Batch Mean": 0.78411865234375, "accuracy": 0.59375, "epoch": 0.055, "step": 22 }, { "Batch Mean": 0.3693389892578125, "accuracy": 0.625, "epoch": 0.055, "step": 22 }, { "Batch Mean": 0.3277778625488281, "accuracy": 0.59375, "epoch": 0.055, "step": 22 }, { "Batch Mean": 0.23564910888671875, "accuracy": 0.84375, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 10.978965759277344, "learning_rate": 2.9763157894736843e-06, "loss": 0.6967, "step": 23 }, { "Batch Mean": 0.3892631530761719, "accuracy": 0.71875, "epoch": 0.0575, "step": 23 }, { "Batch Mean": 0.4144134521484375, "accuracy": 0.65625, "epoch": 0.0575, "step": 23 }, { "Batch Mean": 0.201019287109375, "accuracy": 0.6875, "epoch": 0.0575, "step": 23 }, { "Batch Mean": 0.15361404418945312, "accuracy": 0.5625, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 10.72164249420166, "learning_rate": 2.968421052631579e-06, "loss": 0.657, "step": 24 }, { "Batch Mean": 0.32332611083984375, "accuracy": 0.65625, "epoch": 0.06, "step": 24 }, { "Batch Mean": -0.45644378662109375, "accuracy": 0.71875, "epoch": 0.06, "step": 24 }, { "Batch Mean": 0.10271453857421875, "accuracy": 0.78125, "epoch": 0.06, "step": 24 }, { "Batch Mean": 0.5616731643676758, "accuracy": 0.78125, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 10.953572273254395, "learning_rate": 2.960526315789474e-06, "loss": 0.6313, "step": 25 }, { "Batch Mean": -0.08791732788085938, "accuracy": 0.71875, "epoch": 0.0625, "step": 25 }, { "Batch Mean": -0.12505340576171875, "accuracy": 0.71875, "epoch": 0.0625, "step": 25 }, { "Batch Mean": 0.2984886169433594, "accuracy": 0.75, "epoch": 0.0625, "step": 25 }, { "Batch Mean": -0.2277584969997406, "accuracy": 0.65625, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 8.867247581481934, "learning_rate": 2.9526315789473685e-06, "loss": 0.5531, "step": 26 }, { "Batch Mean": -0.310638427734375, "accuracy": 0.875, "epoch": 0.065, "step": 26 }, { "Batch Mean": 0.05762290954589844, "accuracy": 0.71875, "epoch": 0.065, "step": 26 }, { "Batch Mean": -0.3841552734375, "accuracy": 0.75, "epoch": 0.065, "step": 26 }, { "Batch Mean": -0.13448715209960938, "accuracy": 0.78125, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 7.167004585266113, "learning_rate": 2.9447368421052633e-06, "loss": 0.4927, "step": 27 }, { "Batch Mean": -0.5082488059997559, "accuracy": 0.59375, "epoch": 0.0675, "step": 27 }, { "Batch Mean": -0.5335745811462402, "accuracy": 0.65625, "epoch": 0.0675, "step": 27 }, { "Batch Mean": -0.3728065490722656, "accuracy": 0.65625, "epoch": 0.0675, "step": 27 }, { "Batch Mean": -0.48749029636383057, "accuracy": 0.65625, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 9.99916934967041, "learning_rate": 2.936842105263158e-06, "loss": 0.6787, "step": 28 }, { "Batch Mean": -0.5768375396728516, "accuracy": 0.78125, "epoch": 0.07, "step": 28 }, { "Batch Mean": -0.36152684688568115, "accuracy": 0.71875, "epoch": 0.07, "step": 28 }, { "Batch Mean": -0.6082801818847656, "accuracy": 0.59375, "epoch": 0.07, "step": 28 }, { "Batch Mean": -0.5176200866699219, "accuracy": 0.65625, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 6.558942794799805, "learning_rate": 2.9289473684210528e-06, "loss": 0.571, "step": 29 }, { "Batch Mean": -0.3009366989135742, "accuracy": 0.78125, "epoch": 0.0725, "step": 29 }, { "Batch Mean": -0.4234275817871094, "accuracy": 0.75, "epoch": 0.0725, "step": 29 }, { "Batch Mean": -0.4476432800292969, "accuracy": 0.78125, "epoch": 0.0725, "step": 29 }, { "Batch Mean": -0.6630382537841797, "accuracy": 0.71875, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 5.937437534332275, "learning_rate": 2.9210526315789475e-06, "loss": 0.5233, "step": 30 }, { "Batch Mean": -0.47089385986328125, "accuracy": 0.65625, "epoch": 0.075, "step": 30 }, { "Batch Mean": -0.5186127424240112, "accuracy": 0.78125, "epoch": 0.075, "step": 30 }, { "Batch Mean": -0.5250816345214844, "accuracy": 0.5625, "epoch": 0.075, "step": 30 }, { "Batch Mean": -0.3480682373046875, "accuracy": 0.8125, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 5.8368072509765625, "learning_rate": 2.9131578947368423e-06, "loss": 0.5172, "step": 31 }, { "Batch Mean": -0.29285621643066406, "accuracy": 0.8125, "epoch": 0.0775, "step": 31 }, { "Batch Mean": -0.3106422424316406, "accuracy": 0.84375, "epoch": 0.0775, "step": 31 }, { "Batch Mean": 0.005329132080078125, "accuracy": 0.71875, "epoch": 0.0775, "step": 31 }, { "Batch Mean": -0.1413421630859375, "accuracy": 0.78125, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 5.706140995025635, "learning_rate": 2.905263157894737e-06, "loss": 0.5095, "step": 32 }, { "Batch Mean": -0.04312324523925781, "accuracy": 0.6875, "epoch": 0.08, "step": 32 }, { "Batch Mean": -0.10883808135986328, "accuracy": 0.71875, "epoch": 0.08, "step": 32 }, { "Batch Mean": 0.3197288513183594, "accuracy": 0.71875, "epoch": 0.08, "step": 32 }, { "Batch Mean": -0.13158416748046875, "accuracy": 0.65625, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 6.042052268981934, "learning_rate": 2.8973684210526318e-06, "loss": 0.5717, "step": 33 }, { "Batch Mean": 0.0721282958984375, "accuracy": 0.75, "epoch": 0.0825, "step": 33 }, { "Batch Mean": 0.05409049987792969, "accuracy": 0.71875, "epoch": 0.0825, "step": 33 }, { "Batch Mean": -0.04035043716430664, "accuracy": 0.625, "epoch": 0.0825, "step": 33 }, { "Batch Mean": -0.04631471633911133, "accuracy": 0.71875, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 5.908041954040527, "learning_rate": 2.8894736842105265e-06, "loss": 0.5446, "step": 34 }, { "Batch Mean": 0.2712249755859375, "accuracy": 0.84375, "epoch": 0.085, "step": 34 }, { "Batch Mean": 0.179473876953125, "accuracy": 0.8125, "epoch": 0.085, "step": 34 }, { "Batch Mean": -0.01055145263671875, "accuracy": 0.78125, "epoch": 0.085, "step": 34 }, { "Batch Mean": 0.06919479370117188, "accuracy": 0.8125, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 4.990839958190918, "learning_rate": 2.8815789473684213e-06, "loss": 0.4607, "step": 35 }, { "Batch Mean": -0.017984390258789062, "accuracy": 0.78125, "epoch": 0.0875, "step": 35 }, { "Batch Mean": 0.075164794921875, "accuracy": 0.78125, "epoch": 0.0875, "step": 35 }, { "Batch Mean": 0.20074462890625, "accuracy": 0.8125, "epoch": 0.0875, "step": 35 }, { "Batch Mean": -0.03507876396179199, "accuracy": 0.65625, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 5.7467803955078125, "learning_rate": 2.873684210526316e-06, "loss": 0.5038, "step": 36 }, { "Batch Mean": 0.2868976593017578, "accuracy": 0.75, "epoch": 0.09, "step": 36 }, { "Batch Mean": 0.16400146484375, "accuracy": 0.65625, "epoch": 0.09, "step": 36 }, { "Batch Mean": 0.2293224334716797, "accuracy": 0.71875, "epoch": 0.09, "step": 36 }, { "Batch Mean": 0.2969036102294922, "accuracy": 0.6875, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 6.629448413848877, "learning_rate": 2.8657894736842103e-06, "loss": 0.5233, "step": 37 }, { "Batch Mean": -0.07112598419189453, "accuracy": 0.6875, "epoch": 0.0925, "step": 37 }, { "Batch Mean": 0.25348663330078125, "accuracy": 0.9375, "epoch": 0.0925, "step": 37 }, { "Batch Mean": 0.2884788513183594, "accuracy": 0.6875, "epoch": 0.0925, "step": 37 }, { "Batch Mean": 0.06340456008911133, "accuracy": 0.71875, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 6.545988082885742, "learning_rate": 2.857894736842105e-06, "loss": 0.521, "step": 38 }, { "Batch Mean": 0.372711181640625, "accuracy": 0.8125, "epoch": 0.095, "step": 38 }, { "Batch Mean": 0.2590770721435547, "accuracy": 0.53125, "epoch": 0.095, "step": 38 }, { "Batch Mean": 0.016815185546875, "accuracy": 0.6875, "epoch": 0.095, "step": 38 }, { "Batch Mean": 0.0049419403076171875, "accuracy": 0.71875, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 9.898524284362793, "learning_rate": 2.85e-06, "loss": 0.6255, "step": 39 }, { "Batch Mean": 0.6515955924987793, "accuracy": 0.6875, "epoch": 0.0975, "step": 39 }, { "Batch Mean": 0.4063148498535156, "accuracy": 0.8125, "epoch": 0.0975, "step": 39 }, { "Batch Mean": 0.1270294189453125, "accuracy": 0.71875, "epoch": 0.0975, "step": 39 }, { "Batch Mean": 0.4789772033691406, "accuracy": 0.6875, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 6.953475475311279, "learning_rate": 2.8421052631578946e-06, "loss": 0.4934, "step": 40 }, { "Batch Mean": 0.25176239013671875, "accuracy": 0.78125, "epoch": 0.1, "step": 40 }, { "Batch Mean": 0.4009513854980469, "accuracy": 0.65625, "epoch": 0.1, "step": 40 }, { "Batch Mean": 0.6202306747436523, "accuracy": 0.78125, "epoch": 0.1, "step": 40 }, { "Batch Mean": 0.2911343574523926, "accuracy": 0.78125, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 7.0007123947143555, "learning_rate": 2.8342105263157897e-06, "loss": 0.4957, "step": 41 }, { "Batch Mean": 0.13779544830322266, "accuracy": 0.625, "epoch": 0.1025, "step": 41 }, { "Batch Mean": 0.5141849517822266, "accuracy": 0.84375, "epoch": 0.1025, "step": 41 }, { "Batch Mean": 0.12182235717773438, "accuracy": 0.71875, "epoch": 0.1025, "step": 41 }, { "Batch Mean": 0.09358537197113037, "accuracy": 0.65625, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 8.165699005126953, "learning_rate": 2.8263157894736845e-06, "loss": 0.5642, "step": 42 }, { "Batch Mean": 0.26740550994873047, "accuracy": 0.8125, "epoch": 0.105, "step": 42 }, { "Batch Mean": -0.07419204711914062, "accuracy": 0.71875, "epoch": 0.105, "step": 42 }, { "Batch Mean": 0.2999420166015625, "accuracy": 0.78125, "epoch": 0.105, "step": 42 }, { "Batch Mean": -0.2398681640625, "accuracy": 0.75, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 7.090755939483643, "learning_rate": 2.8184210526315792e-06, "loss": 0.5136, "step": 43 }, { "Batch Mean": 0.3058357238769531, "accuracy": 0.875, "epoch": 0.1075, "step": 43 }, { "Batch Mean": 0.10181450843811035, "accuracy": 0.78125, "epoch": 0.1075, "step": 43 }, { "Batch Mean": -0.07529067993164062, "accuracy": 0.71875, "epoch": 0.1075, "step": 43 }, { "Batch Mean": 0.46073150634765625, "accuracy": 0.8125, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 5.939328670501709, "learning_rate": 2.810526315789474e-06, "loss": 0.464, "step": 44 }, { "Batch Mean": -0.13095474243164062, "accuracy": 0.6875, "epoch": 0.11, "step": 44 }, { "Batch Mean": 0.32462239265441895, "accuracy": 0.875, "epoch": 0.11, "step": 44 }, { "Batch Mean": -0.15337753295898438, "accuracy": 0.78125, "epoch": 0.11, "step": 44 }, { "Batch Mean": 0.38422298431396484, "accuracy": 0.71875, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 6.517725944519043, "learning_rate": 2.8026315789473687e-06, "loss": 0.4854, "step": 45 }, { "Batch Mean": 0.290924072265625, "accuracy": 0.625, "epoch": 0.1125, "step": 45 }, { "Batch Mean": 0.03897809982299805, "accuracy": 0.875, "epoch": 0.1125, "step": 45 }, { "Batch Mean": 0.20547938346862793, "accuracy": 0.875, "epoch": 0.1125, "step": 45 }, { "Batch Mean": 0.3288555145263672, "accuracy": 0.75, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 5.711620330810547, "learning_rate": 2.7947368421052635e-06, "loss": 0.4129, "step": 46 }, { "Batch Mean": 0.331978440284729, "accuracy": 0.78125, "epoch": 0.115, "step": 46 }, { "Batch Mean": -0.12884771823883057, "accuracy": 0.71875, "epoch": 0.115, "step": 46 }, { "Batch Mean": 0.2715787887573242, "accuracy": 0.65625, "epoch": 0.115, "step": 46 }, { "Batch Mean": 0.3961639404296875, "accuracy": 0.78125, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 6.815968036651611, "learning_rate": 2.7868421052631578e-06, "loss": 0.5217, "step": 47 }, { "Batch Mean": -0.05124783515930176, "accuracy": 0.84375, "epoch": 0.1175, "step": 47 }, { "Batch Mean": -0.4043617248535156, "accuracy": 0.78125, "epoch": 0.1175, "step": 47 }, { "Batch Mean": 0.21244239807128906, "accuracy": 0.78125, "epoch": 0.1175, "step": 47 }, { "Batch Mean": -0.09090805053710938, "accuracy": 0.75, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 6.305139541625977, "learning_rate": 2.7789473684210525e-06, "loss": 0.4484, "step": 48 }, { "Batch Mean": 0.3022747039794922, "accuracy": 0.65625, "epoch": 0.12, "step": 48 }, { "Batch Mean": -0.013670921325683594, "accuracy": 0.6875, "epoch": 0.12, "step": 48 }, { "Batch Mean": 0.4046478271484375, "accuracy": 0.84375, "epoch": 0.12, "step": 48 }, { "Batch Mean": 0.16419363021850586, "accuracy": 0.84375, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 5.598595142364502, "learning_rate": 2.7710526315789473e-06, "loss": 0.4684, "step": 49 }, { "Batch Mean": -0.24893569946289062, "accuracy": 0.6875, "epoch": 0.1225, "step": 49 }, { "Batch Mean": -0.2393360137939453, "accuracy": 0.8125, "epoch": 0.1225, "step": 49 }, { "Batch Mean": 0.2698392868041992, "accuracy": 0.8125, "epoch": 0.1225, "step": 49 }, { "Batch Mean": -0.3564453125, "accuracy": 0.75, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 6.394057750701904, "learning_rate": 2.763157894736842e-06, "loss": 0.4703, "step": 50 }, { "Batch Mean": -0.09824085235595703, "accuracy": 0.71875, "epoch": 0.125, "step": 50 }, { "Batch Mean": -0.1602630615234375, "accuracy": 0.8125, "epoch": 0.125, "step": 50 }, { "Batch Mean": -0.6205692291259766, "accuracy": 0.625, "epoch": 0.125, "step": 50 }, { "Batch Mean": 0.06192302703857422, "accuracy": 0.75, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 7.542079925537109, "learning_rate": 2.7552631578947368e-06, "loss": 0.4731, "step": 51 }, { "Batch Mean": -0.24329090118408203, "accuracy": 0.71875, "epoch": 0.1275, "step": 51 }, { "Batch Mean": 0.277587890625, "accuracy": 0.78125, "epoch": 0.1275, "step": 51 }, { "Batch Mean": -0.1536083221435547, "accuracy": 0.8125, "epoch": 0.1275, "step": 51 }, { "Batch Mean": -0.2829427719116211, "accuracy": 0.90625, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 6.608920097351074, "learning_rate": 2.7473684210526315e-06, "loss": 0.4472, "step": 52 }, { "Batch Mean": -0.2534487247467041, "accuracy": 0.78125, "epoch": 0.13, "step": 52 }, { "Batch Mean": -0.3897590637207031, "accuracy": 0.6875, "epoch": 0.13, "step": 52 }, { "Batch Mean": 0.0982666015625, "accuracy": 0.75, "epoch": 0.13, "step": 52 }, { "Batch Mean": -0.19083404541015625, "accuracy": 0.78125, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 9.115386962890625, "learning_rate": 2.7394736842105263e-06, "loss": 0.4964, "step": 53 }, { "Batch Mean": -0.07914352416992188, "accuracy": 0.84375, "epoch": 0.1325, "step": 53 }, { "Batch Mean": -0.8162860870361328, "accuracy": 0.75, "epoch": 0.1325, "step": 53 }, { "Batch Mean": -0.9538593292236328, "accuracy": 0.71875, "epoch": 0.1325, "step": 53 }, { "Batch Mean": 0.025072097778320312, "accuracy": 0.78125, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 9.654952049255371, "learning_rate": 2.7315789473684214e-06, "loss": 0.4771, "step": 54 }, { "Batch Mean": -0.2607238292694092, "accuracy": 0.65625, "epoch": 0.135, "step": 54 }, { "Batch Mean": 0.07077789306640625, "accuracy": 0.875, "epoch": 0.135, "step": 54 }, { "Batch Mean": -0.2121124267578125, "accuracy": 0.84375, "epoch": 0.135, "step": 54 }, { "Batch Mean": 0.040355682373046875, "accuracy": 0.6875, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 9.226275444030762, "learning_rate": 2.723684210526316e-06, "loss": 0.477, "step": 55 }, { "Batch Mean": 0.14949023723602295, "accuracy": 0.75, "epoch": 0.1375, "step": 55 }, { "Batch Mean": -0.2880672216415405, "accuracy": 0.875, "epoch": 0.1375, "step": 55 }, { "Batch Mean": -0.037652015686035156, "accuracy": 0.75, "epoch": 0.1375, "step": 55 }, { "Batch Mean": 0.11230850219726562, "accuracy": 0.78125, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 8.696858406066895, "learning_rate": 2.715789473684211e-06, "loss": 0.4385, "step": 56 }, { "Batch Mean": -0.14437103271484375, "accuracy": 0.84375, "epoch": 0.14, "step": 56 }, { "Batch Mean": -0.3502960205078125, "accuracy": 0.78125, "epoch": 0.14, "step": 56 }, { "Batch Mean": -0.3359222412109375, "accuracy": 0.65625, "epoch": 0.14, "step": 56 }, { "Batch Mean": -0.4460906982421875, "accuracy": 0.71875, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 10.204813003540039, "learning_rate": 2.7078947368421052e-06, "loss": 0.4971, "step": 57 }, { "Batch Mean": -0.0918121337890625, "accuracy": 0.75, "epoch": 0.1425, "step": 57 }, { "Batch Mean": 0.1797332763671875, "accuracy": 0.78125, "epoch": 0.1425, "step": 57 }, { "Batch Mean": -0.22362709045410156, "accuracy": 0.65625, "epoch": 0.1425, "step": 57 }, { "Batch Mean": -0.932403564453125, "accuracy": 0.78125, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 9.547924995422363, "learning_rate": 2.7e-06, "loss": 0.5235, "step": 58 }, { "Batch Mean": -0.69256591796875, "accuracy": 0.75, "epoch": 0.145, "step": 58 }, { "Batch Mean": -0.408052921295166, "accuracy": 0.71875, "epoch": 0.145, "step": 58 }, { "Batch Mean": -0.7247238159179688, "accuracy": 0.75, "epoch": 0.145, "step": 58 }, { "Batch Mean": -0.5294733047485352, "accuracy": 0.8125, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 8.18185043334961, "learning_rate": 2.6921052631578947e-06, "loss": 0.4697, "step": 59 }, { "Batch Mean": -0.562103271484375, "accuracy": 0.75, "epoch": 0.1475, "step": 59 }, { "Batch Mean": -0.36240386962890625, "accuracy": 0.71875, "epoch": 0.1475, "step": 59 }, { "Batch Mean": -0.8479537963867188, "accuracy": 0.75, "epoch": 0.1475, "step": 59 }, { "Batch Mean": -0.5514106750488281, "accuracy": 0.8125, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 9.638142585754395, "learning_rate": 2.6842105263157895e-06, "loss": 0.4854, "step": 60 }, { "Batch Mean": -0.9713249206542969, "accuracy": 0.875, "epoch": 0.15, "step": 60 }, { "Batch Mean": -1.4701347351074219, "accuracy": 0.8125, "epoch": 0.15, "step": 60 }, { "Batch Mean": -0.8054180145263672, "accuracy": 0.6875, "epoch": 0.15, "step": 60 }, { "Batch Mean": -1.1165752410888672, "accuracy": 0.875, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 9.138744354248047, "learning_rate": 2.6763157894736842e-06, "loss": 0.4093, "step": 61 }, { "Batch Mean": -1.2550277709960938, "accuracy": 0.78125, "epoch": 0.1525, "step": 61 }, { "Batch Mean": -0.9237594604492188, "accuracy": 0.6875, "epoch": 0.1525, "step": 61 }, { "Batch Mean": -0.9178142547607422, "accuracy": 0.875, "epoch": 0.1525, "step": 61 }, { "Batch Mean": -0.8621349334716797, "accuracy": 0.75, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 9.812451362609863, "learning_rate": 2.668421052631579e-06, "loss": 0.4354, "step": 62 }, { "Batch Mean": -1.3034553527832031, "accuracy": 0.78125, "epoch": 0.155, "step": 62 }, { "Batch Mean": -1.0795440673828125, "accuracy": 0.78125, "epoch": 0.155, "step": 62 }, { "Batch Mean": -1.0960693359375, "accuracy": 0.84375, "epoch": 0.155, "step": 62 }, { "Batch Mean": -1.2091312408447266, "accuracy": 0.6875, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 9.518035888671875, "learning_rate": 2.6605263157894737e-06, "loss": 0.4399, "step": 63 }, { "Batch Mean": -1.405853271484375, "accuracy": 0.6875, "epoch": 0.1575, "step": 63 }, { "Batch Mean": -1.4421844482421875, "accuracy": 0.71875, "epoch": 0.1575, "step": 63 }, { "Batch Mean": -1.2391834259033203, "accuracy": 0.6875, "epoch": 0.1575, "step": 63 }, { "Batch Mean": -0.881195068359375, "accuracy": 0.8125, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 10.348162651062012, "learning_rate": 2.6526315789473685e-06, "loss": 0.537, "step": 64 }, { "Batch Mean": -1.0128021240234375, "accuracy": 0.84375, "epoch": 0.16, "step": 64 }, { "Batch Mean": -1.0150184631347656, "accuracy": 0.84375, "epoch": 0.16, "step": 64 }, { "Batch Mean": -1.497243881225586, "accuracy": 0.78125, "epoch": 0.16, "step": 64 }, { "Batch Mean": -0.910819947719574, "accuracy": 0.75, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 8.633638381958008, "learning_rate": 2.644736842105263e-06, "loss": 0.4436, "step": 65 }, { "Batch Mean": -1.0223121643066406, "accuracy": 0.71875, "epoch": 0.1625, "step": 65 }, { "Batch Mean": -0.5706081390380859, "accuracy": 0.78125, "epoch": 0.1625, "step": 65 }, { "Batch Mean": -0.965911865234375, "accuracy": 0.8125, "epoch": 0.1625, "step": 65 }, { "Batch Mean": -0.7304267883300781, "accuracy": 0.8125, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 8.088103294372559, "learning_rate": 2.636842105263158e-06, "loss": 0.4446, "step": 66 }, { "Batch Mean": -0.4677067697048187, "accuracy": 0.8125, "epoch": 0.165, "step": 66 }, { "Batch Mean": -1.4533824920654297, "accuracy": 0.78125, "epoch": 0.165, "step": 66 }, { "Batch Mean": -0.78509521484375, "accuracy": 0.84375, "epoch": 0.165, "step": 66 }, { "Batch Mean": -0.8427619934082031, "accuracy": 0.78125, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 7.766864776611328, "learning_rate": 2.6289473684210527e-06, "loss": 0.412, "step": 67 }, { "Batch Mean": -1.0267219543457031, "accuracy": 0.75, "epoch": 0.1675, "step": 67 }, { "Batch Mean": -0.0344390869140625, "accuracy": 0.78125, "epoch": 0.1675, "step": 67 }, { "Batch Mean": -0.7120513916015625, "accuracy": 0.8125, "epoch": 0.1675, "step": 67 }, { "Batch Mean": -0.8848686218261719, "accuracy": 0.75, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 8.952485084533691, "learning_rate": 2.6210526315789474e-06, "loss": 0.4073, "step": 68 }, { "Batch Mean": -1.2683296203613281, "accuracy": 0.78125, "epoch": 0.17, "step": 68 }, { "Batch Mean": -0.9470596313476562, "accuracy": 0.6875, "epoch": 0.17, "step": 68 }, { "Batch Mean": -1.2335700988769531, "accuracy": 0.78125, "epoch": 0.17, "step": 68 }, { "Batch Mean": -0.9984736442565918, "accuracy": 0.71875, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 8.944815635681152, "learning_rate": 2.613157894736842e-06, "loss": 0.4827, "step": 69 }, { "Batch Mean": -0.6530609130859375, "accuracy": 0.8125, "epoch": 0.1725, "step": 69 }, { "Batch Mean": -0.6013336181640625, "accuracy": 0.6875, "epoch": 0.1725, "step": 69 }, { "Batch Mean": -1.4489421844482422, "accuracy": 0.8125, "epoch": 0.1725, "step": 69 }, { "Batch Mean": -0.9736480712890625, "accuracy": 0.90625, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 8.779143333435059, "learning_rate": 2.605263157894737e-06, "loss": 0.4578, "step": 70 }, { "Batch Mean": -0.9107780456542969, "accuracy": 0.84375, "epoch": 0.175, "step": 70 }, { "Batch Mean": -1.1361122131347656, "accuracy": 0.6875, "epoch": 0.175, "step": 70 }, { "Batch Mean": -0.6527862548828125, "accuracy": 0.9375, "epoch": 0.175, "step": 70 }, { "Batch Mean": -0.7553470134735107, "accuracy": 0.6875, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 8.647814750671387, "learning_rate": 2.5973684210526317e-06, "loss": 0.4257, "step": 71 }, { "Batch Mean": -0.41971588134765625, "accuracy": 0.75, "epoch": 0.1775, "step": 71 }, { "Batch Mean": -0.705718994140625, "accuracy": 0.875, "epoch": 0.1775, "step": 71 }, { "Batch Mean": -1.0686330795288086, "accuracy": 0.71875, "epoch": 0.1775, "step": 71 }, { "Batch Mean": -0.8464865684509277, "accuracy": 0.78125, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 8.784235000610352, "learning_rate": 2.5894736842105264e-06, "loss": 0.3921, "step": 72 }, { "Batch Mean": -0.7266769409179688, "accuracy": 0.8125, "epoch": 0.18, "step": 72 }, { "Batch Mean": -0.7239456176757812, "accuracy": 0.8125, "epoch": 0.18, "step": 72 }, { "Batch Mean": -0.6862373352050781, "accuracy": 0.8125, "epoch": 0.18, "step": 72 }, { "Batch Mean": -0.525360107421875, "accuracy": 0.875, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 7.80237340927124, "learning_rate": 2.581578947368421e-06, "loss": 0.374, "step": 73 }, { "Batch Mean": -0.9130859375, "accuracy": 0.84375, "epoch": 0.1825, "step": 73 }, { "Batch Mean": -0.35595703125, "accuracy": 0.875, "epoch": 0.1825, "step": 73 }, { "Batch Mean": -0.8892440795898438, "accuracy": 0.6875, "epoch": 0.1825, "step": 73 }, { "Batch Mean": -0.4263725280761719, "accuracy": 0.71875, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 7.894434452056885, "learning_rate": 2.573684210526316e-06, "loss": 0.4405, "step": 74 }, { "Batch Mean": -0.6322441101074219, "accuracy": 0.8125, "epoch": 0.185, "step": 74 }, { "Batch Mean": -0.456390380859375, "accuracy": 0.875, "epoch": 0.185, "step": 74 }, { "Batch Mean": -1.1346385478973389, "accuracy": 0.84375, "epoch": 0.185, "step": 74 }, { "Batch Mean": -0.8706645965576172, "accuracy": 0.8125, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 8.292348861694336, "learning_rate": 2.5657894736842107e-06, "loss": 0.3895, "step": 75 }, { "Batch Mean": -1.0383148193359375, "accuracy": 0.8125, "epoch": 0.1875, "step": 75 }, { "Batch Mean": 0.19762420654296875, "accuracy": 0.78125, "epoch": 0.1875, "step": 75 }, { "Batch Mean": 0.10494613647460938, "accuracy": 0.78125, "epoch": 0.1875, "step": 75 }, { "Batch Mean": -0.5181140899658203, "accuracy": 0.84375, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 8.882412910461426, "learning_rate": 2.5578947368421054e-06, "loss": 0.3805, "step": 76 }, { "Batch Mean": -0.4382622241973877, "accuracy": 0.75, "epoch": 0.19, "step": 76 }, { "Batch Mean": -0.16241741180419922, "accuracy": 0.90625, "epoch": 0.19, "step": 76 }, { "Batch Mean": -0.8776988983154297, "accuracy": 0.875, "epoch": 0.19, "step": 76 }, { "Batch Mean": -0.6540908813476562, "accuracy": 0.8125, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 10.025094032287598, "learning_rate": 2.55e-06, "loss": 0.4001, "step": 77 }, { "Batch Mean": 0.06690788269042969, "accuracy": 0.78125, "epoch": 0.1925, "step": 77 }, { "Batch Mean": -0.03551149368286133, "accuracy": 0.75, "epoch": 0.1925, "step": 77 }, { "Batch Mean": 0.17040252685546875, "accuracy": 0.78125, "epoch": 0.1925, "step": 77 }, { "Batch Mean": -0.019598007202148438, "accuracy": 0.90625, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 9.164822578430176, "learning_rate": 2.542105263157895e-06, "loss": 0.3807, "step": 78 }, { "Batch Mean": 0.0378570556640625, "accuracy": 0.75, "epoch": 0.195, "step": 78 }, { "Batch Mean": 0.3024101257324219, "accuracy": 0.75, "epoch": 0.195, "step": 78 }, { "Batch Mean": 0.1015625, "accuracy": 0.6875, "epoch": 0.195, "step": 78 }, { "Batch Mean": 0.10402488708496094, "accuracy": 0.875, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 9.871844291687012, "learning_rate": 2.5342105263157892e-06, "loss": 0.4781, "step": 79 }, { "Batch Mean": 0.1356794238090515, "accuracy": 0.8125, "epoch": 0.1975, "step": 79 }, { "Batch Mean": 0.0782623291015625, "accuracy": 0.78125, "epoch": 0.1975, "step": 79 }, { "Batch Mean": -0.12647247314453125, "accuracy": 0.8125, "epoch": 0.1975, "step": 79 }, { "Batch Mean": 0.2567100524902344, "accuracy": 0.78125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 9.033759117126465, "learning_rate": 2.526315789473684e-06, "loss": 0.4288, "step": 80 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }