JW17's picture
Add files using upload-large-folder tool
d89a705 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 500,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"Batch Mean": -1.4581298828125,
"accuracy": 0.28125,
"epoch": 0,
"step": 0
},
{
"Batch Mean": -1.4786376953125,
"accuracy": 0.46875,
"epoch": 0,
"step": 0
},
{
"Batch Mean": -1.486572265625,
"accuracy": 0.5,
"epoch": 0,
"step": 0
},
{
"Batch Mean": -1.439697265625,
"accuracy": 0.625,
"epoch": 0,
"step": 0
},
{
"epoch": 0.0025,
"grad_norm": 2.7191572189331055,
"learning_rate": 1.5000000000000002e-07,
"loss": 0.6927,
"step": 1
},
{
"Batch Mean": -1.4107666015625,
"accuracy": 0.4375,
"epoch": 0.0025,
"step": 1
},
{
"Batch Mean": -1.4342041015625,
"accuracy": 0.5,
"epoch": 0.0025,
"step": 1
},
{
"Batch Mean": -1.45263671875,
"accuracy": 0.5625,
"epoch": 0.0025,
"step": 1
},
{
"Batch Mean": -1.4517822265625,
"accuracy": 0.5625,
"epoch": 0.0025,
"step": 1
},
{
"epoch": 0.005,
"grad_norm": 3.204066038131714,
"learning_rate": 3.0000000000000004e-07,
"loss": 0.6964,
"step": 2
},
{
"Batch Mean": -1.4908447265625,
"accuracy": 0.59375,
"epoch": 0.005,
"step": 2
},
{
"Batch Mean": -1.425048828125,
"accuracy": 0.4375,
"epoch": 0.005,
"step": 2
},
{
"Batch Mean": -1.464111328125,
"accuracy": 0.375,
"epoch": 0.005,
"step": 2
},
{
"Batch Mean": -1.4324951171875,
"accuracy": 0.59375,
"epoch": 0.005,
"step": 2
},
{
"epoch": 0.0075,
"grad_norm": 3.103353261947632,
"learning_rate": 4.5e-07,
"loss": 0.6991,
"step": 3
},
{
"Batch Mean": -1.494140625,
"accuracy": 0.46875,
"epoch": 0.0075,
"step": 3
},
{
"Batch Mean": -1.4178466796875,
"accuracy": 0.625,
"epoch": 0.0075,
"step": 3
},
{
"Batch Mean": -1.520751953125,
"accuracy": 0.59375,
"epoch": 0.0075,
"step": 3
},
{
"Batch Mean": -1.4844970703125,
"accuracy": 0.5625,
"epoch": 0.0075,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 3.3672587871551514,
"learning_rate": 6.000000000000001e-07,
"loss": 0.6883,
"step": 4
},
{
"Batch Mean": -1.4312744140625,
"accuracy": 0.4375,
"epoch": 0.01,
"step": 4
},
{
"Batch Mean": -1.4820556640625,
"accuracy": 0.5625,
"epoch": 0.01,
"step": 4
},
{
"Batch Mean": -1.4405517578125,
"accuracy": 0.5,
"epoch": 0.01,
"step": 4
},
{
"Batch Mean": -1.4302978515625,
"accuracy": 0.53125,
"epoch": 0.01,
"step": 4
},
{
"epoch": 0.0125,
"grad_norm": 3.158576011657715,
"learning_rate": 7.5e-07,
"loss": 0.7012,
"step": 5
},
{
"Batch Mean": -1.4569091796875,
"accuracy": 0.40625,
"epoch": 0.0125,
"step": 5
},
{
"Batch Mean": -1.46435546875,
"accuracy": 0.5,
"epoch": 0.0125,
"step": 5
},
{
"Batch Mean": -1.4354248046875,
"accuracy": 0.5625,
"epoch": 0.0125,
"step": 5
},
{
"Batch Mean": -1.47412109375,
"accuracy": 0.40625,
"epoch": 0.0125,
"step": 5
},
{
"epoch": 0.015,
"grad_norm": 4.888192176818848,
"learning_rate": 9e-07,
"loss": 0.7118,
"step": 6
},
{
"Batch Mean": -1.4361572265625,
"accuracy": 0.53125,
"epoch": 0.015,
"step": 6
},
{
"Batch Mean": -1.4234619140625,
"accuracy": 0.625,
"epoch": 0.015,
"step": 6
},
{
"Batch Mean": -1.4453125,
"accuracy": 0.375,
"epoch": 0.015,
"step": 6
},
{
"Batch Mean": -1.44287109375,
"accuracy": 0.5,
"epoch": 0.015,
"step": 6
},
{
"epoch": 0.0175,
"grad_norm": 3.654751777648926,
"learning_rate": 1.05e-06,
"loss": 0.6901,
"step": 7
},
{
"Batch Mean": -1.4200439453125,
"accuracy": 0.53125,
"epoch": 0.0175,
"step": 7
},
{
"Batch Mean": -1.406494140625,
"accuracy": 0.4375,
"epoch": 0.0175,
"step": 7
},
{
"Batch Mean": -1.4012451171875,
"accuracy": 0.53125,
"epoch": 0.0175,
"step": 7
},
{
"Batch Mean": -1.4122314453125,
"accuracy": 0.4375,
"epoch": 0.0175,
"step": 7
},
{
"epoch": 0.02,
"grad_norm": 3.2707793712615967,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.7026,
"step": 8
},
{
"Batch Mean": -1.400634765625,
"accuracy": 0.5625,
"epoch": 0.02,
"step": 8
},
{
"Batch Mean": -1.3936767578125,
"accuracy": 0.46875,
"epoch": 0.02,
"step": 8
},
{
"Batch Mean": -1.4110107421875,
"accuracy": 0.34375,
"epoch": 0.02,
"step": 8
},
{
"Batch Mean": -1.4215087890625,
"accuracy": 0.625,
"epoch": 0.02,
"step": 8
},
{
"epoch": 0.0225,
"grad_norm": 3.053551197052002,
"learning_rate": 1.35e-06,
"loss": 0.6859,
"step": 9
},
{
"Batch Mean": -1.35302734375,
"accuracy": 0.5625,
"epoch": 0.0225,
"step": 9
},
{
"Batch Mean": -1.35003662109375,
"accuracy": 0.40625,
"epoch": 0.0225,
"step": 9
},
{
"Batch Mean": -1.39306640625,
"accuracy": 0.5625,
"epoch": 0.0225,
"step": 9
},
{
"Batch Mean": -1.3843994140625,
"accuracy": 0.53125,
"epoch": 0.0225,
"step": 9
},
{
"epoch": 0.025,
"grad_norm": 2.9442760944366455,
"learning_rate": 1.5e-06,
"loss": 0.6853,
"step": 10
},
{
"Batch Mean": -1.31396484375,
"accuracy": 0.46875,
"epoch": 0.025,
"step": 10
},
{
"Batch Mean": -1.33154296875,
"accuracy": 0.5625,
"epoch": 0.025,
"step": 10
},
{
"Batch Mean": -1.3260498046875,
"accuracy": 0.46875,
"epoch": 0.025,
"step": 10
},
{
"Batch Mean": -1.3170166015625,
"accuracy": 0.4375,
"epoch": 0.025,
"step": 10
},
{
"epoch": 0.0275,
"grad_norm": 2.729567050933838,
"learning_rate": 1.65e-06,
"loss": 0.6946,
"step": 11
},
{
"Batch Mean": -1.24346923828125,
"accuracy": 0.4375,
"epoch": 0.0275,
"step": 11
},
{
"Batch Mean": -1.239013671875,
"accuracy": 0.59375,
"epoch": 0.0275,
"step": 11
},
{
"Batch Mean": -1.3074951171875,
"accuracy": 0.5,
"epoch": 0.0275,
"step": 11
},
{
"Batch Mean": -1.24664306640625,
"accuracy": 0.53125,
"epoch": 0.0275,
"step": 11
},
{
"epoch": 0.03,
"grad_norm": 2.8832643032073975,
"learning_rate": 1.8e-06,
"loss": 0.6869,
"step": 12
},
{
"Batch Mean": -1.2061767578125,
"accuracy": 0.625,
"epoch": 0.03,
"step": 12
},
{
"Batch Mean": -1.09735107421875,
"accuracy": 0.5625,
"epoch": 0.03,
"step": 12
},
{
"Batch Mean": -1.1669921875,
"accuracy": 0.625,
"epoch": 0.03,
"step": 12
},
{
"Batch Mean": -1.107421875,
"accuracy": 0.53125,
"epoch": 0.03,
"step": 12
},
{
"epoch": 0.0325,
"grad_norm": 3.347060441970825,
"learning_rate": 1.95e-06,
"loss": 0.676,
"step": 13
},
{
"Batch Mean": -0.99713134765625,
"accuracy": 0.53125,
"epoch": 0.0325,
"step": 13
},
{
"Batch Mean": -0.992431640625,
"accuracy": 0.6875,
"epoch": 0.0325,
"step": 13
},
{
"Batch Mean": -1.08367919921875,
"accuracy": 0.65625,
"epoch": 0.0325,
"step": 13
},
{
"Batch Mean": -1.073486328125,
"accuracy": 0.5625,
"epoch": 0.0325,
"step": 13
},
{
"epoch": 0.035,
"grad_norm": 3.0629279613494873,
"learning_rate": 2.1e-06,
"loss": 0.6446,
"step": 14
},
{
"Batch Mean": -1.027008056640625,
"accuracy": 0.5625,
"epoch": 0.035,
"step": 14
},
{
"Batch Mean": -1.04302978515625,
"accuracy": 0.625,
"epoch": 0.035,
"step": 14
},
{
"Batch Mean": -0.986724853515625,
"accuracy": 0.71875,
"epoch": 0.035,
"step": 14
},
{
"Batch Mean": -1.010406494140625,
"accuracy": 0.65625,
"epoch": 0.035,
"step": 14
},
{
"epoch": 0.0375,
"grad_norm": 3.297088146209717,
"learning_rate": 2.25e-06,
"loss": 0.6466,
"step": 15
},
{
"Batch Mean": -0.945648193359375,
"accuracy": 0.625,
"epoch": 0.0375,
"step": 15
},
{
"Batch Mean": -0.90460205078125,
"accuracy": 0.625,
"epoch": 0.0375,
"step": 15
},
{
"Batch Mean": -0.9103546142578125,
"accuracy": 0.625,
"epoch": 0.0375,
"step": 15
},
{
"Batch Mean": -0.84765625,
"accuracy": 0.8125,
"epoch": 0.0375,
"step": 15
},
{
"epoch": 0.04,
"grad_norm": 3.339815855026245,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.6261,
"step": 16
},
{
"Batch Mean": -0.7341957092285156,
"accuracy": 0.65625,
"epoch": 0.04,
"step": 16
},
{
"Batch Mean": -0.6576881408691406,
"accuracy": 0.8125,
"epoch": 0.04,
"step": 16
},
{
"Batch Mean": -0.7573471069335938,
"accuracy": 0.75,
"epoch": 0.04,
"step": 16
},
{
"Batch Mean": -0.8988265991210938,
"accuracy": 0.84375,
"epoch": 0.04,
"step": 16
},
{
"epoch": 0.0425,
"grad_norm": 4.010303974151611,
"learning_rate": 2.55e-06,
"loss": 0.6324,
"step": 17
},
{
"Batch Mean": -0.45727968215942383,
"accuracy": 0.59375,
"epoch": 0.0425,
"step": 17
},
{
"Batch Mean": -0.40456533432006836,
"accuracy": 0.6875,
"epoch": 0.0425,
"step": 17
},
{
"Batch Mean": -0.4847888946533203,
"accuracy": 0.59375,
"epoch": 0.0425,
"step": 17
},
{
"Batch Mean": -0.31931304931640625,
"accuracy": 0.65625,
"epoch": 0.0425,
"step": 17
},
{
"epoch": 0.045,
"grad_norm": 4.431520462036133,
"learning_rate": 2.7e-06,
"loss": 0.62,
"step": 18
},
{
"Batch Mean": -0.0693979263305664,
"accuracy": 0.8125,
"epoch": 0.045,
"step": 18
},
{
"Batch Mean": -0.23062896728515625,
"accuracy": 0.625,
"epoch": 0.045,
"step": 18
},
{
"Batch Mean": -0.10647201538085938,
"accuracy": 0.65625,
"epoch": 0.045,
"step": 18
},
{
"Batch Mean": -0.07384902238845825,
"accuracy": 0.6875,
"epoch": 0.045,
"step": 18
},
{
"epoch": 0.0475,
"grad_norm": 5.421309947967529,
"learning_rate": 2.85e-06,
"loss": 0.5896,
"step": 19
},
{
"Batch Mean": -0.12799835205078125,
"accuracy": 0.6875,
"epoch": 0.0475,
"step": 19
},
{
"Batch Mean": 0.0684967041015625,
"accuracy": 0.875,
"epoch": 0.0475,
"step": 19
},
{
"Batch Mean": -0.014011383056640625,
"accuracy": 0.65625,
"epoch": 0.0475,
"step": 19
},
{
"Batch Mean": 0.0633087158203125,
"accuracy": 0.84375,
"epoch": 0.0475,
"step": 19
},
{
"epoch": 0.05,
"grad_norm": 5.102872848510742,
"learning_rate": 3e-06,
"loss": 0.4931,
"step": 20
},
{
"Batch Mean": 0.19290733337402344,
"accuracy": 0.8125,
"epoch": 0.05,
"step": 20
},
{
"Batch Mean": 0.29687976837158203,
"accuracy": 0.53125,
"epoch": 0.05,
"step": 20
},
{
"Batch Mean": 0.103363037109375,
"accuracy": 0.625,
"epoch": 0.05,
"step": 20
},
{
"Batch Mean": 0.3869609832763672,
"accuracy": 0.71875,
"epoch": 0.05,
"step": 20
},
{
"epoch": 0.0525,
"grad_norm": 7.569705486297607,
"learning_rate": 2.992105263157895e-06,
"loss": 0.5976,
"step": 21
},
{
"Batch Mean": 0.47769927978515625,
"accuracy": 0.71875,
"epoch": 0.0525,
"step": 21
},
{
"Batch Mean": 0.5898284912109375,
"accuracy": 0.875,
"epoch": 0.0525,
"step": 21
},
{
"Batch Mean": 0.3037242889404297,
"accuracy": 0.75,
"epoch": 0.0525,
"step": 21
},
{
"Batch Mean": -0.037357330322265625,
"accuracy": 0.75,
"epoch": 0.0525,
"step": 21
},
{
"epoch": 0.055,
"grad_norm": 7.273393630981445,
"learning_rate": 2.9842105263157896e-06,
"loss": 0.4982,
"step": 22
},
{
"Batch Mean": 0.78411865234375,
"accuracy": 0.59375,
"epoch": 0.055,
"step": 22
},
{
"Batch Mean": 0.3693389892578125,
"accuracy": 0.625,
"epoch": 0.055,
"step": 22
},
{
"Batch Mean": 0.3277778625488281,
"accuracy": 0.59375,
"epoch": 0.055,
"step": 22
},
{
"Batch Mean": 0.23564910888671875,
"accuracy": 0.84375,
"epoch": 0.055,
"step": 22
},
{
"epoch": 0.0575,
"grad_norm": 10.978965759277344,
"learning_rate": 2.9763157894736843e-06,
"loss": 0.6967,
"step": 23
},
{
"Batch Mean": 0.3892631530761719,
"accuracy": 0.71875,
"epoch": 0.0575,
"step": 23
},
{
"Batch Mean": 0.4144134521484375,
"accuracy": 0.65625,
"epoch": 0.0575,
"step": 23
},
{
"Batch Mean": 0.201019287109375,
"accuracy": 0.6875,
"epoch": 0.0575,
"step": 23
},
{
"Batch Mean": 0.15361404418945312,
"accuracy": 0.5625,
"epoch": 0.0575,
"step": 23
},
{
"epoch": 0.06,
"grad_norm": 10.72164249420166,
"learning_rate": 2.968421052631579e-06,
"loss": 0.657,
"step": 24
},
{
"Batch Mean": 0.32332611083984375,
"accuracy": 0.65625,
"epoch": 0.06,
"step": 24
},
{
"Batch Mean": -0.45644378662109375,
"accuracy": 0.71875,
"epoch": 0.06,
"step": 24
},
{
"Batch Mean": 0.10271453857421875,
"accuracy": 0.78125,
"epoch": 0.06,
"step": 24
},
{
"Batch Mean": 0.5616731643676758,
"accuracy": 0.78125,
"epoch": 0.06,
"step": 24
},
{
"epoch": 0.0625,
"grad_norm": 10.953572273254395,
"learning_rate": 2.960526315789474e-06,
"loss": 0.6313,
"step": 25
},
{
"Batch Mean": -0.08791732788085938,
"accuracy": 0.71875,
"epoch": 0.0625,
"step": 25
},
{
"Batch Mean": -0.12505340576171875,
"accuracy": 0.71875,
"epoch": 0.0625,
"step": 25
},
{
"Batch Mean": 0.2984886169433594,
"accuracy": 0.75,
"epoch": 0.0625,
"step": 25
},
{
"Batch Mean": -0.2277584969997406,
"accuracy": 0.65625,
"epoch": 0.0625,
"step": 25
},
{
"epoch": 0.065,
"grad_norm": 8.867247581481934,
"learning_rate": 2.9526315789473685e-06,
"loss": 0.5531,
"step": 26
},
{
"Batch Mean": -0.310638427734375,
"accuracy": 0.875,
"epoch": 0.065,
"step": 26
},
{
"Batch Mean": 0.05762290954589844,
"accuracy": 0.71875,
"epoch": 0.065,
"step": 26
},
{
"Batch Mean": -0.3841552734375,
"accuracy": 0.75,
"epoch": 0.065,
"step": 26
},
{
"Batch Mean": -0.13448715209960938,
"accuracy": 0.78125,
"epoch": 0.065,
"step": 26
},
{
"epoch": 0.0675,
"grad_norm": 7.167004585266113,
"learning_rate": 2.9447368421052633e-06,
"loss": 0.4927,
"step": 27
},
{
"Batch Mean": -0.5082488059997559,
"accuracy": 0.59375,
"epoch": 0.0675,
"step": 27
},
{
"Batch Mean": -0.5335745811462402,
"accuracy": 0.65625,
"epoch": 0.0675,
"step": 27
},
{
"Batch Mean": -0.3728065490722656,
"accuracy": 0.65625,
"epoch": 0.0675,
"step": 27
},
{
"Batch Mean": -0.48749029636383057,
"accuracy": 0.65625,
"epoch": 0.0675,
"step": 27
},
{
"epoch": 0.07,
"grad_norm": 9.99916934967041,
"learning_rate": 2.936842105263158e-06,
"loss": 0.6787,
"step": 28
},
{
"Batch Mean": -0.5768375396728516,
"accuracy": 0.78125,
"epoch": 0.07,
"step": 28
},
{
"Batch Mean": -0.36152684688568115,
"accuracy": 0.71875,
"epoch": 0.07,
"step": 28
},
{
"Batch Mean": -0.6082801818847656,
"accuracy": 0.59375,
"epoch": 0.07,
"step": 28
},
{
"Batch Mean": -0.5176200866699219,
"accuracy": 0.65625,
"epoch": 0.07,
"step": 28
},
{
"epoch": 0.0725,
"grad_norm": 6.558942794799805,
"learning_rate": 2.9289473684210528e-06,
"loss": 0.571,
"step": 29
},
{
"Batch Mean": -0.3009366989135742,
"accuracy": 0.78125,
"epoch": 0.0725,
"step": 29
},
{
"Batch Mean": -0.4234275817871094,
"accuracy": 0.75,
"epoch": 0.0725,
"step": 29
},
{
"Batch Mean": -0.4476432800292969,
"accuracy": 0.78125,
"epoch": 0.0725,
"step": 29
},
{
"Batch Mean": -0.6630382537841797,
"accuracy": 0.71875,
"epoch": 0.0725,
"step": 29
},
{
"epoch": 0.075,
"grad_norm": 5.937437534332275,
"learning_rate": 2.9210526315789475e-06,
"loss": 0.5233,
"step": 30
},
{
"Batch Mean": -0.47089385986328125,
"accuracy": 0.65625,
"epoch": 0.075,
"step": 30
},
{
"Batch Mean": -0.5186127424240112,
"accuracy": 0.78125,
"epoch": 0.075,
"step": 30
},
{
"Batch Mean": -0.5250816345214844,
"accuracy": 0.5625,
"epoch": 0.075,
"step": 30
},
{
"Batch Mean": -0.3480682373046875,
"accuracy": 0.8125,
"epoch": 0.075,
"step": 30
},
{
"epoch": 0.0775,
"grad_norm": 5.8368072509765625,
"learning_rate": 2.9131578947368423e-06,
"loss": 0.5172,
"step": 31
},
{
"Batch Mean": -0.29285621643066406,
"accuracy": 0.8125,
"epoch": 0.0775,
"step": 31
},
{
"Batch Mean": -0.3106422424316406,
"accuracy": 0.84375,
"epoch": 0.0775,
"step": 31
},
{
"Batch Mean": 0.005329132080078125,
"accuracy": 0.71875,
"epoch": 0.0775,
"step": 31
},
{
"Batch Mean": -0.1413421630859375,
"accuracy": 0.78125,
"epoch": 0.0775,
"step": 31
},
{
"epoch": 0.08,
"grad_norm": 5.706140995025635,
"learning_rate": 2.905263157894737e-06,
"loss": 0.5095,
"step": 32
},
{
"Batch Mean": -0.04312324523925781,
"accuracy": 0.6875,
"epoch": 0.08,
"step": 32
},
{
"Batch Mean": -0.10883808135986328,
"accuracy": 0.71875,
"epoch": 0.08,
"step": 32
},
{
"Batch Mean": 0.3197288513183594,
"accuracy": 0.71875,
"epoch": 0.08,
"step": 32
},
{
"Batch Mean": -0.13158416748046875,
"accuracy": 0.65625,
"epoch": 0.08,
"step": 32
},
{
"epoch": 0.0825,
"grad_norm": 6.042052268981934,
"learning_rate": 2.8973684210526318e-06,
"loss": 0.5717,
"step": 33
},
{
"Batch Mean": 0.0721282958984375,
"accuracy": 0.75,
"epoch": 0.0825,
"step": 33
},
{
"Batch Mean": 0.05409049987792969,
"accuracy": 0.71875,
"epoch": 0.0825,
"step": 33
},
{
"Batch Mean": -0.04035043716430664,
"accuracy": 0.625,
"epoch": 0.0825,
"step": 33
},
{
"Batch Mean": -0.04631471633911133,
"accuracy": 0.71875,
"epoch": 0.0825,
"step": 33
},
{
"epoch": 0.085,
"grad_norm": 5.908041954040527,
"learning_rate": 2.8894736842105265e-06,
"loss": 0.5446,
"step": 34
},
{
"Batch Mean": 0.2712249755859375,
"accuracy": 0.84375,
"epoch": 0.085,
"step": 34
},
{
"Batch Mean": 0.179473876953125,
"accuracy": 0.8125,
"epoch": 0.085,
"step": 34
},
{
"Batch Mean": -0.01055145263671875,
"accuracy": 0.78125,
"epoch": 0.085,
"step": 34
},
{
"Batch Mean": 0.06919479370117188,
"accuracy": 0.8125,
"epoch": 0.085,
"step": 34
},
{
"epoch": 0.0875,
"grad_norm": 4.990839958190918,
"learning_rate": 2.8815789473684213e-06,
"loss": 0.4607,
"step": 35
},
{
"Batch Mean": -0.017984390258789062,
"accuracy": 0.78125,
"epoch": 0.0875,
"step": 35
},
{
"Batch Mean": 0.075164794921875,
"accuracy": 0.78125,
"epoch": 0.0875,
"step": 35
},
{
"Batch Mean": 0.20074462890625,
"accuracy": 0.8125,
"epoch": 0.0875,
"step": 35
},
{
"Batch Mean": -0.03507876396179199,
"accuracy": 0.65625,
"epoch": 0.0875,
"step": 35
},
{
"epoch": 0.09,
"grad_norm": 5.7467803955078125,
"learning_rate": 2.873684210526316e-06,
"loss": 0.5038,
"step": 36
},
{
"Batch Mean": 0.2868976593017578,
"accuracy": 0.75,
"epoch": 0.09,
"step": 36
},
{
"Batch Mean": 0.16400146484375,
"accuracy": 0.65625,
"epoch": 0.09,
"step": 36
},
{
"Batch Mean": 0.2293224334716797,
"accuracy": 0.71875,
"epoch": 0.09,
"step": 36
},
{
"Batch Mean": 0.2969036102294922,
"accuracy": 0.6875,
"epoch": 0.09,
"step": 36
},
{
"epoch": 0.0925,
"grad_norm": 6.629448413848877,
"learning_rate": 2.8657894736842103e-06,
"loss": 0.5233,
"step": 37
},
{
"Batch Mean": -0.07112598419189453,
"accuracy": 0.6875,
"epoch": 0.0925,
"step": 37
},
{
"Batch Mean": 0.25348663330078125,
"accuracy": 0.9375,
"epoch": 0.0925,
"step": 37
},
{
"Batch Mean": 0.2884788513183594,
"accuracy": 0.6875,
"epoch": 0.0925,
"step": 37
},
{
"Batch Mean": 0.06340456008911133,
"accuracy": 0.71875,
"epoch": 0.0925,
"step": 37
},
{
"epoch": 0.095,
"grad_norm": 6.545988082885742,
"learning_rate": 2.857894736842105e-06,
"loss": 0.521,
"step": 38
},
{
"Batch Mean": 0.372711181640625,
"accuracy": 0.8125,
"epoch": 0.095,
"step": 38
},
{
"Batch Mean": 0.2590770721435547,
"accuracy": 0.53125,
"epoch": 0.095,
"step": 38
},
{
"Batch Mean": 0.016815185546875,
"accuracy": 0.6875,
"epoch": 0.095,
"step": 38
},
{
"Batch Mean": 0.0049419403076171875,
"accuracy": 0.71875,
"epoch": 0.095,
"step": 38
},
{
"epoch": 0.0975,
"grad_norm": 9.898524284362793,
"learning_rate": 2.85e-06,
"loss": 0.6255,
"step": 39
},
{
"Batch Mean": 0.6515955924987793,
"accuracy": 0.6875,
"epoch": 0.0975,
"step": 39
},
{
"Batch Mean": 0.4063148498535156,
"accuracy": 0.8125,
"epoch": 0.0975,
"step": 39
},
{
"Batch Mean": 0.1270294189453125,
"accuracy": 0.71875,
"epoch": 0.0975,
"step": 39
},
{
"Batch Mean": 0.4789772033691406,
"accuracy": 0.6875,
"epoch": 0.0975,
"step": 39
},
{
"epoch": 0.1,
"grad_norm": 6.953475475311279,
"learning_rate": 2.8421052631578946e-06,
"loss": 0.4934,
"step": 40
},
{
"Batch Mean": 0.25176239013671875,
"accuracy": 0.78125,
"epoch": 0.1,
"step": 40
},
{
"Batch Mean": 0.4009513854980469,
"accuracy": 0.65625,
"epoch": 0.1,
"step": 40
},
{
"Batch Mean": 0.6202306747436523,
"accuracy": 0.78125,
"epoch": 0.1,
"step": 40
},
{
"Batch Mean": 0.2911343574523926,
"accuracy": 0.78125,
"epoch": 0.1,
"step": 40
},
{
"epoch": 0.1025,
"grad_norm": 7.0007123947143555,
"learning_rate": 2.8342105263157897e-06,
"loss": 0.4957,
"step": 41
},
{
"Batch Mean": 0.13779544830322266,
"accuracy": 0.625,
"epoch": 0.1025,
"step": 41
},
{
"Batch Mean": 0.5141849517822266,
"accuracy": 0.84375,
"epoch": 0.1025,
"step": 41
},
{
"Batch Mean": 0.12182235717773438,
"accuracy": 0.71875,
"epoch": 0.1025,
"step": 41
},
{
"Batch Mean": 0.09358537197113037,
"accuracy": 0.65625,
"epoch": 0.1025,
"step": 41
},
{
"epoch": 0.105,
"grad_norm": 8.165699005126953,
"learning_rate": 2.8263157894736845e-06,
"loss": 0.5642,
"step": 42
},
{
"Batch Mean": 0.26740550994873047,
"accuracy": 0.8125,
"epoch": 0.105,
"step": 42
},
{
"Batch Mean": -0.07419204711914062,
"accuracy": 0.71875,
"epoch": 0.105,
"step": 42
},
{
"Batch Mean": 0.2999420166015625,
"accuracy": 0.78125,
"epoch": 0.105,
"step": 42
},
{
"Batch Mean": -0.2398681640625,
"accuracy": 0.75,
"epoch": 0.105,
"step": 42
},
{
"epoch": 0.1075,
"grad_norm": 7.090755939483643,
"learning_rate": 2.8184210526315792e-06,
"loss": 0.5136,
"step": 43
},
{
"Batch Mean": 0.3058357238769531,
"accuracy": 0.875,
"epoch": 0.1075,
"step": 43
},
{
"Batch Mean": 0.10181450843811035,
"accuracy": 0.78125,
"epoch": 0.1075,
"step": 43
},
{
"Batch Mean": -0.07529067993164062,
"accuracy": 0.71875,
"epoch": 0.1075,
"step": 43
},
{
"Batch Mean": 0.46073150634765625,
"accuracy": 0.8125,
"epoch": 0.1075,
"step": 43
},
{
"epoch": 0.11,
"grad_norm": 5.939328670501709,
"learning_rate": 2.810526315789474e-06,
"loss": 0.464,
"step": 44
},
{
"Batch Mean": -0.13095474243164062,
"accuracy": 0.6875,
"epoch": 0.11,
"step": 44
},
{
"Batch Mean": 0.32462239265441895,
"accuracy": 0.875,
"epoch": 0.11,
"step": 44
},
{
"Batch Mean": -0.15337753295898438,
"accuracy": 0.78125,
"epoch": 0.11,
"step": 44
},
{
"Batch Mean": 0.38422298431396484,
"accuracy": 0.71875,
"epoch": 0.11,
"step": 44
},
{
"epoch": 0.1125,
"grad_norm": 6.517725944519043,
"learning_rate": 2.8026315789473687e-06,
"loss": 0.4854,
"step": 45
},
{
"Batch Mean": 0.290924072265625,
"accuracy": 0.625,
"epoch": 0.1125,
"step": 45
},
{
"Batch Mean": 0.03897809982299805,
"accuracy": 0.875,
"epoch": 0.1125,
"step": 45
},
{
"Batch Mean": 0.20547938346862793,
"accuracy": 0.875,
"epoch": 0.1125,
"step": 45
},
{
"Batch Mean": 0.3288555145263672,
"accuracy": 0.75,
"epoch": 0.1125,
"step": 45
},
{
"epoch": 0.115,
"grad_norm": 5.711620330810547,
"learning_rate": 2.7947368421052635e-06,
"loss": 0.4129,
"step": 46
},
{
"Batch Mean": 0.331978440284729,
"accuracy": 0.78125,
"epoch": 0.115,
"step": 46
},
{
"Batch Mean": -0.12884771823883057,
"accuracy": 0.71875,
"epoch": 0.115,
"step": 46
},
{
"Batch Mean": 0.2715787887573242,
"accuracy": 0.65625,
"epoch": 0.115,
"step": 46
},
{
"Batch Mean": 0.3961639404296875,
"accuracy": 0.78125,
"epoch": 0.115,
"step": 46
},
{
"epoch": 0.1175,
"grad_norm": 6.815968036651611,
"learning_rate": 2.7868421052631578e-06,
"loss": 0.5217,
"step": 47
},
{
"Batch Mean": -0.05124783515930176,
"accuracy": 0.84375,
"epoch": 0.1175,
"step": 47
},
{
"Batch Mean": -0.4043617248535156,
"accuracy": 0.78125,
"epoch": 0.1175,
"step": 47
},
{
"Batch Mean": 0.21244239807128906,
"accuracy": 0.78125,
"epoch": 0.1175,
"step": 47
},
{
"Batch Mean": -0.09090805053710938,
"accuracy": 0.75,
"epoch": 0.1175,
"step": 47
},
{
"epoch": 0.12,
"grad_norm": 6.305139541625977,
"learning_rate": 2.7789473684210525e-06,
"loss": 0.4484,
"step": 48
},
{
"Batch Mean": 0.3022747039794922,
"accuracy": 0.65625,
"epoch": 0.12,
"step": 48
},
{
"Batch Mean": -0.013670921325683594,
"accuracy": 0.6875,
"epoch": 0.12,
"step": 48
},
{
"Batch Mean": 0.4046478271484375,
"accuracy": 0.84375,
"epoch": 0.12,
"step": 48
},
{
"Batch Mean": 0.16419363021850586,
"accuracy": 0.84375,
"epoch": 0.12,
"step": 48
},
{
"epoch": 0.1225,
"grad_norm": 5.598595142364502,
"learning_rate": 2.7710526315789473e-06,
"loss": 0.4684,
"step": 49
},
{
"Batch Mean": -0.24893569946289062,
"accuracy": 0.6875,
"epoch": 0.1225,
"step": 49
},
{
"Batch Mean": -0.2393360137939453,
"accuracy": 0.8125,
"epoch": 0.1225,
"step": 49
},
{
"Batch Mean": 0.2698392868041992,
"accuracy": 0.8125,
"epoch": 0.1225,
"step": 49
},
{
"Batch Mean": -0.3564453125,
"accuracy": 0.75,
"epoch": 0.1225,
"step": 49
},
{
"epoch": 0.125,
"grad_norm": 6.394057750701904,
"learning_rate": 2.763157894736842e-06,
"loss": 0.4703,
"step": 50
},
{
"Batch Mean": -0.09824085235595703,
"accuracy": 0.71875,
"epoch": 0.125,
"step": 50
},
{
"Batch Mean": -0.1602630615234375,
"accuracy": 0.8125,
"epoch": 0.125,
"step": 50
},
{
"Batch Mean": -0.6205692291259766,
"accuracy": 0.625,
"epoch": 0.125,
"step": 50
},
{
"Batch Mean": 0.06192302703857422,
"accuracy": 0.75,
"epoch": 0.125,
"step": 50
},
{
"epoch": 0.1275,
"grad_norm": 7.542079925537109,
"learning_rate": 2.7552631578947368e-06,
"loss": 0.4731,
"step": 51
},
{
"Batch Mean": -0.24329090118408203,
"accuracy": 0.71875,
"epoch": 0.1275,
"step": 51
},
{
"Batch Mean": 0.277587890625,
"accuracy": 0.78125,
"epoch": 0.1275,
"step": 51
},
{
"Batch Mean": -0.1536083221435547,
"accuracy": 0.8125,
"epoch": 0.1275,
"step": 51
},
{
"Batch Mean": -0.2829427719116211,
"accuracy": 0.90625,
"epoch": 0.1275,
"step": 51
},
{
"epoch": 0.13,
"grad_norm": 6.608920097351074,
"learning_rate": 2.7473684210526315e-06,
"loss": 0.4472,
"step": 52
},
{
"Batch Mean": -0.2534487247467041,
"accuracy": 0.78125,
"epoch": 0.13,
"step": 52
},
{
"Batch Mean": -0.3897590637207031,
"accuracy": 0.6875,
"epoch": 0.13,
"step": 52
},
{
"Batch Mean": 0.0982666015625,
"accuracy": 0.75,
"epoch": 0.13,
"step": 52
},
{
"Batch Mean": -0.19083404541015625,
"accuracy": 0.78125,
"epoch": 0.13,
"step": 52
},
{
"epoch": 0.1325,
"grad_norm": 9.115386962890625,
"learning_rate": 2.7394736842105263e-06,
"loss": 0.4964,
"step": 53
},
{
"Batch Mean": -0.07914352416992188,
"accuracy": 0.84375,
"epoch": 0.1325,
"step": 53
},
{
"Batch Mean": -0.8162860870361328,
"accuracy": 0.75,
"epoch": 0.1325,
"step": 53
},
{
"Batch Mean": -0.9538593292236328,
"accuracy": 0.71875,
"epoch": 0.1325,
"step": 53
},
{
"Batch Mean": 0.025072097778320312,
"accuracy": 0.78125,
"epoch": 0.1325,
"step": 53
},
{
"epoch": 0.135,
"grad_norm": 9.654952049255371,
"learning_rate": 2.7315789473684214e-06,
"loss": 0.4771,
"step": 54
},
{
"Batch Mean": -0.2607238292694092,
"accuracy": 0.65625,
"epoch": 0.135,
"step": 54
},
{
"Batch Mean": 0.07077789306640625,
"accuracy": 0.875,
"epoch": 0.135,
"step": 54
},
{
"Batch Mean": -0.2121124267578125,
"accuracy": 0.84375,
"epoch": 0.135,
"step": 54
},
{
"Batch Mean": 0.040355682373046875,
"accuracy": 0.6875,
"epoch": 0.135,
"step": 54
},
{
"epoch": 0.1375,
"grad_norm": 9.226275444030762,
"learning_rate": 2.723684210526316e-06,
"loss": 0.477,
"step": 55
},
{
"Batch Mean": 0.14949023723602295,
"accuracy": 0.75,
"epoch": 0.1375,
"step": 55
},
{
"Batch Mean": -0.2880672216415405,
"accuracy": 0.875,
"epoch": 0.1375,
"step": 55
},
{
"Batch Mean": -0.037652015686035156,
"accuracy": 0.75,
"epoch": 0.1375,
"step": 55
},
{
"Batch Mean": 0.11230850219726562,
"accuracy": 0.78125,
"epoch": 0.1375,
"step": 55
},
{
"epoch": 0.14,
"grad_norm": 8.696858406066895,
"learning_rate": 2.715789473684211e-06,
"loss": 0.4385,
"step": 56
},
{
"Batch Mean": -0.14437103271484375,
"accuracy": 0.84375,
"epoch": 0.14,
"step": 56
},
{
"Batch Mean": -0.3502960205078125,
"accuracy": 0.78125,
"epoch": 0.14,
"step": 56
},
{
"Batch Mean": -0.3359222412109375,
"accuracy": 0.65625,
"epoch": 0.14,
"step": 56
},
{
"Batch Mean": -0.4460906982421875,
"accuracy": 0.71875,
"epoch": 0.14,
"step": 56
},
{
"epoch": 0.1425,
"grad_norm": 10.204813003540039,
"learning_rate": 2.7078947368421052e-06,
"loss": 0.4971,
"step": 57
},
{
"Batch Mean": -0.0918121337890625,
"accuracy": 0.75,
"epoch": 0.1425,
"step": 57
},
{
"Batch Mean": 0.1797332763671875,
"accuracy": 0.78125,
"epoch": 0.1425,
"step": 57
},
{
"Batch Mean": -0.22362709045410156,
"accuracy": 0.65625,
"epoch": 0.1425,
"step": 57
},
{
"Batch Mean": -0.932403564453125,
"accuracy": 0.78125,
"epoch": 0.1425,
"step": 57
},
{
"epoch": 0.145,
"grad_norm": 9.547924995422363,
"learning_rate": 2.7e-06,
"loss": 0.5235,
"step": 58
},
{
"Batch Mean": -0.69256591796875,
"accuracy": 0.75,
"epoch": 0.145,
"step": 58
},
{
"Batch Mean": -0.408052921295166,
"accuracy": 0.71875,
"epoch": 0.145,
"step": 58
},
{
"Batch Mean": -0.7247238159179688,
"accuracy": 0.75,
"epoch": 0.145,
"step": 58
},
{
"Batch Mean": -0.5294733047485352,
"accuracy": 0.8125,
"epoch": 0.145,
"step": 58
},
{
"epoch": 0.1475,
"grad_norm": 8.18185043334961,
"learning_rate": 2.6921052631578947e-06,
"loss": 0.4697,
"step": 59
},
{
"Batch Mean": -0.562103271484375,
"accuracy": 0.75,
"epoch": 0.1475,
"step": 59
},
{
"Batch Mean": -0.36240386962890625,
"accuracy": 0.71875,
"epoch": 0.1475,
"step": 59
},
{
"Batch Mean": -0.8479537963867188,
"accuracy": 0.75,
"epoch": 0.1475,
"step": 59
},
{
"Batch Mean": -0.5514106750488281,
"accuracy": 0.8125,
"epoch": 0.1475,
"step": 59
},
{
"epoch": 0.15,
"grad_norm": 9.638142585754395,
"learning_rate": 2.6842105263157895e-06,
"loss": 0.4854,
"step": 60
},
{
"Batch Mean": -0.9713249206542969,
"accuracy": 0.875,
"epoch": 0.15,
"step": 60
},
{
"Batch Mean": -1.4701347351074219,
"accuracy": 0.8125,
"epoch": 0.15,
"step": 60
},
{
"Batch Mean": -0.8054180145263672,
"accuracy": 0.6875,
"epoch": 0.15,
"step": 60
},
{
"Batch Mean": -1.1165752410888672,
"accuracy": 0.875,
"epoch": 0.15,
"step": 60
},
{
"epoch": 0.1525,
"grad_norm": 9.138744354248047,
"learning_rate": 2.6763157894736842e-06,
"loss": 0.4093,
"step": 61
},
{
"Batch Mean": -1.2550277709960938,
"accuracy": 0.78125,
"epoch": 0.1525,
"step": 61
},
{
"Batch Mean": -0.9237594604492188,
"accuracy": 0.6875,
"epoch": 0.1525,
"step": 61
},
{
"Batch Mean": -0.9178142547607422,
"accuracy": 0.875,
"epoch": 0.1525,
"step": 61
},
{
"Batch Mean": -0.8621349334716797,
"accuracy": 0.75,
"epoch": 0.1525,
"step": 61
},
{
"epoch": 0.155,
"grad_norm": 9.812451362609863,
"learning_rate": 2.668421052631579e-06,
"loss": 0.4354,
"step": 62
},
{
"Batch Mean": -1.3034553527832031,
"accuracy": 0.78125,
"epoch": 0.155,
"step": 62
},
{
"Batch Mean": -1.0795440673828125,
"accuracy": 0.78125,
"epoch": 0.155,
"step": 62
},
{
"Batch Mean": -1.0960693359375,
"accuracy": 0.84375,
"epoch": 0.155,
"step": 62
},
{
"Batch Mean": -1.2091312408447266,
"accuracy": 0.6875,
"epoch": 0.155,
"step": 62
},
{
"epoch": 0.1575,
"grad_norm": 9.518035888671875,
"learning_rate": 2.6605263157894737e-06,
"loss": 0.4399,
"step": 63
},
{
"Batch Mean": -1.405853271484375,
"accuracy": 0.6875,
"epoch": 0.1575,
"step": 63
},
{
"Batch Mean": -1.4421844482421875,
"accuracy": 0.71875,
"epoch": 0.1575,
"step": 63
},
{
"Batch Mean": -1.2391834259033203,
"accuracy": 0.6875,
"epoch": 0.1575,
"step": 63
},
{
"Batch Mean": -0.881195068359375,
"accuracy": 0.8125,
"epoch": 0.1575,
"step": 63
},
{
"epoch": 0.16,
"grad_norm": 10.348162651062012,
"learning_rate": 2.6526315789473685e-06,
"loss": 0.537,
"step": 64
},
{
"Batch Mean": -1.0128021240234375,
"accuracy": 0.84375,
"epoch": 0.16,
"step": 64
},
{
"Batch Mean": -1.0150184631347656,
"accuracy": 0.84375,
"epoch": 0.16,
"step": 64
},
{
"Batch Mean": -1.497243881225586,
"accuracy": 0.78125,
"epoch": 0.16,
"step": 64
},
{
"Batch Mean": -0.910819947719574,
"accuracy": 0.75,
"epoch": 0.16,
"step": 64
},
{
"epoch": 0.1625,
"grad_norm": 8.633638381958008,
"learning_rate": 2.644736842105263e-06,
"loss": 0.4436,
"step": 65
},
{
"Batch Mean": -1.0223121643066406,
"accuracy": 0.71875,
"epoch": 0.1625,
"step": 65
},
{
"Batch Mean": -0.5706081390380859,
"accuracy": 0.78125,
"epoch": 0.1625,
"step": 65
},
{
"Batch Mean": -0.965911865234375,
"accuracy": 0.8125,
"epoch": 0.1625,
"step": 65
},
{
"Batch Mean": -0.7304267883300781,
"accuracy": 0.8125,
"epoch": 0.1625,
"step": 65
},
{
"epoch": 0.165,
"grad_norm": 8.088103294372559,
"learning_rate": 2.636842105263158e-06,
"loss": 0.4446,
"step": 66
},
{
"Batch Mean": -0.4677067697048187,
"accuracy": 0.8125,
"epoch": 0.165,
"step": 66
},
{
"Batch Mean": -1.4533824920654297,
"accuracy": 0.78125,
"epoch": 0.165,
"step": 66
},
{
"Batch Mean": -0.78509521484375,
"accuracy": 0.84375,
"epoch": 0.165,
"step": 66
},
{
"Batch Mean": -0.8427619934082031,
"accuracy": 0.78125,
"epoch": 0.165,
"step": 66
},
{
"epoch": 0.1675,
"grad_norm": 7.766864776611328,
"learning_rate": 2.6289473684210527e-06,
"loss": 0.412,
"step": 67
},
{
"Batch Mean": -1.0267219543457031,
"accuracy": 0.75,
"epoch": 0.1675,
"step": 67
},
{
"Batch Mean": -0.0344390869140625,
"accuracy": 0.78125,
"epoch": 0.1675,
"step": 67
},
{
"Batch Mean": -0.7120513916015625,
"accuracy": 0.8125,
"epoch": 0.1675,
"step": 67
},
{
"Batch Mean": -0.8848686218261719,
"accuracy": 0.75,
"epoch": 0.1675,
"step": 67
},
{
"epoch": 0.17,
"grad_norm": 8.952485084533691,
"learning_rate": 2.6210526315789474e-06,
"loss": 0.4073,
"step": 68
},
{
"Batch Mean": -1.2683296203613281,
"accuracy": 0.78125,
"epoch": 0.17,
"step": 68
},
{
"Batch Mean": -0.9470596313476562,
"accuracy": 0.6875,
"epoch": 0.17,
"step": 68
},
{
"Batch Mean": -1.2335700988769531,
"accuracy": 0.78125,
"epoch": 0.17,
"step": 68
},
{
"Batch Mean": -0.9984736442565918,
"accuracy": 0.71875,
"epoch": 0.17,
"step": 68
},
{
"epoch": 0.1725,
"grad_norm": 8.944815635681152,
"learning_rate": 2.613157894736842e-06,
"loss": 0.4827,
"step": 69
},
{
"Batch Mean": -0.6530609130859375,
"accuracy": 0.8125,
"epoch": 0.1725,
"step": 69
},
{
"Batch Mean": -0.6013336181640625,
"accuracy": 0.6875,
"epoch": 0.1725,
"step": 69
},
{
"Batch Mean": -1.4489421844482422,
"accuracy": 0.8125,
"epoch": 0.1725,
"step": 69
},
{
"Batch Mean": -0.9736480712890625,
"accuracy": 0.90625,
"epoch": 0.1725,
"step": 69
},
{
"epoch": 0.175,
"grad_norm": 8.779143333435059,
"learning_rate": 2.605263157894737e-06,
"loss": 0.4578,
"step": 70
},
{
"Batch Mean": -0.9107780456542969,
"accuracy": 0.84375,
"epoch": 0.175,
"step": 70
},
{
"Batch Mean": -1.1361122131347656,
"accuracy": 0.6875,
"epoch": 0.175,
"step": 70
},
{
"Batch Mean": -0.6527862548828125,
"accuracy": 0.9375,
"epoch": 0.175,
"step": 70
},
{
"Batch Mean": -0.7553470134735107,
"accuracy": 0.6875,
"epoch": 0.175,
"step": 70
},
{
"epoch": 0.1775,
"grad_norm": 8.647814750671387,
"learning_rate": 2.5973684210526317e-06,
"loss": 0.4257,
"step": 71
},
{
"Batch Mean": -0.41971588134765625,
"accuracy": 0.75,
"epoch": 0.1775,
"step": 71
},
{
"Batch Mean": -0.705718994140625,
"accuracy": 0.875,
"epoch": 0.1775,
"step": 71
},
{
"Batch Mean": -1.0686330795288086,
"accuracy": 0.71875,
"epoch": 0.1775,
"step": 71
},
{
"Batch Mean": -0.8464865684509277,
"accuracy": 0.78125,
"epoch": 0.1775,
"step": 71
},
{
"epoch": 0.18,
"grad_norm": 8.784235000610352,
"learning_rate": 2.5894736842105264e-06,
"loss": 0.3921,
"step": 72
},
{
"Batch Mean": -0.7266769409179688,
"accuracy": 0.8125,
"epoch": 0.18,
"step": 72
},
{
"Batch Mean": -0.7239456176757812,
"accuracy": 0.8125,
"epoch": 0.18,
"step": 72
},
{
"Batch Mean": -0.6862373352050781,
"accuracy": 0.8125,
"epoch": 0.18,
"step": 72
},
{
"Batch Mean": -0.525360107421875,
"accuracy": 0.875,
"epoch": 0.18,
"step": 72
},
{
"epoch": 0.1825,
"grad_norm": 7.80237340927124,
"learning_rate": 2.581578947368421e-06,
"loss": 0.374,
"step": 73
},
{
"Batch Mean": -0.9130859375,
"accuracy": 0.84375,
"epoch": 0.1825,
"step": 73
},
{
"Batch Mean": -0.35595703125,
"accuracy": 0.875,
"epoch": 0.1825,
"step": 73
},
{
"Batch Mean": -0.8892440795898438,
"accuracy": 0.6875,
"epoch": 0.1825,
"step": 73
},
{
"Batch Mean": -0.4263725280761719,
"accuracy": 0.71875,
"epoch": 0.1825,
"step": 73
},
{
"epoch": 0.185,
"grad_norm": 7.894434452056885,
"learning_rate": 2.573684210526316e-06,
"loss": 0.4405,
"step": 74
},
{
"Batch Mean": -0.6322441101074219,
"accuracy": 0.8125,
"epoch": 0.185,
"step": 74
},
{
"Batch Mean": -0.456390380859375,
"accuracy": 0.875,
"epoch": 0.185,
"step": 74
},
{
"Batch Mean": -1.1346385478973389,
"accuracy": 0.84375,
"epoch": 0.185,
"step": 74
},
{
"Batch Mean": -0.8706645965576172,
"accuracy": 0.8125,
"epoch": 0.185,
"step": 74
},
{
"epoch": 0.1875,
"grad_norm": 8.292348861694336,
"learning_rate": 2.5657894736842107e-06,
"loss": 0.3895,
"step": 75
},
{
"Batch Mean": -1.0383148193359375,
"accuracy": 0.8125,
"epoch": 0.1875,
"step": 75
},
{
"Batch Mean": 0.19762420654296875,
"accuracy": 0.78125,
"epoch": 0.1875,
"step": 75
},
{
"Batch Mean": 0.10494613647460938,
"accuracy": 0.78125,
"epoch": 0.1875,
"step": 75
},
{
"Batch Mean": -0.5181140899658203,
"accuracy": 0.84375,
"epoch": 0.1875,
"step": 75
},
{
"epoch": 0.19,
"grad_norm": 8.882412910461426,
"learning_rate": 2.5578947368421054e-06,
"loss": 0.3805,
"step": 76
},
{
"Batch Mean": -0.4382622241973877,
"accuracy": 0.75,
"epoch": 0.19,
"step": 76
},
{
"Batch Mean": -0.16241741180419922,
"accuracy": 0.90625,
"epoch": 0.19,
"step": 76
},
{
"Batch Mean": -0.8776988983154297,
"accuracy": 0.875,
"epoch": 0.19,
"step": 76
},
{
"Batch Mean": -0.6540908813476562,
"accuracy": 0.8125,
"epoch": 0.19,
"step": 76
},
{
"epoch": 0.1925,
"grad_norm": 10.025094032287598,
"learning_rate": 2.55e-06,
"loss": 0.4001,
"step": 77
},
{
"Batch Mean": 0.06690788269042969,
"accuracy": 0.78125,
"epoch": 0.1925,
"step": 77
},
{
"Batch Mean": -0.03551149368286133,
"accuracy": 0.75,
"epoch": 0.1925,
"step": 77
},
{
"Batch Mean": 0.17040252685546875,
"accuracy": 0.78125,
"epoch": 0.1925,
"step": 77
},
{
"Batch Mean": -0.019598007202148438,
"accuracy": 0.90625,
"epoch": 0.1925,
"step": 77
},
{
"epoch": 0.195,
"grad_norm": 9.164822578430176,
"learning_rate": 2.542105263157895e-06,
"loss": 0.3807,
"step": 78
},
{
"Batch Mean": 0.0378570556640625,
"accuracy": 0.75,
"epoch": 0.195,
"step": 78
},
{
"Batch Mean": 0.3024101257324219,
"accuracy": 0.75,
"epoch": 0.195,
"step": 78
},
{
"Batch Mean": 0.1015625,
"accuracy": 0.6875,
"epoch": 0.195,
"step": 78
},
{
"Batch Mean": 0.10402488708496094,
"accuracy": 0.875,
"epoch": 0.195,
"step": 78
},
{
"epoch": 0.1975,
"grad_norm": 9.871844291687012,
"learning_rate": 2.5342105263157892e-06,
"loss": 0.4781,
"step": 79
},
{
"Batch Mean": 0.1356794238090515,
"accuracy": 0.8125,
"epoch": 0.1975,
"step": 79
},
{
"Batch Mean": 0.0782623291015625,
"accuracy": 0.78125,
"epoch": 0.1975,
"step": 79
},
{
"Batch Mean": -0.12647247314453125,
"accuracy": 0.8125,
"epoch": 0.1975,
"step": 79
},
{
"Batch Mean": 0.2567100524902344,
"accuracy": 0.78125,
"epoch": 0.1975,
"step": 79
},
{
"epoch": 0.2,
"grad_norm": 9.033759117126465,
"learning_rate": 2.526315789473684e-06,
"loss": 0.4288,
"step": 80
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 80,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}