|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2, |
|
"eval_steps": 500, |
|
"global_step": 80, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"Batch Mean": -1.4581298828125, |
|
"accuracy": 0.28125, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"Batch Mean": -1.4786376953125, |
|
"accuracy": 0.46875, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"Batch Mean": -1.486572265625, |
|
"accuracy": 0.5, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"Batch Mean": -1.439697265625, |
|
"accuracy": 0.625, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 2.7191572189331055, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 0.6927, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -1.4107666015625, |
|
"accuracy": 0.4375, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -1.4342041015625, |
|
"accuracy": 0.5, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -1.45263671875, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -1.4517822265625, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 3.204066038131714, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 0.6964, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -1.4908447265625, |
|
"accuracy": 0.59375, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -1.425048828125, |
|
"accuracy": 0.4375, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -1.464111328125, |
|
"accuracy": 0.375, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -1.4324951171875, |
|
"accuracy": 0.59375, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 3.103353261947632, |
|
"learning_rate": 4.5e-07, |
|
"loss": 0.6991, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -1.494140625, |
|
"accuracy": 0.46875, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -1.4178466796875, |
|
"accuracy": 0.625, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -1.520751953125, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -1.4844970703125, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.3672587871551514, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.6883, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -1.4312744140625, |
|
"accuracy": 0.4375, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -1.4820556640625, |
|
"accuracy": 0.5625, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -1.4405517578125, |
|
"accuracy": 0.5, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -1.4302978515625, |
|
"accuracy": 0.53125, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 3.158576011657715, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.7012, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -1.4569091796875, |
|
"accuracy": 0.40625, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -1.46435546875, |
|
"accuracy": 0.5, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -1.4354248046875, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -1.47412109375, |
|
"accuracy": 0.40625, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 4.888192176818848, |
|
"learning_rate": 9e-07, |
|
"loss": 0.7118, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -1.4361572265625, |
|
"accuracy": 0.53125, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -1.4234619140625, |
|
"accuracy": 0.625, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -1.4453125, |
|
"accuracy": 0.375, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -1.44287109375, |
|
"accuracy": 0.5, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 3.654751777648926, |
|
"learning_rate": 1.05e-06, |
|
"loss": 0.6901, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -1.4200439453125, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -1.406494140625, |
|
"accuracy": 0.4375, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -1.4012451171875, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -1.4122314453125, |
|
"accuracy": 0.4375, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.2707793712615967, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.7026, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -1.400634765625, |
|
"accuracy": 0.5625, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -1.3936767578125, |
|
"accuracy": 0.46875, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -1.4110107421875, |
|
"accuracy": 0.34375, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -1.4215087890625, |
|
"accuracy": 0.625, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 3.053551197052002, |
|
"learning_rate": 1.35e-06, |
|
"loss": 0.6859, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -1.35302734375, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -1.35003662109375, |
|
"accuracy": 0.40625, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -1.39306640625, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -1.3843994140625, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 2.9442760944366455, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.6853, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -1.31396484375, |
|
"accuracy": 0.46875, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -1.33154296875, |
|
"accuracy": 0.5625, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -1.3260498046875, |
|
"accuracy": 0.46875, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -1.3170166015625, |
|
"accuracy": 0.4375, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 2.729567050933838, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.6946, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -1.24346923828125, |
|
"accuracy": 0.4375, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -1.239013671875, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -1.3074951171875, |
|
"accuracy": 0.5, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -1.24664306640625, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.8832643032073975, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.6869, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -1.2061767578125, |
|
"accuracy": 0.625, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -1.09735107421875, |
|
"accuracy": 0.5625, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -1.1669921875, |
|
"accuracy": 0.625, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": -1.107421875, |
|
"accuracy": 0.53125, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 3.347060441970825, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.676, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -0.99713134765625, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -0.992431640625, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -1.08367919921875, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": -1.073486328125, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 3.0629279613494873, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.6446, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -1.027008056640625, |
|
"accuracy": 0.5625, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -1.04302978515625, |
|
"accuracy": 0.625, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -0.986724853515625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": -1.010406494140625, |
|
"accuracy": 0.65625, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 3.297088146209717, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.6466, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -0.945648193359375, |
|
"accuracy": 0.625, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -0.90460205078125, |
|
"accuracy": 0.625, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -0.9103546142578125, |
|
"accuracy": 0.625, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": -0.84765625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.339815855026245, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.6261, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -0.7341957092285156, |
|
"accuracy": 0.65625, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -0.6576881408691406, |
|
"accuracy": 0.8125, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -0.7573471069335938, |
|
"accuracy": 0.75, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": -0.8988265991210938, |
|
"accuracy": 0.84375, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 4.010303974151611, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.6324, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -0.45727968215942383, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -0.40456533432006836, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -0.4847888946533203, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": -0.31931304931640625, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 4.431520462036133, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.62, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -0.0693979263305664, |
|
"accuracy": 0.8125, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -0.23062896728515625, |
|
"accuracy": 0.625, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -0.10647201538085938, |
|
"accuracy": 0.65625, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": -0.07384902238845825, |
|
"accuracy": 0.6875, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 5.421309947967529, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.5896, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": -0.12799835205078125, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": 0.0684967041015625, |
|
"accuracy": 0.875, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": -0.014011383056640625, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": 0.0633087158203125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.102872848510742, |
|
"learning_rate": 3e-06, |
|
"loss": 0.4931, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": 0.19290733337402344, |
|
"accuracy": 0.8125, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": 0.29687976837158203, |
|
"accuracy": 0.53125, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": 0.103363037109375, |
|
"accuracy": 0.625, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": 0.3869609832763672, |
|
"accuracy": 0.71875, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 7.569705486297607, |
|
"learning_rate": 2.992105263157895e-06, |
|
"loss": 0.5976, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": 0.47769927978515625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": 0.5898284912109375, |
|
"accuracy": 0.875, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": 0.3037242889404297, |
|
"accuracy": 0.75, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": -0.037357330322265625, |
|
"accuracy": 0.75, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 7.273393630981445, |
|
"learning_rate": 2.9842105263157896e-06, |
|
"loss": 0.4982, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": 0.78411865234375, |
|
"accuracy": 0.59375, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": 0.3693389892578125, |
|
"accuracy": 0.625, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": 0.3277778625488281, |
|
"accuracy": 0.59375, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": 0.23564910888671875, |
|
"accuracy": 0.84375, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 10.978965759277344, |
|
"learning_rate": 2.9763157894736843e-06, |
|
"loss": 0.6967, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": 0.3892631530761719, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": 0.4144134521484375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": 0.201019287109375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": 0.15361404418945312, |
|
"accuracy": 0.5625, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 10.72164249420166, |
|
"learning_rate": 2.968421052631579e-06, |
|
"loss": 0.657, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": 0.32332611083984375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": -0.45644378662109375, |
|
"accuracy": 0.71875, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": 0.10271453857421875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": 0.5616731643676758, |
|
"accuracy": 0.78125, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 10.953572273254395, |
|
"learning_rate": 2.960526315789474e-06, |
|
"loss": 0.6313, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -0.08791732788085938, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -0.12505340576171875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": 0.2984886169433594, |
|
"accuracy": 0.75, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -0.2277584969997406, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 8.867247581481934, |
|
"learning_rate": 2.9526315789473685e-06, |
|
"loss": 0.5531, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -0.310638427734375, |
|
"accuracy": 0.875, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": 0.05762290954589844, |
|
"accuracy": 0.71875, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -0.3841552734375, |
|
"accuracy": 0.75, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -0.13448715209960938, |
|
"accuracy": 0.78125, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 7.167004585266113, |
|
"learning_rate": 2.9447368421052633e-06, |
|
"loss": 0.4927, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -0.5082488059997559, |
|
"accuracy": 0.59375, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -0.5335745811462402, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -0.3728065490722656, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -0.48749029636383057, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 9.99916934967041, |
|
"learning_rate": 2.936842105263158e-06, |
|
"loss": 0.6787, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -0.5768375396728516, |
|
"accuracy": 0.78125, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -0.36152684688568115, |
|
"accuracy": 0.71875, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -0.6082801818847656, |
|
"accuracy": 0.59375, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": -0.5176200866699219, |
|
"accuracy": 0.65625, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 6.558942794799805, |
|
"learning_rate": 2.9289473684210528e-06, |
|
"loss": 0.571, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -0.3009366989135742, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -0.4234275817871094, |
|
"accuracy": 0.75, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -0.4476432800292969, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": -0.6630382537841797, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 5.937437534332275, |
|
"learning_rate": 2.9210526315789475e-06, |
|
"loss": 0.5233, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -0.47089385986328125, |
|
"accuracy": 0.65625, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -0.5186127424240112, |
|
"accuracy": 0.78125, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -0.5250816345214844, |
|
"accuracy": 0.5625, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": -0.3480682373046875, |
|
"accuracy": 0.8125, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 5.8368072509765625, |
|
"learning_rate": 2.9131578947368423e-06, |
|
"loss": 0.5172, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": -0.29285621643066406, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": -0.3106422424316406, |
|
"accuracy": 0.84375, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": 0.005329132080078125, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": -0.1413421630859375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.706140995025635, |
|
"learning_rate": 2.905263157894737e-06, |
|
"loss": 0.5095, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": -0.04312324523925781, |
|
"accuracy": 0.6875, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": -0.10883808135986328, |
|
"accuracy": 0.71875, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": 0.3197288513183594, |
|
"accuracy": 0.71875, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": -0.13158416748046875, |
|
"accuracy": 0.65625, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 6.042052268981934, |
|
"learning_rate": 2.8973684210526318e-06, |
|
"loss": 0.5717, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": 0.0721282958984375, |
|
"accuracy": 0.75, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": 0.05409049987792969, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": -0.04035043716430664, |
|
"accuracy": 0.625, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": -0.04631471633911133, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 5.908041954040527, |
|
"learning_rate": 2.8894736842105265e-06, |
|
"loss": 0.5446, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": 0.2712249755859375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": 0.179473876953125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": -0.01055145263671875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": 0.06919479370117188, |
|
"accuracy": 0.8125, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 4.990839958190918, |
|
"learning_rate": 2.8815789473684213e-06, |
|
"loss": 0.4607, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": -0.017984390258789062, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": 0.075164794921875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": 0.20074462890625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": -0.03507876396179199, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.7467803955078125, |
|
"learning_rate": 2.873684210526316e-06, |
|
"loss": 0.5038, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": 0.2868976593017578, |
|
"accuracy": 0.75, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": 0.16400146484375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": 0.2293224334716797, |
|
"accuracy": 0.71875, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": 0.2969036102294922, |
|
"accuracy": 0.6875, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 6.629448413848877, |
|
"learning_rate": 2.8657894736842103e-06, |
|
"loss": 0.5233, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": -0.07112598419189453, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": 0.25348663330078125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": 0.2884788513183594, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": 0.06340456008911133, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 6.545988082885742, |
|
"learning_rate": 2.857894736842105e-06, |
|
"loss": 0.521, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": 0.372711181640625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": 0.2590770721435547, |
|
"accuracy": 0.53125, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": 0.016815185546875, |
|
"accuracy": 0.6875, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": 0.0049419403076171875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 9.898524284362793, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.6255, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": 0.6515955924987793, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": 0.4063148498535156, |
|
"accuracy": 0.8125, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": 0.1270294189453125, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": 0.4789772033691406, |
|
"accuracy": 0.6875, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.953475475311279, |
|
"learning_rate": 2.8421052631578946e-06, |
|
"loss": 0.4934, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": 0.25176239013671875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": 0.4009513854980469, |
|
"accuracy": 0.65625, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": 0.6202306747436523, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": 0.2911343574523926, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 7.0007123947143555, |
|
"learning_rate": 2.8342105263157897e-06, |
|
"loss": 0.4957, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": 0.13779544830322266, |
|
"accuracy": 0.625, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": 0.5141849517822266, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": 0.12182235717773438, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": 0.09358537197113037, |
|
"accuracy": 0.65625, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 8.165699005126953, |
|
"learning_rate": 2.8263157894736845e-06, |
|
"loss": 0.5642, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": 0.26740550994873047, |
|
"accuracy": 0.8125, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": -0.07419204711914062, |
|
"accuracy": 0.71875, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": 0.2999420166015625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": -0.2398681640625, |
|
"accuracy": 0.75, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 7.090755939483643, |
|
"learning_rate": 2.8184210526315792e-06, |
|
"loss": 0.5136, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": 0.3058357238769531, |
|
"accuracy": 0.875, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": 0.10181450843811035, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": -0.07529067993164062, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": 0.46073150634765625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.939328670501709, |
|
"learning_rate": 2.810526315789474e-06, |
|
"loss": 0.464, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": -0.13095474243164062, |
|
"accuracy": 0.6875, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": 0.32462239265441895, |
|
"accuracy": 0.875, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": -0.15337753295898438, |
|
"accuracy": 0.78125, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": 0.38422298431396484, |
|
"accuracy": 0.71875, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 6.517725944519043, |
|
"learning_rate": 2.8026315789473687e-06, |
|
"loss": 0.4854, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": 0.290924072265625, |
|
"accuracy": 0.625, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": 0.03897809982299805, |
|
"accuracy": 0.875, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": 0.20547938346862793, |
|
"accuracy": 0.875, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": 0.3288555145263672, |
|
"accuracy": 0.75, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 5.711620330810547, |
|
"learning_rate": 2.7947368421052635e-06, |
|
"loss": 0.4129, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": 0.331978440284729, |
|
"accuracy": 0.78125, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": -0.12884771823883057, |
|
"accuracy": 0.71875, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": 0.2715787887573242, |
|
"accuracy": 0.65625, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": 0.3961639404296875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 6.815968036651611, |
|
"learning_rate": 2.7868421052631578e-06, |
|
"loss": 0.5217, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": -0.05124783515930176, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": -0.4043617248535156, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": 0.21244239807128906, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": -0.09090805053710938, |
|
"accuracy": 0.75, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.305139541625977, |
|
"learning_rate": 2.7789473684210525e-06, |
|
"loss": 0.4484, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": 0.3022747039794922, |
|
"accuracy": 0.65625, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": -0.013670921325683594, |
|
"accuracy": 0.6875, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": 0.4046478271484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": 0.16419363021850586, |
|
"accuracy": 0.84375, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 5.598595142364502, |
|
"learning_rate": 2.7710526315789473e-06, |
|
"loss": 0.4684, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -0.24893569946289062, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -0.2393360137939453, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": 0.2698392868041992, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -0.3564453125, |
|
"accuracy": 0.75, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 6.394057750701904, |
|
"learning_rate": 2.763157894736842e-06, |
|
"loss": 0.4703, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -0.09824085235595703, |
|
"accuracy": 0.71875, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -0.1602630615234375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -0.6205692291259766, |
|
"accuracy": 0.625, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": 0.06192302703857422, |
|
"accuracy": 0.75, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 7.542079925537109, |
|
"learning_rate": 2.7552631578947368e-06, |
|
"loss": 0.4731, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -0.24329090118408203, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": 0.277587890625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -0.1536083221435547, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -0.2829427719116211, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.608920097351074, |
|
"learning_rate": 2.7473684210526315e-06, |
|
"loss": 0.4472, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": -0.2534487247467041, |
|
"accuracy": 0.78125, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": -0.3897590637207031, |
|
"accuracy": 0.6875, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": 0.0982666015625, |
|
"accuracy": 0.75, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": -0.19083404541015625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 9.115386962890625, |
|
"learning_rate": 2.7394736842105263e-06, |
|
"loss": 0.4964, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": -0.07914352416992188, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": -0.8162860870361328, |
|
"accuracy": 0.75, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": -0.9538593292236328, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": 0.025072097778320312, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 9.654952049255371, |
|
"learning_rate": 2.7315789473684214e-06, |
|
"loss": 0.4771, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": -0.2607238292694092, |
|
"accuracy": 0.65625, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": 0.07077789306640625, |
|
"accuracy": 0.875, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": -0.2121124267578125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": 0.040355682373046875, |
|
"accuracy": 0.6875, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 9.226275444030762, |
|
"learning_rate": 2.723684210526316e-06, |
|
"loss": 0.477, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": 0.14949023723602295, |
|
"accuracy": 0.75, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": -0.2880672216415405, |
|
"accuracy": 0.875, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": -0.037652015686035156, |
|
"accuracy": 0.75, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": 0.11230850219726562, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 8.696858406066895, |
|
"learning_rate": 2.715789473684211e-06, |
|
"loss": 0.4385, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -0.14437103271484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -0.3502960205078125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -0.3359222412109375, |
|
"accuracy": 0.65625, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": -0.4460906982421875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 10.204813003540039, |
|
"learning_rate": 2.7078947368421052e-06, |
|
"loss": 0.4971, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -0.0918121337890625, |
|
"accuracy": 0.75, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": 0.1797332763671875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -0.22362709045410156, |
|
"accuracy": 0.65625, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -0.932403564453125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 9.547924995422363, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.5235, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -0.69256591796875, |
|
"accuracy": 0.75, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -0.408052921295166, |
|
"accuracy": 0.71875, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -0.7247238159179688, |
|
"accuracy": 0.75, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -0.5294733047485352, |
|
"accuracy": 0.8125, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 8.18185043334961, |
|
"learning_rate": 2.6921052631578947e-06, |
|
"loss": 0.4697, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -0.562103271484375, |
|
"accuracy": 0.75, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -0.36240386962890625, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -0.8479537963867188, |
|
"accuracy": 0.75, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -0.5514106750488281, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 9.638142585754395, |
|
"learning_rate": 2.6842105263157895e-06, |
|
"loss": 0.4854, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -0.9713249206542969, |
|
"accuracy": 0.875, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -1.4701347351074219, |
|
"accuracy": 0.8125, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -0.8054180145263672, |
|
"accuracy": 0.6875, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -1.1165752410888672, |
|
"accuracy": 0.875, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 9.138744354248047, |
|
"learning_rate": 2.6763157894736842e-06, |
|
"loss": 0.4093, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -1.2550277709960938, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -0.9237594604492188, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -0.9178142547607422, |
|
"accuracy": 0.875, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -0.8621349334716797, |
|
"accuracy": 0.75, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 9.812451362609863, |
|
"learning_rate": 2.668421052631579e-06, |
|
"loss": 0.4354, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -1.3034553527832031, |
|
"accuracy": 0.78125, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -1.0795440673828125, |
|
"accuracy": 0.78125, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -1.0960693359375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -1.2091312408447266, |
|
"accuracy": 0.6875, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 9.518035888671875, |
|
"learning_rate": 2.6605263157894737e-06, |
|
"loss": 0.4399, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -1.405853271484375, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -1.4421844482421875, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -1.2391834259033203, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -0.881195068359375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 10.348162651062012, |
|
"learning_rate": 2.6526315789473685e-06, |
|
"loss": 0.537, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": -1.0128021240234375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": -1.0150184631347656, |
|
"accuracy": 0.84375, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": -1.497243881225586, |
|
"accuracy": 0.78125, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": -0.910819947719574, |
|
"accuracy": 0.75, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 8.633638381958008, |
|
"learning_rate": 2.644736842105263e-06, |
|
"loss": 0.4436, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -1.0223121643066406, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -0.5706081390380859, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -0.965911865234375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -0.7304267883300781, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 8.088103294372559, |
|
"learning_rate": 2.636842105263158e-06, |
|
"loss": 0.4446, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -0.4677067697048187, |
|
"accuracy": 0.8125, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -1.4533824920654297, |
|
"accuracy": 0.78125, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -0.78509521484375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -0.8427619934082031, |
|
"accuracy": 0.78125, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 7.766864776611328, |
|
"learning_rate": 2.6289473684210527e-06, |
|
"loss": 0.412, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -1.0267219543457031, |
|
"accuracy": 0.75, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -0.0344390869140625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -0.7120513916015625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -0.8848686218261719, |
|
"accuracy": 0.75, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 8.952485084533691, |
|
"learning_rate": 2.6210526315789474e-06, |
|
"loss": 0.4073, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -1.2683296203613281, |
|
"accuracy": 0.78125, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -0.9470596313476562, |
|
"accuracy": 0.6875, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -1.2335700988769531, |
|
"accuracy": 0.78125, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -0.9984736442565918, |
|
"accuracy": 0.71875, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 8.944815635681152, |
|
"learning_rate": 2.613157894736842e-06, |
|
"loss": 0.4827, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -0.6530609130859375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -0.6013336181640625, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -1.4489421844482422, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -0.9736480712890625, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 8.779143333435059, |
|
"learning_rate": 2.605263157894737e-06, |
|
"loss": 0.4578, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -0.9107780456542969, |
|
"accuracy": 0.84375, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -1.1361122131347656, |
|
"accuracy": 0.6875, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -0.6527862548828125, |
|
"accuracy": 0.9375, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -0.7553470134735107, |
|
"accuracy": 0.6875, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 8.647814750671387, |
|
"learning_rate": 2.5973684210526317e-06, |
|
"loss": 0.4257, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -0.41971588134765625, |
|
"accuracy": 0.75, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -0.705718994140625, |
|
"accuracy": 0.875, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -1.0686330795288086, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -0.8464865684509277, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 8.784235000610352, |
|
"learning_rate": 2.5894736842105264e-06, |
|
"loss": 0.3921, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -0.7266769409179688, |
|
"accuracy": 0.8125, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -0.7239456176757812, |
|
"accuracy": 0.8125, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -0.6862373352050781, |
|
"accuracy": 0.8125, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": -0.525360107421875, |
|
"accuracy": 0.875, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 7.80237340927124, |
|
"learning_rate": 2.581578947368421e-06, |
|
"loss": 0.374, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -0.9130859375, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -0.35595703125, |
|
"accuracy": 0.875, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -0.8892440795898438, |
|
"accuracy": 0.6875, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -0.4263725280761719, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 7.894434452056885, |
|
"learning_rate": 2.573684210526316e-06, |
|
"loss": 0.4405, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -0.6322441101074219, |
|
"accuracy": 0.8125, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -0.456390380859375, |
|
"accuracy": 0.875, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -1.1346385478973389, |
|
"accuracy": 0.84375, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": -0.8706645965576172, |
|
"accuracy": 0.8125, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 8.292348861694336, |
|
"learning_rate": 2.5657894736842107e-06, |
|
"loss": 0.3895, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": -1.0383148193359375, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": 0.19762420654296875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": 0.10494613647460938, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": -0.5181140899658203, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 8.882412910461426, |
|
"learning_rate": 2.5578947368421054e-06, |
|
"loss": 0.3805, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -0.4382622241973877, |
|
"accuracy": 0.75, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -0.16241741180419922, |
|
"accuracy": 0.90625, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -0.8776988983154297, |
|
"accuracy": 0.875, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -0.6540908813476562, |
|
"accuracy": 0.8125, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 10.025094032287598, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.4001, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": 0.06690788269042969, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": -0.03551149368286133, |
|
"accuracy": 0.75, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": 0.17040252685546875, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": -0.019598007202148438, |
|
"accuracy": 0.90625, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 9.164822578430176, |
|
"learning_rate": 2.542105263157895e-06, |
|
"loss": 0.3807, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": 0.0378570556640625, |
|
"accuracy": 0.75, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": 0.3024101257324219, |
|
"accuracy": 0.75, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": 0.1015625, |
|
"accuracy": 0.6875, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": 0.10402488708496094, |
|
"accuracy": 0.875, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 9.871844291687012, |
|
"learning_rate": 2.5342105263157892e-06, |
|
"loss": 0.4781, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": 0.1356794238090515, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": 0.0782623291015625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": -0.12647247314453125, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": 0.2567100524902344, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 9.033759117126465, |
|
"learning_rate": 2.526315789473684e-06, |
|
"loss": 0.4288, |
|
"step": 80 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 80, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|