|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"Batch Mean": -3.4444580078125, |
|
"accuracy": 0.40625, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 43.31455612182617, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 0.9077, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -3.43255615234375, |
|
"accuracy": 0.4921875, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 50.06682205200195, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 0.8907, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -3.44775390625, |
|
"accuracy": 0.453125, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 47.389522552490234, |
|
"learning_rate": 4.5e-07, |
|
"loss": 0.8899, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -3.4224853515625, |
|
"accuracy": 0.4375, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 49.24280548095703, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.8833, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -3.39697265625, |
|
"accuracy": 0.5234375, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 44.11930847167969, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.8661, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -3.38104248046875, |
|
"accuracy": 0.5078125, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 44.9741325378418, |
|
"learning_rate": 9e-07, |
|
"loss": 0.8805, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -3.19671630859375, |
|
"accuracy": 0.5078125, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 42.32276153564453, |
|
"learning_rate": 1.05e-06, |
|
"loss": 0.8547, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -3.1279296875, |
|
"accuracy": 0.4921875, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 44.22520065307617, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.8464, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -2.520538330078125, |
|
"accuracy": 0.5, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 36.41325378417969, |
|
"learning_rate": 1.35e-06, |
|
"loss": 0.7946, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -2.413787841796875, |
|
"accuracy": 0.515625, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 31.74874496459961, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.7939, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -2.218231201171875, |
|
"accuracy": 0.4921875, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 34.12691879272461, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.7505, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": -0.2393415868282318, |
|
"accuracy": 0.53125, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 10.481990814208984, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.6779, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": 0.2733480930328369, |
|
"accuracy": 0.5703125, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 8.39962387084961, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.6728, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": 0.5041149854660034, |
|
"accuracy": 0.671875, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 15.906760215759277, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.6679, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": 0.8821840286254883, |
|
"accuracy": 0.578125, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 19.03310775756836, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.7024, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": 1.1891746520996094, |
|
"accuracy": 0.5859375, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 20.89366340637207, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.6868, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": 2.1533203125, |
|
"accuracy": 0.671875, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 34.633541107177734, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.7297, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": 2.5652008056640625, |
|
"accuracy": 0.6484375, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 41.46820068359375, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.7387, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": 2.4411849975585938, |
|
"accuracy": 0.75, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 42.9193000793457, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.6813, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": 2.1285018920898438, |
|
"accuracy": 0.703125, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 37.16952133178711, |
|
"learning_rate": 3e-06, |
|
"loss": 0.672, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": 1.4230481386184692, |
|
"accuracy": 0.671875, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 27.2924747467041, |
|
"learning_rate": 2.992105263157895e-06, |
|
"loss": 0.647, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": 0.5414033532142639, |
|
"accuracy": 0.6953125, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 11.927323341369629, |
|
"learning_rate": 2.9842105263157896e-06, |
|
"loss": 0.5981, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": -0.36795324087142944, |
|
"accuracy": 0.7265625, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 13.455949783325195, |
|
"learning_rate": 2.9763157894736843e-06, |
|
"loss": 0.5763, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": -1.0874066352844238, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 22.382274627685547, |
|
"learning_rate": 2.968421052631579e-06, |
|
"loss": 0.5763, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": -1.103028416633606, |
|
"accuracy": 0.6953125, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 19.854228973388672, |
|
"learning_rate": 2.960526315789474e-06, |
|
"loss": 0.6065, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -0.8232302069664001, |
|
"accuracy": 0.734375, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 16.642824172973633, |
|
"learning_rate": 2.9526315789473685e-06, |
|
"loss": 0.5282, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -0.11966800689697266, |
|
"accuracy": 0.6875, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 8.563494682312012, |
|
"learning_rate": 2.9447368421052633e-06, |
|
"loss": 0.5348, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": 0.5674030780792236, |
|
"accuracy": 0.671875, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 13.211753845214844, |
|
"learning_rate": 2.936842105263158e-06, |
|
"loss": 0.5812, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": 1.1245126724243164, |
|
"accuracy": 0.6171875, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 22.8548583984375, |
|
"learning_rate": 2.9289473684210528e-06, |
|
"loss": 0.7336, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": 1.2212319374084473, |
|
"accuracy": 0.6796875, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 21.96833038330078, |
|
"learning_rate": 2.9210526315789475e-06, |
|
"loss": 0.6167, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": 0.7509702444076538, |
|
"accuracy": 0.796875, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 12.47424030303955, |
|
"learning_rate": 2.9131578947368423e-06, |
|
"loss": 0.463, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": 0.2412339150905609, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 9.769195556640625, |
|
"learning_rate": 2.905263157894737e-06, |
|
"loss": 0.5412, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": -0.5602660179138184, |
|
"accuracy": 0.7109375, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 11.323891639709473, |
|
"learning_rate": 2.8973684210526318e-06, |
|
"loss": 0.5601, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": -0.8669416904449463, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 15.461758613586426, |
|
"learning_rate": 2.8894736842105265e-06, |
|
"loss": 0.5147, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": -0.7635477781295776, |
|
"accuracy": 0.75, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 14.736496925354004, |
|
"learning_rate": 2.8815789473684213e-06, |
|
"loss": 0.5421, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": -0.5088728070259094, |
|
"accuracy": 0.734375, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 11.982501029968262, |
|
"learning_rate": 2.873684210526316e-06, |
|
"loss": 0.5234, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": -0.03705340623855591, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 7.854848861694336, |
|
"learning_rate": 2.8657894736842103e-06, |
|
"loss": 0.4996, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": 0.5426892042160034, |
|
"accuracy": 0.734375, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 10.834790229797363, |
|
"learning_rate": 2.857894736842105e-06, |
|
"loss": 0.5187, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": 0.4622654914855957, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 10.764225959777832, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.4723, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": 0.18012571334838867, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 9.500924110412598, |
|
"learning_rate": 2.8421052631578946e-06, |
|
"loss": 0.4545, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": -0.05992317199707031, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 8.079994201660156, |
|
"learning_rate": 2.8342105263157897e-06, |
|
"loss": 0.49, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": 0.04392993450164795, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 10.184918403625488, |
|
"learning_rate": 2.8263157894736845e-06, |
|
"loss": 0.3789, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": -0.2515716552734375, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 9.894248008728027, |
|
"learning_rate": 2.8184210526315792e-06, |
|
"loss": 0.435, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": 0.15310335159301758, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 10.671575546264648, |
|
"learning_rate": 2.810526315789474e-06, |
|
"loss": 0.4291, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": 0.2563772201538086, |
|
"accuracy": 0.765625, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 10.690945625305176, |
|
"learning_rate": 2.8026315789473687e-06, |
|
"loss": 0.4812, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": 0.046463966369628906, |
|
"accuracy": 0.875, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 9.616545677185059, |
|
"learning_rate": 2.7947368421052635e-06, |
|
"loss": 0.3082, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": 0.11115455627441406, |
|
"accuracy": 0.75, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 15.726456642150879, |
|
"learning_rate": 2.7868421052631578e-06, |
|
"loss": 0.5576, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": 0.2878689765930176, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 11.675898551940918, |
|
"learning_rate": 2.7789473684210525e-06, |
|
"loss": 0.472, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": -0.25928783416748047, |
|
"accuracy": 0.75, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 15.394309997558594, |
|
"learning_rate": 2.7710526315789473e-06, |
|
"loss": 0.5011, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -0.2885439097881317, |
|
"accuracy": 0.703125, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 15.69314956665039, |
|
"learning_rate": 2.763157894736842e-06, |
|
"loss": 0.5974, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -0.8389434814453125, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 16.744606018066406, |
|
"learning_rate": 2.7552631578947368e-06, |
|
"loss": 0.5151, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -0.6685854196548462, |
|
"accuracy": 0.703125, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 15.530059814453125, |
|
"learning_rate": 2.7473684210526315e-06, |
|
"loss": 0.5502, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": -0.5919044017791748, |
|
"accuracy": 0.71875, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 15.14562702178955, |
|
"learning_rate": 2.7394736842105263e-06, |
|
"loss": 0.5305, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": -0.11876988410949707, |
|
"accuracy": 0.75, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 7.011972427368164, |
|
"learning_rate": 2.7315789473684214e-06, |
|
"loss": 0.4802, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": 0.09675300121307373, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 8.37578010559082, |
|
"learning_rate": 2.723684210526316e-06, |
|
"loss": 0.4354, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": 0.3877286911010742, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 8.380095481872559, |
|
"learning_rate": 2.715789473684211e-06, |
|
"loss": 0.4114, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": 0.7472854852676392, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 12.794017791748047, |
|
"learning_rate": 2.7078947368421052e-06, |
|
"loss": 0.465, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": 0.5131087303161621, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 8.367005348205566, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.4407, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": 0.5741372108459473, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 9.066951751708984, |
|
"learning_rate": 2.6921052631578947e-06, |
|
"loss": 0.4468, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": 0.169053316116333, |
|
"accuracy": 0.734375, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.888144016265869, |
|
"learning_rate": 2.6842105263157895e-06, |
|
"loss": 0.4849, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -0.18010663986206055, |
|
"accuracy": 0.765625, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 6.846613883972168, |
|
"learning_rate": 2.6763157894736842e-06, |
|
"loss": 0.4643, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": -0.4085197448730469, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 7.677695274353027, |
|
"learning_rate": 2.668421052631579e-06, |
|
"loss": 0.3926, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": -0.48792409896850586, |
|
"accuracy": 0.828125, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 8.656214714050293, |
|
"learning_rate": 2.6605263157894737e-06, |
|
"loss": 0.4116, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": -0.19100165367126465, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 9.927976608276367, |
|
"learning_rate": 2.6526315789473685e-06, |
|
"loss": 0.4395, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": 0.025106430053710938, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 8.97470474243164, |
|
"learning_rate": 2.644736842105263e-06, |
|
"loss": 0.4632, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": 0.4497801661491394, |
|
"accuracy": 0.796875, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 11.160425186157227, |
|
"learning_rate": 2.636842105263158e-06, |
|
"loss": 0.5073, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": 0.6494235992431641, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 11.464187622070312, |
|
"learning_rate": 2.6289473684210527e-06, |
|
"loss": 0.4467, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -0.17097759246826172, |
|
"accuracy": 0.828125, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 8.820326805114746, |
|
"learning_rate": 2.6210526315789474e-06, |
|
"loss": 0.4543, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -0.1554449200630188, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 9.248348236083984, |
|
"learning_rate": 2.613157894736842e-06, |
|
"loss": 0.3796, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -0.21274375915527344, |
|
"accuracy": 0.75, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 8.15241813659668, |
|
"learning_rate": 2.605263157894737e-06, |
|
"loss": 0.4445, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": -0.06371593475341797, |
|
"accuracy": 0.765625, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 6.543805122375488, |
|
"learning_rate": 2.5973684210526317e-06, |
|
"loss": 0.4376, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": -0.10743597149848938, |
|
"accuracy": 0.7265625, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 7.594844341278076, |
|
"learning_rate": 2.5894736842105264e-06, |
|
"loss": 0.4363, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": 0.08045387268066406, |
|
"accuracy": 0.84375, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 7.977687835693359, |
|
"learning_rate": 2.581578947368421e-06, |
|
"loss": 0.3616, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": -0.11765336990356445, |
|
"accuracy": 0.796875, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 6.521571636199951, |
|
"learning_rate": 2.573684210526316e-06, |
|
"loss": 0.4001, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": 0.0607762336730957, |
|
"accuracy": 0.703125, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 8.15623950958252, |
|
"learning_rate": 2.5657894736842107e-06, |
|
"loss": 0.4941, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": 0.4910193681716919, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 9.602405548095703, |
|
"learning_rate": 2.5578947368421054e-06, |
|
"loss": 0.4224, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": 0.06563162803649902, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 6.577213764190674, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.4047, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": -0.08942317962646484, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 16.202590942382812, |
|
"learning_rate": 2.542105263157895e-06, |
|
"loss": 0.4418, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": -0.37958288192749023, |
|
"accuracy": 0.8125, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 23.45298194885254, |
|
"learning_rate": 2.5342105263157892e-06, |
|
"loss": 0.391, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": -0.5664916038513184, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 10.679176330566406, |
|
"learning_rate": 2.526315789473684e-06, |
|
"loss": 0.4402, |
|
"step": 80 |
|
}, |
|
{ |
|
"Batch Mean": -0.31339550018310547, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.2, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 16.459426879882812, |
|
"learning_rate": 2.5184210526315787e-06, |
|
"loss": 0.3765, |
|
"step": 81 |
|
}, |
|
{ |
|
"Batch Mean": 0.20376300811767578, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2025, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 8.887246131896973, |
|
"learning_rate": 2.510526315789474e-06, |
|
"loss": 0.4509, |
|
"step": 82 |
|
}, |
|
{ |
|
"Batch Mean": -0.054105013608932495, |
|
"accuracy": 0.796875, |
|
"epoch": 0.205, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 8.116570472717285, |
|
"learning_rate": 2.5026315789473686e-06, |
|
"loss": 0.3906, |
|
"step": 83 |
|
}, |
|
{ |
|
"Batch Mean": 0.4383058547973633, |
|
"accuracy": 0.875, |
|
"epoch": 0.2075, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 9.300606727600098, |
|
"learning_rate": 2.4947368421052634e-06, |
|
"loss": 0.3099, |
|
"step": 84 |
|
}, |
|
{ |
|
"Batch Mean": 0.4242062568664551, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.21, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 8.7870512008667, |
|
"learning_rate": 2.486842105263158e-06, |
|
"loss": 0.3996, |
|
"step": 85 |
|
}, |
|
{ |
|
"Batch Mean": 0.22194945812225342, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2125, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 8.691889762878418, |
|
"learning_rate": 2.478947368421053e-06, |
|
"loss": 0.4512, |
|
"step": 86 |
|
}, |
|
{ |
|
"Batch Mean": -0.14107918739318848, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.215, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 7.0445146560668945, |
|
"learning_rate": 2.4710526315789476e-06, |
|
"loss": 0.3771, |
|
"step": 87 |
|
}, |
|
{ |
|
"Batch Mean": -0.35906583070755005, |
|
"accuracy": 0.765625, |
|
"epoch": 0.2175, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 9.609421730041504, |
|
"learning_rate": 2.4631578947368424e-06, |
|
"loss": 0.4558, |
|
"step": 88 |
|
}, |
|
{ |
|
"Batch Mean": -0.45313501358032227, |
|
"accuracy": 0.828125, |
|
"epoch": 0.22, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 9.091865539550781, |
|
"learning_rate": 2.4552631578947367e-06, |
|
"loss": 0.4292, |
|
"step": 89 |
|
}, |
|
{ |
|
"Batch Mean": -0.35767626762390137, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.2225, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 7.811399459838867, |
|
"learning_rate": 2.4473684210526314e-06, |
|
"loss": 0.3823, |
|
"step": 90 |
|
}, |
|
{ |
|
"Batch Mean": 0.0022423267364501953, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.225, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2275, |
|
"grad_norm": 7.804200172424316, |
|
"learning_rate": 2.439473684210526e-06, |
|
"loss": 0.4079, |
|
"step": 91 |
|
}, |
|
{ |
|
"Batch Mean": 0.17678523063659668, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.2275, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.715360641479492, |
|
"learning_rate": 2.431578947368421e-06, |
|
"loss": 0.391, |
|
"step": 92 |
|
}, |
|
{ |
|
"Batch Mean": -0.22452354431152344, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.23, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2325, |
|
"grad_norm": 7.197687149047852, |
|
"learning_rate": 2.4236842105263157e-06, |
|
"loss": 0.3864, |
|
"step": 93 |
|
}, |
|
{ |
|
"Batch Mean": 0.30264759063720703, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2325, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 7.754411697387695, |
|
"learning_rate": 2.4157894736842104e-06, |
|
"loss": 0.3939, |
|
"step": 94 |
|
}, |
|
{ |
|
"Batch Mean": -0.2516747713088989, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 8.55639362335205, |
|
"learning_rate": 2.4078947368421056e-06, |
|
"loss": 0.4671, |
|
"step": 95 |
|
}, |
|
{ |
|
"Batch Mean": -0.15928316116333008, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.2375, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 7.463867664337158, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.471, |
|
"step": 96 |
|
}, |
|
{ |
|
"Batch Mean": -0.0638132095336914, |
|
"accuracy": 0.8125, |
|
"epoch": 0.24, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2425, |
|
"grad_norm": 6.797260761260986, |
|
"learning_rate": 2.392105263157895e-06, |
|
"loss": 0.3892, |
|
"step": 97 |
|
}, |
|
{ |
|
"Batch Mean": -0.08082282543182373, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.2425, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 7.096107482910156, |
|
"learning_rate": 2.38421052631579e-06, |
|
"loss": 0.3942, |
|
"step": 98 |
|
}, |
|
{ |
|
"Batch Mean": 0.3185189962387085, |
|
"accuracy": 0.796875, |
|
"epoch": 0.245, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2475, |
|
"grad_norm": 9.447708129882812, |
|
"learning_rate": 2.376315789473684e-06, |
|
"loss": 0.5037, |
|
"step": 99 |
|
}, |
|
{ |
|
"Batch Mean": 0.26651012897491455, |
|
"accuracy": 0.890625, |
|
"epoch": 0.2475, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 9.391190528869629, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 0.3352, |
|
"step": 100 |
|
}, |
|
{ |
|
"Batch Mean": -0.0040132105350494385, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.25, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2525, |
|
"grad_norm": 7.999260425567627, |
|
"learning_rate": 2.3605263157894736e-06, |
|
"loss": 0.4583, |
|
"step": 101 |
|
}, |
|
{ |
|
"Batch Mean": 0.13753342628479004, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2525, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 7.398046493530273, |
|
"learning_rate": 2.3526315789473684e-06, |
|
"loss": 0.3847, |
|
"step": 102 |
|
}, |
|
{ |
|
"Batch Mean": 0.15256333351135254, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.255, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2575, |
|
"grad_norm": 7.636544227600098, |
|
"learning_rate": 2.344736842105263e-06, |
|
"loss": 0.332, |
|
"step": 103 |
|
}, |
|
{ |
|
"Batch Mean": -0.2140512466430664, |
|
"accuracy": 0.75, |
|
"epoch": 0.2575, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 8.101052284240723, |
|
"learning_rate": 2.336842105263158e-06, |
|
"loss": 0.4434, |
|
"step": 104 |
|
}, |
|
{ |
|
"Batch Mean": -0.2993035316467285, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.26, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 7.990828514099121, |
|
"learning_rate": 2.3289473684210526e-06, |
|
"loss": 0.3701, |
|
"step": 105 |
|
}, |
|
{ |
|
"Batch Mean": -0.0038167238235473633, |
|
"accuracy": 0.796875, |
|
"epoch": 0.2625, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 6.86246919631958, |
|
"learning_rate": 2.3210526315789473e-06, |
|
"loss": 0.3546, |
|
"step": 106 |
|
}, |
|
{ |
|
"Batch Mean": 0.4233437776565552, |
|
"accuracy": 0.796875, |
|
"epoch": 0.265, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2675, |
|
"grad_norm": 9.27191162109375, |
|
"learning_rate": 2.313157894736842e-06, |
|
"loss": 0.414, |
|
"step": 107 |
|
}, |
|
{ |
|
"Batch Mean": 0.5853117108345032, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.2675, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 12.894753456115723, |
|
"learning_rate": 2.305263157894737e-06, |
|
"loss": 0.4258, |
|
"step": 108 |
|
}, |
|
{ |
|
"Batch Mean": 0.07786870002746582, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.27, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2725, |
|
"grad_norm": 10.551774024963379, |
|
"learning_rate": 2.2973684210526316e-06, |
|
"loss": 0.455, |
|
"step": 109 |
|
}, |
|
{ |
|
"Batch Mean": 0.003017425537109375, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.2725, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 8.059015274047852, |
|
"learning_rate": 2.2894736842105263e-06, |
|
"loss": 0.3559, |
|
"step": 110 |
|
}, |
|
{ |
|
"Batch Mean": 0.08839225769042969, |
|
"accuracy": 0.859375, |
|
"epoch": 0.275, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2775, |
|
"grad_norm": 7.2062530517578125, |
|
"learning_rate": 2.281578947368421e-06, |
|
"loss": 0.2765, |
|
"step": 111 |
|
}, |
|
{ |
|
"Batch Mean": -0.4452195167541504, |
|
"accuracy": 0.890625, |
|
"epoch": 0.2775, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 9.328097343444824, |
|
"learning_rate": 2.273684210526316e-06, |
|
"loss": 0.2535, |
|
"step": 112 |
|
}, |
|
{ |
|
"Batch Mean": -0.2987861633300781, |
|
"accuracy": 0.828125, |
|
"epoch": 0.28, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2825, |
|
"grad_norm": 10.32376766204834, |
|
"learning_rate": 2.2657894736842106e-06, |
|
"loss": 0.4152, |
|
"step": 113 |
|
}, |
|
{ |
|
"Batch Mean": -0.05189800262451172, |
|
"accuracy": 0.875, |
|
"epoch": 0.2825, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 7.734317779541016, |
|
"learning_rate": 2.2578947368421053e-06, |
|
"loss": 0.2874, |
|
"step": 114 |
|
}, |
|
{ |
|
"Batch Mean": 0.8154248595237732, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.285, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 16.42246437072754, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.517, |
|
"step": 115 |
|
}, |
|
{ |
|
"Batch Mean": 0.2764568328857422, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.2875, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 10.174060821533203, |
|
"learning_rate": 2.242105263157895e-06, |
|
"loss": 0.3843, |
|
"step": 116 |
|
}, |
|
{ |
|
"Batch Mean": -0.10035037994384766, |
|
"accuracy": 0.84375, |
|
"epoch": 0.29, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2925, |
|
"grad_norm": 7.6288161277771, |
|
"learning_rate": 2.2342105263157895e-06, |
|
"loss": 0.3509, |
|
"step": 117 |
|
}, |
|
{ |
|
"Batch Mean": -0.22602462768554688, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2925, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 8.613261222839355, |
|
"learning_rate": 2.2263157894736843e-06, |
|
"loss": 0.3731, |
|
"step": 118 |
|
}, |
|
{ |
|
"Batch Mean": -0.09805738925933838, |
|
"accuracy": 0.7265625, |
|
"epoch": 0.295, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2975, |
|
"grad_norm": 10.688722610473633, |
|
"learning_rate": 2.218421052631579e-06, |
|
"loss": 0.4856, |
|
"step": 119 |
|
}, |
|
{ |
|
"Batch Mean": -0.21628332138061523, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2975, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.811151027679443, |
|
"learning_rate": 2.2105263157894738e-06, |
|
"loss": 0.3613, |
|
"step": 120 |
|
}, |
|
{ |
|
"Batch Mean": -0.4057881832122803, |
|
"accuracy": 0.796875, |
|
"epoch": 0.3, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3025, |
|
"grad_norm": 8.88042163848877, |
|
"learning_rate": 2.2026315789473685e-06, |
|
"loss": 0.3821, |
|
"step": 121 |
|
}, |
|
{ |
|
"Batch Mean": -0.3958141803741455, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3025, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 8.513867378234863, |
|
"learning_rate": 2.1947368421052633e-06, |
|
"loss": 0.3592, |
|
"step": 122 |
|
}, |
|
{ |
|
"Batch Mean": 0.11790943145751953, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.305, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3075, |
|
"grad_norm": 6.011488437652588, |
|
"learning_rate": 2.186842105263158e-06, |
|
"loss": 0.3249, |
|
"step": 123 |
|
}, |
|
{ |
|
"Batch Mean": 0.7885537147521973, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3075, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 13.359249114990234, |
|
"learning_rate": 2.1789473684210528e-06, |
|
"loss": 0.3776, |
|
"step": 124 |
|
}, |
|
{ |
|
"Batch Mean": 0.906917929649353, |
|
"accuracy": 0.8125, |
|
"epoch": 0.31, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 13.69727897644043, |
|
"learning_rate": 2.1710526315789475e-06, |
|
"loss": 0.4525, |
|
"step": 125 |
|
}, |
|
{ |
|
"Batch Mean": 0.7685656547546387, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.3125, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 13.353426933288574, |
|
"learning_rate": 2.1631578947368423e-06, |
|
"loss": 0.3873, |
|
"step": 126 |
|
}, |
|
{ |
|
"Batch Mean": 0.31209874153137207, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.315, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3175, |
|
"grad_norm": 8.501708030700684, |
|
"learning_rate": 2.155263157894737e-06, |
|
"loss": 0.3643, |
|
"step": 127 |
|
}, |
|
{ |
|
"Batch Mean": -0.0774112343788147, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.3175, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.79693078994751, |
|
"learning_rate": 2.1473684210526317e-06, |
|
"loss": 0.3361, |
|
"step": 128 |
|
}, |
|
{ |
|
"Batch Mean": -1.1531860828399658, |
|
"accuracy": 0.875, |
|
"epoch": 0.32, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3225, |
|
"grad_norm": 18.78434181213379, |
|
"learning_rate": 2.1394736842105265e-06, |
|
"loss": 0.381, |
|
"step": 129 |
|
}, |
|
{ |
|
"Batch Mean": -0.9209874868392944, |
|
"accuracy": 0.828125, |
|
"epoch": 0.3225, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 15.549153327941895, |
|
"learning_rate": 2.1315789473684212e-06, |
|
"loss": 0.352, |
|
"step": 130 |
|
}, |
|
{ |
|
"Batch Mean": -0.5993038415908813, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.325, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3275, |
|
"grad_norm": 11.017681121826172, |
|
"learning_rate": 2.123684210526316e-06, |
|
"loss": 0.4731, |
|
"step": 131 |
|
}, |
|
{ |
|
"Batch Mean": -0.5436639785766602, |
|
"accuracy": 0.859375, |
|
"epoch": 0.3275, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 9.474353790283203, |
|
"learning_rate": 2.1157894736842103e-06, |
|
"loss": 0.3224, |
|
"step": 132 |
|
}, |
|
{ |
|
"Batch Mean": 0.03529167175292969, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.33, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3325, |
|
"grad_norm": 9.17381763458252, |
|
"learning_rate": 2.107894736842105e-06, |
|
"loss": 0.4091, |
|
"step": 133 |
|
}, |
|
{ |
|
"Batch Mean": 0.37068402767181396, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.3325, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 8.768671035766602, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.4094, |
|
"step": 134 |
|
}, |
|
{ |
|
"Batch Mean": 0.8418741226196289, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.335, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 13.19222354888916, |
|
"learning_rate": 2.0921052631578945e-06, |
|
"loss": 0.4361, |
|
"step": 135 |
|
}, |
|
{ |
|
"Batch Mean": 0.38561856746673584, |
|
"accuracy": 0.828125, |
|
"epoch": 0.3375, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 7.658647537231445, |
|
"learning_rate": 2.0842105263157897e-06, |
|
"loss": 0.3619, |
|
"step": 136 |
|
}, |
|
{ |
|
"Batch Mean": 0.24846503138542175, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.34, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3425, |
|
"grad_norm": 6.886051654815674, |
|
"learning_rate": 2.0763157894736845e-06, |
|
"loss": 0.3778, |
|
"step": 137 |
|
}, |
|
{ |
|
"Batch Mean": -0.3311891555786133, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.3425, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 7.944254398345947, |
|
"learning_rate": 2.068421052631579e-06, |
|
"loss": 0.3948, |
|
"step": 138 |
|
}, |
|
{ |
|
"Batch Mean": -0.568354606628418, |
|
"accuracy": 0.84375, |
|
"epoch": 0.345, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3475, |
|
"grad_norm": 9.763813018798828, |
|
"learning_rate": 2.060526315789474e-06, |
|
"loss": 0.3355, |
|
"step": 139 |
|
}, |
|
{ |
|
"Batch Mean": -0.39496636390686035, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.3475, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 7.516551971435547, |
|
"learning_rate": 2.0526315789473687e-06, |
|
"loss": 0.3501, |
|
"step": 140 |
|
}, |
|
{ |
|
"Batch Mean": -0.15581655502319336, |
|
"accuracy": 0.828125, |
|
"epoch": 0.35, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3525, |
|
"grad_norm": 8.155888557434082, |
|
"learning_rate": 2.0447368421052634e-06, |
|
"loss": 0.3773, |
|
"step": 141 |
|
}, |
|
{ |
|
"Batch Mean": 0.44981956481933594, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.3525, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 9.395730018615723, |
|
"learning_rate": 2.0368421052631578e-06, |
|
"loss": 0.4603, |
|
"step": 142 |
|
}, |
|
{ |
|
"Batch Mean": 0.33932405710220337, |
|
"accuracy": 0.8125, |
|
"epoch": 0.355, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3575, |
|
"grad_norm": 8.501273155212402, |
|
"learning_rate": 2.0289473684210525e-06, |
|
"loss": 0.4167, |
|
"step": 143 |
|
}, |
|
{ |
|
"Batch Mean": 0.31684207916259766, |
|
"accuracy": 0.875, |
|
"epoch": 0.3575, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 7.186644554138184, |
|
"learning_rate": 2.0210526315789473e-06, |
|
"loss": 0.3359, |
|
"step": 144 |
|
}, |
|
{ |
|
"Batch Mean": 0.14882159233093262, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.36, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 7.065666675567627, |
|
"learning_rate": 2.013157894736842e-06, |
|
"loss": 0.3773, |
|
"step": 145 |
|
}, |
|
{ |
|
"Batch Mean": 0.010605573654174805, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.3625, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.365, |
|
"grad_norm": 6.343169212341309, |
|
"learning_rate": 2.0052631578947367e-06, |
|
"loss": 0.3441, |
|
"step": 146 |
|
}, |
|
{ |
|
"Batch Mean": 0.13873529434204102, |
|
"accuracy": 0.828125, |
|
"epoch": 0.365, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3675, |
|
"grad_norm": 7.586203098297119, |
|
"learning_rate": 1.9973684210526315e-06, |
|
"loss": 0.3723, |
|
"step": 147 |
|
}, |
|
{ |
|
"Batch Mean": -0.6648737192153931, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.3675, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 11.366660118103027, |
|
"learning_rate": 1.9894736842105262e-06, |
|
"loss": 0.4372, |
|
"step": 148 |
|
}, |
|
{ |
|
"Batch Mean": -0.3633718490600586, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.37, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3725, |
|
"grad_norm": 7.831110000610352, |
|
"learning_rate": 1.9815789473684214e-06, |
|
"loss": 0.4074, |
|
"step": 149 |
|
}, |
|
{ |
|
"Batch Mean": -0.12644743919372559, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.3725, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 6.3676371574401855, |
|
"learning_rate": 1.973684210526316e-06, |
|
"loss": 0.304, |
|
"step": 150 |
|
}, |
|
{ |
|
"Batch Mean": 0.36059999465942383, |
|
"accuracy": 0.8125, |
|
"epoch": 0.375, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3775, |
|
"grad_norm": 8.31554889678955, |
|
"learning_rate": 1.965789473684211e-06, |
|
"loss": 0.3811, |
|
"step": 151 |
|
}, |
|
{ |
|
"Batch Mean": 0.3979158401489258, |
|
"accuracy": 0.8125, |
|
"epoch": 0.3775, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 9.095989227294922, |
|
"learning_rate": 1.9578947368421052e-06, |
|
"loss": 0.4376, |
|
"step": 152 |
|
}, |
|
{ |
|
"Batch Mean": 0.443440318107605, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.38, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3825, |
|
"grad_norm": 8.482221603393555, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.3351, |
|
"step": 153 |
|
}, |
|
{ |
|
"Batch Mean": 0.3431110382080078, |
|
"accuracy": 0.828125, |
|
"epoch": 0.3825, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.385, |
|
"grad_norm": 8.352450370788574, |
|
"learning_rate": 1.9421052631578947e-06, |
|
"loss": 0.3627, |
|
"step": 154 |
|
}, |
|
{ |
|
"Batch Mean": 0.17504891753196716, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.385, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 6.995707035064697, |
|
"learning_rate": 1.9342105263157895e-06, |
|
"loss": 0.323, |
|
"step": 155 |
|
}, |
|
{ |
|
"Batch Mean": -0.1753711700439453, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.3875, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 7.297128677368164, |
|
"learning_rate": 1.926315789473684e-06, |
|
"loss": 0.3426, |
|
"step": 156 |
|
}, |
|
{ |
|
"Batch Mean": -0.17636048793792725, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.39, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3925, |
|
"grad_norm": 6.796414375305176, |
|
"learning_rate": 1.918421052631579e-06, |
|
"loss": 0.3263, |
|
"step": 157 |
|
}, |
|
{ |
|
"Batch Mean": -0.7188338041305542, |
|
"accuracy": 0.875, |
|
"epoch": 0.3925, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.395, |
|
"grad_norm": 10.371746063232422, |
|
"learning_rate": 1.9105263157894737e-06, |
|
"loss": 0.3046, |
|
"step": 158 |
|
}, |
|
{ |
|
"Batch Mean": -0.5050258636474609, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.395, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3975, |
|
"grad_norm": 8.59885311126709, |
|
"learning_rate": 1.9026315789473684e-06, |
|
"loss": 0.3257, |
|
"step": 159 |
|
}, |
|
{ |
|
"Batch Mean": 0.15757006406784058, |
|
"accuracy": 0.875, |
|
"epoch": 0.3975, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.61456298828125, |
|
"learning_rate": 1.8947368421052632e-06, |
|
"loss": 0.2748, |
|
"step": 160 |
|
}, |
|
{ |
|
"Batch Mean": 0.4738607406616211, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.4, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4025, |
|
"grad_norm": 9.277711868286133, |
|
"learning_rate": 1.8868421052631577e-06, |
|
"loss": 0.3558, |
|
"step": 161 |
|
}, |
|
{ |
|
"Batch Mean": 0.9098443984985352, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.4025, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.405, |
|
"grad_norm": 14.112342834472656, |
|
"learning_rate": 1.8789473684210525e-06, |
|
"loss": 0.3479, |
|
"step": 162 |
|
}, |
|
{ |
|
"Batch Mean": -0.09790003299713135, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.405, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4075, |
|
"grad_norm": 7.988643169403076, |
|
"learning_rate": 1.8710526315789476e-06, |
|
"loss": 0.3329, |
|
"step": 163 |
|
}, |
|
{ |
|
"Batch Mean": -0.3589191436767578, |
|
"accuracy": 0.875, |
|
"epoch": 0.4075, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 8.374390602111816, |
|
"learning_rate": 1.8631578947368424e-06, |
|
"loss": 0.3404, |
|
"step": 164 |
|
}, |
|
{ |
|
"Batch Mean": -0.2895526885986328, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.41, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 10.125029563903809, |
|
"learning_rate": 1.855263157894737e-06, |
|
"loss": 0.3828, |
|
"step": 165 |
|
}, |
|
{ |
|
"Batch Mean": -0.7614960670471191, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.4125, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.415, |
|
"grad_norm": 12.611261367797852, |
|
"learning_rate": 1.8473684210526317e-06, |
|
"loss": 0.3182, |
|
"step": 166 |
|
}, |
|
{ |
|
"Batch Mean": 0.047823190689086914, |
|
"accuracy": 0.84375, |
|
"epoch": 0.415, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4175, |
|
"grad_norm": 7.5092644691467285, |
|
"learning_rate": 1.8394736842105264e-06, |
|
"loss": 0.3277, |
|
"step": 167 |
|
}, |
|
{ |
|
"Batch Mean": 0.3487553596496582, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.4175, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 7.762388229370117, |
|
"learning_rate": 1.8315789473684211e-06, |
|
"loss": 0.3021, |
|
"step": 168 |
|
}, |
|
{ |
|
"Batch Mean": -0.34416115283966064, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.42, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4225, |
|
"grad_norm": 9.338401794433594, |
|
"learning_rate": 1.8236842105263159e-06, |
|
"loss": 0.4031, |
|
"step": 169 |
|
}, |
|
{ |
|
"Batch Mean": 0.013556480407714844, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.4225, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 8.208139419555664, |
|
"learning_rate": 1.8157894736842106e-06, |
|
"loss": 0.367, |
|
"step": 170 |
|
}, |
|
{ |
|
"Batch Mean": -0.1292668581008911, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.425, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4275, |
|
"grad_norm": 6.575423717498779, |
|
"learning_rate": 1.8078947368421052e-06, |
|
"loss": 0.2778, |
|
"step": 171 |
|
}, |
|
{ |
|
"Batch Mean": -0.5196864604949951, |
|
"accuracy": 0.828125, |
|
"epoch": 0.4275, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 9.9501953125, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.388, |
|
"step": 172 |
|
}, |
|
{ |
|
"Batch Mean": -0.10827267169952393, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.43, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4325, |
|
"grad_norm": 6.746122360229492, |
|
"learning_rate": 1.7921052631578947e-06, |
|
"loss": 0.3337, |
|
"step": 173 |
|
}, |
|
{ |
|
"Batch Mean": -0.20117001235485077, |
|
"accuracy": 0.859375, |
|
"epoch": 0.4325, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.435, |
|
"grad_norm": 7.591915130615234, |
|
"learning_rate": 1.7842105263157894e-06, |
|
"loss": 0.3576, |
|
"step": 174 |
|
}, |
|
{ |
|
"Batch Mean": 0.4478938579559326, |
|
"accuracy": 0.8125, |
|
"epoch": 0.435, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 8.740865707397461, |
|
"learning_rate": 1.7763157894736842e-06, |
|
"loss": 0.3836, |
|
"step": 175 |
|
}, |
|
{ |
|
"Batch Mean": 0.1370692253112793, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.4375, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 6.480626583099365, |
|
"learning_rate": 1.768421052631579e-06, |
|
"loss": 0.3648, |
|
"step": 176 |
|
}, |
|
{ |
|
"Batch Mean": 0.47721147537231445, |
|
"accuracy": 0.859375, |
|
"epoch": 0.44, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4425, |
|
"grad_norm": 9.018311500549316, |
|
"learning_rate": 1.7605263157894739e-06, |
|
"loss": 0.4128, |
|
"step": 177 |
|
}, |
|
{ |
|
"Batch Mean": 0.13465499877929688, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.4425, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.445, |
|
"grad_norm": 6.7764973640441895, |
|
"learning_rate": 1.7526315789473686e-06, |
|
"loss": 0.3124, |
|
"step": 178 |
|
}, |
|
{ |
|
"Batch Mean": 0.25302886962890625, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.445, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4475, |
|
"grad_norm": 7.894146919250488, |
|
"learning_rate": 1.7447368421052633e-06, |
|
"loss": 0.3335, |
|
"step": 179 |
|
}, |
|
{ |
|
"Batch Mean": -0.5794901847839355, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.4475, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 8.826544761657715, |
|
"learning_rate": 1.736842105263158e-06, |
|
"loss": 0.3982, |
|
"step": 180 |
|
}, |
|
{ |
|
"Batch Mean": 0.16424822807312012, |
|
"accuracy": 0.84375, |
|
"epoch": 0.45, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4525, |
|
"grad_norm": 5.991344451904297, |
|
"learning_rate": 1.7289473684210526e-06, |
|
"loss": 0.3408, |
|
"step": 181 |
|
}, |
|
{ |
|
"Batch Mean": -0.13757801055908203, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.4525, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.455, |
|
"grad_norm": 5.649535179138184, |
|
"learning_rate": 1.7210526315789474e-06, |
|
"loss": 0.3081, |
|
"step": 182 |
|
}, |
|
{ |
|
"Batch Mean": 0.13538789749145508, |
|
"accuracy": 0.78125, |
|
"epoch": 0.455, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4575, |
|
"grad_norm": 7.989570140838623, |
|
"learning_rate": 1.7131578947368421e-06, |
|
"loss": 0.4532, |
|
"step": 183 |
|
}, |
|
{ |
|
"Batch Mean": -0.09452962875366211, |
|
"accuracy": 0.796875, |
|
"epoch": 0.4575, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 6.7350616455078125, |
|
"learning_rate": 1.7052631578947369e-06, |
|
"loss": 0.3937, |
|
"step": 184 |
|
}, |
|
{ |
|
"Batch Mean": -0.19620466232299805, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.46, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 32.979637145996094, |
|
"learning_rate": 1.6973684210526316e-06, |
|
"loss": 0.4651, |
|
"step": 185 |
|
}, |
|
{ |
|
"Batch Mean": 0.203094482421875, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.4625, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.465, |
|
"grad_norm": 6.455933570861816, |
|
"learning_rate": 1.6894736842105264e-06, |
|
"loss": 0.3405, |
|
"step": 186 |
|
}, |
|
{ |
|
"Batch Mean": 0.39686131477355957, |
|
"accuracy": 0.875, |
|
"epoch": 0.465, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4675, |
|
"grad_norm": 6.76831579208374, |
|
"learning_rate": 1.6815789473684209e-06, |
|
"loss": 0.3486, |
|
"step": 187 |
|
}, |
|
{ |
|
"Batch Mean": -0.008591651916503906, |
|
"accuracy": 0.859375, |
|
"epoch": 0.4675, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 5.692453861236572, |
|
"learning_rate": 1.6736842105263156e-06, |
|
"loss": 0.3288, |
|
"step": 188 |
|
}, |
|
{ |
|
"Batch Mean": -0.36408185958862305, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.47, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4725, |
|
"grad_norm": 7.711721897125244, |
|
"learning_rate": 1.6657894736842104e-06, |
|
"loss": 0.4075, |
|
"step": 189 |
|
}, |
|
{ |
|
"Batch Mean": -0.4406242370605469, |
|
"accuracy": 0.890625, |
|
"epoch": 0.4725, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 7.3570756912231445, |
|
"learning_rate": 1.6578947368421056e-06, |
|
"loss": 0.3025, |
|
"step": 190 |
|
}, |
|
{ |
|
"Batch Mean": 0.1726367473602295, |
|
"accuracy": 0.8125, |
|
"epoch": 0.475, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4775, |
|
"grad_norm": 7.980853080749512, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.3876, |
|
"step": 191 |
|
}, |
|
{ |
|
"Batch Mean": -0.0961066484451294, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4775, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 5.444026947021484, |
|
"learning_rate": 1.6421052631578948e-06, |
|
"loss": 0.2777, |
|
"step": 192 |
|
}, |
|
{ |
|
"Batch Mean": -0.12086796760559082, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.48, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4825, |
|
"grad_norm": 5.993566989898682, |
|
"learning_rate": 1.6342105263157896e-06, |
|
"loss": 0.3327, |
|
"step": 193 |
|
}, |
|
{ |
|
"Batch Mean": 0.5805015563964844, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4825, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.485, |
|
"grad_norm": 9.61883544921875, |
|
"learning_rate": 1.6263157894736843e-06, |
|
"loss": 0.3707, |
|
"step": 194 |
|
}, |
|
{ |
|
"Batch Mean": 0.8935675024986267, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.485, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 12.503327369689941, |
|
"learning_rate": 1.618421052631579e-06, |
|
"loss": 0.3951, |
|
"step": 195 |
|
}, |
|
{ |
|
"Batch Mean": 0.2793365716934204, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.4875, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 7.763684272766113, |
|
"learning_rate": 1.6105263157894738e-06, |
|
"loss": 0.3955, |
|
"step": 196 |
|
}, |
|
{ |
|
"Batch Mean": -0.14763522148132324, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.49, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4925, |
|
"grad_norm": 7.21512508392334, |
|
"learning_rate": 1.6026315789473683e-06, |
|
"loss": 0.3188, |
|
"step": 197 |
|
}, |
|
{ |
|
"Batch Mean": -0.5799511671066284, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.4925, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.495, |
|
"grad_norm": 8.71732234954834, |
|
"learning_rate": 1.594736842105263e-06, |
|
"loss": 0.3139, |
|
"step": 198 |
|
}, |
|
{ |
|
"Batch Mean": -0.46822547912597656, |
|
"accuracy": 0.890625, |
|
"epoch": 0.495, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4975, |
|
"grad_norm": 7.613549709320068, |
|
"learning_rate": 1.5868421052631578e-06, |
|
"loss": 0.2978, |
|
"step": 199 |
|
}, |
|
{ |
|
"Batch Mean": -0.370266854763031, |
|
"accuracy": 0.875, |
|
"epoch": 0.4975, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 7.669687271118164, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.2982, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|