{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": -3.4444580078125, "accuracy": 0.40625, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 43.31455612182617, "learning_rate": 1.5000000000000002e-07, "loss": 0.9077, "step": 1 }, { "Batch Mean": -3.43255615234375, "accuracy": 0.4921875, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 50.06682205200195, "learning_rate": 3.0000000000000004e-07, "loss": 0.8907, "step": 2 }, { "Batch Mean": -3.44775390625, "accuracy": 0.453125, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 47.389522552490234, "learning_rate": 4.5e-07, "loss": 0.8899, "step": 3 }, { "Batch Mean": -3.4224853515625, "accuracy": 0.4375, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 49.24280548095703, "learning_rate": 6.000000000000001e-07, "loss": 0.8833, "step": 4 }, { "Batch Mean": -3.39697265625, "accuracy": 0.5234375, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 44.11930847167969, "learning_rate": 7.5e-07, "loss": 0.8661, "step": 5 }, { "Batch Mean": -3.38104248046875, "accuracy": 0.5078125, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 44.9741325378418, "learning_rate": 9e-07, "loss": 0.8805, "step": 6 }, { "Batch Mean": -3.19671630859375, "accuracy": 0.5078125, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 42.32276153564453, "learning_rate": 1.05e-06, "loss": 0.8547, "step": 7 }, { "Batch Mean": -3.1279296875, "accuracy": 0.4921875, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 44.22520065307617, "learning_rate": 1.2000000000000002e-06, "loss": 0.8464, "step": 8 }, { "Batch Mean": -2.520538330078125, "accuracy": 0.5, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 36.41325378417969, "learning_rate": 1.35e-06, "loss": 0.7946, "step": 9 }, { "Batch Mean": -2.413787841796875, "accuracy": 0.515625, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 31.74874496459961, "learning_rate": 1.5e-06, "loss": 0.7939, "step": 10 }, { "Batch Mean": -2.218231201171875, "accuracy": 0.4921875, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 34.12691879272461, "learning_rate": 1.65e-06, "loss": 0.7505, "step": 11 }, { "Batch Mean": -0.2393415868282318, "accuracy": 0.53125, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 10.481990814208984, "learning_rate": 1.8e-06, "loss": 0.6779, "step": 12 }, { "Batch Mean": 0.2733480930328369, "accuracy": 0.5703125, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 8.39962387084961, "learning_rate": 1.95e-06, "loss": 0.6728, "step": 13 }, { "Batch Mean": 0.5041149854660034, "accuracy": 0.671875, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 15.906760215759277, "learning_rate": 2.1e-06, "loss": 0.6679, "step": 14 }, { "Batch Mean": 0.8821840286254883, "accuracy": 0.578125, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 19.03310775756836, "learning_rate": 2.25e-06, "loss": 0.7024, "step": 15 }, { "Batch Mean": 1.1891746520996094, "accuracy": 0.5859375, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 20.89366340637207, "learning_rate": 2.4000000000000003e-06, "loss": 0.6868, "step": 16 }, { "Batch Mean": 2.1533203125, "accuracy": 0.671875, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 34.633541107177734, "learning_rate": 2.55e-06, "loss": 0.7297, "step": 17 }, { "Batch Mean": 2.5652008056640625, "accuracy": 0.6484375, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 41.46820068359375, "learning_rate": 2.7e-06, "loss": 0.7387, "step": 18 }, { "Batch Mean": 2.4411849975585938, "accuracy": 0.75, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 42.9193000793457, "learning_rate": 2.85e-06, "loss": 0.6813, "step": 19 }, { "Batch Mean": 2.1285018920898438, "accuracy": 0.703125, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 37.16952133178711, "learning_rate": 3e-06, "loss": 0.672, "step": 20 }, { "Batch Mean": 1.4230481386184692, "accuracy": 0.671875, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 27.2924747467041, "learning_rate": 2.992105263157895e-06, "loss": 0.647, "step": 21 }, { "Batch Mean": 0.5414033532142639, "accuracy": 0.6953125, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 11.927323341369629, "learning_rate": 2.9842105263157896e-06, "loss": 0.5981, "step": 22 }, { "Batch Mean": -0.36795324087142944, "accuracy": 0.7265625, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 13.455949783325195, "learning_rate": 2.9763157894736843e-06, "loss": 0.5763, "step": 23 }, { "Batch Mean": -1.0874066352844238, "accuracy": 0.7421875, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 22.382274627685547, "learning_rate": 2.968421052631579e-06, "loss": 0.5763, "step": 24 }, { "Batch Mean": -1.103028416633606, "accuracy": 0.6953125, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 19.854228973388672, "learning_rate": 2.960526315789474e-06, "loss": 0.6065, "step": 25 }, { "Batch Mean": -0.8232302069664001, "accuracy": 0.734375, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 16.642824172973633, "learning_rate": 2.9526315789473685e-06, "loss": 0.5282, "step": 26 }, { "Batch Mean": -0.11966800689697266, "accuracy": 0.6875, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 8.563494682312012, "learning_rate": 2.9447368421052633e-06, "loss": 0.5348, "step": 27 }, { "Batch Mean": 0.5674030780792236, "accuracy": 0.671875, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 13.211753845214844, "learning_rate": 2.936842105263158e-06, "loss": 0.5812, "step": 28 }, { "Batch Mean": 1.1245126724243164, "accuracy": 0.6171875, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 22.8548583984375, "learning_rate": 2.9289473684210528e-06, "loss": 0.7336, "step": 29 }, { "Batch Mean": 1.2212319374084473, "accuracy": 0.6796875, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 21.96833038330078, "learning_rate": 2.9210526315789475e-06, "loss": 0.6167, "step": 30 }, { "Batch Mean": 0.7509702444076538, "accuracy": 0.796875, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 12.47424030303955, "learning_rate": 2.9131578947368423e-06, "loss": 0.463, "step": 31 }, { "Batch Mean": 0.2412339150905609, "accuracy": 0.7578125, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 9.769195556640625, "learning_rate": 2.905263157894737e-06, "loss": 0.5412, "step": 32 }, { "Batch Mean": -0.5602660179138184, "accuracy": 0.7109375, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 11.323891639709473, "learning_rate": 2.8973684210526318e-06, "loss": 0.5601, "step": 33 }, { "Batch Mean": -0.8669416904449463, "accuracy": 0.7734375, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 15.461758613586426, "learning_rate": 2.8894736842105265e-06, "loss": 0.5147, "step": 34 }, { "Batch Mean": -0.7635477781295776, "accuracy": 0.75, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 14.736496925354004, "learning_rate": 2.8815789473684213e-06, "loss": 0.5421, "step": 35 }, { "Batch Mean": -0.5088728070259094, "accuracy": 0.734375, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 11.982501029968262, "learning_rate": 2.873684210526316e-06, "loss": 0.5234, "step": 36 }, { "Batch Mean": -0.03705340623855591, "accuracy": 0.7578125, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 7.854848861694336, "learning_rate": 2.8657894736842103e-06, "loss": 0.4996, "step": 37 }, { "Batch Mean": 0.5426892042160034, "accuracy": 0.734375, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 10.834790229797363, "learning_rate": 2.857894736842105e-06, "loss": 0.5187, "step": 38 }, { "Batch Mean": 0.4622654914855957, "accuracy": 0.7734375, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 10.764225959777832, "learning_rate": 2.85e-06, "loss": 0.4723, "step": 39 }, { "Batch Mean": 0.18012571334838867, "accuracy": 0.7890625, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 9.500924110412598, "learning_rate": 2.8421052631578946e-06, "loss": 0.4545, "step": 40 }, { "Batch Mean": -0.05992317199707031, "accuracy": 0.7890625, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 8.079994201660156, "learning_rate": 2.8342105263157897e-06, "loss": 0.49, "step": 41 }, { "Batch Mean": 0.04392993450164795, "accuracy": 0.8515625, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 10.184918403625488, "learning_rate": 2.8263157894736845e-06, "loss": 0.3789, "step": 42 }, { "Batch Mean": -0.2515716552734375, "accuracy": 0.8046875, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 9.894248008728027, "learning_rate": 2.8184210526315792e-06, "loss": 0.435, "step": 43 }, { "Batch Mean": 0.15310335159301758, "accuracy": 0.7734375, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 10.671575546264648, "learning_rate": 2.810526315789474e-06, "loss": 0.4291, "step": 44 }, { "Batch Mean": 0.2563772201538086, "accuracy": 0.765625, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 10.690945625305176, "learning_rate": 2.8026315789473687e-06, "loss": 0.4812, "step": 45 }, { "Batch Mean": 0.046463966369628906, "accuracy": 0.875, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 9.616545677185059, "learning_rate": 2.7947368421052635e-06, "loss": 0.3082, "step": 46 }, { "Batch Mean": 0.11115455627441406, "accuracy": 0.75, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 15.726456642150879, "learning_rate": 2.7868421052631578e-06, "loss": 0.5576, "step": 47 }, { "Batch Mean": 0.2878689765930176, "accuracy": 0.8125, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 11.675898551940918, "learning_rate": 2.7789473684210525e-06, "loss": 0.472, "step": 48 }, { "Batch Mean": -0.25928783416748047, "accuracy": 0.75, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 15.394309997558594, "learning_rate": 2.7710526315789473e-06, "loss": 0.5011, "step": 49 }, { "Batch Mean": -0.2885439097881317, "accuracy": 0.703125, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 15.69314956665039, "learning_rate": 2.763157894736842e-06, "loss": 0.5974, "step": 50 }, { "Batch Mean": -0.8389434814453125, "accuracy": 0.8046875, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 16.744606018066406, "learning_rate": 2.7552631578947368e-06, "loss": 0.5151, "step": 51 }, { "Batch Mean": -0.6685854196548462, "accuracy": 0.703125, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 15.530059814453125, "learning_rate": 2.7473684210526315e-06, "loss": 0.5502, "step": 52 }, { "Batch Mean": -0.5919044017791748, "accuracy": 0.71875, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 15.14562702178955, "learning_rate": 2.7394736842105263e-06, "loss": 0.5305, "step": 53 }, { "Batch Mean": -0.11876988410949707, "accuracy": 0.75, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 7.011972427368164, "learning_rate": 2.7315789473684214e-06, "loss": 0.4802, "step": 54 }, { "Batch Mean": 0.09675300121307373, "accuracy": 0.7578125, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 8.37578010559082, "learning_rate": 2.723684210526316e-06, "loss": 0.4354, "step": 55 }, { "Batch Mean": 0.3877286911010742, "accuracy": 0.8046875, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 8.380095481872559, "learning_rate": 2.715789473684211e-06, "loss": 0.4114, "step": 56 }, { "Batch Mean": 0.7472854852676392, "accuracy": 0.8046875, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 12.794017791748047, "learning_rate": 2.7078947368421052e-06, "loss": 0.465, "step": 57 }, { "Batch Mean": 0.5131087303161621, "accuracy": 0.8046875, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 8.367005348205566, "learning_rate": 2.7e-06, "loss": 0.4407, "step": 58 }, { "Batch Mean": 0.5741372108459473, "accuracy": 0.8515625, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 9.066951751708984, "learning_rate": 2.6921052631578947e-06, "loss": 0.4468, "step": 59 }, { "Batch Mean": 0.169053316116333, "accuracy": 0.734375, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 6.888144016265869, "learning_rate": 2.6842105263157895e-06, "loss": 0.4849, "step": 60 }, { "Batch Mean": -0.18010663986206055, "accuracy": 0.765625, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 6.846613883972168, "learning_rate": 2.6763157894736842e-06, "loss": 0.4643, "step": 61 }, { "Batch Mean": -0.4085197448730469, "accuracy": 0.8359375, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 7.677695274353027, "learning_rate": 2.668421052631579e-06, "loss": 0.3926, "step": 62 }, { "Batch Mean": -0.48792409896850586, "accuracy": 0.828125, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 8.656214714050293, "learning_rate": 2.6605263157894737e-06, "loss": 0.4116, "step": 63 }, { "Batch Mean": -0.19100165367126465, "accuracy": 0.7734375, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 9.927976608276367, "learning_rate": 2.6526315789473685e-06, "loss": 0.4395, "step": 64 }, { "Batch Mean": 0.025106430053710938, "accuracy": 0.8046875, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 8.97470474243164, "learning_rate": 2.644736842105263e-06, "loss": 0.4632, "step": 65 }, { "Batch Mean": 0.4497801661491394, "accuracy": 0.796875, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 11.160425186157227, "learning_rate": 2.636842105263158e-06, "loss": 0.5073, "step": 66 }, { "Batch Mean": 0.6494235992431641, "accuracy": 0.8203125, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 11.464187622070312, "learning_rate": 2.6289473684210527e-06, "loss": 0.4467, "step": 67 }, { "Batch Mean": -0.17097759246826172, "accuracy": 0.828125, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 8.820326805114746, "learning_rate": 2.6210526315789474e-06, "loss": 0.4543, "step": 68 }, { "Batch Mean": -0.1554449200630188, "accuracy": 0.8046875, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 9.248348236083984, "learning_rate": 2.613157894736842e-06, "loss": 0.3796, "step": 69 }, { "Batch Mean": -0.21274375915527344, "accuracy": 0.75, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 8.15241813659668, "learning_rate": 2.605263157894737e-06, "loss": 0.4445, "step": 70 }, { "Batch Mean": -0.06371593475341797, "accuracy": 0.765625, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 6.543805122375488, "learning_rate": 2.5973684210526317e-06, "loss": 0.4376, "step": 71 }, { "Batch Mean": -0.10743597149848938, "accuracy": 0.7265625, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 7.594844341278076, "learning_rate": 2.5894736842105264e-06, "loss": 0.4363, "step": 72 }, { "Batch Mean": 0.08045387268066406, "accuracy": 0.84375, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 7.977687835693359, "learning_rate": 2.581578947368421e-06, "loss": 0.3616, "step": 73 }, { "Batch Mean": -0.11765336990356445, "accuracy": 0.796875, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 6.521571636199951, "learning_rate": 2.573684210526316e-06, "loss": 0.4001, "step": 74 }, { "Batch Mean": 0.0607762336730957, "accuracy": 0.703125, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 8.15623950958252, "learning_rate": 2.5657894736842107e-06, "loss": 0.4941, "step": 75 }, { "Batch Mean": 0.4910193681716919, "accuracy": 0.8046875, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 9.602405548095703, "learning_rate": 2.5578947368421054e-06, "loss": 0.4224, "step": 76 }, { "Batch Mean": 0.06563162803649902, "accuracy": 0.8046875, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 6.577213764190674, "learning_rate": 2.55e-06, "loss": 0.4047, "step": 77 }, { "Batch Mean": -0.08942317962646484, "accuracy": 0.8046875, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 16.202590942382812, "learning_rate": 2.542105263157895e-06, "loss": 0.4418, "step": 78 }, { "Batch Mean": -0.37958288192749023, "accuracy": 0.8125, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 23.45298194885254, "learning_rate": 2.5342105263157892e-06, "loss": 0.391, "step": 79 }, { "Batch Mean": -0.5664916038513184, "accuracy": 0.8125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 10.679176330566406, "learning_rate": 2.526315789473684e-06, "loss": 0.4402, "step": 80 }, { "Batch Mean": -0.31339550018310547, "accuracy": 0.8359375, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 16.459426879882812, "learning_rate": 2.5184210526315787e-06, "loss": 0.3765, "step": 81 }, { "Batch Mean": 0.20376300811767578, "accuracy": 0.78125, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 8.887246131896973, "learning_rate": 2.510526315789474e-06, "loss": 0.4509, "step": 82 }, { "Batch Mean": -0.054105013608932495, "accuracy": 0.796875, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 8.116570472717285, "learning_rate": 2.5026315789473686e-06, "loss": 0.3906, "step": 83 }, { "Batch Mean": 0.4383058547973633, "accuracy": 0.875, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 9.300606727600098, "learning_rate": 2.4947368421052634e-06, "loss": 0.3099, "step": 84 }, { "Batch Mean": 0.4242062568664551, "accuracy": 0.7890625, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 8.7870512008667, "learning_rate": 2.486842105263158e-06, "loss": 0.3996, "step": 85 }, { "Batch Mean": 0.22194945812225342, "accuracy": 0.78125, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 8.691889762878418, "learning_rate": 2.478947368421053e-06, "loss": 0.4512, "step": 86 }, { "Batch Mean": -0.14107918739318848, "accuracy": 0.8359375, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 7.0445146560668945, "learning_rate": 2.4710526315789476e-06, "loss": 0.3771, "step": 87 }, { "Batch Mean": -0.35906583070755005, "accuracy": 0.765625, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 9.609421730041504, "learning_rate": 2.4631578947368424e-06, "loss": 0.4558, "step": 88 }, { "Batch Mean": -0.45313501358032227, "accuracy": 0.828125, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 9.091865539550781, "learning_rate": 2.4552631578947367e-06, "loss": 0.4292, "step": 89 }, { "Batch Mean": -0.35767626762390137, "accuracy": 0.8359375, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 7.811399459838867, "learning_rate": 2.4473684210526314e-06, "loss": 0.3823, "step": 90 }, { "Batch Mean": 0.0022423267364501953, "accuracy": 0.8046875, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 7.804200172424316, "learning_rate": 2.439473684210526e-06, "loss": 0.4079, "step": 91 }, { "Batch Mean": 0.17678523063659668, "accuracy": 0.8046875, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 7.715360641479492, "learning_rate": 2.431578947368421e-06, "loss": 0.391, "step": 92 }, { "Batch Mean": -0.22452354431152344, "accuracy": 0.8203125, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 7.197687149047852, "learning_rate": 2.4236842105263157e-06, "loss": 0.3864, "step": 93 }, { "Batch Mean": 0.30264759063720703, "accuracy": 0.84375, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 7.754411697387695, "learning_rate": 2.4157894736842104e-06, "loss": 0.3939, "step": 94 }, { "Batch Mean": -0.2516747713088989, "accuracy": 0.7890625, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 8.55639362335205, "learning_rate": 2.4078947368421056e-06, "loss": 0.4671, "step": 95 }, { "Batch Mean": -0.15928316116333008, "accuracy": 0.8046875, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 7.463867664337158, "learning_rate": 2.4000000000000003e-06, "loss": 0.471, "step": 96 }, { "Batch Mean": -0.0638132095336914, "accuracy": 0.8125, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 6.797260761260986, "learning_rate": 2.392105263157895e-06, "loss": 0.3892, "step": 97 }, { "Batch Mean": -0.08082282543182373, "accuracy": 0.8203125, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 7.096107482910156, "learning_rate": 2.38421052631579e-06, "loss": 0.3942, "step": 98 }, { "Batch Mean": 0.3185189962387085, "accuracy": 0.796875, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 9.447708129882812, "learning_rate": 2.376315789473684e-06, "loss": 0.5037, "step": 99 }, { "Batch Mean": 0.26651012897491455, "accuracy": 0.890625, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 9.391190528869629, "learning_rate": 2.368421052631579e-06, "loss": 0.3352, "step": 100 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }