diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4287 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 50, + "global_step": 594, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005050505050505051, + "grad_norm": 2.061766753199945, + "learning_rate": 1.0344827586206896e-06, + "loss": 1.9091, + "step": 1 + }, + { + "epoch": 0.005050505050505051, + "eval_loss": 1.7296912670135498, + "eval_runtime": 229.6516, + "eval_samples_per_second": 6.231, + "eval_steps_per_second": 0.779, + "step": 1 + }, + { + "epoch": 0.010101010101010102, + "grad_norm": 0.9340006537063892, + "learning_rate": 2.068965517241379e-06, + "loss": 1.9543, + "step": 2 + }, + { + "epoch": 0.015151515151515152, + "grad_norm": 0.8730658898573816, + "learning_rate": 3.103448275862069e-06, + "loss": 1.8991, + "step": 3 + }, + { + "epoch": 0.020202020202020204, + "grad_norm": 0.9760544939810394, + "learning_rate": 4.137931034482758e-06, + "loss": 1.9469, + "step": 4 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 1.1435345547054372, + "learning_rate": 5.172413793103449e-06, + "loss": 1.875, + "step": 5 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 0.6977064869142138, + "learning_rate": 6.206896551724138e-06, + "loss": 1.9303, + "step": 6 + }, + { + "epoch": 0.03535353535353535, + "grad_norm": 0.9852376303852637, + "learning_rate": 7.241379310344828e-06, + "loss": 1.9764, + "step": 7 + }, + { + "epoch": 0.04040404040404041, + "grad_norm": 1.5021550676895192, + "learning_rate": 8.275862068965517e-06, + "loss": 1.9526, + "step": 8 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 14.465654024779864, + "learning_rate": 9.310344827586207e-06, + "loss": 1.9238, + "step": 9 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 1.3153716741597974, + "learning_rate": 1.0344827586206898e-05, + "loss": 1.8548, + "step": 10 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 0.7251965259785947, + "learning_rate": 1.1379310344827586e-05, + "loss": 1.9568, + "step": 11 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 0.675323305346893, + "learning_rate": 1.2413793103448277e-05, + "loss": 2.0394, + "step": 12 + }, + { + "epoch": 0.06565656565656566, + "grad_norm": 0.7284662389076806, + "learning_rate": 1.3448275862068966e-05, + "loss": 1.9143, + "step": 13 + }, + { + "epoch": 0.0707070707070707, + "grad_norm": 0.9273147668903492, + "learning_rate": 1.4482758620689657e-05, + "loss": 1.8514, + "step": 14 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.8045697974814354, + "learning_rate": 1.5517241379310346e-05, + "loss": 1.8691, + "step": 15 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 0.8617824816372583, + "learning_rate": 1.6551724137931033e-05, + "loss": 1.9208, + "step": 16 + }, + { + "epoch": 0.08585858585858586, + "grad_norm": 0.8474189127719305, + "learning_rate": 1.7586206896551724e-05, + "loss": 1.971, + "step": 17 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.7802161578292038, + "learning_rate": 1.8620689655172415e-05, + "loss": 1.9364, + "step": 18 + }, + { + "epoch": 0.09595959595959595, + "grad_norm": 0.7670275224618359, + "learning_rate": 1.9655172413793102e-05, + "loss": 1.8924, + "step": 19 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 0.8871142093374335, + "learning_rate": 2.0689655172413797e-05, + "loss": 1.8666, + "step": 20 + }, + { + "epoch": 0.10606060606060606, + "grad_norm": 0.8537219904851733, + "learning_rate": 2.1724137931034484e-05, + "loss": 1.903, + "step": 21 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.7103385523680423, + "learning_rate": 2.275862068965517e-05, + "loss": 1.8842, + "step": 22 + }, + { + "epoch": 0.11616161616161616, + "grad_norm": 0.7687652688491076, + "learning_rate": 2.3793103448275862e-05, + "loss": 1.9043, + "step": 23 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 0.8618641553034271, + "learning_rate": 2.4827586206896553e-05, + "loss": 1.9657, + "step": 24 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 0.6952586490192444, + "learning_rate": 2.586206896551724e-05, + "loss": 1.8744, + "step": 25 + }, + { + "epoch": 0.13131313131313133, + "grad_norm": 0.7879308602246046, + "learning_rate": 2.689655172413793e-05, + "loss": 1.9088, + "step": 26 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 0.6286619656859633, + "learning_rate": 2.793103448275862e-05, + "loss": 1.8769, + "step": 27 + }, + { + "epoch": 0.1414141414141414, + "grad_norm": 0.6368137307786529, + "learning_rate": 2.8965517241379313e-05, + "loss": 1.9609, + "step": 28 + }, + { + "epoch": 0.14646464646464646, + "grad_norm": 0.6997236575440393, + "learning_rate": 3e-05, + "loss": 1.8216, + "step": 29 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.7017124666486956, + "learning_rate": 2.999976812015884e-05, + "loss": 1.8589, + "step": 30 + }, + { + "epoch": 0.15656565656565657, + "grad_norm": 0.6473032006817286, + "learning_rate": 2.999907248780446e-05, + "loss": 1.8554, + "step": 31 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 0.5625959900004289, + "learning_rate": 2.9997913124443945e-05, + "loss": 1.8885, + "step": 32 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.5416656043411677, + "learning_rate": 2.9996290065921693e-05, + "loss": 1.8011, + "step": 33 + }, + { + "epoch": 0.1717171717171717, + "grad_norm": 0.6288840271054257, + "learning_rate": 2.9994203362418313e-05, + "loss": 1.8941, + "step": 34 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 0.46613508572576606, + "learning_rate": 2.9991653078449062e-05, + "loss": 1.8755, + "step": 35 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.483380275460621, + "learning_rate": 2.998863929286187e-05, + "loss": 1.7971, + "step": 36 + }, + { + "epoch": 0.18686868686868688, + "grad_norm": 0.5435079019515928, + "learning_rate": 2.9985162098834886e-05, + "loss": 1.8561, + "step": 37 + }, + { + "epoch": 0.1919191919191919, + "grad_norm": 0.39935840982328513, + "learning_rate": 2.9981221603873608e-05, + "loss": 1.8447, + "step": 38 + }, + { + "epoch": 0.19696969696969696, + "grad_norm": 0.4860381910271567, + "learning_rate": 2.9976817929807542e-05, + "loss": 1.8571, + "step": 39 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.4704297980261362, + "learning_rate": 2.9971951212786453e-05, + "loss": 1.8953, + "step": 40 + }, + { + "epoch": 0.20707070707070707, + "grad_norm": 0.45313219770954005, + "learning_rate": 2.996662160327616e-05, + "loss": 1.8858, + "step": 41 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 0.4849523903159801, + "learning_rate": 2.9960829266053854e-05, + "loss": 1.8558, + "step": 42 + }, + { + "epoch": 0.21717171717171718, + "grad_norm": 0.5296271329297302, + "learning_rate": 2.9954574380203036e-05, + "loss": 1.8819, + "step": 43 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.42138253356228195, + "learning_rate": 2.9947857139107964e-05, + "loss": 1.8218, + "step": 44 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.54778300283403, + "learning_rate": 2.994067775044768e-05, + "loss": 1.8718, + "step": 45 + }, + { + "epoch": 0.23232323232323232, + "grad_norm": 0.5171055027316829, + "learning_rate": 2.9933036436189582e-05, + "loss": 1.8202, + "step": 46 + }, + { + "epoch": 0.23737373737373738, + "grad_norm": 0.42755509411252407, + "learning_rate": 2.992493343258257e-05, + "loss": 1.7941, + "step": 47 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.5864597089331413, + "learning_rate": 2.9916368990149738e-05, + "loss": 1.8177, + "step": 48 + }, + { + "epoch": 0.2474747474747475, + "grad_norm": 0.5015977034643889, + "learning_rate": 2.990734337368062e-05, + "loss": 1.8441, + "step": 49 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 0.4223650022421803, + "learning_rate": 2.989785686222302e-05, + "loss": 1.8235, + "step": 50 + }, + { + "epoch": 0.25252525252525254, + "eval_loss": 1.5191664695739746, + "eval_runtime": 228.7897, + "eval_samples_per_second": 6.255, + "eval_steps_per_second": 0.782, + "step": 50 + }, + { + "epoch": 0.25757575757575757, + "grad_norm": 0.5549092161535342, + "learning_rate": 2.9887909749074373e-05, + "loss": 1.8724, + "step": 51 + }, + { + "epoch": 0.26262626262626265, + "grad_norm": 0.5808532765316217, + "learning_rate": 2.9877502341772687e-05, + "loss": 1.8668, + "step": 52 + }, + { + "epoch": 0.2676767676767677, + "grad_norm": 0.42037362125882416, + "learning_rate": 2.9866634962087014e-05, + "loss": 1.8149, + "step": 53 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.510701064713823, + "learning_rate": 2.9855307946007532e-05, + "loss": 1.8388, + "step": 54 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.4740044982938013, + "learning_rate": 2.984352164373513e-05, + "loss": 1.8898, + "step": 55 + }, + { + "epoch": 0.2828282828282828, + "grad_norm": 0.42600738162835966, + "learning_rate": 2.9831276419670593e-05, + "loss": 1.7645, + "step": 56 + }, + { + "epoch": 0.2878787878787879, + "grad_norm": 0.523957827857094, + "learning_rate": 2.9818572652403336e-05, + "loss": 1.8688, + "step": 57 + }, + { + "epoch": 0.29292929292929293, + "grad_norm": 0.4739264160838728, + "learning_rate": 2.9805410734699694e-05, + "loss": 1.8253, + "step": 58 + }, + { + "epoch": 0.29797979797979796, + "grad_norm": 0.4030312937735394, + "learning_rate": 2.9791791073490795e-05, + "loss": 1.8799, + "step": 59 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.4287715829219747, + "learning_rate": 2.9777714089859946e-05, + "loss": 1.8283, + "step": 60 + }, + { + "epoch": 0.30808080808080807, + "grad_norm": 0.41789218942140666, + "learning_rate": 2.976318021902965e-05, + "loss": 1.8135, + "step": 61 + }, + { + "epoch": 0.31313131313131315, + "grad_norm": 0.4587324787560187, + "learning_rate": 2.9748189910348122e-05, + "loss": 1.8821, + "step": 62 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 0.4460155322546901, + "learning_rate": 2.9732743627275428e-05, + "loss": 1.8889, + "step": 63 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 0.4157211867080571, + "learning_rate": 2.9716841847369106e-05, + "loss": 1.8345, + "step": 64 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 0.41844440976522496, + "learning_rate": 2.9700485062269465e-05, + "loss": 1.7642, + "step": 65 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.4520630042741615, + "learning_rate": 2.968367377768432e-05, + "loss": 1.8485, + "step": 66 + }, + { + "epoch": 0.3383838383838384, + "grad_norm": 0.4368200901934031, + "learning_rate": 2.966640851337342e-05, + "loss": 1.8617, + "step": 67 + }, + { + "epoch": 0.3434343434343434, + "grad_norm": 0.38212187200746967, + "learning_rate": 2.964868980313232e-05, + "loss": 1.7493, + "step": 68 + }, + { + "epoch": 0.3484848484848485, + "grad_norm": 0.4173981057054639, + "learning_rate": 2.963051819477592e-05, + "loss": 1.836, + "step": 69 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 0.4067519509663366, + "learning_rate": 2.9611894250121508e-05, + "loss": 1.81, + "step": 70 + }, + { + "epoch": 0.35858585858585856, + "grad_norm": 0.40355345502721407, + "learning_rate": 2.9592818544971394e-05, + "loss": 1.8126, + "step": 71 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.4205716149104244, + "learning_rate": 2.9573291669095112e-05, + "loss": 1.725, + "step": 72 + }, + { + "epoch": 0.3686868686868687, + "grad_norm": 0.42333778099845615, + "learning_rate": 2.955331422621117e-05, + "loss": 1.8448, + "step": 73 + }, + { + "epoch": 0.37373737373737376, + "grad_norm": 0.39602223830054895, + "learning_rate": 2.953288683396841e-05, + "loss": 1.8518, + "step": 74 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.42272756108809484, + "learning_rate": 2.95120101239269e-05, + "loss": 1.846, + "step": 75 + }, + { + "epoch": 0.3838383838383838, + "grad_norm": 0.38458304927883635, + "learning_rate": 2.9490684741538394e-05, + "loss": 1.7287, + "step": 76 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 0.4515217019544044, + "learning_rate": 2.9468911346126395e-05, + "loss": 1.9222, + "step": 77 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 0.3902694265684303, + "learning_rate": 2.9446690610865775e-05, + "loss": 1.8075, + "step": 78 + }, + { + "epoch": 0.398989898989899, + "grad_norm": 0.4621030399414333, + "learning_rate": 2.9424023222761938e-05, + "loss": 1.8307, + "step": 79 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.4164626377683268, + "learning_rate": 2.9400909882629595e-05, + "loss": 1.826, + "step": 80 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 0.4989467879911725, + "learning_rate": 2.9377351305071097e-05, + "loss": 1.7694, + "step": 81 + }, + { + "epoch": 0.41414141414141414, + "grad_norm": 0.38502918266185665, + "learning_rate": 2.935334821845434e-05, + "loss": 1.8558, + "step": 82 + }, + { + "epoch": 0.41919191919191917, + "grad_norm": 0.43876163502583143, + "learning_rate": 2.9328901364890253e-05, + "loss": 1.9031, + "step": 83 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 0.44274327469566893, + "learning_rate": 2.930401150020983e-05, + "loss": 1.8486, + "step": 84 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 0.42306669374100425, + "learning_rate": 2.9278679393940794e-05, + "loss": 1.8095, + "step": 85 + }, + { + "epoch": 0.43434343434343436, + "grad_norm": 0.43069541482459783, + "learning_rate": 2.9252905829283778e-05, + "loss": 1.8091, + "step": 86 + }, + { + "epoch": 0.4393939393939394, + "grad_norm": 0.524368528081938, + "learning_rate": 2.9226691603088124e-05, + "loss": 1.8407, + "step": 87 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.4306578292725837, + "learning_rate": 2.9200037525827255e-05, + "loss": 1.8075, + "step": 88 + }, + { + "epoch": 0.4494949494949495, + "grad_norm": 0.401793293233711, + "learning_rate": 2.9172944421573587e-05, + "loss": 1.8405, + "step": 89 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.48084582939870474, + "learning_rate": 2.9145413127973085e-05, + "loss": 1.8596, + "step": 90 + }, + { + "epoch": 0.4595959595959596, + "grad_norm": 0.4259765113516244, + "learning_rate": 2.911744449621935e-05, + "loss": 1.7931, + "step": 91 + }, + { + "epoch": 0.46464646464646464, + "grad_norm": 0.41686644545343865, + "learning_rate": 2.90890393910273e-05, + "loss": 1.8693, + "step": 92 + }, + { + "epoch": 0.4696969696969697, + "grad_norm": 0.49645836560093654, + "learning_rate": 2.9060198690606438e-05, + "loss": 1.8567, + "step": 93 + }, + { + "epoch": 0.47474747474747475, + "grad_norm": 0.4122894943723907, + "learning_rate": 2.9030923286633703e-05, + "loss": 1.7979, + "step": 94 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 0.41291814598709514, + "learning_rate": 2.9001214084225898e-05, + "loss": 1.8409, + "step": 95 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.4346649991553492, + "learning_rate": 2.89710720019117e-05, + "loss": 1.8904, + "step": 96 + }, + { + "epoch": 0.4898989898989899, + "grad_norm": 0.41818341954882543, + "learning_rate": 2.8940497971603288e-05, + "loss": 1.8695, + "step": 97 + }, + { + "epoch": 0.494949494949495, + "grad_norm": 0.4420144131423374, + "learning_rate": 2.890949293856749e-05, + "loss": 1.8021, + "step": 98 + }, + { + "epoch": 0.5, + "grad_norm": 0.43880248651128445, + "learning_rate": 2.8878057861396606e-05, + "loss": 1.8329, + "step": 99 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 0.40444053098152477, + "learning_rate": 2.8846193711978717e-05, + "loss": 1.8201, + "step": 100 + }, + { + "epoch": 0.5050505050505051, + "eval_loss": 1.4966636896133423, + "eval_runtime": 229.0804, + "eval_samples_per_second": 6.247, + "eval_steps_per_second": 0.781, + "step": 100 + }, + { + "epoch": 0.51010101010101, + "grad_norm": 0.45960433784465665, + "learning_rate": 2.881390147546768e-05, + "loss": 1.8183, + "step": 101 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 0.38763847079359476, + "learning_rate": 2.878118215025265e-05, + "loss": 1.8208, + "step": 102 + }, + { + "epoch": 0.5202020202020202, + "grad_norm": 0.41520378471101016, + "learning_rate": 2.874803674792722e-05, + "loss": 1.8714, + "step": 103 + }, + { + "epoch": 0.5252525252525253, + "grad_norm": 0.40206269424525176, + "learning_rate": 2.8714466293258142e-05, + "loss": 1.7878, + "step": 104 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.4373758839964535, + "learning_rate": 2.868047182415364e-05, + "loss": 1.8093, + "step": 105 + }, + { + "epoch": 0.5353535353535354, + "grad_norm": 0.3875079810746361, + "learning_rate": 2.864605439163133e-05, + "loss": 1.8606, + "step": 106 + }, + { + "epoch": 0.5404040404040404, + "grad_norm": 0.4303262268327845, + "learning_rate": 2.8611215059785706e-05, + "loss": 1.8233, + "step": 107 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.45649878897921986, + "learning_rate": 2.8575954905755278e-05, + "loss": 1.8817, + "step": 108 + }, + { + "epoch": 0.5505050505050505, + "grad_norm": 0.44045126098579024, + "learning_rate": 2.8540275019689237e-05, + "loss": 1.7945, + "step": 109 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.43144466038537, + "learning_rate": 2.8504176504713745e-05, + "loss": 1.7419, + "step": 110 + }, + { + "epoch": 0.5606060606060606, + "grad_norm": 0.38271501972768057, + "learning_rate": 2.846766047689787e-05, + "loss": 1.8557, + "step": 111 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 0.4193952465924727, + "learning_rate": 2.8430728065219035e-05, + "loss": 1.8245, + "step": 112 + }, + { + "epoch": 0.5707070707070707, + "grad_norm": 0.41096403730883174, + "learning_rate": 2.839338041152814e-05, + "loss": 1.8117, + "step": 113 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 0.41746315731568373, + "learning_rate": 2.835561867051426e-05, + "loss": 1.7859, + "step": 114 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 0.35796806338294784, + "learning_rate": 2.8317444009668916e-05, + "loss": 1.8125, + "step": 115 + }, + { + "epoch": 0.5858585858585859, + "grad_norm": 0.3782772183418781, + "learning_rate": 2.8278857609250033e-05, + "loss": 1.8551, + "step": 116 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 0.4563811308324174, + "learning_rate": 2.823986066224538e-05, + "loss": 1.8711, + "step": 117 + }, + { + "epoch": 0.5959595959595959, + "grad_norm": 0.42499726957188555, + "learning_rate": 2.820045437433575e-05, + "loss": 1.8827, + "step": 118 + }, + { + "epoch": 0.601010101010101, + "grad_norm": 0.35463970926383165, + "learning_rate": 2.816063996385765e-05, + "loss": 1.8511, + "step": 119 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.39828700116760574, + "learning_rate": 2.8120418661765624e-05, + "loss": 1.8386, + "step": 120 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.406542351713585, + "learning_rate": 2.8079791711594236e-05, + "loss": 1.8035, + "step": 121 + }, + { + "epoch": 0.6161616161616161, + "grad_norm": 0.36243471155718693, + "learning_rate": 2.8038760369419583e-05, + "loss": 1.8014, + "step": 122 + }, + { + "epoch": 0.6212121212121212, + "grad_norm": 0.3845741639511099, + "learning_rate": 2.7997325903820478e-05, + "loss": 1.9082, + "step": 123 + }, + { + "epoch": 0.6262626262626263, + "grad_norm": 0.4004906560962447, + "learning_rate": 2.7955489595839228e-05, + "loss": 1.828, + "step": 124 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 0.40745184195706274, + "learning_rate": 2.7913252738942027e-05, + "loss": 1.8285, + "step": 125 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.4701862294606233, + "learning_rate": 2.787061663897896e-05, + "loss": 1.7612, + "step": 126 + }, + { + "epoch": 0.6414141414141414, + "grad_norm": 0.3845304191870889, + "learning_rate": 2.782758261414365e-05, + "loss": 1.9005, + "step": 127 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 0.45387333682470754, + "learning_rate": 2.7784151994932462e-05, + "loss": 1.8369, + "step": 128 + }, + { + "epoch": 0.6515151515151515, + "grad_norm": 0.402744943135056, + "learning_rate": 2.7740326124103416e-05, + "loss": 1.8382, + "step": 129 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 0.39184387630637835, + "learning_rate": 2.7696106356634637e-05, + "loss": 1.8659, + "step": 130 + }, + { + "epoch": 0.6616161616161617, + "grad_norm": 0.4626435038267262, + "learning_rate": 2.7651494059682485e-05, + "loss": 1.8667, + "step": 131 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.40353571201115057, + "learning_rate": 2.7606490612539262e-05, + "loss": 1.8209, + "step": 132 + }, + { + "epoch": 0.6717171717171717, + "grad_norm": 0.4162695579854718, + "learning_rate": 2.7561097406590595e-05, + "loss": 1.8002, + "step": 133 + }, + { + "epoch": 0.6767676767676768, + "grad_norm": 0.3909561466126897, + "learning_rate": 2.751531584527241e-05, + "loss": 1.8076, + "step": 134 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.41533705192980835, + "learning_rate": 2.746914734402752e-05, + "loss": 1.9099, + "step": 135 + }, + { + "epoch": 0.6868686868686869, + "grad_norm": 0.38207500499538627, + "learning_rate": 2.7422593330261888e-05, + "loss": 1.7688, + "step": 136 + }, + { + "epoch": 0.6919191919191919, + "grad_norm": 0.35742242270800784, + "learning_rate": 2.7375655243300493e-05, + "loss": 1.8651, + "step": 137 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 0.41421312272061606, + "learning_rate": 2.7328334534342827e-05, + "loss": 1.8785, + "step": 138 + }, + { + "epoch": 0.702020202020202, + "grad_norm": 0.4171193053836604, + "learning_rate": 2.7280632666418013e-05, + "loss": 1.8781, + "step": 139 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.40782157651436396, + "learning_rate": 2.723255111433959e-05, + "loss": 1.8247, + "step": 140 + }, + { + "epoch": 0.7121212121212122, + "grad_norm": 0.4281506276367188, + "learning_rate": 2.7184091364659923e-05, + "loss": 1.733, + "step": 141 + }, + { + "epoch": 0.7171717171717171, + "grad_norm": 0.39479511690560315, + "learning_rate": 2.7135254915624213e-05, + "loss": 1.8401, + "step": 142 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.42100536218468343, + "learning_rate": 2.70860432771242e-05, + "loss": 1.8484, + "step": 143 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.3855161042933354, + "learning_rate": 2.703645797065147e-05, + "loss": 1.8197, + "step": 144 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 0.3920745691095182, + "learning_rate": 2.6986500529250427e-05, + "loss": 1.8345, + "step": 145 + }, + { + "epoch": 0.7373737373737373, + "grad_norm": 0.42204478492867153, + "learning_rate": 2.6936172497470874e-05, + "loss": 1.8368, + "step": 146 + }, + { + "epoch": 0.7424242424242424, + "grad_norm": 0.40405492226523393, + "learning_rate": 2.688547543132029e-05, + "loss": 1.8182, + "step": 147 + }, + { + "epoch": 0.7474747474747475, + "grad_norm": 0.36686029807824033, + "learning_rate": 2.6834410898215688e-05, + "loss": 1.8581, + "step": 148 + }, + { + "epoch": 0.7525252525252525, + "grad_norm": 0.36859567357522177, + "learning_rate": 2.678298047693518e-05, + "loss": 1.7276, + "step": 149 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.37761559772659303, + "learning_rate": 2.6731185757569153e-05, + "loss": 1.8321, + "step": 150 + }, + { + "epoch": 0.7575757575757576, + "eval_loss": 1.484434962272644, + "eval_runtime": 229.048, + "eval_samples_per_second": 6.248, + "eval_steps_per_second": 0.781, + "step": 150 + }, + { + "epoch": 0.7626262626262627, + "grad_norm": 0.40657370322541503, + "learning_rate": 2.6679028341471114e-05, + "loss": 1.8364, + "step": 151 + }, + { + "epoch": 0.7676767676767676, + "grad_norm": 0.3852020973392934, + "learning_rate": 2.6626509841208177e-05, + "loss": 1.8723, + "step": 152 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 0.4471085311517623, + "learning_rate": 2.6573631880511214e-05, + "loss": 1.9017, + "step": 153 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.3858532089668969, + "learning_rate": 2.652039609422463e-05, + "loss": 1.7849, + "step": 154 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 0.427216562758674, + "learning_rate": 2.6466804128255865e-05, + "loss": 1.7743, + "step": 155 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 0.38828532121415243, + "learning_rate": 2.6412857639524442e-05, + "loss": 1.8294, + "step": 156 + }, + { + "epoch": 0.7929292929292929, + "grad_norm": 0.40962967785771864, + "learning_rate": 2.6358558295910805e-05, + "loss": 1.8314, + "step": 157 + }, + { + "epoch": 0.797979797979798, + "grad_norm": 0.7209413951413858, + "learning_rate": 2.6303907776204706e-05, + "loss": 1.8388, + "step": 158 + }, + { + "epoch": 0.803030303030303, + "grad_norm": 0.3753158677183495, + "learning_rate": 2.624890777005332e-05, + "loss": 1.9189, + "step": 159 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.3683405444328328, + "learning_rate": 2.6193559977909008e-05, + "loss": 1.7511, + "step": 160 + }, + { + "epoch": 0.8131313131313131, + "grad_norm": 0.420103866946048, + "learning_rate": 2.6137866110976742e-05, + "loss": 1.8447, + "step": 161 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 0.4025395036734605, + "learning_rate": 2.608182789116118e-05, + "loss": 1.7567, + "step": 162 + }, + { + "epoch": 0.8232323232323232, + "grad_norm": 0.41152519566999235, + "learning_rate": 2.6025447051013466e-05, + "loss": 1.8844, + "step": 163 + }, + { + "epoch": 0.8282828282828283, + "grad_norm": 0.37456667248254777, + "learning_rate": 2.5968725333677628e-05, + "loss": 1.8316, + "step": 164 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.3989107911221168, + "learning_rate": 2.5911664492836714e-05, + "loss": 1.8336, + "step": 165 + }, + { + "epoch": 0.8383838383838383, + "grad_norm": 0.4192570186399055, + "learning_rate": 2.585426629265854e-05, + "loss": 1.8368, + "step": 166 + }, + { + "epoch": 0.8434343434343434, + "grad_norm": 0.4039992281383353, + "learning_rate": 2.579653250774119e-05, + "loss": 1.8474, + "step": 167 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 0.3860934737365767, + "learning_rate": 2.5738464923058118e-05, + "loss": 1.9136, + "step": 168 + }, + { + "epoch": 0.8535353535353535, + "grad_norm": 0.3766141715707985, + "learning_rate": 2.568006533390295e-05, + "loss": 1.7905, + "step": 169 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.43692452251495456, + "learning_rate": 2.562133554583402e-05, + "loss": 1.8692, + "step": 170 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 0.38506171479168916, + "learning_rate": 2.556227737461852e-05, + "loss": 1.8802, + "step": 171 + }, + { + "epoch": 0.8686868686868687, + "grad_norm": 0.4139103090309799, + "learning_rate": 2.5502892646176364e-05, + "loss": 1.9145, + "step": 172 + }, + { + "epoch": 0.8737373737373737, + "grad_norm": 0.41202711286858845, + "learning_rate": 2.5443183196523744e-05, + "loss": 1.8799, + "step": 173 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 0.441012649010044, + "learning_rate": 2.5383150871716342e-05, + "loss": 1.7964, + "step": 174 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 0.3924859380226668, + "learning_rate": 2.5322797527792297e-05, + "loss": 1.8196, + "step": 175 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.484116113199706, + "learning_rate": 2.526212503071477e-05, + "loss": 1.8211, + "step": 176 + }, + { + "epoch": 0.8939393939393939, + "grad_norm": 0.4214268259949748, + "learning_rate": 2.52011352563143e-05, + "loss": 1.8756, + "step": 177 + }, + { + "epoch": 0.898989898989899, + "grad_norm": 0.3761454305197214, + "learning_rate": 2.5139830090230776e-05, + "loss": 1.7887, + "step": 178 + }, + { + "epoch": 0.9040404040404041, + "grad_norm": 0.38472045617953676, + "learning_rate": 2.507821142785516e-05, + "loss": 1.9109, + "step": 179 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.4437224782187781, + "learning_rate": 2.501628117427087e-05, + "loss": 1.8024, + "step": 180 + }, + { + "epoch": 0.9141414141414141, + "grad_norm": 0.3928636586609788, + "learning_rate": 2.4954041244194883e-05, + "loss": 1.8177, + "step": 181 + }, + { + "epoch": 0.9191919191919192, + "grad_norm": 0.40978302308222725, + "learning_rate": 2.4891493561918545e-05, + "loss": 1.8238, + "step": 182 + }, + { + "epoch": 0.9242424242424242, + "grad_norm": 0.41578094468387805, + "learning_rate": 2.482864006124808e-05, + "loss": 1.7558, + "step": 183 + }, + { + "epoch": 0.9292929292929293, + "grad_norm": 0.47459329104171105, + "learning_rate": 2.4765482685444786e-05, + "loss": 1.8982, + "step": 184 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 0.4584785873115325, + "learning_rate": 2.470202338716497e-05, + "loss": 1.8925, + "step": 185 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 0.39624647908908, + "learning_rate": 2.4638264128399555e-05, + "loss": 1.8032, + "step": 186 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.38536269925326294, + "learning_rate": 2.457420688041345e-05, + "loss": 1.7927, + "step": 187 + }, + { + "epoch": 0.9494949494949495, + "grad_norm": 0.44911468803951493, + "learning_rate": 2.4509853623684598e-05, + "loss": 1.8635, + "step": 188 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 0.38483613716276227, + "learning_rate": 2.4445206347842714e-05, + "loss": 1.828, + "step": 189 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 0.4134516508007333, + "learning_rate": 2.43802670516078e-05, + "loss": 1.7766, + "step": 190 + }, + { + "epoch": 0.9646464646464646, + "grad_norm": 0.4205305313105495, + "learning_rate": 2.4315037742728366e-05, + "loss": 1.8162, + "step": 191 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.4019754805551098, + "learning_rate": 2.4249520437919307e-05, + "loss": 1.7429, + "step": 192 + }, + { + "epoch": 0.9747474747474747, + "grad_norm": 0.38403144394408584, + "learning_rate": 2.4183717162799587e-05, + "loss": 1.731, + "step": 193 + }, + { + "epoch": 0.9797979797979798, + "grad_norm": 0.3807338127364377, + "learning_rate": 2.4117629951829602e-05, + "loss": 1.8942, + "step": 194 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.44761141331014614, + "learning_rate": 2.4051260848248286e-05, + "loss": 1.8313, + "step": 195 + }, + { + "epoch": 0.98989898989899, + "grad_norm": 0.38554593665783987, + "learning_rate": 2.398461190400993e-05, + "loss": 1.8394, + "step": 196 + }, + { + "epoch": 0.9949494949494949, + "grad_norm": 0.403968521425789, + "learning_rate": 2.3917685179720752e-05, + "loss": 1.8287, + "step": 197 + }, + { + "epoch": 1.0, + "grad_norm": 0.36274397859542107, + "learning_rate": 2.3850482744575177e-05, + "loss": 1.7637, + "step": 198 + }, + { + "epoch": 1.005050505050505, + "grad_norm": 0.5408632160019302, + "learning_rate": 2.3783006676291866e-05, + "loss": 1.6493, + "step": 199 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 0.4273958454403075, + "learning_rate": 2.3715259061049487e-05, + "loss": 1.5513, + "step": 200 + }, + { + "epoch": 1.0101010101010102, + "eval_loss": 1.4980653524398804, + "eval_runtime": 229.1221, + "eval_samples_per_second": 6.246, + "eval_steps_per_second": 0.781, + "step": 200 + }, + { + "epoch": 1.0151515151515151, + "grad_norm": 0.9048049507315692, + "learning_rate": 2.3647241993422208e-05, + "loss": 1.6547, + "step": 201 + }, + { + "epoch": 1.02020202020202, + "grad_norm": 0.3848882038249155, + "learning_rate": 2.3578957576314944e-05, + "loss": 1.5279, + "step": 202 + }, + { + "epoch": 1.0252525252525253, + "grad_norm": 0.40317657177333205, + "learning_rate": 2.3510407920898327e-05, + "loss": 1.6036, + "step": 203 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 0.43727100501923616, + "learning_rate": 2.3441595146543458e-05, + "loss": 1.6581, + "step": 204 + }, + { + "epoch": 1.0353535353535352, + "grad_norm": 0.3912845389083466, + "learning_rate": 2.337252138075636e-05, + "loss": 1.5399, + "step": 205 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.3961929212943775, + "learning_rate": 2.3303188759112213e-05, + "loss": 1.6788, + "step": 206 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 0.4307375964156506, + "learning_rate": 2.3233599425189317e-05, + "loss": 1.6081, + "step": 207 + }, + { + "epoch": 1.0505050505050506, + "grad_norm": 0.3912142497081324, + "learning_rate": 2.316375553050284e-05, + "loss": 1.5678, + "step": 208 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.3763414411950308, + "learning_rate": 2.3093659234438266e-05, + "loss": 1.5547, + "step": 209 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.36800317307295144, + "learning_rate": 2.3023312704184676e-05, + "loss": 1.4771, + "step": 210 + }, + { + "epoch": 1.0656565656565657, + "grad_norm": 0.3910877207089002, + "learning_rate": 2.295271811466769e-05, + "loss": 1.6495, + "step": 211 + }, + { + "epoch": 1.0707070707070707, + "grad_norm": 0.39199841864196705, + "learning_rate": 2.2881877648482274e-05, + "loss": 1.6182, + "step": 212 + }, + { + "epoch": 1.0757575757575757, + "grad_norm": 0.4188630635601797, + "learning_rate": 2.281079349582524e-05, + "loss": 1.5539, + "step": 213 + }, + { + "epoch": 1.0808080808080809, + "grad_norm": 0.4321942331098238, + "learning_rate": 2.2739467854427512e-05, + "loss": 1.7017, + "step": 214 + }, + { + "epoch": 1.0858585858585859, + "grad_norm": 0.34896226538754127, + "learning_rate": 2.266790292948622e-05, + "loss": 1.6232, + "step": 215 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.39588469087020545, + "learning_rate": 2.2596100933596498e-05, + "loss": 1.633, + "step": 216 + }, + { + "epoch": 1.095959595959596, + "grad_norm": 0.3680904345345009, + "learning_rate": 2.252406408668304e-05, + "loss": 1.5771, + "step": 217 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 0.3635291050534616, + "learning_rate": 2.2451794615931542e-05, + "loss": 1.5251, + "step": 218 + }, + { + "epoch": 1.106060606060606, + "grad_norm": 0.39495821190617536, + "learning_rate": 2.237929475571979e-05, + "loss": 1.6461, + "step": 219 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.3741651687847982, + "learning_rate": 2.2306566747548604e-05, + "loss": 1.5491, + "step": 220 + }, + { + "epoch": 1.1161616161616161, + "grad_norm": 0.3482093972061835, + "learning_rate": 2.2233612839972497e-05, + "loss": 1.4896, + "step": 221 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 0.39618420354024975, + "learning_rate": 2.2160435288530208e-05, + "loss": 1.6567, + "step": 222 + }, + { + "epoch": 1.1262626262626263, + "grad_norm": 0.3660259252156906, + "learning_rate": 2.2087036355674947e-05, + "loss": 1.5476, + "step": 223 + }, + { + "epoch": 1.1313131313131313, + "grad_norm": 0.39507488097490023, + "learning_rate": 2.2013418310704422e-05, + "loss": 1.5749, + "step": 224 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.4312941458010281, + "learning_rate": 2.1939583429690716e-05, + "loss": 1.5716, + "step": 225 + }, + { + "epoch": 1.1414141414141414, + "grad_norm": 0.35158839668333186, + "learning_rate": 2.1865533995409887e-05, + "loss": 1.5532, + "step": 226 + }, + { + "epoch": 1.1464646464646464, + "grad_norm": 0.3641625201369256, + "learning_rate": 2.1791272297271416e-05, + "loss": 1.5972, + "step": 227 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 0.4250517041163231, + "learning_rate": 2.1716800631247403e-05, + "loss": 1.6134, + "step": 228 + }, + { + "epoch": 1.1565656565656566, + "grad_norm": 0.4357327884434021, + "learning_rate": 2.1642121299801594e-05, + "loss": 1.6299, + "step": 229 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 0.36361246043809753, + "learning_rate": 2.1567236611818187e-05, + "loss": 1.6186, + "step": 230 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.38940692200146193, + "learning_rate": 2.149214888253046e-05, + "loss": 1.5636, + "step": 231 + }, + { + "epoch": 1.1717171717171717, + "grad_norm": 0.4479735326446513, + "learning_rate": 2.1416860433449177e-05, + "loss": 1.6342, + "step": 232 + }, + { + "epoch": 1.1767676767676767, + "grad_norm": 0.36500909652831554, + "learning_rate": 2.1341373592290822e-05, + "loss": 1.545, + "step": 233 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 0.41140999886470425, + "learning_rate": 2.126569069290562e-05, + "loss": 1.6267, + "step": 234 + }, + { + "epoch": 1.1868686868686869, + "grad_norm": 0.37095591557372914, + "learning_rate": 2.1189814075205406e-05, + "loss": 1.5176, + "step": 235 + }, + { + "epoch": 1.1919191919191918, + "grad_norm": 0.3754937123457338, + "learning_rate": 2.1113746085091246e-05, + "loss": 1.6441, + "step": 236 + }, + { + "epoch": 1.196969696969697, + "grad_norm": 0.42464228140478444, + "learning_rate": 2.1037489074380934e-05, + "loss": 1.574, + "step": 237 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.3732085771180313, + "learning_rate": 2.0961045400736286e-05, + "loss": 1.5687, + "step": 238 + }, + { + "epoch": 1.2070707070707072, + "grad_norm": 0.4174666906233983, + "learning_rate": 2.0884417427590217e-05, + "loss": 1.5579, + "step": 239 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.414246878313963, + "learning_rate": 2.0807607524073684e-05, + "loss": 1.6069, + "step": 240 + }, + { + "epoch": 1.2171717171717171, + "grad_norm": 0.35489448104499416, + "learning_rate": 2.073061806494246e-05, + "loss": 1.5943, + "step": 241 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.3961867107004295, + "learning_rate": 2.0653451430503686e-05, + "loss": 1.5851, + "step": 242 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 0.4264411067539511, + "learning_rate": 2.0576110006542278e-05, + "loss": 1.5436, + "step": 243 + }, + { + "epoch": 1.2323232323232323, + "grad_norm": 0.43088202335465686, + "learning_rate": 2.0498596184247196e-05, + "loss": 1.642, + "step": 244 + }, + { + "epoch": 1.2373737373737375, + "grad_norm": 0.36469200409321884, + "learning_rate": 2.0420912360137466e-05, + "loss": 1.6031, + "step": 245 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.4153699560933353, + "learning_rate": 2.0343060935988136e-05, + "loss": 1.6385, + "step": 246 + }, + { + "epoch": 1.2474747474747474, + "grad_norm": 0.39846520447706185, + "learning_rate": 2.0265044318755988e-05, + "loss": 1.5288, + "step": 247 + }, + { + "epoch": 1.2525252525252526, + "grad_norm": 0.35709349697159576, + "learning_rate": 2.018686492050513e-05, + "loss": 1.6124, + "step": 248 + }, + { + "epoch": 1.2575757575757576, + "grad_norm": 0.38319549330931446, + "learning_rate": 2.010852515833242e-05, + "loss": 1.535, + "step": 249 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 0.46317037418448487, + "learning_rate": 2.003002745429274e-05, + "loss": 1.6523, + "step": 250 + }, + { + "epoch": 1.2626262626262625, + "eval_loss": 1.498921275138855, + "eval_runtime": 229.2608, + "eval_samples_per_second": 6.242, + "eval_steps_per_second": 0.781, + "step": 250 + }, + { + "epoch": 1.2676767676767677, + "grad_norm": 0.386123708626963, + "learning_rate": 1.9951374235324105e-05, + "loss": 1.6125, + "step": 251 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.3970356471812719, + "learning_rate": 1.9872567933172647e-05, + "loss": 1.586, + "step": 252 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.4033780506493091, + "learning_rate": 1.97936109843174e-05, + "loss": 1.5574, + "step": 253 + }, + { + "epoch": 1.2828282828282829, + "grad_norm": 0.39482348113655924, + "learning_rate": 1.9714505829895004e-05, + "loss": 1.6136, + "step": 254 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.45725538920837333, + "learning_rate": 1.963525491562421e-05, + "loss": 1.6221, + "step": 255 + }, + { + "epoch": 1.2929292929292928, + "grad_norm": 0.4259870746825829, + "learning_rate": 1.9555860691730277e-05, + "loss": 1.5543, + "step": 256 + }, + { + "epoch": 1.297979797979798, + "grad_norm": 0.38172206665482866, + "learning_rate": 1.9476325612869202e-05, + "loss": 1.4709, + "step": 257 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.3996792794666037, + "learning_rate": 1.9396652138051844e-05, + "loss": 1.6108, + "step": 258 + }, + { + "epoch": 1.308080808080808, + "grad_norm": 0.4433312059533111, + "learning_rate": 1.9316842730567902e-05, + "loss": 1.627, + "step": 259 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 0.36892676364281046, + "learning_rate": 1.923689985790974e-05, + "loss": 1.4977, + "step": 260 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 0.3785785048742356, + "learning_rate": 1.9156825991696096e-05, + "loss": 1.5503, + "step": 261 + }, + { + "epoch": 1.3232323232323233, + "grad_norm": 0.4339005608639186, + "learning_rate": 1.9076623607595696e-05, + "loss": 1.5747, + "step": 262 + }, + { + "epoch": 1.3282828282828283, + "grad_norm": 0.37171089149936926, + "learning_rate": 1.8996295185250682e-05, + "loss": 1.4574, + "step": 263 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.383278000804476, + "learning_rate": 1.8915843208199967e-05, + "loss": 1.6036, + "step": 264 + }, + { + "epoch": 1.3383838383838385, + "grad_norm": 0.41536599243507505, + "learning_rate": 1.8835270163802433e-05, + "loss": 1.6381, + "step": 265 + }, + { + "epoch": 1.3434343434343434, + "grad_norm": 0.38161228496015287, + "learning_rate": 1.8754578543160045e-05, + "loss": 1.5011, + "step": 266 + }, + { + "epoch": 1.3484848484848486, + "grad_norm": 0.3744640963542385, + "learning_rate": 1.867377084104083e-05, + "loss": 1.6644, + "step": 267 + }, + { + "epoch": 1.3535353535353536, + "grad_norm": 0.3733108852557401, + "learning_rate": 1.8592849555801746e-05, + "loss": 1.6011, + "step": 268 + }, + { + "epoch": 1.3585858585858586, + "grad_norm": 0.4411122207695828, + "learning_rate": 1.851181718931141e-05, + "loss": 1.5586, + "step": 269 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.38080346450070857, + "learning_rate": 1.84306762468728e-05, + "loss": 1.5985, + "step": 270 + }, + { + "epoch": 1.3686868686868687, + "grad_norm": 0.3491871213017396, + "learning_rate": 1.8349429237145776e-05, + "loss": 1.5688, + "step": 271 + }, + { + "epoch": 1.3737373737373737, + "grad_norm": 0.37833218710396194, + "learning_rate": 1.8268078672069478e-05, + "loss": 1.5583, + "step": 272 + }, + { + "epoch": 1.378787878787879, + "grad_norm": 0.5250217329172516, + "learning_rate": 1.818662706678473e-05, + "loss": 1.5692, + "step": 273 + }, + { + "epoch": 1.3838383838383839, + "grad_norm": 0.37119240386882224, + "learning_rate": 1.8105076939556238e-05, + "loss": 1.5941, + "step": 274 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.3439611330263627, + "learning_rate": 1.8023430811694746e-05, + "loss": 1.6779, + "step": 275 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 0.3327777793630774, + "learning_rate": 1.7941691207479067e-05, + "loss": 1.6037, + "step": 276 + }, + { + "epoch": 1.398989898989899, + "grad_norm": 0.34323800459831283, + "learning_rate": 1.7859860654078065e-05, + "loss": 1.6266, + "step": 277 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.3647890764031767, + "learning_rate": 1.77779416814725e-05, + "loss": 1.5636, + "step": 278 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 0.3686098374668375, + "learning_rate": 1.769593682237682e-05, + "loss": 1.6069, + "step": 279 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.33500372009414925, + "learning_rate": 1.7613848612160857e-05, + "loss": 1.5403, + "step": 280 + }, + { + "epoch": 1.4191919191919191, + "grad_norm": 0.35011148506577505, + "learning_rate": 1.753167958877143e-05, + "loss": 1.6803, + "step": 281 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 0.3903537969599135, + "learning_rate": 1.7449432292653875e-05, + "loss": 1.5327, + "step": 282 + }, + { + "epoch": 1.4292929292929293, + "grad_norm": 0.3511906044729009, + "learning_rate": 1.736710926667352e-05, + "loss": 1.5245, + "step": 283 + }, + { + "epoch": 1.4343434343434343, + "grad_norm": 0.36248283366049505, + "learning_rate": 1.7284713056037074e-05, + "loss": 1.6522, + "step": 284 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.3698906007038963, + "learning_rate": 1.720224620821389e-05, + "loss": 1.612, + "step": 285 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.36917069694620513, + "learning_rate": 1.7119711272857242e-05, + "loss": 1.6716, + "step": 286 + }, + { + "epoch": 1.4494949494949494, + "grad_norm": 0.3504632139356775, + "learning_rate": 1.7037110801725498e-05, + "loss": 1.567, + "step": 287 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.38102242984213275, + "learning_rate": 1.69544473486032e-05, + "loss": 1.5831, + "step": 288 + }, + { + "epoch": 1.4595959595959596, + "grad_norm": 0.3490442364729326, + "learning_rate": 1.687172346922213e-05, + "loss": 1.5999, + "step": 289 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.36706394830835837, + "learning_rate": 1.67889417211823e-05, + "loss": 1.6264, + "step": 290 + }, + { + "epoch": 1.4696969696969697, + "grad_norm": 0.330869571421232, + "learning_rate": 1.670610466387285e-05, + "loss": 1.5905, + "step": 291 + }, + { + "epoch": 1.4747474747474747, + "grad_norm": 0.35481611568449467, + "learning_rate": 1.662321485839294e-05, + "loss": 1.5783, + "step": 292 + }, + { + "epoch": 1.4797979797979797, + "grad_norm": 0.420284177177337, + "learning_rate": 1.6540274867472554e-05, + "loss": 1.5323, + "step": 293 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 0.3733856344040281, + "learning_rate": 1.645728725539329e-05, + "loss": 1.5342, + "step": 294 + }, + { + "epoch": 1.4898989898989898, + "grad_norm": 0.35109601539946006, + "learning_rate": 1.637425458790905e-05, + "loss": 1.5483, + "step": 295 + }, + { + "epoch": 1.494949494949495, + "grad_norm": 0.341135998502388, + "learning_rate": 1.6291179432166737e-05, + "loss": 1.6149, + "step": 296 + }, + { + "epoch": 1.5, + "grad_norm": 0.35535095307312015, + "learning_rate": 1.620806435662687e-05, + "loss": 1.5373, + "step": 297 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 0.36290854386190635, + "learning_rate": 1.612491193098419e-05, + "loss": 1.6085, + "step": 298 + }, + { + "epoch": 1.51010101010101, + "grad_norm": 0.3571923405167369, + "learning_rate": 1.6041724726088187e-05, + "loss": 1.5928, + "step": 299 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.36396420706496496, + "learning_rate": 1.5958505313863654e-05, + "loss": 1.5806, + "step": 300 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 1.4940327405929565, + "eval_runtime": 229.2657, + "eval_samples_per_second": 6.242, + "eval_steps_per_second": 0.781, + "step": 300 + }, + { + "epoch": 1.5202020202020203, + "grad_norm": 0.3431930051075577, + "learning_rate": 1.587525626723113e-05, + "loss": 1.5698, + "step": 301 + }, + { + "epoch": 1.5252525252525253, + "grad_norm": 0.3620485249342895, + "learning_rate": 1.5791980160027376e-05, + "loss": 1.5985, + "step": 302 + }, + { + "epoch": 1.5303030303030303, + "grad_norm": 0.34146120936201213, + "learning_rate": 1.570867956692579e-05, + "loss": 1.5825, + "step": 303 + }, + { + "epoch": 1.5353535353535355, + "grad_norm": 0.3545235530221104, + "learning_rate": 1.5625357063356825e-05, + "loss": 1.5798, + "step": 304 + }, + { + "epoch": 1.5404040404040404, + "grad_norm": 0.34094408848239416, + "learning_rate": 1.5542015225428314e-05, + "loss": 1.6249, + "step": 305 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 0.3574679152501359, + "learning_rate": 1.545865662984589e-05, + "loss": 1.6048, + "step": 306 + }, + { + "epoch": 1.5505050505050506, + "grad_norm": 0.3581945699930548, + "learning_rate": 1.5375283853833272e-05, + "loss": 1.7071, + "step": 307 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.34332771314631755, + "learning_rate": 1.5291899475052596e-05, + "loss": 1.563, + "step": 308 + }, + { + "epoch": 1.5606060606060606, + "grad_norm": 0.3585885575803054, + "learning_rate": 1.5208506071524727e-05, + "loss": 1.5551, + "step": 309 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 0.3478032073365952, + "learning_rate": 1.5125106221549567e-05, + "loss": 1.6227, + "step": 310 + }, + { + "epoch": 1.5707070707070707, + "grad_norm": 0.34451168320759784, + "learning_rate": 1.5041702503626296e-05, + "loss": 1.5625, + "step": 311 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 0.3625884247911658, + "learning_rate": 1.4958297496373708e-05, + "loss": 1.622, + "step": 312 + }, + { + "epoch": 1.5808080808080809, + "grad_norm": 0.3390738782534803, + "learning_rate": 1.4874893778450436e-05, + "loss": 1.5539, + "step": 313 + }, + { + "epoch": 1.5858585858585859, + "grad_norm": 0.36415276028038074, + "learning_rate": 1.4791493928475275e-05, + "loss": 1.5946, + "step": 314 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.3447466909227647, + "learning_rate": 1.4708100524947413e-05, + "loss": 1.6482, + "step": 315 + }, + { + "epoch": 1.595959595959596, + "grad_norm": 0.3573366580845515, + "learning_rate": 1.4624716146166734e-05, + "loss": 1.6493, + "step": 316 + }, + { + "epoch": 1.601010101010101, + "grad_norm": 0.35959249143609046, + "learning_rate": 1.4541343370154115e-05, + "loss": 1.6135, + "step": 317 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.3323927118670595, + "learning_rate": 1.4457984774571692e-05, + "loss": 1.6435, + "step": 318 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.34075152367107814, + "learning_rate": 1.437464293664318e-05, + "loss": 1.6634, + "step": 319 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.3399955824391449, + "learning_rate": 1.4291320433074213e-05, + "loss": 1.6254, + "step": 320 + }, + { + "epoch": 1.621212121212121, + "grad_norm": 0.3427914203743951, + "learning_rate": 1.4208019839972627e-05, + "loss": 1.5906, + "step": 321 + }, + { + "epoch": 1.6262626262626263, + "grad_norm": 0.3394828693597387, + "learning_rate": 1.4124743732768873e-05, + "loss": 1.6156, + "step": 322 + }, + { + "epoch": 1.6313131313131313, + "grad_norm": 0.3854380971355761, + "learning_rate": 1.4041494686136348e-05, + "loss": 1.6758, + "step": 323 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.3391843715071689, + "learning_rate": 1.3958275273911812e-05, + "loss": 1.6073, + "step": 324 + }, + { + "epoch": 1.6414141414141414, + "grad_norm": 0.34941530909401375, + "learning_rate": 1.3875088069015815e-05, + "loss": 1.7296, + "step": 325 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.3645855591600974, + "learning_rate": 1.3791935643373133e-05, + "loss": 1.5878, + "step": 326 + }, + { + "epoch": 1.6515151515151514, + "grad_norm": 0.3818786490819524, + "learning_rate": 1.3708820567833266e-05, + "loss": 1.6895, + "step": 327 + }, + { + "epoch": 1.6565656565656566, + "grad_norm": 0.3918329456403931, + "learning_rate": 1.3625745412090953e-05, + "loss": 1.5491, + "step": 328 + }, + { + "epoch": 1.6616161616161618, + "grad_norm": 0.3685269378673387, + "learning_rate": 1.3542712744606712e-05, + "loss": 1.5422, + "step": 329 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3784489076280344, + "learning_rate": 1.3459725132527448e-05, + "loss": 1.516, + "step": 330 + }, + { + "epoch": 1.6717171717171717, + "grad_norm": 0.35908318066713363, + "learning_rate": 1.3376785141607067e-05, + "loss": 1.5082, + "step": 331 + }, + { + "epoch": 1.676767676767677, + "grad_norm": 0.3600676479254767, + "learning_rate": 1.329389533612715e-05, + "loss": 1.6618, + "step": 332 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 0.3408967547242199, + "learning_rate": 1.3211058278817705e-05, + "loss": 1.5369, + "step": 333 + }, + { + "epoch": 1.6868686868686869, + "grad_norm": 0.3397113886527206, + "learning_rate": 1.3128276530777874e-05, + "loss": 1.5655, + "step": 334 + }, + { + "epoch": 1.691919191919192, + "grad_norm": 0.3552202693938084, + "learning_rate": 1.3045552651396805e-05, + "loss": 1.5913, + "step": 335 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 0.3744742946582953, + "learning_rate": 1.2962889198274506e-05, + "loss": 1.5468, + "step": 336 + }, + { + "epoch": 1.702020202020202, + "grad_norm": 0.34653332317369845, + "learning_rate": 1.2880288727142757e-05, + "loss": 1.6004, + "step": 337 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 0.31916935496978377, + "learning_rate": 1.2797753791786112e-05, + "loss": 1.5545, + "step": 338 + }, + { + "epoch": 1.7121212121212122, + "grad_norm": 0.34268191483789684, + "learning_rate": 1.2715286943962925e-05, + "loss": 1.5286, + "step": 339 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 0.34934463675899574, + "learning_rate": 1.2632890733326475e-05, + "loss": 1.6462, + "step": 340 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.35309417659426007, + "learning_rate": 1.255056770734613e-05, + "loss": 1.6459, + "step": 341 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.36011757823593665, + "learning_rate": 1.2468320411228579e-05, + "loss": 1.5716, + "step": 342 + }, + { + "epoch": 1.7323232323232323, + "grad_norm": 0.31144567675642815, + "learning_rate": 1.2386151387839145e-05, + "loss": 1.4455, + "step": 343 + }, + { + "epoch": 1.7373737373737375, + "grad_norm": 0.3723102104349624, + "learning_rate": 1.2304063177623182e-05, + "loss": 1.6399, + "step": 344 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.38916725203253794, + "learning_rate": 1.2222058318527502e-05, + "loss": 1.553, + "step": 345 + }, + { + "epoch": 1.7474747474747474, + "grad_norm": 0.3322536259081415, + "learning_rate": 1.214013934592194e-05, + "loss": 1.492, + "step": 346 + }, + { + "epoch": 1.7525252525252526, + "grad_norm": 0.37090482924070856, + "learning_rate": 1.2058308792520937e-05, + "loss": 1.5703, + "step": 347 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 0.38587618815415825, + "learning_rate": 1.1976569188305255e-05, + "loss": 1.6146, + "step": 348 + }, + { + "epoch": 1.7626262626262625, + "grad_norm": 0.37194117916485847, + "learning_rate": 1.1894923060443763e-05, + "loss": 1.5854, + "step": 349 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 0.39535022296888556, + "learning_rate": 1.1813372933215274e-05, + "loss": 1.5918, + "step": 350 + }, + { + "epoch": 1.7676767676767677, + "eval_loss": 1.4877514839172363, + "eval_runtime": 229.0648, + "eval_samples_per_second": 6.247, + "eval_steps_per_second": 0.781, + "step": 350 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 0.35051004852162915, + "learning_rate": 1.1731921327930523e-05, + "loss": 1.6095, + "step": 351 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.34675700639977675, + "learning_rate": 1.165057076285423e-05, + "loss": 1.5565, + "step": 352 + }, + { + "epoch": 1.7828282828282829, + "grad_norm": 0.325905557218622, + "learning_rate": 1.1569323753127196e-05, + "loss": 1.5533, + "step": 353 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 0.3713917306740112, + "learning_rate": 1.1488182810688593e-05, + "loss": 1.5455, + "step": 354 + }, + { + "epoch": 1.7929292929292928, + "grad_norm": 0.405573404381939, + "learning_rate": 1.1407150444198262e-05, + "loss": 1.6257, + "step": 355 + }, + { + "epoch": 1.797979797979798, + "grad_norm": 0.34148070848500733, + "learning_rate": 1.132622915895917e-05, + "loss": 1.6975, + "step": 356 + }, + { + "epoch": 1.803030303030303, + "grad_norm": 0.34138980212543457, + "learning_rate": 1.1245421456839954e-05, + "loss": 1.5417, + "step": 357 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.3458043789729876, + "learning_rate": 1.116472983619757e-05, + "loss": 1.6212, + "step": 358 + }, + { + "epoch": 1.8131313131313131, + "grad_norm": 0.34821285941193825, + "learning_rate": 1.1084156791800036e-05, + "loss": 1.6329, + "step": 359 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.35583032341592974, + "learning_rate": 1.100370481474932e-05, + "loss": 1.6058, + "step": 360 + }, + { + "epoch": 1.823232323232323, + "grad_norm": 0.3347559695941822, + "learning_rate": 1.0923376392404302e-05, + "loss": 1.5184, + "step": 361 + }, + { + "epoch": 1.8282828282828283, + "grad_norm": 0.3914463684785098, + "learning_rate": 1.0843174008303908e-05, + "loss": 1.6334, + "step": 362 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.3665474032055003, + "learning_rate": 1.0763100142090267e-05, + "loss": 1.6412, + "step": 363 + }, + { + "epoch": 1.8383838383838382, + "grad_norm": 0.3342017672784236, + "learning_rate": 1.0683157269432097e-05, + "loss": 1.6167, + "step": 364 + }, + { + "epoch": 1.8434343434343434, + "grad_norm": 0.3300768200411088, + "learning_rate": 1.0603347861948155e-05, + "loss": 1.6253, + "step": 365 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.3573589534814461, + "learning_rate": 1.0523674387130806e-05, + "loss": 1.5936, + "step": 366 + }, + { + "epoch": 1.8535353535353534, + "grad_norm": 0.3476694128643829, + "learning_rate": 1.0444139308269725e-05, + "loss": 1.6644, + "step": 367 + }, + { + "epoch": 1.8585858585858586, + "grad_norm": 0.38035554483291945, + "learning_rate": 1.036474508437579e-05, + "loss": 1.6168, + "step": 368 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 0.3334234919513792, + "learning_rate": 1.0285494170104996e-05, + "loss": 1.6633, + "step": 369 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 0.3265228255184419, + "learning_rate": 1.0206389015682601e-05, + "loss": 1.5896, + "step": 370 + }, + { + "epoch": 1.8737373737373737, + "grad_norm": 0.3691401309714038, + "learning_rate": 1.0127432066827357e-05, + "loss": 1.5916, + "step": 371 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 0.33985591597069215, + "learning_rate": 1.0048625764675896e-05, + "loss": 1.6083, + "step": 372 + }, + { + "epoch": 1.8838383838383839, + "grad_norm": 0.3654336922923449, + "learning_rate": 9.969972545707266e-06, + "loss": 1.6688, + "step": 373 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.3474815931456802, + "learning_rate": 9.891474841667585e-06, + "loss": 1.6077, + "step": 374 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.35105409408446747, + "learning_rate": 9.813135079494872e-06, + "loss": 1.6133, + "step": 375 + }, + { + "epoch": 1.898989898989899, + "grad_norm": 0.3412933734315999, + "learning_rate": 9.734955681244016e-06, + "loss": 1.6069, + "step": 376 + }, + { + "epoch": 1.904040404040404, + "grad_norm": 0.32783066972147346, + "learning_rate": 9.656939064011861e-06, + "loss": 1.4873, + "step": 377 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.3478856883264074, + "learning_rate": 9.579087639862538e-06, + "loss": 1.5366, + "step": 378 + }, + { + "epoch": 1.9141414141414141, + "grad_norm": 0.3539395477762644, + "learning_rate": 9.501403815752813e-06, + "loss": 1.5213, + "step": 379 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 0.3581622590288963, + "learning_rate": 9.423889993457721e-06, + "loss": 1.6273, + "step": 380 + }, + { + "epoch": 1.9242424242424243, + "grad_norm": 0.3335926440610502, + "learning_rate": 9.346548569496318e-06, + "loss": 1.6772, + "step": 381 + }, + { + "epoch": 1.9292929292929293, + "grad_norm": 0.3413862529782368, + "learning_rate": 9.269381935057546e-06, + "loss": 1.5744, + "step": 382 + }, + { + "epoch": 1.9343434343434343, + "grad_norm": 0.378369060255768, + "learning_rate": 9.192392475926315e-06, + "loss": 1.6305, + "step": 383 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.3508043937268879, + "learning_rate": 9.115582572409789e-06, + "loss": 1.6425, + "step": 384 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.3566255219446304, + "learning_rate": 9.038954599263713e-06, + "loss": 1.5535, + "step": 385 + }, + { + "epoch": 1.9494949494949494, + "grad_norm": 0.536928554074425, + "learning_rate": 8.962510925619065e-06, + "loss": 1.5138, + "step": 386 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 0.37533231697996555, + "learning_rate": 8.88625391490876e-06, + "loss": 1.5625, + "step": 387 + }, + { + "epoch": 1.9595959595959596, + "grad_norm": 0.3431698476036675, + "learning_rate": 8.8101859247946e-06, + "loss": 1.5796, + "step": 388 + }, + { + "epoch": 1.9646464646464645, + "grad_norm": 0.38914902758530917, + "learning_rate": 8.734309307094382e-06, + "loss": 1.6072, + "step": 389 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.3184503763537262, + "learning_rate": 8.658626407709182e-06, + "loss": 1.5288, + "step": 390 + }, + { + "epoch": 1.9747474747474747, + "grad_norm": 0.32890220573653406, + "learning_rate": 8.583139566550827e-06, + "loss": 1.592, + "step": 391 + }, + { + "epoch": 1.9797979797979797, + "grad_norm": 0.3395800990661379, + "learning_rate": 8.507851117469546e-06, + "loss": 1.5736, + "step": 392 + }, + { + "epoch": 1.9848484848484849, + "grad_norm": 0.34733690681577234, + "learning_rate": 8.432763388181812e-06, + "loss": 1.5791, + "step": 393 + }, + { + "epoch": 1.98989898989899, + "grad_norm": 0.37942192437254635, + "learning_rate": 8.357878700198407e-06, + "loss": 1.5927, + "step": 394 + }, + { + "epoch": 1.9949494949494948, + "grad_norm": 0.3285509965028684, + "learning_rate": 8.283199368752598e-06, + "loss": 1.5848, + "step": 395 + }, + { + "epoch": 2.0, + "grad_norm": 0.3237504993576643, + "learning_rate": 8.208727702728586e-06, + "loss": 1.5468, + "step": 396 + }, + { + "epoch": 2.005050505050505, + "grad_norm": 0.6489959146891373, + "learning_rate": 8.134466004590116e-06, + "loss": 1.3495, + "step": 397 + }, + { + "epoch": 2.01010101010101, + "grad_norm": 0.5603926475365507, + "learning_rate": 8.060416570309291e-06, + "loss": 1.3804, + "step": 398 + }, + { + "epoch": 2.015151515151515, + "grad_norm": 0.49584097590048715, + "learning_rate": 7.986581689295577e-06, + "loss": 1.3215, + "step": 399 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.4392542856654279, + "learning_rate": 7.912963644325057e-06, + "loss": 1.3675, + "step": 400 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 1.5459927320480347, + "eval_runtime": 229.9347, + "eval_samples_per_second": 6.224, + "eval_steps_per_second": 0.778, + "step": 400 + }, + { + "epoch": 2.025252525252525, + "grad_norm": 0.5550727689847038, + "learning_rate": 7.839564711469786e-06, + "loss": 1.3907, + "step": 401 + }, + { + "epoch": 2.0303030303030303, + "grad_norm": 0.6935079495968718, + "learning_rate": 7.766387160027504e-06, + "loss": 1.4597, + "step": 402 + }, + { + "epoch": 2.0353535353535355, + "grad_norm": 0.5919993820181615, + "learning_rate": 7.693433252451404e-06, + "loss": 1.3068, + "step": 403 + }, + { + "epoch": 2.04040404040404, + "grad_norm": 0.4444393538098348, + "learning_rate": 7.620705244280208e-06, + "loss": 1.3455, + "step": 404 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.44018034109403575, + "learning_rate": 7.548205384068461e-06, + "loss": 1.4318, + "step": 405 + }, + { + "epoch": 2.0505050505050506, + "grad_norm": 0.47554567734474, + "learning_rate": 7.475935913316967e-06, + "loss": 1.3565, + "step": 406 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.4485832265306745, + "learning_rate": 7.403899066403506e-06, + "loss": 1.3344, + "step": 407 + }, + { + "epoch": 2.0606060606060606, + "grad_norm": 0.4155215847745916, + "learning_rate": 7.332097070513776e-06, + "loss": 1.3103, + "step": 408 + }, + { + "epoch": 2.0656565656565657, + "grad_norm": 0.3796385213739081, + "learning_rate": 7.260532145572487e-06, + "loss": 1.3414, + "step": 409 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 0.40547334324608847, + "learning_rate": 7.1892065041747645e-06, + "loss": 1.3815, + "step": 410 + }, + { + "epoch": 2.0757575757575757, + "grad_norm": 0.39981507933068694, + "learning_rate": 7.118122351517729e-06, + "loss": 1.3698, + "step": 411 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 0.380701129157713, + "learning_rate": 7.047281885332311e-06, + "loss": 1.348, + "step": 412 + }, + { + "epoch": 2.0858585858585856, + "grad_norm": 0.3836974581541541, + "learning_rate": 6.976687295815327e-06, + "loss": 1.3143, + "step": 413 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 0.3778921933523212, + "learning_rate": 6.906340765561734e-06, + "loss": 1.3396, + "step": 414 + }, + { + "epoch": 2.095959595959596, + "grad_norm": 0.38922397224846605, + "learning_rate": 6.836244469497159e-06, + "loss": 1.4238, + "step": 415 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.35783821916056857, + "learning_rate": 6.7664005748106875e-06, + "loss": 1.3582, + "step": 416 + }, + { + "epoch": 2.106060606060606, + "grad_norm": 0.35939456738266723, + "learning_rate": 6.69681124088779e-06, + "loss": 1.3783, + "step": 417 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.3498463091785103, + "learning_rate": 6.627478619243642e-06, + "loss": 1.3671, + "step": 418 + }, + { + "epoch": 2.1161616161616164, + "grad_norm": 0.35085992990906917, + "learning_rate": 6.558404853456545e-06, + "loss": 1.2929, + "step": 419 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.36844923629881426, + "learning_rate": 6.489592079101671e-06, + "loss": 1.422, + "step": 420 + }, + { + "epoch": 2.1262626262626263, + "grad_norm": 0.3459330366665105, + "learning_rate": 6.421042423685059e-06, + "loss": 1.3818, + "step": 421 + }, + { + "epoch": 2.1313131313131315, + "grad_norm": 0.34989994293169857, + "learning_rate": 6.352758006577794e-06, + "loss": 1.4177, + "step": 422 + }, + { + "epoch": 2.1363636363636362, + "grad_norm": 0.3267726260945404, + "learning_rate": 6.284740938950517e-06, + "loss": 1.3085, + "step": 423 + }, + { + "epoch": 2.1414141414141414, + "grad_norm": 0.3513152908110021, + "learning_rate": 6.2169933237081386e-06, + "loss": 1.4144, + "step": 424 + }, + { + "epoch": 2.1464646464646466, + "grad_norm": 0.35199489451607285, + "learning_rate": 6.149517255424823e-06, + "loss": 1.3202, + "step": 425 + }, + { + "epoch": 2.1515151515151514, + "grad_norm": 0.35307815632276385, + "learning_rate": 6.0823148202792474e-06, + "loss": 1.3623, + "step": 426 + }, + { + "epoch": 2.1565656565656566, + "grad_norm": 0.3470711355464423, + "learning_rate": 6.015388095990069e-06, + "loss": 1.3437, + "step": 427 + }, + { + "epoch": 2.1616161616161618, + "grad_norm": 0.3362602033819937, + "learning_rate": 5.948739151751716e-06, + "loss": 1.3541, + "step": 428 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.34033946687573235, + "learning_rate": 5.882370048170403e-06, + "loss": 1.4119, + "step": 429 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 0.3532405636078066, + "learning_rate": 5.81628283720042e-06, + "loss": 1.4835, + "step": 430 + }, + { + "epoch": 2.176767676767677, + "grad_norm": 0.32887202826818873, + "learning_rate": 5.7504795620806964e-06, + "loss": 1.3296, + "step": 431 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.3411420041143714, + "learning_rate": 5.684962257271636e-06, + "loss": 1.3661, + "step": 432 + }, + { + "epoch": 2.186868686868687, + "grad_norm": 0.31237309569104227, + "learning_rate": 5.619732948392197e-06, + "loss": 1.2338, + "step": 433 + }, + { + "epoch": 2.191919191919192, + "grad_norm": 0.3413488930865825, + "learning_rate": 5.55479365215729e-06, + "loss": 1.3423, + "step": 434 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.3569265566015804, + "learning_rate": 5.490146376315405e-06, + "loss": 1.3593, + "step": 435 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 0.36112224488349237, + "learning_rate": 5.42579311958655e-06, + "loss": 1.302, + "step": 436 + }, + { + "epoch": 2.207070707070707, + "grad_norm": 0.3276471583119553, + "learning_rate": 5.361735871600451e-06, + "loss": 1.3311, + "step": 437 + }, + { + "epoch": 2.212121212121212, + "grad_norm": 0.3156487456237961, + "learning_rate": 5.297976612835038e-06, + "loss": 1.3156, + "step": 438 + }, + { + "epoch": 2.217171717171717, + "grad_norm": 0.36001007809681823, + "learning_rate": 5.234517314555213e-06, + "loss": 1.2172, + "step": 439 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.35044457797610973, + "learning_rate": 5.171359938751919e-06, + "loss": 1.3586, + "step": 440 + }, + { + "epoch": 2.227272727272727, + "grad_norm": 0.3408481018485263, + "learning_rate": 5.108506438081457e-06, + "loss": 1.386, + "step": 441 + }, + { + "epoch": 2.2323232323232323, + "grad_norm": 0.5110825954105327, + "learning_rate": 5.045958755805123e-06, + "loss": 1.3849, + "step": 442 + }, + { + "epoch": 2.2373737373737375, + "grad_norm": 0.33198011548599254, + "learning_rate": 4.983718825729138e-06, + "loss": 1.3877, + "step": 443 + }, + { + "epoch": 2.242424242424242, + "grad_norm": 0.338000968148193, + "learning_rate": 4.921788572144841e-06, + "loss": 1.328, + "step": 444 + }, + { + "epoch": 2.2474747474747474, + "grad_norm": 0.3381841613090944, + "learning_rate": 4.860169909769223e-06, + "loss": 1.3864, + "step": 445 + }, + { + "epoch": 2.2525252525252526, + "grad_norm": 0.33987310454638214, + "learning_rate": 4.798864743685704e-06, + "loss": 1.3087, + "step": 446 + }, + { + "epoch": 2.257575757575758, + "grad_norm": 0.3312906048392972, + "learning_rate": 4.73787496928523e-06, + "loss": 1.3784, + "step": 447 + }, + { + "epoch": 2.2626262626262625, + "grad_norm": 0.3471224148442727, + "learning_rate": 4.677202472207706e-06, + "loss": 1.3469, + "step": 448 + }, + { + "epoch": 2.2676767676767677, + "grad_norm": 0.33785032384606045, + "learning_rate": 4.616849128283658e-06, + "loss": 1.3877, + "step": 449 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.34074480809570873, + "learning_rate": 4.556816803476261e-06, + "loss": 1.3675, + "step": 450 + }, + { + "epoch": 2.2727272727272725, + "eval_loss": 1.5548166036605835, + "eval_runtime": 230.2364, + "eval_samples_per_second": 6.215, + "eval_steps_per_second": 0.777, + "step": 450 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.32624573777354227, + "learning_rate": 4.497107353823638e-06, + "loss": 1.3828, + "step": 451 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 0.3369163688557624, + "learning_rate": 4.437722625381481e-06, + "loss": 1.3295, + "step": 452 + }, + { + "epoch": 2.287878787878788, + "grad_norm": 0.3355454228013588, + "learning_rate": 4.378664454165983e-06, + "loss": 1.3544, + "step": 453 + }, + { + "epoch": 2.292929292929293, + "grad_norm": 0.348354298966465, + "learning_rate": 4.319934666097055e-06, + "loss": 1.3835, + "step": 454 + }, + { + "epoch": 2.297979797979798, + "grad_norm": 0.3305159849956163, + "learning_rate": 4.261535076941888e-06, + "loss": 1.2923, + "step": 455 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.3240680403742593, + "learning_rate": 4.20346749225881e-06, + "loss": 1.3421, + "step": 456 + }, + { + "epoch": 2.308080808080808, + "grad_norm": 0.35833405093271425, + "learning_rate": 4.145733707341457e-06, + "loss": 1.4357, + "step": 457 + }, + { + "epoch": 2.313131313131313, + "grad_norm": 0.3454656348650401, + "learning_rate": 4.08833550716329e-06, + "loss": 1.3921, + "step": 458 + }, + { + "epoch": 2.3181818181818183, + "grad_norm": 0.32930254998399916, + "learning_rate": 4.031274666322372e-06, + "loss": 1.3532, + "step": 459 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 0.35468933367330896, + "learning_rate": 3.9745529489865325e-06, + "loss": 1.3409, + "step": 460 + }, + { + "epoch": 2.3282828282828283, + "grad_norm": 0.3407135461627606, + "learning_rate": 3.918172108838819e-06, + "loss": 1.3504, + "step": 461 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3174290975875319, + "learning_rate": 3.862133889023264e-06, + "loss": 1.2801, + "step": 462 + }, + { + "epoch": 2.3383838383838382, + "grad_norm": 0.33257974936544793, + "learning_rate": 3.806440022090991e-06, + "loss": 1.3476, + "step": 463 + }, + { + "epoch": 2.3434343434343434, + "grad_norm": 0.34433935520925646, + "learning_rate": 3.7510922299466818e-06, + "loss": 1.4109, + "step": 464 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.3329914479718482, + "learning_rate": 3.696092223795293e-06, + "loss": 1.3107, + "step": 465 + }, + { + "epoch": 2.3535353535353534, + "grad_norm": 0.34511424002581903, + "learning_rate": 3.641441704089195e-06, + "loss": 1.344, + "step": 466 + }, + { + "epoch": 2.3585858585858586, + "grad_norm": 0.3225246247472432, + "learning_rate": 3.5871423604755576e-06, + "loss": 1.379, + "step": 467 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.3314079832160016, + "learning_rate": 3.53319587174414e-06, + "loss": 1.3605, + "step": 468 + }, + { + "epoch": 2.3686868686868685, + "grad_norm": 0.33155417660439823, + "learning_rate": 3.4796039057753703e-06, + "loss": 1.3788, + "step": 469 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 0.3217080508265444, + "learning_rate": 3.4263681194887926e-06, + "loss": 1.2229, + "step": 470 + }, + { + "epoch": 2.378787878787879, + "grad_norm": 0.32856611753986226, + "learning_rate": 3.3734901587918234e-06, + "loss": 1.3069, + "step": 471 + }, + { + "epoch": 2.3838383838383836, + "grad_norm": 0.33453934067183627, + "learning_rate": 3.320971658528889e-06, + "loss": 1.3685, + "step": 472 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3195781522323383, + "learning_rate": 3.2688142424308464e-06, + "loss": 1.3335, + "step": 473 + }, + { + "epoch": 2.393939393939394, + "grad_norm": 0.33384250654885655, + "learning_rate": 3.217019523064825e-06, + "loss": 1.3149, + "step": 474 + }, + { + "epoch": 2.398989898989899, + "grad_norm": 0.3417470523103563, + "learning_rate": 3.1655891017843152e-06, + "loss": 1.4034, + "step": 475 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 0.3192940536310456, + "learning_rate": 3.1145245686797107e-06, + "loss": 1.3257, + "step": 476 + }, + { + "epoch": 2.409090909090909, + "grad_norm": 0.33117280629763984, + "learning_rate": 3.063827502529125e-06, + "loss": 1.3705, + "step": 477 + }, + { + "epoch": 2.4141414141414144, + "grad_norm": 0.31577233152801576, + "learning_rate": 3.0134994707495776e-06, + "loss": 1.3051, + "step": 478 + }, + { + "epoch": 2.419191919191919, + "grad_norm": 0.32359860321254813, + "learning_rate": 2.96354202934853e-06, + "loss": 1.3105, + "step": 479 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.3323495077715317, + "learning_rate": 2.913956722875803e-06, + "loss": 1.3609, + "step": 480 + }, + { + "epoch": 2.429292929292929, + "grad_norm": 0.34559349440125164, + "learning_rate": 2.86474508437579e-06, + "loss": 1.3739, + "step": 481 + }, + { + "epoch": 2.4343434343434343, + "grad_norm": 0.3229886400387147, + "learning_rate": 2.8159086353400787e-06, + "loss": 1.2993, + "step": 482 + }, + { + "epoch": 2.4393939393939394, + "grad_norm": 0.3151626607633294, + "learning_rate": 2.7674488856604106e-06, + "loss": 1.2994, + "step": 483 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.33940231127513143, + "learning_rate": 2.7193673335819893e-06, + "loss": 1.3887, + "step": 484 + }, + { + "epoch": 2.4494949494949494, + "grad_norm": 0.3314650305858189, + "learning_rate": 2.6716654656571766e-06, + "loss": 1.2696, + "step": 485 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 0.3283054013523294, + "learning_rate": 2.6243447566995075e-06, + "loss": 1.3248, + "step": 486 + }, + { + "epoch": 2.45959595959596, + "grad_norm": 0.3294385291650391, + "learning_rate": 2.5774066697381173e-06, + "loss": 1.3744, + "step": 487 + }, + { + "epoch": 2.4646464646464645, + "grad_norm": 0.32701883167697327, + "learning_rate": 2.530852655972487e-06, + "loss": 1.4138, + "step": 488 + }, + { + "epoch": 2.4696969696969697, + "grad_norm": 0.334931690168335, + "learning_rate": 2.4846841547275916e-06, + "loss": 1.3611, + "step": 489 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 0.33546755336530826, + "learning_rate": 2.4389025934094045e-06, + "loss": 1.268, + "step": 490 + }, + { + "epoch": 2.4797979797979797, + "grad_norm": 0.31726122931536993, + "learning_rate": 2.39350938746074e-06, + "loss": 1.3288, + "step": 491 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.3287627054478962, + "learning_rate": 2.348505940317516e-06, + "loss": 1.4282, + "step": 492 + }, + { + "epoch": 2.48989898989899, + "grad_norm": 0.32239418628280586, + "learning_rate": 2.3038936433653617e-06, + "loss": 1.3585, + "step": 493 + }, + { + "epoch": 2.494949494949495, + "grad_norm": 0.31867368014934067, + "learning_rate": 2.259673875896585e-06, + "loss": 1.279, + "step": 494 + }, + { + "epoch": 2.5, + "grad_norm": 0.34355794868397255, + "learning_rate": 2.2158480050675407e-06, + "loss": 1.3742, + "step": 495 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.32224758819615007, + "learning_rate": 2.1724173858563546e-06, + "loss": 1.2032, + "step": 496 + }, + { + "epoch": 2.51010101010101, + "grad_norm": 0.338860108565893, + "learning_rate": 2.129383361021038e-06, + "loss": 1.3843, + "step": 497 + }, + { + "epoch": 2.515151515151515, + "grad_norm": 0.3358553477580223, + "learning_rate": 2.0867472610579753e-06, + "loss": 1.3089, + "step": 498 + }, + { + "epoch": 2.5202020202020203, + "grad_norm": 0.3343623935360769, + "learning_rate": 2.044510404160774e-06, + "loss": 1.3487, + "step": 499 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 0.3546274911997211, + "learning_rate": 2.002674096179525e-06, + "loss": 1.3862, + "step": 500 + }, + { + "epoch": 2.525252525252525, + "eval_loss": 1.5534101724624634, + "eval_runtime": 230.0945, + "eval_samples_per_second": 6.219, + "eval_steps_per_second": 0.778, + "step": 500 + }, + { + "epoch": 2.5303030303030303, + "grad_norm": 0.33054033498676905, + "learning_rate": 1.961239630580419e-06, + "loss": 1.3379, + "step": 501 + }, + { + "epoch": 2.5353535353535355, + "grad_norm": 0.3243578841256009, + "learning_rate": 1.9202082884057663e-06, + "loss": 1.361, + "step": 502 + }, + { + "epoch": 2.5404040404040407, + "grad_norm": 0.3453871510381904, + "learning_rate": 1.8795813382343757e-06, + "loss": 1.3128, + "step": 503 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 0.3240177636684765, + "learning_rate": 1.8393600361423534e-06, + "loss": 1.2967, + "step": 504 + }, + { + "epoch": 2.5505050505050506, + "grad_norm": 0.32198961074160476, + "learning_rate": 1.7995456256642467e-06, + "loss": 1.3792, + "step": 505 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.3323091802115782, + "learning_rate": 1.7601393377546189e-06, + "loss": 1.3401, + "step": 506 + }, + { + "epoch": 2.5606060606060606, + "grad_norm": 0.33782378441397226, + "learning_rate": 1.7211423907499729e-06, + "loss": 1.39, + "step": 507 + }, + { + "epoch": 2.5656565656565657, + "grad_norm": 0.3173070512019593, + "learning_rate": 1.6825559903310827e-06, + "loss": 1.3419, + "step": 508 + }, + { + "epoch": 2.570707070707071, + "grad_norm": 0.328610068013649, + "learning_rate": 1.6443813294857452e-06, + "loss": 1.3739, + "step": 509 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.3275374111063948, + "learning_rate": 1.6066195884718631e-06, + "loss": 1.2777, + "step": 510 + }, + { + "epoch": 2.580808080808081, + "grad_norm": 0.35141551632003326, + "learning_rate": 1.5692719347809686e-06, + "loss": 1.3861, + "step": 511 + }, + { + "epoch": 2.5858585858585856, + "grad_norm": 0.31546480930066206, + "learning_rate": 1.5323395231021325e-06, + "loss": 1.2477, + "step": 512 + }, + { + "epoch": 2.590909090909091, + "grad_norm": 0.33265355608600455, + "learning_rate": 1.4958234952862553e-06, + "loss": 1.4034, + "step": 513 + }, + { + "epoch": 2.595959595959596, + "grad_norm": 0.35507158066405115, + "learning_rate": 1.459724980310767e-06, + "loss": 1.3902, + "step": 514 + }, + { + "epoch": 2.601010101010101, + "grad_norm": 0.3295175897866719, + "learning_rate": 1.4240450942447213e-06, + "loss": 1.373, + "step": 515 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.32252779319703984, + "learning_rate": 1.388784940214292e-06, + "loss": 1.3497, + "step": 516 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.3402791161335901, + "learning_rate": 1.3539456083686736e-06, + "loss": 1.3999, + "step": 517 + }, + { + "epoch": 2.616161616161616, + "grad_norm": 0.3279145522242951, + "learning_rate": 1.3195281758463624e-06, + "loss": 1.3533, + "step": 518 + }, + { + "epoch": 2.621212121212121, + "grad_norm": 0.32392571532268805, + "learning_rate": 1.2855337067418576e-06, + "loss": 1.3212, + "step": 519 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.325724061463982, + "learning_rate": 1.2519632520727825e-06, + "loss": 1.3995, + "step": 520 + }, + { + "epoch": 2.6313131313131315, + "grad_norm": 0.33044089457213144, + "learning_rate": 1.2188178497473513e-06, + "loss": 1.3527, + "step": 521 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 0.3249288237340594, + "learning_rate": 1.1860985245323237e-06, + "loss": 1.4036, + "step": 522 + }, + { + "epoch": 2.6414141414141414, + "grad_norm": 0.3316367696024126, + "learning_rate": 1.1538062880212868e-06, + "loss": 1.4117, + "step": 523 + }, + { + "epoch": 2.6464646464646466, + "grad_norm": 0.330308972754837, + "learning_rate": 1.1219421386033958e-06, + "loss": 1.3664, + "step": 524 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.32101693883494814, + "learning_rate": 1.090507061432507e-06, + "loss": 1.3609, + "step": 525 + }, + { + "epoch": 2.6565656565656566, + "grad_norm": 0.31721696490080514, + "learning_rate": 1.059502028396714e-06, + "loss": 1.4163, + "step": 526 + }, + { + "epoch": 2.6616161616161618, + "grad_norm": 0.33100041836983457, + "learning_rate": 1.028927998088302e-06, + "loss": 1.373, + "step": 527 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.3087065560652621, + "learning_rate": 9.987859157741063e-07, + "loss": 1.3462, + "step": 528 + }, + { + "epoch": 2.6717171717171717, + "grad_norm": 0.3228221395167405, + "learning_rate": 9.690767133662976e-07, + "loss": 1.2952, + "step": 529 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.34478777480046996, + "learning_rate": 9.398013093935604e-07, + "loss": 1.3755, + "step": 530 + }, + { + "epoch": 2.6818181818181817, + "grad_norm": 0.32420291241316956, + "learning_rate": 9.10960608972698e-07, + "loss": 1.3832, + "step": 531 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 0.33022446961420726, + "learning_rate": 8.825555037806493e-07, + "loss": 1.347, + "step": 532 + }, + { + "epoch": 2.691919191919192, + "grad_norm": 0.33958181683593025, + "learning_rate": 8.545868720269145e-07, + "loss": 1.3724, + "step": 533 + }, + { + "epoch": 2.6969696969696972, + "grad_norm": 0.33725010658515825, + "learning_rate": 8.270555784264167e-07, + "loss": 1.2772, + "step": 534 + }, + { + "epoch": 2.702020202020202, + "grad_norm": 0.3515586496897235, + "learning_rate": 7.999624741727479e-07, + "loss": 1.4157, + "step": 535 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 0.3315645750045216, + "learning_rate": 7.733083969118759e-07, + "loss": 1.3699, + "step": 536 + }, + { + "epoch": 2.712121212121212, + "grad_norm": 0.3265957901814612, + "learning_rate": 7.470941707162226e-07, + "loss": 1.3797, + "step": 537 + }, + { + "epoch": 2.717171717171717, + "grad_norm": 0.3209125002172067, + "learning_rate": 7.213206060592064e-07, + "loss": 1.3694, + "step": 538 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.32513518265436614, + "learning_rate": 6.959884997901705e-07, + "loss": 1.2818, + "step": 539 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.31979488074104667, + "learning_rate": 6.710986351097493e-07, + "loss": 1.2585, + "step": 540 + }, + { + "epoch": 2.7323232323232323, + "grad_norm": 0.323915220461446, + "learning_rate": 6.466517815456607e-07, + "loss": 1.2766, + "step": 541 + }, + { + "epoch": 2.7373737373737375, + "grad_norm": 0.3272707694127074, + "learning_rate": 6.226486949289079e-07, + "loss": 1.2811, + "step": 542 + }, + { + "epoch": 2.742424242424242, + "grad_norm": 0.32467673338245945, + "learning_rate": 5.990901173704083e-07, + "loss": 1.3697, + "step": 543 + }, + { + "epoch": 2.7474747474747474, + "grad_norm": 0.331433447678859, + "learning_rate": 5.759767772380648e-07, + "loss": 1.4064, + "step": 544 + }, + { + "epoch": 2.7525252525252526, + "grad_norm": 0.3439032539794961, + "learning_rate": 5.533093891342262e-07, + "loss": 1.3781, + "step": 545 + }, + { + "epoch": 2.757575757575758, + "grad_norm": 0.3223680031666151, + "learning_rate": 5.310886538736037e-07, + "loss": 1.3459, + "step": 546 + }, + { + "epoch": 2.7626262626262625, + "grad_norm": 0.3219629931268615, + "learning_rate": 5.093152584616101e-07, + "loss": 1.3199, + "step": 547 + }, + { + "epoch": 2.7676767676767677, + "grad_norm": 0.30885242518385303, + "learning_rate": 4.879898760731028e-07, + "loss": 1.3183, + "step": 548 + }, + { + "epoch": 2.7727272727272725, + "grad_norm": 0.3671027111648377, + "learning_rate": 4.6711316603159084e-07, + "loss": 1.3361, + "step": 549 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.31733325019140657, + "learning_rate": 4.46685773788833e-07, + "loss": 1.3552, + "step": 550 + }, + { + "epoch": 2.7777777777777777, + "eval_loss": 1.5522303581237793, + "eval_runtime": 230.1843, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 0.778, + "step": 550 + }, + { + "epoch": 2.782828282828283, + "grad_norm": 0.3200707645533683, + "learning_rate": 4.2670833090489193e-07, + "loss": 1.2946, + "step": 551 + }, + { + "epoch": 2.787878787878788, + "grad_norm": 0.331980900223843, + "learning_rate": 4.0718145502860627e-07, + "loss": 1.3345, + "step": 552 + }, + { + "epoch": 2.792929292929293, + "grad_norm": 0.3282199495755412, + "learning_rate": 3.8810574987849126e-07, + "loss": 1.3024, + "step": 553 + }, + { + "epoch": 2.797979797979798, + "grad_norm": 0.3175787008542941, + "learning_rate": 3.6948180522408006e-07, + "loss": 1.3115, + "step": 554 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.3179435007333598, + "learning_rate": 3.513101968676802e-07, + "loss": 1.3098, + "step": 555 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 0.33169394245798245, + "learning_rate": 3.3359148662658103e-07, + "loss": 1.3372, + "step": 556 + }, + { + "epoch": 2.813131313131313, + "grad_norm": 0.33115578745917224, + "learning_rate": 3.163262223156793e-07, + "loss": 1.3867, + "step": 557 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 0.31264293668678106, + "learning_rate": 2.9951493773053953e-07, + "loss": 1.3085, + "step": 558 + }, + { + "epoch": 2.823232323232323, + "grad_norm": 0.3401717880353158, + "learning_rate": 2.831581526308935e-07, + "loss": 1.3873, + "step": 559 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.33495812017470433, + "learning_rate": 2.6725637272457527e-07, + "loss": 1.4253, + "step": 560 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.33801260984820947, + "learning_rate": 2.518100896518766e-07, + "loss": 1.3547, + "step": 561 + }, + { + "epoch": 2.8383838383838382, + "grad_norm": 0.3348385881354705, + "learning_rate": 2.3681978097035263e-07, + "loss": 1.4229, + "step": 562 + }, + { + "epoch": 2.8434343434343434, + "grad_norm": 0.31719634542323716, + "learning_rate": 2.222859101400554e-07, + "loss": 1.2818, + "step": 563 + }, + { + "epoch": 2.8484848484848486, + "grad_norm": 0.3183857279402652, + "learning_rate": 2.0820892650920686e-07, + "loss": 1.3834, + "step": 564 + }, + { + "epoch": 2.8535353535353534, + "grad_norm": 0.37637626910255345, + "learning_rate": 1.9458926530030675e-07, + "loss": 1.3882, + "step": 565 + }, + { + "epoch": 2.8585858585858586, + "grad_norm": 0.30878369912934317, + "learning_rate": 1.8142734759666823e-07, + "loss": 1.2498, + "step": 566 + }, + { + "epoch": 2.8636363636363638, + "grad_norm": 0.328503940896062, + "learning_rate": 1.6872358032941014e-07, + "loss": 1.3391, + "step": 567 + }, + { + "epoch": 2.8686868686868685, + "grad_norm": 0.3185916122047658, + "learning_rate": 1.5647835626487194e-07, + "loss": 1.3697, + "step": 568 + }, + { + "epoch": 2.8737373737373737, + "grad_norm": 0.333090152305754, + "learning_rate": 1.4469205399246844e-07, + "loss": 1.2437, + "step": 569 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.31288406908685207, + "learning_rate": 1.3336503791298582e-07, + "loss": 1.2924, + "step": 570 + }, + { + "epoch": 2.883838383838384, + "grad_norm": 0.3096586778222418, + "learning_rate": 1.2249765822731574e-07, + "loss": 1.3853, + "step": 571 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.3310184954270413, + "learning_rate": 1.1209025092562719e-07, + "loss": 1.3591, + "step": 572 + }, + { + "epoch": 2.893939393939394, + "grad_norm": 0.312664849919579, + "learning_rate": 1.0214313777698325e-07, + "loss": 1.3503, + "step": 573 + }, + { + "epoch": 2.898989898989899, + "grad_norm": 0.3232002984438332, + "learning_rate": 9.265662631938399e-08, + "loss": 1.323, + "step": 574 + }, + { + "epoch": 2.904040404040404, + "grad_norm": 0.32862385769715274, + "learning_rate": 8.363100985026406e-08, + "loss": 1.3899, + "step": 575 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.31778651266022473, + "learning_rate": 7.506656741743001e-08, + "loss": 1.3727, + "step": 576 + }, + { + "epoch": 2.9141414141414144, + "grad_norm": 0.31945025164654717, + "learning_rate": 6.696356381041712e-08, + "loss": 1.3538, + "step": 577 + }, + { + "epoch": 2.919191919191919, + "grad_norm": 0.32448621680923295, + "learning_rate": 5.932224955231935e-08, + "loss": 1.2681, + "step": 578 + }, + { + "epoch": 2.9242424242424243, + "grad_norm": 0.3255830375698755, + "learning_rate": 5.214286089203546e-08, + "loss": 1.2082, + "step": 579 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 0.3230864932191129, + "learning_rate": 4.542561979696491e-08, + "loss": 1.3078, + "step": 580 + }, + { + "epoch": 2.9343434343434343, + "grad_norm": 0.32044440900847904, + "learning_rate": 3.917073394614667e-08, + "loss": 1.2221, + "step": 581 + }, + { + "epoch": 2.9393939393939394, + "grad_norm": 0.33325632897727475, + "learning_rate": 3.337839672384102e-08, + "loss": 1.4102, + "step": 582 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3376927329733297, + "learning_rate": 2.8048787213544315e-08, + "loss": 1.3118, + "step": 583 + }, + { + "epoch": 2.9494949494949494, + "grad_norm": 0.33092086824129646, + "learning_rate": 2.3182070192460104e-08, + "loss": 1.3261, + "step": 584 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.3317204876226306, + "learning_rate": 1.8778396126393206e-08, + "loss": 1.2283, + "step": 585 + }, + { + "epoch": 2.9595959595959593, + "grad_norm": 0.32504734717078215, + "learning_rate": 1.483790116511341e-08, + "loss": 1.3745, + "step": 586 + }, + { + "epoch": 2.9646464646464645, + "grad_norm": 0.3417117505937822, + "learning_rate": 1.1360707138130532e-08, + "loss": 1.3584, + "step": 587 + }, + { + "epoch": 2.9696969696969697, + "grad_norm": 0.3322967272336344, + "learning_rate": 8.346921550939079e-09, + "loss": 1.3358, + "step": 588 + }, + { + "epoch": 2.974747474747475, + "grad_norm": 0.32207157482687426, + "learning_rate": 5.796637581689246e-09, + "loss": 1.3833, + "step": 589 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 0.3127525864356668, + "learning_rate": 3.709934078307553e-09, + "loss": 1.3835, + "step": 590 + }, + { + "epoch": 2.984848484848485, + "grad_norm": 0.32045284467637863, + "learning_rate": 2.0868755560538023e-09, + "loss": 1.3423, + "step": 591 + }, + { + "epoch": 2.98989898989899, + "grad_norm": 0.3197154903249698, + "learning_rate": 9.275121955393262e-10, + "loss": 1.2623, + "step": 592 + }, + { + "epoch": 2.994949494949495, + "grad_norm": 0.33061919914451243, + "learning_rate": 2.318798411599099e-10, + "loss": 1.2716, + "step": 593 + }, + { + "epoch": 3.0, + "grad_norm": 0.3257190208394002, + "learning_rate": 0.0, + "loss": 1.4233, + "step": 594 + } + ], + "logging_steps": 1, + "max_steps": 594, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.5937910249441198e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}