{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4284490145672665, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000856898029134533, "grad_norm": 0.4453125, "learning_rate": 5e-05, "loss": 3.6595, "step": 1 }, { "epoch": 0.001713796058269066, "grad_norm": 0.4296875, "learning_rate": 0.0001, "loss": 3.6973, "step": 2 }, { "epoch": 0.002570694087403599, "grad_norm": 0.453125, "learning_rate": 0.00015, "loss": 3.6343, "step": 3 }, { "epoch": 0.003427592116538132, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 3.3538, "step": 4 }, { "epoch": 0.004284490145672665, "grad_norm": 0.4140625, "learning_rate": 0.00025, "loss": 3.2142, "step": 5 }, { "epoch": 0.005141388174807198, "grad_norm": 0.384765625, "learning_rate": 0.0003, "loss": 2.9167, "step": 6 }, { "epoch": 0.005998286203941731, "grad_norm": 0.349609375, "learning_rate": 0.00035, "loss": 2.7017, "step": 7 }, { "epoch": 0.006855184233076264, "grad_norm": 0.306640625, "learning_rate": 0.0004, "loss": 2.4232, "step": 8 }, { "epoch": 0.007712082262210797, "grad_norm": 0.25390625, "learning_rate": 0.00045000000000000004, "loss": 2.1348, "step": 9 }, { "epoch": 0.00856898029134533, "grad_norm": 0.24609375, "learning_rate": 0.0005, "loss": 2.0355, "step": 10 }, { "epoch": 0.009425878320479864, "grad_norm": 0.28515625, "learning_rate": 0.00055, "loss": 2.0537, "step": 11 }, { "epoch": 0.010282776349614395, "grad_norm": 0.322265625, "learning_rate": 0.0006, "loss": 2.0642, "step": 12 }, { "epoch": 0.011139674378748929, "grad_norm": 0.3203125, "learning_rate": 0.0006500000000000001, "loss": 2.0116, "step": 13 }, { "epoch": 0.011996572407883462, "grad_norm": 0.3125, "learning_rate": 0.0007, "loss": 2.0132, "step": 14 }, { "epoch": 0.012853470437017995, "grad_norm": 0.283203125, "learning_rate": 0.00075, "loss": 1.9433, "step": 15 }, { "epoch": 0.013710368466152529, "grad_norm": 0.2578125, "learning_rate": 0.0008, "loss": 1.774, "step": 16 }, { "epoch": 0.01456726649528706, "grad_norm": 0.236328125, "learning_rate": 0.00085, "loss": 1.7933, "step": 17 }, { "epoch": 0.015424164524421594, "grad_norm": 0.240234375, "learning_rate": 0.0009000000000000001, "loss": 1.6388, "step": 18 }, { "epoch": 0.016281062553556127, "grad_norm": 0.240234375, "learning_rate": 0.00095, "loss": 1.6048, "step": 19 }, { "epoch": 0.01713796058269066, "grad_norm": 0.23046875, "learning_rate": 0.001, "loss": 1.5744, "step": 20 }, { "epoch": 0.017994858611825194, "grad_norm": 0.232421875, "learning_rate": 0.0009999892908320648, "loss": 1.4603, "step": 21 }, { "epoch": 0.018851756640959727, "grad_norm": 0.201171875, "learning_rate": 0.0009999571637870036, "loss": 1.4607, "step": 22 }, { "epoch": 0.01970865467009426, "grad_norm": 0.2021484375, "learning_rate": 0.0009999036202410325, "loss": 1.3065, "step": 23 }, { "epoch": 0.02056555269922879, "grad_norm": 0.185546875, "learning_rate": 0.0009998286624877785, "loss": 1.2436, "step": 24 }, { "epoch": 0.021422450728363324, "grad_norm": 0.1787109375, "learning_rate": 0.0009997322937381828, "loss": 1.1979, "step": 25 }, { "epoch": 0.022279348757497857, "grad_norm": 0.1767578125, "learning_rate": 0.0009996145181203615, "loss": 1.1425, "step": 26 }, { "epoch": 0.02313624678663239, "grad_norm": 0.154296875, "learning_rate": 0.00099947534067943, "loss": 1.0843, "step": 27 }, { "epoch": 0.023993144815766924, "grad_norm": 0.1591796875, "learning_rate": 0.0009993147673772868, "loss": 1.12, "step": 28 }, { "epoch": 0.024850042844901457, "grad_norm": 0.154296875, "learning_rate": 0.000999132805092358, "loss": 1.0019, "step": 29 }, { "epoch": 0.02570694087403599, "grad_norm": 0.142578125, "learning_rate": 0.0009989294616193018, "loss": 1.0082, "step": 30 }, { "epoch": 0.026563838903170524, "grad_norm": 0.1376953125, "learning_rate": 0.000998704745668676, "loss": 0.9653, "step": 31 }, { "epoch": 0.027420736932305057, "grad_norm": 0.1337890625, "learning_rate": 0.000998458666866564, "loss": 0.9137, "step": 32 }, { "epoch": 0.028277634961439587, "grad_norm": 0.130859375, "learning_rate": 0.0009981912357541628, "loss": 0.8958, "step": 33 }, { "epoch": 0.02913453299057412, "grad_norm": 0.125, "learning_rate": 0.0009979024637873308, "loss": 0.8628, "step": 34 }, { "epoch": 0.029991431019708654, "grad_norm": 0.12060546875, "learning_rate": 0.0009975923633360985, "loss": 0.8233, "step": 35 }, { "epoch": 0.030848329048843187, "grad_norm": 0.1083984375, "learning_rate": 0.0009972609476841367, "loss": 0.8045, "step": 36 }, { "epoch": 0.031705227077977724, "grad_norm": 0.1171875, "learning_rate": 0.0009969082310281891, "loss": 0.7961, "step": 37 }, { "epoch": 0.032562125107112254, "grad_norm": 0.10791015625, "learning_rate": 0.0009965342284774632, "loss": 0.7864, "step": 38 }, { "epoch": 0.033419023136246784, "grad_norm": 0.103515625, "learning_rate": 0.0009961389560529835, "loss": 0.7664, "step": 39 }, { "epoch": 0.03427592116538132, "grad_norm": 0.1015625, "learning_rate": 0.0009957224306869053, "loss": 0.7723, "step": 40 }, { "epoch": 0.03513281919451585, "grad_norm": 0.09765625, "learning_rate": 0.0009952846702217886, "loss": 0.7501, "step": 41 }, { "epoch": 0.03598971722365039, "grad_norm": 0.09228515625, "learning_rate": 0.0009948256934098352, "loss": 0.6932, "step": 42 }, { "epoch": 0.03684661525278492, "grad_norm": 0.09130859375, "learning_rate": 0.0009943455199120836, "loss": 0.675, "step": 43 }, { "epoch": 0.037703513281919454, "grad_norm": 0.09033203125, "learning_rate": 0.0009938441702975688, "loss": 0.6838, "step": 44 }, { "epoch": 0.038560411311053984, "grad_norm": 0.08935546875, "learning_rate": 0.0009933216660424397, "loss": 0.6546, "step": 45 }, { "epoch": 0.03941730934018852, "grad_norm": 0.08349609375, "learning_rate": 0.0009927780295290389, "loss": 0.6443, "step": 46 }, { "epoch": 0.04027420736932305, "grad_norm": 0.0791015625, "learning_rate": 0.0009922132840449458, "loss": 0.6705, "step": 47 }, { "epoch": 0.04113110539845758, "grad_norm": 0.08251953125, "learning_rate": 0.0009916274537819774, "loss": 0.6176, "step": 48 }, { "epoch": 0.04198800342759212, "grad_norm": 0.07568359375, "learning_rate": 0.000991020563835152, "loss": 0.683, "step": 49 }, { "epoch": 0.04284490145672665, "grad_norm": 0.07861328125, "learning_rate": 0.0009903926402016153, "loss": 0.5799, "step": 50 }, { "epoch": 0.043701799485861184, "grad_norm": 0.07470703125, "learning_rate": 0.0009897437097795257, "loss": 0.6293, "step": 51 }, { "epoch": 0.044558697514995714, "grad_norm": 0.0693359375, "learning_rate": 0.0009890738003669028, "loss": 0.5864, "step": 52 }, { "epoch": 0.04541559554413025, "grad_norm": 0.0673828125, "learning_rate": 0.0009883829406604362, "loss": 0.5672, "step": 53 }, { "epoch": 0.04627249357326478, "grad_norm": 0.06884765625, "learning_rate": 0.0009876711602542563, "loss": 0.607, "step": 54 }, { "epoch": 0.04712939160239932, "grad_norm": 0.08251953125, "learning_rate": 0.0009869384896386668, "loss": 0.6006, "step": 55 }, { "epoch": 0.04798628963153385, "grad_norm": 0.0625, "learning_rate": 0.0009861849601988384, "loss": 0.536, "step": 56 }, { "epoch": 0.04884318766066838, "grad_norm": 0.05810546875, "learning_rate": 0.0009854106042134641, "loss": 0.5153, "step": 57 }, { "epoch": 0.049700085689802914, "grad_norm": 0.0615234375, "learning_rate": 0.0009846154548533773, "loss": 0.5317, "step": 58 }, { "epoch": 0.050556983718937444, "grad_norm": 0.07861328125, "learning_rate": 0.0009837995461801298, "loss": 0.5354, "step": 59 }, { "epoch": 0.05141388174807198, "grad_norm": 0.06005859375, "learning_rate": 0.0009829629131445341, "loss": 0.5109, "step": 60 }, { "epoch": 0.05227077977720651, "grad_norm": 0.0634765625, "learning_rate": 0.0009821055915851646, "loss": 0.5122, "step": 61 }, { "epoch": 0.05312767780634105, "grad_norm": 0.0634765625, "learning_rate": 0.0009812276182268236, "loss": 0.5057, "step": 62 }, { "epoch": 0.05398457583547558, "grad_norm": 0.058349609375, "learning_rate": 0.0009803290306789677, "loss": 0.4955, "step": 63 }, { "epoch": 0.054841473864610114, "grad_norm": 0.056396484375, "learning_rate": 0.0009794098674340967, "loss": 0.4997, "step": 64 }, { "epoch": 0.055698371893744644, "grad_norm": 0.054931640625, "learning_rate": 0.0009784701678661044, "loss": 0.4673, "step": 65 }, { "epoch": 0.056555269922879174, "grad_norm": 0.058837890625, "learning_rate": 0.0009775099722285933, "loss": 0.4822, "step": 66 }, { "epoch": 0.05741216795201371, "grad_norm": 0.060546875, "learning_rate": 0.0009765293216531485, "loss": 0.4716, "step": 67 }, { "epoch": 0.05826906598114824, "grad_norm": 0.05078125, "learning_rate": 0.0009755282581475768, "loss": 0.463, "step": 68 }, { "epoch": 0.05912596401028278, "grad_norm": 0.052734375, "learning_rate": 0.000974506824594107, "loss": 0.461, "step": 69 }, { "epoch": 0.05998286203941731, "grad_norm": 0.06396484375, "learning_rate": 0.0009734650647475529, "loss": 0.4503, "step": 70 }, { "epoch": 0.060839760068551844, "grad_norm": 0.0478515625, "learning_rate": 0.0009724030232334391, "loss": 0.4586, "step": 71 }, { "epoch": 0.061696658097686374, "grad_norm": 0.047119140625, "learning_rate": 0.0009713207455460893, "loss": 0.4326, "step": 72 }, { "epoch": 0.06255355612682091, "grad_norm": 0.0478515625, "learning_rate": 0.0009702182780466775, "loss": 0.4312, "step": 73 }, { "epoch": 0.06341045415595545, "grad_norm": 0.0458984375, "learning_rate": 0.0009690956679612422, "loss": 0.4472, "step": 74 }, { "epoch": 0.06426735218508997, "grad_norm": 0.0556640625, "learning_rate": 0.0009679529633786629, "loss": 0.4427, "step": 75 }, { "epoch": 0.06512425021422451, "grad_norm": 0.04736328125, "learning_rate": 0.0009667902132486009, "loss": 0.4308, "step": 76 }, { "epoch": 0.06598114824335904, "grad_norm": 0.052490234375, "learning_rate": 0.0009656074673794017, "loss": 0.431, "step": 77 }, { "epoch": 0.06683804627249357, "grad_norm": 0.0458984375, "learning_rate": 0.0009644047764359622, "loss": 0.4219, "step": 78 }, { "epoch": 0.0676949443016281, "grad_norm": 0.0478515625, "learning_rate": 0.0009631821919375591, "loss": 0.413, "step": 79 }, { "epoch": 0.06855184233076264, "grad_norm": 0.042724609375, "learning_rate": 0.0009619397662556434, "loss": 0.4065, "step": 80 }, { "epoch": 0.06940874035989718, "grad_norm": 0.046875, "learning_rate": 0.0009606775526115963, "loss": 0.447, "step": 81 }, { "epoch": 0.0702656383890317, "grad_norm": 0.04638671875, "learning_rate": 0.0009593956050744492, "loss": 0.4243, "step": 82 }, { "epoch": 0.07112253641816624, "grad_norm": 0.0439453125, "learning_rate": 0.0009580939785585681, "loss": 0.4003, "step": 83 }, { "epoch": 0.07197943444730077, "grad_norm": 0.046630859375, "learning_rate": 0.0009567727288213005, "loss": 0.4098, "step": 84 }, { "epoch": 0.0728363324764353, "grad_norm": 0.041015625, "learning_rate": 0.000955431912460588, "loss": 0.415, "step": 85 }, { "epoch": 0.07369323050556983, "grad_norm": 0.040771484375, "learning_rate": 0.0009540715869125407, "loss": 0.4239, "step": 86 }, { "epoch": 0.07455012853470437, "grad_norm": 0.043701171875, "learning_rate": 0.0009526918104489777, "loss": 0.4058, "step": 87 }, { "epoch": 0.07540702656383891, "grad_norm": 0.043701171875, "learning_rate": 0.0009512926421749304, "loss": 0.3894, "step": 88 }, { "epoch": 0.07626392459297343, "grad_norm": 0.0390625, "learning_rate": 0.0009498741420261108, "loss": 0.389, "step": 89 }, { "epoch": 0.07712082262210797, "grad_norm": 0.04736328125, "learning_rate": 0.0009484363707663442, "loss": 0.3865, "step": 90 }, { "epoch": 0.0779777206512425, "grad_norm": 0.048828125, "learning_rate": 0.0009469793899849661, "loss": 0.3823, "step": 91 }, { "epoch": 0.07883461868037704, "grad_norm": 0.041748046875, "learning_rate": 0.0009455032620941839, "loss": 0.3963, "step": 92 }, { "epoch": 0.07969151670951156, "grad_norm": 0.052001953125, "learning_rate": 0.0009440080503264037, "loss": 0.382, "step": 93 }, { "epoch": 0.0805484147386461, "grad_norm": 0.04296875, "learning_rate": 0.0009424938187315209, "loss": 0.3723, "step": 94 }, { "epoch": 0.08140531276778064, "grad_norm": 0.038818359375, "learning_rate": 0.0009409606321741775, "loss": 0.3766, "step": 95 }, { "epoch": 0.08226221079691516, "grad_norm": 0.038330078125, "learning_rate": 0.0009394085563309827, "loss": 0.3798, "step": 96 }, { "epoch": 0.0831191088260497, "grad_norm": 0.053466796875, "learning_rate": 0.0009378376576876999, "loss": 0.386, "step": 97 }, { "epoch": 0.08397600685518423, "grad_norm": 0.03759765625, "learning_rate": 0.0009362480035363986, "loss": 0.4009, "step": 98 }, { "epoch": 0.08483290488431877, "grad_norm": 0.05029296875, "learning_rate": 0.0009346396619725719, "loss": 0.3651, "step": 99 }, { "epoch": 0.0856898029134533, "grad_norm": 0.03759765625, "learning_rate": 0.0009330127018922195, "loss": 0.3922, "step": 100 }, { "epoch": 0.08654670094258783, "grad_norm": 0.036865234375, "learning_rate": 0.0009313671929888959, "loss": 0.3604, "step": 101 }, { "epoch": 0.08740359897172237, "grad_norm": 0.037353515625, "learning_rate": 0.0009297032057507264, "loss": 0.3547, "step": 102 }, { "epoch": 0.08826049700085689, "grad_norm": 0.04541015625, "learning_rate": 0.0009280208114573858, "loss": 0.3611, "step": 103 }, { "epoch": 0.08911739502999143, "grad_norm": 0.036376953125, "learning_rate": 0.0009263200821770461, "loss": 0.3789, "step": 104 }, { "epoch": 0.08997429305912596, "grad_norm": 0.035400390625, "learning_rate": 0.0009246010907632895, "loss": 0.3512, "step": 105 }, { "epoch": 0.0908311910882605, "grad_norm": 0.035400390625, "learning_rate": 0.0009228639108519867, "loss": 0.3634, "step": 106 }, { "epoch": 0.09168808911739502, "grad_norm": 0.03759765625, "learning_rate": 0.0009211086168581433, "loss": 0.3509, "step": 107 }, { "epoch": 0.09254498714652956, "grad_norm": 0.03759765625, "learning_rate": 0.0009193352839727121, "loss": 0.3474, "step": 108 }, { "epoch": 0.0934018851756641, "grad_norm": 0.036376953125, "learning_rate": 0.0009175439881593715, "loss": 0.3742, "step": 109 }, { "epoch": 0.09425878320479864, "grad_norm": 0.033447265625, "learning_rate": 0.0009157348061512727, "loss": 0.3422, "step": 110 }, { "epoch": 0.09511568123393316, "grad_norm": 0.043212890625, "learning_rate": 0.0009139078154477511, "loss": 0.3379, "step": 111 }, { "epoch": 0.0959725792630677, "grad_norm": 0.03662109375, "learning_rate": 0.0009120630943110077, "loss": 0.3374, "step": 112 }, { "epoch": 0.09682947729220223, "grad_norm": 0.03125, "learning_rate": 0.0009102007217627568, "loss": 0.3629, "step": 113 }, { "epoch": 0.09768637532133675, "grad_norm": 0.043701171875, "learning_rate": 0.0009083207775808396, "loss": 0.3537, "step": 114 }, { "epoch": 0.09854327335047129, "grad_norm": 0.0439453125, "learning_rate": 0.0009064233422958076, "loss": 0.3473, "step": 115 }, { "epoch": 0.09940017137960583, "grad_norm": 0.035888671875, "learning_rate": 0.0009045084971874737, "loss": 0.3549, "step": 116 }, { "epoch": 0.10025706940874037, "grad_norm": 0.03271484375, "learning_rate": 0.0009025763242814291, "loss": 0.3407, "step": 117 }, { "epoch": 0.10111396743787489, "grad_norm": 0.03271484375, "learning_rate": 0.0009006269063455304, "loss": 0.3304, "step": 118 }, { "epoch": 0.10197086546700942, "grad_norm": 0.033935546875, "learning_rate": 0.0008986603268863536, "loss": 0.3473, "step": 119 }, { "epoch": 0.10282776349614396, "grad_norm": 0.03857421875, "learning_rate": 0.0008966766701456176, "loss": 0.3376, "step": 120 }, { "epoch": 0.1036846615252785, "grad_norm": 0.03466796875, "learning_rate": 0.000894676021096575, "loss": 0.3262, "step": 121 }, { "epoch": 0.10454155955441302, "grad_norm": 0.03857421875, "learning_rate": 0.0008926584654403724, "loss": 0.3222, "step": 122 }, { "epoch": 0.10539845758354756, "grad_norm": 0.03515625, "learning_rate": 0.0008906240896023794, "loss": 0.3278, "step": 123 }, { "epoch": 0.1062553556126821, "grad_norm": 0.035888671875, "learning_rate": 0.0008885729807284854, "loss": 0.3251, "step": 124 }, { "epoch": 0.10711225364181662, "grad_norm": 0.033447265625, "learning_rate": 0.0008865052266813684, "loss": 0.3267, "step": 125 }, { "epoch": 0.10796915167095116, "grad_norm": 0.034912109375, "learning_rate": 0.0008844209160367298, "loss": 0.3176, "step": 126 }, { "epoch": 0.10882604970008569, "grad_norm": 0.040283203125, "learning_rate": 0.0008823201380795002, "loss": 0.3374, "step": 127 }, { "epoch": 0.10968294772922023, "grad_norm": 0.032470703125, "learning_rate": 0.0008802029828000156, "loss": 0.314, "step": 128 }, { "epoch": 0.11053984575835475, "grad_norm": 0.033935546875, "learning_rate": 0.0008780695408901613, "loss": 0.324, "step": 129 }, { "epoch": 0.11139674378748929, "grad_norm": 0.032958984375, "learning_rate": 0.0008759199037394887, "loss": 0.3199, "step": 130 }, { "epoch": 0.11225364181662383, "grad_norm": 0.031494140625, "learning_rate": 0.0008737541634312985, "loss": 0.3034, "step": 131 }, { "epoch": 0.11311053984575835, "grad_norm": 0.0322265625, "learning_rate": 0.0008715724127386971, "loss": 0.3153, "step": 132 }, { "epoch": 0.11396743787489289, "grad_norm": 0.034423828125, "learning_rate": 0.0008693747451206231, "loss": 0.3202, "step": 133 }, { "epoch": 0.11482433590402742, "grad_norm": 0.03369140625, "learning_rate": 0.0008671612547178428, "loss": 0.3325, "step": 134 }, { "epoch": 0.11568123393316196, "grad_norm": 0.043701171875, "learning_rate": 0.0008649320363489178, "loss": 0.3207, "step": 135 }, { "epoch": 0.11653813196229648, "grad_norm": 0.031982421875, "learning_rate": 0.0008626871855061438, "loss": 0.3279, "step": 136 }, { "epoch": 0.11739502999143102, "grad_norm": 0.033935546875, "learning_rate": 0.0008604267983514594, "loss": 0.3236, "step": 137 }, { "epoch": 0.11825192802056556, "grad_norm": 0.03076171875, "learning_rate": 0.0008581509717123273, "loss": 0.315, "step": 138 }, { "epoch": 0.11910882604970009, "grad_norm": 0.032958984375, "learning_rate": 0.0008558598030775857, "loss": 0.3124, "step": 139 }, { "epoch": 0.11996572407883462, "grad_norm": 0.04052734375, "learning_rate": 0.0008535533905932737, "loss": 0.3064, "step": 140 }, { "epoch": 0.12082262210796915, "grad_norm": 0.031494140625, "learning_rate": 0.0008512318330584259, "loss": 0.3055, "step": 141 }, { "epoch": 0.12167952013710369, "grad_norm": 0.03515625, "learning_rate": 0.0008488952299208401, "loss": 0.2951, "step": 142 }, { "epoch": 0.12253641816623821, "grad_norm": 0.032958984375, "learning_rate": 0.000846543681272818, "loss": 0.3288, "step": 143 }, { "epoch": 0.12339331619537275, "grad_norm": 0.032470703125, "learning_rate": 0.000844177287846877, "loss": 0.3015, "step": 144 }, { "epoch": 0.12425021422450729, "grad_norm": 0.033935546875, "learning_rate": 0.0008417961510114356, "loss": 0.3013, "step": 145 }, { "epoch": 0.12510711225364182, "grad_norm": 0.0341796875, "learning_rate": 0.0008394003727664709, "loss": 0.2914, "step": 146 }, { "epoch": 0.12596401028277635, "grad_norm": 0.0306396484375, "learning_rate": 0.000836990055739149, "loss": 0.3018, "step": 147 }, { "epoch": 0.1268209083119109, "grad_norm": 0.039794921875, "learning_rate": 0.0008345653031794292, "loss": 0.3074, "step": 148 }, { "epoch": 0.12767780634104542, "grad_norm": 0.03955078125, "learning_rate": 0.0008321262189556409, "loss": 0.3094, "step": 149 }, { "epoch": 0.12853470437017994, "grad_norm": 0.0311279296875, "learning_rate": 0.0008296729075500344, "loss": 0.2971, "step": 150 }, { "epoch": 0.1293916023993145, "grad_norm": 0.03125, "learning_rate": 0.0008272054740543053, "loss": 0.307, "step": 151 }, { "epoch": 0.13024850042844902, "grad_norm": 0.0390625, "learning_rate": 0.0008247240241650918, "loss": 0.2955, "step": 152 }, { "epoch": 0.13110539845758354, "grad_norm": 0.029541015625, "learning_rate": 0.0008222286641794488, "loss": 0.2955, "step": 153 }, { "epoch": 0.1319622964867181, "grad_norm": 0.0306396484375, "learning_rate": 0.0008197195009902923, "loss": 0.2904, "step": 154 }, { "epoch": 0.1328191945158526, "grad_norm": 0.0341796875, "learning_rate": 0.0008171966420818228, "loss": 0.3027, "step": 155 }, { "epoch": 0.13367609254498714, "grad_norm": 0.03515625, "learning_rate": 0.0008146601955249188, "loss": 0.2864, "step": 156 }, { "epoch": 0.13453299057412169, "grad_norm": 0.0361328125, "learning_rate": 0.0008121102699725089, "loss": 0.2965, "step": 157 }, { "epoch": 0.1353898886032562, "grad_norm": 0.041748046875, "learning_rate": 0.0008095469746549171, "loss": 0.3123, "step": 158 }, { "epoch": 0.13624678663239073, "grad_norm": 0.03125, "learning_rate": 0.0008069704193751832, "loss": 0.2912, "step": 159 }, { "epoch": 0.13710368466152528, "grad_norm": 0.033203125, "learning_rate": 0.0008043807145043603, "loss": 0.309, "step": 160 }, { "epoch": 0.1379605826906598, "grad_norm": 0.0517578125, "learning_rate": 0.0008017779709767858, "loss": 0.2938, "step": 161 }, { "epoch": 0.13881748071979436, "grad_norm": 0.031982421875, "learning_rate": 0.0007991623002853296, "loss": 0.2923, "step": 162 }, { "epoch": 0.13967437874892888, "grad_norm": 0.038330078125, "learning_rate": 0.0007965338144766185, "loss": 0.3003, "step": 163 }, { "epoch": 0.1405312767780634, "grad_norm": 0.033447265625, "learning_rate": 0.0007938926261462366, "loss": 0.2923, "step": 164 }, { "epoch": 0.14138817480719795, "grad_norm": 0.042236328125, "learning_rate": 0.0007912388484339011, "loss": 0.2892, "step": 165 }, { "epoch": 0.14224507283633248, "grad_norm": 0.032958984375, "learning_rate": 0.0007885725950186169, "loss": 0.3198, "step": 166 }, { "epoch": 0.143101970865467, "grad_norm": 0.037109375, "learning_rate": 0.000785893980113806, "loss": 0.2814, "step": 167 }, { "epoch": 0.14395886889460155, "grad_norm": 0.04833984375, "learning_rate": 0.0007832031184624164, "loss": 0.2911, "step": 168 }, { "epoch": 0.14481576692373607, "grad_norm": 0.0341796875, "learning_rate": 0.000780500125332005, "loss": 0.2893, "step": 169 }, { "epoch": 0.1456726649528706, "grad_norm": 0.034912109375, "learning_rate": 0.0007777851165098011, "loss": 0.2884, "step": 170 }, { "epoch": 0.14652956298200515, "grad_norm": 0.038818359375, "learning_rate": 0.0007750582082977468, "loss": 0.3052, "step": 171 }, { "epoch": 0.14738646101113967, "grad_norm": 0.0419921875, "learning_rate": 0.0007723195175075137, "loss": 0.2833, "step": 172 }, { "epoch": 0.14824335904027422, "grad_norm": 0.040283203125, "learning_rate": 0.0007695691614555002, "loss": 0.2795, "step": 173 }, { "epoch": 0.14910025706940874, "grad_norm": 0.038330078125, "learning_rate": 0.0007668072579578058, "loss": 0.3104, "step": 174 }, { "epoch": 0.14995715509854327, "grad_norm": 0.0322265625, "learning_rate": 0.000764033925325184, "loss": 0.2931, "step": 175 }, { "epoch": 0.15081405312767782, "grad_norm": 0.0361328125, "learning_rate": 0.0007612492823579744, "loss": 0.2867, "step": 176 }, { "epoch": 0.15167095115681234, "grad_norm": 0.0291748046875, "learning_rate": 0.0007584534483410137, "loss": 0.3051, "step": 177 }, { "epoch": 0.15252784918594686, "grad_norm": 0.0341796875, "learning_rate": 0.0007556465430385259, "loss": 0.2852, "step": 178 }, { "epoch": 0.1533847472150814, "grad_norm": 0.036865234375, "learning_rate": 0.0007528286866889924, "loss": 0.2919, "step": 179 }, { "epoch": 0.15424164524421594, "grad_norm": 0.0294189453125, "learning_rate": 0.00075, "loss": 0.2707, "step": 180 }, { "epoch": 0.15509854327335046, "grad_norm": 0.0277099609375, "learning_rate": 0.0007471606041430723, "loss": 0.275, "step": 181 }, { "epoch": 0.155955441302485, "grad_norm": 0.0361328125, "learning_rate": 0.0007443106207484776, "loss": 0.2718, "step": 182 }, { "epoch": 0.15681233933161953, "grad_norm": 0.031005859375, "learning_rate": 0.0007414501719000186, "loss": 0.2869, "step": 183 }, { "epoch": 0.15766923736075408, "grad_norm": 0.033203125, "learning_rate": 0.0007385793801298042, "loss": 0.275, "step": 184 }, { "epoch": 0.1585261353898886, "grad_norm": 0.0277099609375, "learning_rate": 0.000735698368412999, "loss": 0.2852, "step": 185 }, { "epoch": 0.15938303341902313, "grad_norm": 0.031005859375, "learning_rate": 0.0007328072601625557, "loss": 0.2959, "step": 186 }, { "epoch": 0.16023993144815768, "grad_norm": 0.045654296875, "learning_rate": 0.00072990617922393, "loss": 0.2681, "step": 187 }, { "epoch": 0.1610968294772922, "grad_norm": 0.034423828125, "learning_rate": 0.0007269952498697733, "loss": 0.2897, "step": 188 }, { "epoch": 0.16195372750642673, "grad_norm": 0.0281982421875, "learning_rate": 0.0007240745967946113, "loss": 0.2775, "step": 189 }, { "epoch": 0.16281062553556128, "grad_norm": 0.03564453125, "learning_rate": 0.0007211443451095007, "loss": 0.2692, "step": 190 }, { "epoch": 0.1636675235646958, "grad_norm": 0.03125, "learning_rate": 0.000718204620336671, "loss": 0.2847, "step": 191 }, { "epoch": 0.16452442159383032, "grad_norm": 0.03662109375, "learning_rate": 0.0007152555484041476, "loss": 0.2859, "step": 192 }, { "epoch": 0.16538131962296487, "grad_norm": 0.0299072265625, "learning_rate": 0.0007122972556403566, "loss": 0.2784, "step": 193 }, { "epoch": 0.1662382176520994, "grad_norm": 0.029052734375, "learning_rate": 0.0007093298687687141, "loss": 0.2801, "step": 194 }, { "epoch": 0.16709511568123395, "grad_norm": 0.027587890625, "learning_rate": 0.0007063535149021973, "loss": 0.2787, "step": 195 }, { "epoch": 0.16795201371036847, "grad_norm": 0.03173828125, "learning_rate": 0.0007033683215379002, "loss": 0.2796, "step": 196 }, { "epoch": 0.168808911739503, "grad_norm": 0.0291748046875, "learning_rate": 0.0007003744165515704, "loss": 0.2817, "step": 197 }, { "epoch": 0.16966580976863754, "grad_norm": 0.037109375, "learning_rate": 0.0006973719281921336, "loss": 0.2648, "step": 198 }, { "epoch": 0.17052270779777207, "grad_norm": 0.03173828125, "learning_rate": 0.0006943609850761978, "loss": 0.2822, "step": 199 }, { "epoch": 0.1713796058269066, "grad_norm": 0.029541015625, "learning_rate": 0.000691341716182545, "loss": 0.2867, "step": 200 }, { "epoch": 0.17223650385604114, "grad_norm": 0.03564453125, "learning_rate": 0.0006883142508466054, "loss": 0.2901, "step": 201 }, { "epoch": 0.17309340188517566, "grad_norm": 0.027099609375, "learning_rate": 0.0006852787187549182, "loss": 0.2644, "step": 202 }, { "epoch": 0.17395029991431019, "grad_norm": 0.037353515625, "learning_rate": 0.000682235249939575, "loss": 0.277, "step": 203 }, { "epoch": 0.17480719794344474, "grad_norm": 0.038818359375, "learning_rate": 0.0006791839747726501, "loss": 0.2932, "step": 204 }, { "epoch": 0.17566409597257926, "grad_norm": 0.03466796875, "learning_rate": 0.0006761250239606168, "loss": 0.2822, "step": 205 }, { "epoch": 0.17652099400171378, "grad_norm": 0.05322265625, "learning_rate": 0.0006730585285387465, "loss": 0.3618, "step": 206 }, { "epoch": 0.17737789203084833, "grad_norm": 0.0289306640625, "learning_rate": 0.000669984619865497, "loss": 0.2766, "step": 207 }, { "epoch": 0.17823479005998286, "grad_norm": 0.03759765625, "learning_rate": 0.0006669034296168854, "loss": 0.2795, "step": 208 }, { "epoch": 0.1790916880891174, "grad_norm": 0.03369140625, "learning_rate": 0.0006638150897808468, "loss": 0.2788, "step": 209 }, { "epoch": 0.17994858611825193, "grad_norm": 0.0322265625, "learning_rate": 0.0006607197326515808, "loss": 0.2795, "step": 210 }, { "epoch": 0.18080548414738645, "grad_norm": 0.027099609375, "learning_rate": 0.0006576174908238849, "loss": 0.2742, "step": 211 }, { "epoch": 0.181662382176521, "grad_norm": 0.029541015625, "learning_rate": 0.0006545084971874737, "loss": 0.2704, "step": 212 }, { "epoch": 0.18251928020565553, "grad_norm": 0.02734375, "learning_rate": 0.0006513928849212874, "loss": 0.2725, "step": 213 }, { "epoch": 0.18337617823479005, "grad_norm": 0.0400390625, "learning_rate": 0.0006482707874877854, "loss": 0.2742, "step": 214 }, { "epoch": 0.1842330762639246, "grad_norm": 0.0272216796875, "learning_rate": 0.0006451423386272311, "loss": 0.268, "step": 215 }, { "epoch": 0.18508997429305912, "grad_norm": 0.0308837890625, "learning_rate": 0.0006420076723519614, "loss": 0.2617, "step": 216 }, { "epoch": 0.18594687232219365, "grad_norm": 0.0264892578125, "learning_rate": 0.0006388669229406462, "loss": 0.2629, "step": 217 }, { "epoch": 0.1868037703513282, "grad_norm": 0.0286865234375, "learning_rate": 0.0006357202249325371, "loss": 0.2812, "step": 218 }, { "epoch": 0.18766066838046272, "grad_norm": 0.02685546875, "learning_rate": 0.000632567713121704, "loss": 0.2766, "step": 219 }, { "epoch": 0.18851756640959727, "grad_norm": 0.0291748046875, "learning_rate": 0.0006294095225512603, "loss": 0.2816, "step": 220 }, { "epoch": 0.1893744644387318, "grad_norm": 0.027099609375, "learning_rate": 0.000626245788507579, "loss": 0.2744, "step": 221 }, { "epoch": 0.19023136246786632, "grad_norm": 0.0281982421875, "learning_rate": 0.0006230766465144965, "loss": 0.2777, "step": 222 }, { "epoch": 0.19108826049700087, "grad_norm": 0.0341796875, "learning_rate": 0.0006199022323275083, "loss": 0.2632, "step": 223 }, { "epoch": 0.1919451585261354, "grad_norm": 0.0274658203125, "learning_rate": 0.0006167226819279528, "loss": 0.2759, "step": 224 }, { "epoch": 0.1928020565552699, "grad_norm": 0.026611328125, "learning_rate": 0.0006135381315171866, "loss": 0.2926, "step": 225 }, { "epoch": 0.19365895458440446, "grad_norm": 0.031494140625, "learning_rate": 0.0006103487175107507, "loss": 0.2759, "step": 226 }, { "epoch": 0.194515852613539, "grad_norm": 0.0274658203125, "learning_rate": 0.0006071545765325253, "loss": 0.2706, "step": 227 }, { "epoch": 0.1953727506426735, "grad_norm": 0.0390625, "learning_rate": 0.0006039558454088796, "loss": 0.2816, "step": 228 }, { "epoch": 0.19622964867180806, "grad_norm": 0.0272216796875, "learning_rate": 0.0006007526611628086, "loss": 0.2698, "step": 229 }, { "epoch": 0.19708654670094258, "grad_norm": 0.0272216796875, "learning_rate": 0.0005975451610080642, "loss": 0.2719, "step": 230 }, { "epoch": 0.19794344473007713, "grad_norm": 0.0361328125, "learning_rate": 0.0005943334823432777, "loss": 0.2647, "step": 231 }, { "epoch": 0.19880034275921166, "grad_norm": 0.029052734375, "learning_rate": 0.0005911177627460738, "loss": 0.2688, "step": 232 }, { "epoch": 0.19965724078834618, "grad_norm": 0.032470703125, "learning_rate": 0.0005878981399671774, "loss": 0.2762, "step": 233 }, { "epoch": 0.20051413881748073, "grad_norm": 0.029541015625, "learning_rate": 0.0005846747519245122, "loss": 0.2664, "step": 234 }, { "epoch": 0.20137103684661525, "grad_norm": 0.033203125, "learning_rate": 0.0005814477366972944, "loss": 0.2715, "step": 235 }, { "epoch": 0.20222793487574978, "grad_norm": 0.03955078125, "learning_rate": 0.0005782172325201155, "loss": 0.2728, "step": 236 }, { "epoch": 0.20308483290488433, "grad_norm": 0.0272216796875, "learning_rate": 0.0005749833777770225, "loss": 0.2638, "step": 237 }, { "epoch": 0.20394173093401885, "grad_norm": 0.026611328125, "learning_rate": 0.0005717463109955896, "loss": 0.271, "step": 238 }, { "epoch": 0.20479862896315337, "grad_norm": 0.042236328125, "learning_rate": 0.0005685061708409841, "loss": 0.2682, "step": 239 }, { "epoch": 0.20565552699228792, "grad_norm": 0.0274658203125, "learning_rate": 0.000565263096110026, "loss": 0.2635, "step": 240 }, { "epoch": 0.20651242502142245, "grad_norm": 0.02685546875, "learning_rate": 0.0005620172257252427, "loss": 0.2513, "step": 241 }, { "epoch": 0.207369323050557, "grad_norm": 0.02734375, "learning_rate": 0.0005587686987289189, "loss": 0.2672, "step": 242 }, { "epoch": 0.20822622107969152, "grad_norm": 0.0311279296875, "learning_rate": 0.0005555176542771388, "loss": 0.2777, "step": 243 }, { "epoch": 0.20908311910882604, "grad_norm": 0.0255126953125, "learning_rate": 0.0005522642316338268, "loss": 0.2669, "step": 244 }, { "epoch": 0.2099400171379606, "grad_norm": 0.0286865234375, "learning_rate": 0.0005490085701647804, "loss": 0.2708, "step": 245 }, { "epoch": 0.21079691516709512, "grad_norm": 0.0260009765625, "learning_rate": 0.0005457508093317013, "loss": 0.2727, "step": 246 }, { "epoch": 0.21165381319622964, "grad_norm": 0.02490234375, "learning_rate": 0.0005424910886862209, "loss": 0.2751, "step": 247 }, { "epoch": 0.2125107112253642, "grad_norm": 0.023193359375, "learning_rate": 0.0005392295478639225, "loss": 0.2649, "step": 248 }, { "epoch": 0.2133676092544987, "grad_norm": 0.023681640625, "learning_rate": 0.0005359663265783598, "loss": 0.2647, "step": 249 }, { "epoch": 0.21422450728363324, "grad_norm": 0.0283203125, "learning_rate": 0.0005327015646150716, "loss": 0.2594, "step": 250 }, { "epoch": 0.2150814053127678, "grad_norm": 0.026611328125, "learning_rate": 0.0005294354018255945, "loss": 0.2944, "step": 251 }, { "epoch": 0.2159383033419023, "grad_norm": 0.0257568359375, "learning_rate": 0.000526167978121472, "loss": 0.2886, "step": 252 }, { "epoch": 0.21679520137103683, "grad_norm": 0.0281982421875, "learning_rate": 0.0005228994334682604, "loss": 0.2558, "step": 253 }, { "epoch": 0.21765209940017138, "grad_norm": 0.025634765625, "learning_rate": 0.0005196299078795343, "loss": 0.2571, "step": 254 }, { "epoch": 0.2185089974293059, "grad_norm": 0.02685546875, "learning_rate": 0.0005163595414108881, "loss": 0.2524, "step": 255 }, { "epoch": 0.21936589545844046, "grad_norm": 0.0250244140625, "learning_rate": 0.0005130884741539367, "loss": 0.2698, "step": 256 }, { "epoch": 0.22022279348757498, "grad_norm": 0.024658203125, "learning_rate": 0.0005098168462303141, "loss": 0.2716, "step": 257 }, { "epoch": 0.2210796915167095, "grad_norm": 0.0238037109375, "learning_rate": 0.0005065447977856722, "loss": 0.2605, "step": 258 }, { "epoch": 0.22193658954584405, "grad_norm": 0.025146484375, "learning_rate": 0.0005032724689836759, "loss": 0.2584, "step": 259 }, { "epoch": 0.22279348757497858, "grad_norm": 0.025146484375, "learning_rate": 0.0005, "loss": 0.2618, "step": 260 }, { "epoch": 0.2236503856041131, "grad_norm": 0.0289306640625, "learning_rate": 0.0004967275310163241, "loss": 0.2602, "step": 261 }, { "epoch": 0.22450728363324765, "grad_norm": 0.031494140625, "learning_rate": 0.0004934552022143279, "loss": 0.2744, "step": 262 }, { "epoch": 0.22536418166238217, "grad_norm": 0.036865234375, "learning_rate": 0.0004901831537696859, "loss": 0.2598, "step": 263 }, { "epoch": 0.2262210796915167, "grad_norm": 0.0264892578125, "learning_rate": 0.0004869115258460635, "loss": 0.2629, "step": 264 }, { "epoch": 0.22707797772065125, "grad_norm": 0.0245361328125, "learning_rate": 0.00048364045858911197, "loss": 0.2601, "step": 265 }, { "epoch": 0.22793487574978577, "grad_norm": 0.035888671875, "learning_rate": 0.00048037009212046586, "loss": 0.261, "step": 266 }, { "epoch": 0.22879177377892032, "grad_norm": 0.03076171875, "learning_rate": 0.0004771005665317397, "loss": 0.2531, "step": 267 }, { "epoch": 0.22964867180805484, "grad_norm": 0.0250244140625, "learning_rate": 0.0004738320218785281, "loss": 0.2707, "step": 268 }, { "epoch": 0.23050556983718937, "grad_norm": 0.0240478515625, "learning_rate": 0.00047056459817440544, "loss": 0.2636, "step": 269 }, { "epoch": 0.23136246786632392, "grad_norm": 0.0264892578125, "learning_rate": 0.00046729843538492847, "loss": 0.2606, "step": 270 }, { "epoch": 0.23221936589545844, "grad_norm": 0.0286865234375, "learning_rate": 0.00046403367342164026, "loss": 0.257, "step": 271 }, { "epoch": 0.23307626392459296, "grad_norm": 0.028564453125, "learning_rate": 0.0004607704521360776, "loss": 0.2646, "step": 272 }, { "epoch": 0.23393316195372751, "grad_norm": 0.028076171875, "learning_rate": 0.0004575089113137792, "loss": 0.2735, "step": 273 }, { "epoch": 0.23479005998286204, "grad_norm": 0.0257568359375, "learning_rate": 0.00045424919066829885, "loss": 0.272, "step": 274 }, { "epoch": 0.23564695801199656, "grad_norm": 0.02587890625, "learning_rate": 0.0004509914298352197, "loss": 0.266, "step": 275 }, { "epoch": 0.2365038560411311, "grad_norm": 0.0242919921875, "learning_rate": 0.00044773576836617336, "loss": 0.2607, "step": 276 }, { "epoch": 0.23736075407026563, "grad_norm": 0.0245361328125, "learning_rate": 0.0004444823457228612, "loss": 0.2696, "step": 277 }, { "epoch": 0.23821765209940018, "grad_norm": 0.024658203125, "learning_rate": 0.00044123130127108126, "loss": 0.2598, "step": 278 }, { "epoch": 0.2390745501285347, "grad_norm": 0.0301513671875, "learning_rate": 0.0004379827742747575, "loss": 0.2581, "step": 279 }, { "epoch": 0.23993144815766923, "grad_norm": 0.0252685546875, "learning_rate": 0.00043473690388997434, "loss": 0.2652, "step": 280 }, { "epoch": 0.24078834618680378, "grad_norm": 0.024658203125, "learning_rate": 0.0004314938291590161, "loss": 0.2635, "step": 281 }, { "epoch": 0.2416452442159383, "grad_norm": 0.0223388671875, "learning_rate": 0.0004282536890044104, "loss": 0.2546, "step": 282 }, { "epoch": 0.24250214224507283, "grad_norm": 0.0234375, "learning_rate": 0.0004250166222229774, "loss": 0.2533, "step": 283 }, { "epoch": 0.24335904027420738, "grad_norm": 0.026123046875, "learning_rate": 0.0004217827674798845, "loss": 0.2712, "step": 284 }, { "epoch": 0.2442159383033419, "grad_norm": 0.024658203125, "learning_rate": 0.0004185522633027057, "loss": 0.2658, "step": 285 }, { "epoch": 0.24507283633247642, "grad_norm": 0.02587890625, "learning_rate": 0.0004153252480754877, "loss": 0.2588, "step": 286 }, { "epoch": 0.24592973436161097, "grad_norm": 0.029541015625, "learning_rate": 0.00041210186003282274, "loss": 0.2671, "step": 287 }, { "epoch": 0.2467866323907455, "grad_norm": 0.0245361328125, "learning_rate": 0.00040888223725392626, "loss": 0.2741, "step": 288 }, { "epoch": 0.24764353041988005, "grad_norm": 0.0242919921875, "learning_rate": 0.00040566651765672245, "loss": 0.27, "step": 289 }, { "epoch": 0.24850042844901457, "grad_norm": 0.02197265625, "learning_rate": 0.00040245483899193594, "loss": 0.2679, "step": 290 }, { "epoch": 0.2493573264781491, "grad_norm": 0.0224609375, "learning_rate": 0.00039924733883719147, "loss": 0.2685, "step": 291 }, { "epoch": 0.25021422450728364, "grad_norm": 0.0322265625, "learning_rate": 0.0003960441545911204, "loss": 0.2687, "step": 292 }, { "epoch": 0.25107112253641817, "grad_norm": 0.0281982421875, "learning_rate": 0.0003928454234674747, "loss": 0.2554, "step": 293 }, { "epoch": 0.2519280205655527, "grad_norm": 0.031494140625, "learning_rate": 0.0003896512824892495, "loss": 0.268, "step": 294 }, { "epoch": 0.2527849185946872, "grad_norm": 0.0296630859375, "learning_rate": 0.00038646186848281344, "loss": 0.2694, "step": 295 }, { "epoch": 0.2536418166238218, "grad_norm": 0.0283203125, "learning_rate": 0.00038327731807204744, "loss": 0.2585, "step": 296 }, { "epoch": 0.2544987146529563, "grad_norm": 0.0341796875, "learning_rate": 0.0003800977676724919, "loss": 0.2764, "step": 297 }, { "epoch": 0.25535561268209084, "grad_norm": 0.024658203125, "learning_rate": 0.0003769233534855035, "loss": 0.2688, "step": 298 }, { "epoch": 0.25621251071122536, "grad_norm": 0.0277099609375, "learning_rate": 0.00037375421149242103, "loss": 0.2561, "step": 299 }, { "epoch": 0.2570694087403599, "grad_norm": 0.0269775390625, "learning_rate": 0.0003705904774487396, "loss": 0.2564, "step": 300 }, { "epoch": 0.2579263067694944, "grad_norm": 0.0247802734375, "learning_rate": 0.0003674322868782959, "loss": 0.2543, "step": 301 }, { "epoch": 0.258783204798629, "grad_norm": 0.0255126953125, "learning_rate": 0.0003642797750674629, "loss": 0.2586, "step": 302 }, { "epoch": 0.2596401028277635, "grad_norm": 0.0228271484375, "learning_rate": 0.00036113307705935393, "loss": 0.2624, "step": 303 }, { "epoch": 0.26049700085689803, "grad_norm": 0.02197265625, "learning_rate": 0.0003579923276480387, "loss": 0.2658, "step": 304 }, { "epoch": 0.26135389888603255, "grad_norm": 0.0245361328125, "learning_rate": 0.0003548576613727689, "loss": 0.2793, "step": 305 }, { "epoch": 0.2622107969151671, "grad_norm": 0.031494140625, "learning_rate": 0.0003517292125122146, "loss": 0.2605, "step": 306 }, { "epoch": 0.26306769494430166, "grad_norm": 0.0341796875, "learning_rate": 0.0003486071150787128, "loss": 0.2654, "step": 307 }, { "epoch": 0.2639245929734362, "grad_norm": 0.0260009765625, "learning_rate": 0.00034549150281252633, "loss": 0.2711, "step": 308 }, { "epoch": 0.2647814910025707, "grad_norm": 0.0233154296875, "learning_rate": 0.0003423825091761153, "loss": 0.2686, "step": 309 }, { "epoch": 0.2656383890317052, "grad_norm": 0.0260009765625, "learning_rate": 0.0003392802673484193, "loss": 0.2539, "step": 310 }, { "epoch": 0.26649528706083975, "grad_norm": 0.023193359375, "learning_rate": 0.0003361849102191533, "loss": 0.2706, "step": 311 }, { "epoch": 0.26735218508997427, "grad_norm": 0.0260009765625, "learning_rate": 0.00033309657038311456, "loss": 0.2854, "step": 312 }, { "epoch": 0.26820908311910885, "grad_norm": 0.0235595703125, "learning_rate": 0.00033001538013450283, "loss": 0.2714, "step": 313 }, { "epoch": 0.26906598114824337, "grad_norm": 0.0213623046875, "learning_rate": 0.0003269414714612534, "loss": 0.2624, "step": 314 }, { "epoch": 0.2699228791773779, "grad_norm": 0.0224609375, "learning_rate": 0.00032387497603938325, "loss": 0.264, "step": 315 }, { "epoch": 0.2707797772065124, "grad_norm": 0.022705078125, "learning_rate": 0.00032081602522734986, "loss": 0.2611, "step": 316 }, { "epoch": 0.27163667523564694, "grad_norm": 0.0262451171875, "learning_rate": 0.0003177647500604252, "loss": 0.2697, "step": 317 }, { "epoch": 0.27249357326478146, "grad_norm": 0.024169921875, "learning_rate": 0.00031472128124508187, "loss": 0.2684, "step": 318 }, { "epoch": 0.27335047129391604, "grad_norm": 0.0289306640625, "learning_rate": 0.00031168574915339467, "loss": 0.2627, "step": 319 }, { "epoch": 0.27420736932305056, "grad_norm": 0.02197265625, "learning_rate": 0.0003086582838174551, "loss": 0.2661, "step": 320 }, { "epoch": 0.2750642673521851, "grad_norm": 0.023193359375, "learning_rate": 0.0003056390149238022, "loss": 0.2733, "step": 321 }, { "epoch": 0.2759211653813196, "grad_norm": 0.0240478515625, "learning_rate": 0.00030262807180786645, "loss": 0.2619, "step": 322 }, { "epoch": 0.27677806341045413, "grad_norm": 0.0218505859375, "learning_rate": 0.00029962558344842963, "loss": 0.2607, "step": 323 }, { "epoch": 0.2776349614395887, "grad_norm": 0.034912109375, "learning_rate": 0.0002966316784621, "loss": 0.2662, "step": 324 }, { "epoch": 0.27849185946872324, "grad_norm": 0.029052734375, "learning_rate": 0.0002936464850978027, "loss": 0.2581, "step": 325 }, { "epoch": 0.27934875749785776, "grad_norm": 0.0223388671875, "learning_rate": 0.0002906701312312861, "loss": 0.2662, "step": 326 }, { "epoch": 0.2802056555269923, "grad_norm": 0.023681640625, "learning_rate": 0.00028770274435964356, "loss": 0.26, "step": 327 }, { "epoch": 0.2810625535561268, "grad_norm": 0.0272216796875, "learning_rate": 0.0002847444515958523, "loss": 0.2645, "step": 328 }, { "epoch": 0.2819194515852613, "grad_norm": 0.025146484375, "learning_rate": 0.0002817953796633289, "loss": 0.2635, "step": 329 }, { "epoch": 0.2827763496143959, "grad_norm": 0.0233154296875, "learning_rate": 0.00027885565489049947, "loss": 0.2619, "step": 330 }, { "epoch": 0.28363324764353043, "grad_norm": 0.0213623046875, "learning_rate": 0.0002759254032053888, "loss": 0.2668, "step": 331 }, { "epoch": 0.28449014567266495, "grad_norm": 0.0216064453125, "learning_rate": 0.00027300475013022663, "loss": 0.2553, "step": 332 }, { "epoch": 0.2853470437017995, "grad_norm": 0.0228271484375, "learning_rate": 0.0002700938207760701, "loss": 0.2614, "step": 333 }, { "epoch": 0.286203941730934, "grad_norm": 0.02587890625, "learning_rate": 0.0002671927398374443, "loss": 0.2541, "step": 334 }, { "epoch": 0.2870608397600686, "grad_norm": 0.022216796875, "learning_rate": 0.00026430163158700117, "loss": 0.256, "step": 335 }, { "epoch": 0.2879177377892031, "grad_norm": 0.024169921875, "learning_rate": 0.00026142061987019576, "loss": 0.2675, "step": 336 }, { "epoch": 0.2887746358183376, "grad_norm": 0.0284423828125, "learning_rate": 0.0002585498280999815, "loss": 0.2666, "step": 337 }, { "epoch": 0.28963153384747214, "grad_norm": 0.02490234375, "learning_rate": 0.0002556893792515227, "loss": 0.2888, "step": 338 }, { "epoch": 0.29048843187660667, "grad_norm": 0.030029296875, "learning_rate": 0.00025283939585692784, "loss": 0.2674, "step": 339 }, { "epoch": 0.2913453299057412, "grad_norm": 0.0245361328125, "learning_rate": 0.0002500000000000001, "loss": 0.2624, "step": 340 }, { "epoch": 0.29220222793487577, "grad_norm": 0.0205078125, "learning_rate": 0.0002471713133110078, "loss": 0.2457, "step": 341 }, { "epoch": 0.2930591259640103, "grad_norm": 0.0244140625, "learning_rate": 0.00024435345696147403, "loss": 0.2567, "step": 342 }, { "epoch": 0.2939160239931448, "grad_norm": 0.026123046875, "learning_rate": 0.00024154655165898627, "loss": 0.2569, "step": 343 }, { "epoch": 0.29477292202227934, "grad_norm": 0.0244140625, "learning_rate": 0.00023875071764202561, "loss": 0.2583, "step": 344 }, { "epoch": 0.29562982005141386, "grad_norm": 0.0230712890625, "learning_rate": 0.00023596607467481602, "loss": 0.2549, "step": 345 }, { "epoch": 0.29648671808054844, "grad_norm": 0.030517578125, "learning_rate": 0.00023319274204219425, "loss": 0.2647, "step": 346 }, { "epoch": 0.29734361610968296, "grad_norm": 0.0284423828125, "learning_rate": 0.00023043083854449987, "loss": 0.2848, "step": 347 }, { "epoch": 0.2982005141388175, "grad_norm": 0.026123046875, "learning_rate": 0.00022768048249248646, "loss": 0.2724, "step": 348 }, { "epoch": 0.299057412167952, "grad_norm": 0.027587890625, "learning_rate": 0.00022494179170225333, "loss": 0.2684, "step": 349 }, { "epoch": 0.29991431019708653, "grad_norm": 0.0255126953125, "learning_rate": 0.00022221488349019903, "loss": 0.2623, "step": 350 }, { "epoch": 0.30077120822622105, "grad_norm": 0.0269775390625, "learning_rate": 0.0002194998746679952, "loss": 0.2608, "step": 351 }, { "epoch": 0.30162810625535563, "grad_norm": 0.03662109375, "learning_rate": 0.0002167968815375837, "loss": 0.2671, "step": 352 }, { "epoch": 0.30248500428449016, "grad_norm": 0.031494140625, "learning_rate": 0.00021410601988619394, "loss": 0.2583, "step": 353 }, { "epoch": 0.3033419023136247, "grad_norm": 0.02490234375, "learning_rate": 0.00021142740498138323, "loss": 0.2617, "step": 354 }, { "epoch": 0.3041988003427592, "grad_norm": 0.022216796875, "learning_rate": 0.000208761151566099, "loss": 0.2569, "step": 355 }, { "epoch": 0.3050556983718937, "grad_norm": 0.0250244140625, "learning_rate": 0.00020610737385376348, "loss": 0.2612, "step": 356 }, { "epoch": 0.3059125964010283, "grad_norm": 0.02783203125, "learning_rate": 0.00020346618552338148, "loss": 0.2629, "step": 357 }, { "epoch": 0.3067694944301628, "grad_norm": 0.02197265625, "learning_rate": 0.00020083769971467047, "loss": 0.2629, "step": 358 }, { "epoch": 0.30762639245929735, "grad_norm": 0.024169921875, "learning_rate": 0.0001982220290232143, "loss": 0.2847, "step": 359 }, { "epoch": 0.30848329048843187, "grad_norm": 0.026611328125, "learning_rate": 0.00019561928549563967, "loss": 0.266, "step": 360 }, { "epoch": 0.3093401885175664, "grad_norm": 0.0272216796875, "learning_rate": 0.00019302958062481672, "loss": 0.2563, "step": 361 }, { "epoch": 0.3101970865467009, "grad_norm": 0.031005859375, "learning_rate": 0.00019045302534508295, "loss": 0.2696, "step": 362 }, { "epoch": 0.3110539845758355, "grad_norm": 0.0234375, "learning_rate": 0.0001878897300274911, "loss": 0.2636, "step": 363 }, { "epoch": 0.31191088260497, "grad_norm": 0.0220947265625, "learning_rate": 0.00018533980447508135, "loss": 0.258, "step": 364 }, { "epoch": 0.31276778063410454, "grad_norm": 0.022705078125, "learning_rate": 0.00018280335791817732, "loss": 0.2534, "step": 365 }, { "epoch": 0.31362467866323906, "grad_norm": 0.0211181640625, "learning_rate": 0.00018028049900970766, "loss": 0.2709, "step": 366 }, { "epoch": 0.3144815766923736, "grad_norm": 0.021728515625, "learning_rate": 0.0001777713358205514, "loss": 0.2708, "step": 367 }, { "epoch": 0.31533847472150817, "grad_norm": 0.0205078125, "learning_rate": 0.00017527597583490823, "loss": 0.2587, "step": 368 }, { "epoch": 0.3161953727506427, "grad_norm": 0.020263671875, "learning_rate": 0.00017279452594569483, "loss": 0.2597, "step": 369 }, { "epoch": 0.3170522707797772, "grad_norm": 0.0242919921875, "learning_rate": 0.00017032709244996558, "loss": 0.2611, "step": 370 }, { "epoch": 0.31790916880891174, "grad_norm": 0.021484375, "learning_rate": 0.00016787378104435928, "loss": 0.2697, "step": 371 }, { "epoch": 0.31876606683804626, "grad_norm": 0.020751953125, "learning_rate": 0.00016543469682057105, "loss": 0.2641, "step": 372 }, { "epoch": 0.3196229648671808, "grad_norm": 0.022216796875, "learning_rate": 0.00016300994426085103, "loss": 0.2658, "step": 373 }, { "epoch": 0.32047986289631536, "grad_norm": 0.0213623046875, "learning_rate": 0.0001605996272335291, "loss": 0.2641, "step": 374 }, { "epoch": 0.3213367609254499, "grad_norm": 0.0184326171875, "learning_rate": 0.00015820384898856434, "loss": 0.2651, "step": 375 }, { "epoch": 0.3221936589545844, "grad_norm": 0.0262451171875, "learning_rate": 0.00015582271215312294, "loss": 0.2559, "step": 376 }, { "epoch": 0.32305055698371893, "grad_norm": 0.023681640625, "learning_rate": 0.00015345631872718213, "loss": 0.2558, "step": 377 }, { "epoch": 0.32390745501285345, "grad_norm": 0.0252685546875, "learning_rate": 0.00015110477007916002, "loss": 0.2537, "step": 378 }, { "epoch": 0.32476435304198803, "grad_norm": 0.019775390625, "learning_rate": 0.0001487681669415742, "loss": 0.2565, "step": 379 }, { "epoch": 0.32562125107112255, "grad_norm": 0.019775390625, "learning_rate": 0.00014644660940672628, "loss": 0.2562, "step": 380 }, { "epoch": 0.3264781491002571, "grad_norm": 0.0301513671875, "learning_rate": 0.00014414019692241437, "loss": 0.2644, "step": 381 }, { "epoch": 0.3273350471293916, "grad_norm": 0.019287109375, "learning_rate": 0.00014184902828767287, "loss": 0.2671, "step": 382 }, { "epoch": 0.3281919451585261, "grad_norm": 0.0262451171875, "learning_rate": 0.0001395732016485406, "loss": 0.249, "step": 383 }, { "epoch": 0.32904884318766064, "grad_norm": 0.0198974609375, "learning_rate": 0.0001373128144938563, "loss": 0.2558, "step": 384 }, { "epoch": 0.3299057412167952, "grad_norm": 0.021728515625, "learning_rate": 0.00013506796365108232, "loss": 0.2693, "step": 385 }, { "epoch": 0.33076263924592975, "grad_norm": 0.021484375, "learning_rate": 0.00013283874528215734, "loss": 0.2686, "step": 386 }, { "epoch": 0.33161953727506427, "grad_norm": 0.02587890625, "learning_rate": 0.00013062525487937698, "loss": 0.2711, "step": 387 }, { "epoch": 0.3324764353041988, "grad_norm": 0.018798828125, "learning_rate": 0.00012842758726130281, "loss": 0.2559, "step": 388 }, { "epoch": 0.3333333333333333, "grad_norm": 0.0189208984375, "learning_rate": 0.00012624583656870153, "loss": 0.2639, "step": 389 }, { "epoch": 0.3341902313624679, "grad_norm": 0.0196533203125, "learning_rate": 0.00012408009626051135, "loss": 0.2681, "step": 390 }, { "epoch": 0.3350471293916024, "grad_norm": 0.02001953125, "learning_rate": 0.00012193045910983863, "loss": 0.2629, "step": 391 }, { "epoch": 0.33590402742073694, "grad_norm": 0.019775390625, "learning_rate": 0.00011979701719998454, "loss": 0.2671, "step": 392 }, { "epoch": 0.33676092544987146, "grad_norm": 0.0240478515625, "learning_rate": 0.00011767986192049984, "loss": 0.2651, "step": 393 }, { "epoch": 0.337617823479006, "grad_norm": 0.0201416015625, "learning_rate": 0.00011557908396327027, "loss": 0.2646, "step": 394 }, { "epoch": 0.3384747215081405, "grad_norm": 0.0257568359375, "learning_rate": 0.00011349477331863151, "loss": 0.2723, "step": 395 }, { "epoch": 0.3393316195372751, "grad_norm": 0.023681640625, "learning_rate": 0.00011142701927151455, "loss": 0.2775, "step": 396 }, { "epoch": 0.3401885175664096, "grad_norm": 0.032470703125, "learning_rate": 0.00010937591039762085, "loss": 0.265, "step": 397 }, { "epoch": 0.34104541559554413, "grad_norm": 0.0198974609375, "learning_rate": 0.00010734153455962764, "loss": 0.2661, "step": 398 }, { "epoch": 0.34190231362467866, "grad_norm": 0.0184326171875, "learning_rate": 0.00010532397890342504, "loss": 0.2526, "step": 399 }, { "epoch": 0.3427592116538132, "grad_norm": 0.0181884765625, "learning_rate": 0.00010332332985438247, "loss": 0.2583, "step": 400 }, { "epoch": 0.34361610968294776, "grad_norm": 0.0216064453125, "learning_rate": 0.0001013396731136465, "loss": 0.2544, "step": 401 }, { "epoch": 0.3444730077120823, "grad_norm": 0.0247802734375, "learning_rate": 9.937309365446973e-05, "loss": 0.2796, "step": 402 }, { "epoch": 0.3453299057412168, "grad_norm": 0.0189208984375, "learning_rate": 9.742367571857092e-05, "loss": 0.2611, "step": 403 }, { "epoch": 0.3461868037703513, "grad_norm": 0.0225830078125, "learning_rate": 9.549150281252633e-05, "loss": 0.2568, "step": 404 }, { "epoch": 0.34704370179948585, "grad_norm": 0.0233154296875, "learning_rate": 9.357665770419243e-05, "loss": 0.2661, "step": 405 }, { "epoch": 0.34790059982862037, "grad_norm": 0.0245361328125, "learning_rate": 9.167922241916055e-05, "loss": 0.27, "step": 406 }, { "epoch": 0.34875749785775495, "grad_norm": 0.0198974609375, "learning_rate": 8.979927823724321e-05, "loss": 0.2665, "step": 407 }, { "epoch": 0.3496143958868895, "grad_norm": 0.0252685546875, "learning_rate": 8.793690568899215e-05, "loss": 0.26, "step": 408 }, { "epoch": 0.350471293916024, "grad_norm": 0.029052734375, "learning_rate": 8.609218455224893e-05, "loss": 0.2625, "step": 409 }, { "epoch": 0.3513281919451585, "grad_norm": 0.019287109375, "learning_rate": 8.426519384872733e-05, "loss": 0.2581, "step": 410 }, { "epoch": 0.35218508997429304, "grad_norm": 0.02490234375, "learning_rate": 8.24560118406285e-05, "loss": 0.2629, "step": 411 }, { "epoch": 0.35304198800342756, "grad_norm": 0.020263671875, "learning_rate": 8.066471602728804e-05, "loss": 0.2522, "step": 412 }, { "epoch": 0.35389888603256214, "grad_norm": 0.0245361328125, "learning_rate": 7.889138314185678e-05, "loss": 0.2648, "step": 413 }, { "epoch": 0.35475578406169667, "grad_norm": 0.0191650390625, "learning_rate": 7.71360891480134e-05, "loss": 0.2633, "step": 414 }, { "epoch": 0.3556126820908312, "grad_norm": 0.02099609375, "learning_rate": 7.53989092367106e-05, "loss": 0.2681, "step": 415 }, { "epoch": 0.3564695801199657, "grad_norm": 0.0294189453125, "learning_rate": 7.367991782295391e-05, "loss": 0.2681, "step": 416 }, { "epoch": 0.35732647814910024, "grad_norm": 0.020751953125, "learning_rate": 7.197918854261431e-05, "loss": 0.2531, "step": 417 }, { "epoch": 0.3581833761782348, "grad_norm": 0.021728515625, "learning_rate": 7.029679424927366e-05, "loss": 0.2607, "step": 418 }, { "epoch": 0.35904027420736934, "grad_norm": 0.029052734375, "learning_rate": 6.863280701110408e-05, "loss": 0.2623, "step": 419 }, { "epoch": 0.35989717223650386, "grad_norm": 0.0211181640625, "learning_rate": 6.698729810778065e-05, "loss": 0.2641, "step": 420 }, { "epoch": 0.3607540702656384, "grad_norm": 0.019775390625, "learning_rate": 6.536033802742814e-05, "loss": 0.2809, "step": 421 }, { "epoch": 0.3616109682947729, "grad_norm": 0.0181884765625, "learning_rate": 6.375199646360142e-05, "loss": 0.2679, "step": 422 }, { "epoch": 0.36246786632390743, "grad_norm": 0.056396484375, "learning_rate": 6.21623423123001e-05, "loss": 0.3452, "step": 423 }, { "epoch": 0.363324764353042, "grad_norm": 0.0211181640625, "learning_rate": 6.059144366901737e-05, "loss": 0.2508, "step": 424 }, { "epoch": 0.36418166238217653, "grad_norm": 0.01806640625, "learning_rate": 5.903936782582253e-05, "loss": 0.2516, "step": 425 }, { "epoch": 0.36503856041131105, "grad_norm": 0.0203857421875, "learning_rate": 5.750618126847912e-05, "loss": 0.2633, "step": 426 }, { "epoch": 0.3658954584404456, "grad_norm": 0.0186767578125, "learning_rate": 5.599194967359639e-05, "loss": 0.263, "step": 427 }, { "epoch": 0.3667523564695801, "grad_norm": 0.0257568359375, "learning_rate": 5.449673790581611e-05, "loss": 0.2754, "step": 428 }, { "epoch": 0.3676092544987147, "grad_norm": 0.023193359375, "learning_rate": 5.3020610015033946e-05, "loss": 0.2628, "step": 429 }, { "epoch": 0.3684661525278492, "grad_norm": 0.0223388671875, "learning_rate": 5.1563629233655876e-05, "loss": 0.2775, "step": 430 }, { "epoch": 0.3693230505569837, "grad_norm": 0.0213623046875, "learning_rate": 5.0125857973889355e-05, "loss": 0.2529, "step": 431 }, { "epoch": 0.37017994858611825, "grad_norm": 0.0189208984375, "learning_rate": 4.87073578250698e-05, "loss": 0.2672, "step": 432 }, { "epoch": 0.37103684661525277, "grad_norm": 0.023193359375, "learning_rate": 4.730818955102234e-05, "loss": 0.2576, "step": 433 }, { "epoch": 0.3718937446443873, "grad_norm": 0.027587890625, "learning_rate": 4.592841308745932e-05, "loss": 0.2575, "step": 434 }, { "epoch": 0.37275064267352187, "grad_norm": 0.025146484375, "learning_rate": 4.456808753941205e-05, "loss": 0.257, "step": 435 }, { "epoch": 0.3736075407026564, "grad_norm": 0.0201416015625, "learning_rate": 4.322727117869951e-05, "loss": 0.2661, "step": 436 }, { "epoch": 0.3744644387317909, "grad_norm": 0.0302734375, "learning_rate": 4.190602144143207e-05, "loss": 0.278, "step": 437 }, { "epoch": 0.37532133676092544, "grad_norm": 0.0250244140625, "learning_rate": 4.06043949255509e-05, "loss": 0.2695, "step": 438 }, { "epoch": 0.37617823479005996, "grad_norm": 0.0216064453125, "learning_rate": 3.932244738840379e-05, "loss": 0.2559, "step": 439 }, { "epoch": 0.37703513281919454, "grad_norm": 0.020751953125, "learning_rate": 3.806023374435663e-05, "loss": 0.2721, "step": 440 }, { "epoch": 0.37789203084832906, "grad_norm": 0.025146484375, "learning_rate": 3.681780806244095e-05, "loss": 0.2479, "step": 441 }, { "epoch": 0.3787489288774636, "grad_norm": 0.0223388671875, "learning_rate": 3.559522356403788e-05, "loss": 0.2686, "step": 442 }, { "epoch": 0.3796058269065981, "grad_norm": 0.018798828125, "learning_rate": 3.439253262059822e-05, "loss": 0.2404, "step": 443 }, { "epoch": 0.38046272493573263, "grad_norm": 0.021240234375, "learning_rate": 3.3209786751399184e-05, "loss": 0.2702, "step": 444 }, { "epoch": 0.38131962296486716, "grad_norm": 0.0194091796875, "learning_rate": 3.2047036621337236e-05, "loss": 0.2568, "step": 445 }, { "epoch": 0.38217652099400173, "grad_norm": 0.022216796875, "learning_rate": 3.0904332038757974e-05, "loss": 0.2586, "step": 446 }, { "epoch": 0.38303341902313626, "grad_norm": 0.0205078125, "learning_rate": 2.9781721953322627e-05, "loss": 0.2557, "step": 447 }, { "epoch": 0.3838903170522708, "grad_norm": 0.018798828125, "learning_rate": 2.8679254453910786e-05, "loss": 0.2515, "step": 448 }, { "epoch": 0.3847472150814053, "grad_norm": 0.0186767578125, "learning_rate": 2.7596976766560976e-05, "loss": 0.2532, "step": 449 }, { "epoch": 0.3856041131105398, "grad_norm": 0.0181884765625, "learning_rate": 2.653493525244721e-05, "loss": 0.2555, "step": 450 }, { "epoch": 0.3864610111396744, "grad_norm": 0.0218505859375, "learning_rate": 2.5493175405893076e-05, "loss": 0.2469, "step": 451 }, { "epoch": 0.3873179091688089, "grad_norm": 0.0242919921875, "learning_rate": 2.4471741852423235e-05, "loss": 0.2566, "step": 452 }, { "epoch": 0.38817480719794345, "grad_norm": 0.020751953125, "learning_rate": 2.3470678346851513e-05, "loss": 0.273, "step": 453 }, { "epoch": 0.389031705227078, "grad_norm": 0.01904296875, "learning_rate": 2.2490027771406685e-05, "loss": 0.2599, "step": 454 }, { "epoch": 0.3898886032562125, "grad_norm": 0.021728515625, "learning_rate": 2.152983213389559e-05, "loss": 0.2591, "step": 455 }, { "epoch": 0.390745501285347, "grad_norm": 0.01953125, "learning_rate": 2.0590132565903473e-05, "loss": 0.2733, "step": 456 }, { "epoch": 0.3916023993144816, "grad_norm": 0.019287109375, "learning_rate": 1.9670969321032406e-05, "loss": 0.2603, "step": 457 }, { "epoch": 0.3924592973436161, "grad_norm": 0.0233154296875, "learning_rate": 1.8772381773176416e-05, "loss": 0.2568, "step": 458 }, { "epoch": 0.39331619537275064, "grad_norm": 0.022216796875, "learning_rate": 1.7894408414835363e-05, "loss": 0.2858, "step": 459 }, { "epoch": 0.39417309340188517, "grad_norm": 0.0198974609375, "learning_rate": 1.70370868554659e-05, "loss": 0.2589, "step": 460 }, { "epoch": 0.3950299914310197, "grad_norm": 0.01904296875, "learning_rate": 1.620045381987012e-05, "loss": 0.2503, "step": 461 }, { "epoch": 0.39588688946015427, "grad_norm": 0.0205078125, "learning_rate": 1.538454514662285e-05, "loss": 0.2695, "step": 462 }, { "epoch": 0.3967437874892888, "grad_norm": 0.0211181640625, "learning_rate": 1.4589395786535953e-05, "loss": 0.2616, "step": 463 }, { "epoch": 0.3976006855184233, "grad_norm": 0.019287109375, "learning_rate": 1.3815039801161721e-05, "loss": 0.2542, "step": 464 }, { "epoch": 0.39845758354755784, "grad_norm": 0.0244140625, "learning_rate": 1.3061510361333184e-05, "loss": 0.254, "step": 465 }, { "epoch": 0.39931448157669236, "grad_norm": 0.0216064453125, "learning_rate": 1.232883974574367e-05, "loss": 0.2671, "step": 466 }, { "epoch": 0.4001713796058269, "grad_norm": 0.0184326171875, "learning_rate": 1.1617059339563806e-05, "loss": 0.2515, "step": 467 }, { "epoch": 0.40102827763496146, "grad_norm": 0.0201416015625, "learning_rate": 1.0926199633097156e-05, "loss": 0.2528, "step": 468 }, { "epoch": 0.401885175664096, "grad_norm": 0.02001953125, "learning_rate": 1.0256290220474307e-05, "loss": 0.2661, "step": 469 }, { "epoch": 0.4027420736932305, "grad_norm": 0.01953125, "learning_rate": 9.607359798384786e-06, "loss": 0.2616, "step": 470 }, { "epoch": 0.40359897172236503, "grad_norm": 0.0208740234375, "learning_rate": 8.979436164848088e-06, "loss": 0.2668, "step": 471 }, { "epoch": 0.40445586975149955, "grad_norm": 0.0196533203125, "learning_rate": 8.372546218022748e-06, "loss": 0.2446, "step": 472 }, { "epoch": 0.40531276778063413, "grad_norm": 0.0181884765625, "learning_rate": 7.786715955054202e-06, "loss": 0.2594, "step": 473 }, { "epoch": 0.40616966580976865, "grad_norm": 0.019775390625, "learning_rate": 7.221970470961125e-06, "loss": 0.2543, "step": 474 }, { "epoch": 0.4070265638389032, "grad_norm": 0.01904296875, "learning_rate": 6.678333957560512e-06, "loss": 0.267, "step": 475 }, { "epoch": 0.4078834618680377, "grad_norm": 0.02099609375, "learning_rate": 6.15582970243117e-06, "loss": 0.2606, "step": 476 }, { "epoch": 0.4087403598971722, "grad_norm": 0.024169921875, "learning_rate": 5.6544800879163026e-06, "loss": 0.2652, "step": 477 }, { "epoch": 0.40959725792630675, "grad_norm": 0.0201416015625, "learning_rate": 5.174306590164879e-06, "loss": 0.2613, "step": 478 }, { "epoch": 0.4104541559554413, "grad_norm": 0.0223388671875, "learning_rate": 4.715329778211374e-06, "loss": 0.2791, "step": 479 }, { "epoch": 0.41131105398457585, "grad_norm": 0.0194091796875, "learning_rate": 4.277569313094809e-06, "loss": 0.2666, "step": 480 }, { "epoch": 0.41216795201371037, "grad_norm": 0.0213623046875, "learning_rate": 3.861043947016474e-06, "loss": 0.2592, "step": 481 }, { "epoch": 0.4130248500428449, "grad_norm": 0.02294921875, "learning_rate": 3.4657715225368535e-06, "loss": 0.2629, "step": 482 }, { "epoch": 0.4138817480719794, "grad_norm": 0.0211181640625, "learning_rate": 3.09176897181096e-06, "loss": 0.2624, "step": 483 }, { "epoch": 0.414738646101114, "grad_norm": 0.017578125, "learning_rate": 2.739052315863355e-06, "loss": 0.2556, "step": 484 }, { "epoch": 0.4155955441302485, "grad_norm": 0.02099609375, "learning_rate": 2.4076366639015913e-06, "loss": 0.2665, "step": 485 }, { "epoch": 0.41645244215938304, "grad_norm": 0.01904296875, "learning_rate": 2.097536212669171e-06, "loss": 0.2584, "step": 486 }, { "epoch": 0.41730934018851756, "grad_norm": 0.0240478515625, "learning_rate": 1.8087642458373132e-06, "loss": 0.263, "step": 487 }, { "epoch": 0.4181662382176521, "grad_norm": 0.0218505859375, "learning_rate": 1.541333133436018e-06, "loss": 0.2611, "step": 488 }, { "epoch": 0.4190231362467866, "grad_norm": 0.01806640625, "learning_rate": 1.2952543313240472e-06, "loss": 0.255, "step": 489 }, { "epoch": 0.4198800342759212, "grad_norm": 0.0194091796875, "learning_rate": 1.0705383806982606e-06, "loss": 0.2719, "step": 490 }, { "epoch": 0.4207369323050557, "grad_norm": 0.0206298828125, "learning_rate": 8.671949076420882e-07, "loss": 0.2695, "step": 491 }, { "epoch": 0.42159383033419023, "grad_norm": 0.0198974609375, "learning_rate": 6.852326227130834e-07, "loss": 0.2709, "step": 492 }, { "epoch": 0.42245072836332476, "grad_norm": 0.0272216796875, "learning_rate": 5.246593205699424e-07, "loss": 0.2517, "step": 493 }, { "epoch": 0.4233076263924593, "grad_norm": 0.0211181640625, "learning_rate": 3.854818796385495e-07, "loss": 0.2614, "step": 494 }, { "epoch": 0.4241645244215938, "grad_norm": 0.0208740234375, "learning_rate": 2.677062618171577e-07, "loss": 0.2542, "step": 495 }, { "epoch": 0.4250214224507284, "grad_norm": 0.0194091796875, "learning_rate": 1.7133751222137007e-07, "loss": 0.2673, "step": 496 }, { "epoch": 0.4258783204798629, "grad_norm": 0.0223388671875, "learning_rate": 9.637975896759077e-08, "loss": 0.2686, "step": 497 }, { "epoch": 0.4267352185089974, "grad_norm": 0.0184326171875, "learning_rate": 4.283621299649987e-08, "loss": 0.2779, "step": 498 }, { "epoch": 0.42759211653813195, "grad_norm": 0.0191650390625, "learning_rate": 1.0709167935385456e-08, "loss": 0.2736, "step": 499 }, { "epoch": 0.4284490145672665, "grad_norm": 0.0223388671875, "learning_rate": 0.0, "loss": 0.2556, "step": 500 }, { "epoch": 0.4284490145672665, "step": 500, "total_flos": 4.430379024908288e+19, "train_loss": 0.41707064187526705, "train_runtime": 21021.7192, "train_samples_per_second": 0.381, "train_steps_per_second": 0.024 } ], "logging_steps": 1.0, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.430379024908288e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }