LongMountain's picture
init commit
d044d5f
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4284490145672665,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000856898029134533,
"grad_norm": 0.4453125,
"learning_rate": 5e-05,
"loss": 3.6595,
"step": 1
},
{
"epoch": 0.001713796058269066,
"grad_norm": 0.4296875,
"learning_rate": 0.0001,
"loss": 3.6973,
"step": 2
},
{
"epoch": 0.002570694087403599,
"grad_norm": 0.453125,
"learning_rate": 0.00015,
"loss": 3.6343,
"step": 3
},
{
"epoch": 0.003427592116538132,
"grad_norm": 0.419921875,
"learning_rate": 0.0002,
"loss": 3.3538,
"step": 4
},
{
"epoch": 0.004284490145672665,
"grad_norm": 0.4140625,
"learning_rate": 0.00025,
"loss": 3.2142,
"step": 5
},
{
"epoch": 0.005141388174807198,
"grad_norm": 0.384765625,
"learning_rate": 0.0003,
"loss": 2.9167,
"step": 6
},
{
"epoch": 0.005998286203941731,
"grad_norm": 0.349609375,
"learning_rate": 0.00035,
"loss": 2.7017,
"step": 7
},
{
"epoch": 0.006855184233076264,
"grad_norm": 0.306640625,
"learning_rate": 0.0004,
"loss": 2.4232,
"step": 8
},
{
"epoch": 0.007712082262210797,
"grad_norm": 0.25390625,
"learning_rate": 0.00045000000000000004,
"loss": 2.1348,
"step": 9
},
{
"epoch": 0.00856898029134533,
"grad_norm": 0.24609375,
"learning_rate": 0.0005,
"loss": 2.0355,
"step": 10
},
{
"epoch": 0.009425878320479864,
"grad_norm": 0.28515625,
"learning_rate": 0.00055,
"loss": 2.0537,
"step": 11
},
{
"epoch": 0.010282776349614395,
"grad_norm": 0.322265625,
"learning_rate": 0.0006,
"loss": 2.0642,
"step": 12
},
{
"epoch": 0.011139674378748929,
"grad_norm": 0.3203125,
"learning_rate": 0.0006500000000000001,
"loss": 2.0116,
"step": 13
},
{
"epoch": 0.011996572407883462,
"grad_norm": 0.3125,
"learning_rate": 0.0007,
"loss": 2.0132,
"step": 14
},
{
"epoch": 0.012853470437017995,
"grad_norm": 0.283203125,
"learning_rate": 0.00075,
"loss": 1.9433,
"step": 15
},
{
"epoch": 0.013710368466152529,
"grad_norm": 0.2578125,
"learning_rate": 0.0008,
"loss": 1.774,
"step": 16
},
{
"epoch": 0.01456726649528706,
"grad_norm": 0.236328125,
"learning_rate": 0.00085,
"loss": 1.7933,
"step": 17
},
{
"epoch": 0.015424164524421594,
"grad_norm": 0.240234375,
"learning_rate": 0.0009000000000000001,
"loss": 1.6388,
"step": 18
},
{
"epoch": 0.016281062553556127,
"grad_norm": 0.240234375,
"learning_rate": 0.00095,
"loss": 1.6048,
"step": 19
},
{
"epoch": 0.01713796058269066,
"grad_norm": 0.23046875,
"learning_rate": 0.001,
"loss": 1.5744,
"step": 20
},
{
"epoch": 0.017994858611825194,
"grad_norm": 0.232421875,
"learning_rate": 0.0009999892908320648,
"loss": 1.4603,
"step": 21
},
{
"epoch": 0.018851756640959727,
"grad_norm": 0.201171875,
"learning_rate": 0.0009999571637870036,
"loss": 1.4607,
"step": 22
},
{
"epoch": 0.01970865467009426,
"grad_norm": 0.2021484375,
"learning_rate": 0.0009999036202410325,
"loss": 1.3065,
"step": 23
},
{
"epoch": 0.02056555269922879,
"grad_norm": 0.185546875,
"learning_rate": 0.0009998286624877785,
"loss": 1.2436,
"step": 24
},
{
"epoch": 0.021422450728363324,
"grad_norm": 0.1787109375,
"learning_rate": 0.0009997322937381828,
"loss": 1.1979,
"step": 25
},
{
"epoch": 0.022279348757497857,
"grad_norm": 0.1767578125,
"learning_rate": 0.0009996145181203615,
"loss": 1.1425,
"step": 26
},
{
"epoch": 0.02313624678663239,
"grad_norm": 0.154296875,
"learning_rate": 0.00099947534067943,
"loss": 1.0843,
"step": 27
},
{
"epoch": 0.023993144815766924,
"grad_norm": 0.1591796875,
"learning_rate": 0.0009993147673772868,
"loss": 1.12,
"step": 28
},
{
"epoch": 0.024850042844901457,
"grad_norm": 0.154296875,
"learning_rate": 0.000999132805092358,
"loss": 1.0019,
"step": 29
},
{
"epoch": 0.02570694087403599,
"grad_norm": 0.142578125,
"learning_rate": 0.0009989294616193018,
"loss": 1.0082,
"step": 30
},
{
"epoch": 0.026563838903170524,
"grad_norm": 0.1376953125,
"learning_rate": 0.000998704745668676,
"loss": 0.9653,
"step": 31
},
{
"epoch": 0.027420736932305057,
"grad_norm": 0.1337890625,
"learning_rate": 0.000998458666866564,
"loss": 0.9137,
"step": 32
},
{
"epoch": 0.028277634961439587,
"grad_norm": 0.130859375,
"learning_rate": 0.0009981912357541628,
"loss": 0.8958,
"step": 33
},
{
"epoch": 0.02913453299057412,
"grad_norm": 0.125,
"learning_rate": 0.0009979024637873308,
"loss": 0.8628,
"step": 34
},
{
"epoch": 0.029991431019708654,
"grad_norm": 0.12060546875,
"learning_rate": 0.0009975923633360985,
"loss": 0.8233,
"step": 35
},
{
"epoch": 0.030848329048843187,
"grad_norm": 0.1083984375,
"learning_rate": 0.0009972609476841367,
"loss": 0.8045,
"step": 36
},
{
"epoch": 0.031705227077977724,
"grad_norm": 0.1171875,
"learning_rate": 0.0009969082310281891,
"loss": 0.7961,
"step": 37
},
{
"epoch": 0.032562125107112254,
"grad_norm": 0.10791015625,
"learning_rate": 0.0009965342284774632,
"loss": 0.7864,
"step": 38
},
{
"epoch": 0.033419023136246784,
"grad_norm": 0.103515625,
"learning_rate": 0.0009961389560529835,
"loss": 0.7664,
"step": 39
},
{
"epoch": 0.03427592116538132,
"grad_norm": 0.1015625,
"learning_rate": 0.0009957224306869053,
"loss": 0.7723,
"step": 40
},
{
"epoch": 0.03513281919451585,
"grad_norm": 0.09765625,
"learning_rate": 0.0009952846702217886,
"loss": 0.7501,
"step": 41
},
{
"epoch": 0.03598971722365039,
"grad_norm": 0.09228515625,
"learning_rate": 0.0009948256934098352,
"loss": 0.6932,
"step": 42
},
{
"epoch": 0.03684661525278492,
"grad_norm": 0.09130859375,
"learning_rate": 0.0009943455199120836,
"loss": 0.675,
"step": 43
},
{
"epoch": 0.037703513281919454,
"grad_norm": 0.09033203125,
"learning_rate": 0.0009938441702975688,
"loss": 0.6838,
"step": 44
},
{
"epoch": 0.038560411311053984,
"grad_norm": 0.08935546875,
"learning_rate": 0.0009933216660424397,
"loss": 0.6546,
"step": 45
},
{
"epoch": 0.03941730934018852,
"grad_norm": 0.08349609375,
"learning_rate": 0.0009927780295290389,
"loss": 0.6443,
"step": 46
},
{
"epoch": 0.04027420736932305,
"grad_norm": 0.0791015625,
"learning_rate": 0.0009922132840449458,
"loss": 0.6705,
"step": 47
},
{
"epoch": 0.04113110539845758,
"grad_norm": 0.08251953125,
"learning_rate": 0.0009916274537819774,
"loss": 0.6176,
"step": 48
},
{
"epoch": 0.04198800342759212,
"grad_norm": 0.07568359375,
"learning_rate": 0.000991020563835152,
"loss": 0.683,
"step": 49
},
{
"epoch": 0.04284490145672665,
"grad_norm": 0.07861328125,
"learning_rate": 0.0009903926402016153,
"loss": 0.5799,
"step": 50
},
{
"epoch": 0.043701799485861184,
"grad_norm": 0.07470703125,
"learning_rate": 0.0009897437097795257,
"loss": 0.6293,
"step": 51
},
{
"epoch": 0.044558697514995714,
"grad_norm": 0.0693359375,
"learning_rate": 0.0009890738003669028,
"loss": 0.5864,
"step": 52
},
{
"epoch": 0.04541559554413025,
"grad_norm": 0.0673828125,
"learning_rate": 0.0009883829406604362,
"loss": 0.5672,
"step": 53
},
{
"epoch": 0.04627249357326478,
"grad_norm": 0.06884765625,
"learning_rate": 0.0009876711602542563,
"loss": 0.607,
"step": 54
},
{
"epoch": 0.04712939160239932,
"grad_norm": 0.08251953125,
"learning_rate": 0.0009869384896386668,
"loss": 0.6006,
"step": 55
},
{
"epoch": 0.04798628963153385,
"grad_norm": 0.0625,
"learning_rate": 0.0009861849601988384,
"loss": 0.536,
"step": 56
},
{
"epoch": 0.04884318766066838,
"grad_norm": 0.05810546875,
"learning_rate": 0.0009854106042134641,
"loss": 0.5153,
"step": 57
},
{
"epoch": 0.049700085689802914,
"grad_norm": 0.0615234375,
"learning_rate": 0.0009846154548533773,
"loss": 0.5317,
"step": 58
},
{
"epoch": 0.050556983718937444,
"grad_norm": 0.07861328125,
"learning_rate": 0.0009837995461801298,
"loss": 0.5354,
"step": 59
},
{
"epoch": 0.05141388174807198,
"grad_norm": 0.06005859375,
"learning_rate": 0.0009829629131445341,
"loss": 0.5109,
"step": 60
},
{
"epoch": 0.05227077977720651,
"grad_norm": 0.0634765625,
"learning_rate": 0.0009821055915851646,
"loss": 0.5122,
"step": 61
},
{
"epoch": 0.05312767780634105,
"grad_norm": 0.0634765625,
"learning_rate": 0.0009812276182268236,
"loss": 0.5057,
"step": 62
},
{
"epoch": 0.05398457583547558,
"grad_norm": 0.058349609375,
"learning_rate": 0.0009803290306789677,
"loss": 0.4955,
"step": 63
},
{
"epoch": 0.054841473864610114,
"grad_norm": 0.056396484375,
"learning_rate": 0.0009794098674340967,
"loss": 0.4997,
"step": 64
},
{
"epoch": 0.055698371893744644,
"grad_norm": 0.054931640625,
"learning_rate": 0.0009784701678661044,
"loss": 0.4673,
"step": 65
},
{
"epoch": 0.056555269922879174,
"grad_norm": 0.058837890625,
"learning_rate": 0.0009775099722285933,
"loss": 0.4822,
"step": 66
},
{
"epoch": 0.05741216795201371,
"grad_norm": 0.060546875,
"learning_rate": 0.0009765293216531485,
"loss": 0.4716,
"step": 67
},
{
"epoch": 0.05826906598114824,
"grad_norm": 0.05078125,
"learning_rate": 0.0009755282581475768,
"loss": 0.463,
"step": 68
},
{
"epoch": 0.05912596401028278,
"grad_norm": 0.052734375,
"learning_rate": 0.000974506824594107,
"loss": 0.461,
"step": 69
},
{
"epoch": 0.05998286203941731,
"grad_norm": 0.06396484375,
"learning_rate": 0.0009734650647475529,
"loss": 0.4503,
"step": 70
},
{
"epoch": 0.060839760068551844,
"grad_norm": 0.0478515625,
"learning_rate": 0.0009724030232334391,
"loss": 0.4586,
"step": 71
},
{
"epoch": 0.061696658097686374,
"grad_norm": 0.047119140625,
"learning_rate": 0.0009713207455460893,
"loss": 0.4326,
"step": 72
},
{
"epoch": 0.06255355612682091,
"grad_norm": 0.0478515625,
"learning_rate": 0.0009702182780466775,
"loss": 0.4312,
"step": 73
},
{
"epoch": 0.06341045415595545,
"grad_norm": 0.0458984375,
"learning_rate": 0.0009690956679612422,
"loss": 0.4472,
"step": 74
},
{
"epoch": 0.06426735218508997,
"grad_norm": 0.0556640625,
"learning_rate": 0.0009679529633786629,
"loss": 0.4427,
"step": 75
},
{
"epoch": 0.06512425021422451,
"grad_norm": 0.04736328125,
"learning_rate": 0.0009667902132486009,
"loss": 0.4308,
"step": 76
},
{
"epoch": 0.06598114824335904,
"grad_norm": 0.052490234375,
"learning_rate": 0.0009656074673794017,
"loss": 0.431,
"step": 77
},
{
"epoch": 0.06683804627249357,
"grad_norm": 0.0458984375,
"learning_rate": 0.0009644047764359622,
"loss": 0.4219,
"step": 78
},
{
"epoch": 0.0676949443016281,
"grad_norm": 0.0478515625,
"learning_rate": 0.0009631821919375591,
"loss": 0.413,
"step": 79
},
{
"epoch": 0.06855184233076264,
"grad_norm": 0.042724609375,
"learning_rate": 0.0009619397662556434,
"loss": 0.4065,
"step": 80
},
{
"epoch": 0.06940874035989718,
"grad_norm": 0.046875,
"learning_rate": 0.0009606775526115963,
"loss": 0.447,
"step": 81
},
{
"epoch": 0.0702656383890317,
"grad_norm": 0.04638671875,
"learning_rate": 0.0009593956050744492,
"loss": 0.4243,
"step": 82
},
{
"epoch": 0.07112253641816624,
"grad_norm": 0.0439453125,
"learning_rate": 0.0009580939785585681,
"loss": 0.4003,
"step": 83
},
{
"epoch": 0.07197943444730077,
"grad_norm": 0.046630859375,
"learning_rate": 0.0009567727288213005,
"loss": 0.4098,
"step": 84
},
{
"epoch": 0.0728363324764353,
"grad_norm": 0.041015625,
"learning_rate": 0.000955431912460588,
"loss": 0.415,
"step": 85
},
{
"epoch": 0.07369323050556983,
"grad_norm": 0.040771484375,
"learning_rate": 0.0009540715869125407,
"loss": 0.4239,
"step": 86
},
{
"epoch": 0.07455012853470437,
"grad_norm": 0.043701171875,
"learning_rate": 0.0009526918104489777,
"loss": 0.4058,
"step": 87
},
{
"epoch": 0.07540702656383891,
"grad_norm": 0.043701171875,
"learning_rate": 0.0009512926421749304,
"loss": 0.3894,
"step": 88
},
{
"epoch": 0.07626392459297343,
"grad_norm": 0.0390625,
"learning_rate": 0.0009498741420261108,
"loss": 0.389,
"step": 89
},
{
"epoch": 0.07712082262210797,
"grad_norm": 0.04736328125,
"learning_rate": 0.0009484363707663442,
"loss": 0.3865,
"step": 90
},
{
"epoch": 0.0779777206512425,
"grad_norm": 0.048828125,
"learning_rate": 0.0009469793899849661,
"loss": 0.3823,
"step": 91
},
{
"epoch": 0.07883461868037704,
"grad_norm": 0.041748046875,
"learning_rate": 0.0009455032620941839,
"loss": 0.3963,
"step": 92
},
{
"epoch": 0.07969151670951156,
"grad_norm": 0.052001953125,
"learning_rate": 0.0009440080503264037,
"loss": 0.382,
"step": 93
},
{
"epoch": 0.0805484147386461,
"grad_norm": 0.04296875,
"learning_rate": 0.0009424938187315209,
"loss": 0.3723,
"step": 94
},
{
"epoch": 0.08140531276778064,
"grad_norm": 0.038818359375,
"learning_rate": 0.0009409606321741775,
"loss": 0.3766,
"step": 95
},
{
"epoch": 0.08226221079691516,
"grad_norm": 0.038330078125,
"learning_rate": 0.0009394085563309827,
"loss": 0.3798,
"step": 96
},
{
"epoch": 0.0831191088260497,
"grad_norm": 0.053466796875,
"learning_rate": 0.0009378376576876999,
"loss": 0.386,
"step": 97
},
{
"epoch": 0.08397600685518423,
"grad_norm": 0.03759765625,
"learning_rate": 0.0009362480035363986,
"loss": 0.4009,
"step": 98
},
{
"epoch": 0.08483290488431877,
"grad_norm": 0.05029296875,
"learning_rate": 0.0009346396619725719,
"loss": 0.3651,
"step": 99
},
{
"epoch": 0.0856898029134533,
"grad_norm": 0.03759765625,
"learning_rate": 0.0009330127018922195,
"loss": 0.3922,
"step": 100
},
{
"epoch": 0.08654670094258783,
"grad_norm": 0.036865234375,
"learning_rate": 0.0009313671929888959,
"loss": 0.3604,
"step": 101
},
{
"epoch": 0.08740359897172237,
"grad_norm": 0.037353515625,
"learning_rate": 0.0009297032057507264,
"loss": 0.3547,
"step": 102
},
{
"epoch": 0.08826049700085689,
"grad_norm": 0.04541015625,
"learning_rate": 0.0009280208114573858,
"loss": 0.3611,
"step": 103
},
{
"epoch": 0.08911739502999143,
"grad_norm": 0.036376953125,
"learning_rate": 0.0009263200821770461,
"loss": 0.3789,
"step": 104
},
{
"epoch": 0.08997429305912596,
"grad_norm": 0.035400390625,
"learning_rate": 0.0009246010907632895,
"loss": 0.3512,
"step": 105
},
{
"epoch": 0.0908311910882605,
"grad_norm": 0.035400390625,
"learning_rate": 0.0009228639108519867,
"loss": 0.3634,
"step": 106
},
{
"epoch": 0.09168808911739502,
"grad_norm": 0.03759765625,
"learning_rate": 0.0009211086168581433,
"loss": 0.3509,
"step": 107
},
{
"epoch": 0.09254498714652956,
"grad_norm": 0.03759765625,
"learning_rate": 0.0009193352839727121,
"loss": 0.3474,
"step": 108
},
{
"epoch": 0.0934018851756641,
"grad_norm": 0.036376953125,
"learning_rate": 0.0009175439881593715,
"loss": 0.3742,
"step": 109
},
{
"epoch": 0.09425878320479864,
"grad_norm": 0.033447265625,
"learning_rate": 0.0009157348061512727,
"loss": 0.3422,
"step": 110
},
{
"epoch": 0.09511568123393316,
"grad_norm": 0.043212890625,
"learning_rate": 0.0009139078154477511,
"loss": 0.3379,
"step": 111
},
{
"epoch": 0.0959725792630677,
"grad_norm": 0.03662109375,
"learning_rate": 0.0009120630943110077,
"loss": 0.3374,
"step": 112
},
{
"epoch": 0.09682947729220223,
"grad_norm": 0.03125,
"learning_rate": 0.0009102007217627568,
"loss": 0.3629,
"step": 113
},
{
"epoch": 0.09768637532133675,
"grad_norm": 0.043701171875,
"learning_rate": 0.0009083207775808396,
"loss": 0.3537,
"step": 114
},
{
"epoch": 0.09854327335047129,
"grad_norm": 0.0439453125,
"learning_rate": 0.0009064233422958076,
"loss": 0.3473,
"step": 115
},
{
"epoch": 0.09940017137960583,
"grad_norm": 0.035888671875,
"learning_rate": 0.0009045084971874737,
"loss": 0.3549,
"step": 116
},
{
"epoch": 0.10025706940874037,
"grad_norm": 0.03271484375,
"learning_rate": 0.0009025763242814291,
"loss": 0.3407,
"step": 117
},
{
"epoch": 0.10111396743787489,
"grad_norm": 0.03271484375,
"learning_rate": 0.0009006269063455304,
"loss": 0.3304,
"step": 118
},
{
"epoch": 0.10197086546700942,
"grad_norm": 0.033935546875,
"learning_rate": 0.0008986603268863536,
"loss": 0.3473,
"step": 119
},
{
"epoch": 0.10282776349614396,
"grad_norm": 0.03857421875,
"learning_rate": 0.0008966766701456176,
"loss": 0.3376,
"step": 120
},
{
"epoch": 0.1036846615252785,
"grad_norm": 0.03466796875,
"learning_rate": 0.000894676021096575,
"loss": 0.3262,
"step": 121
},
{
"epoch": 0.10454155955441302,
"grad_norm": 0.03857421875,
"learning_rate": 0.0008926584654403724,
"loss": 0.3222,
"step": 122
},
{
"epoch": 0.10539845758354756,
"grad_norm": 0.03515625,
"learning_rate": 0.0008906240896023794,
"loss": 0.3278,
"step": 123
},
{
"epoch": 0.1062553556126821,
"grad_norm": 0.035888671875,
"learning_rate": 0.0008885729807284854,
"loss": 0.3251,
"step": 124
},
{
"epoch": 0.10711225364181662,
"grad_norm": 0.033447265625,
"learning_rate": 0.0008865052266813684,
"loss": 0.3267,
"step": 125
},
{
"epoch": 0.10796915167095116,
"grad_norm": 0.034912109375,
"learning_rate": 0.0008844209160367298,
"loss": 0.3176,
"step": 126
},
{
"epoch": 0.10882604970008569,
"grad_norm": 0.040283203125,
"learning_rate": 0.0008823201380795002,
"loss": 0.3374,
"step": 127
},
{
"epoch": 0.10968294772922023,
"grad_norm": 0.032470703125,
"learning_rate": 0.0008802029828000156,
"loss": 0.314,
"step": 128
},
{
"epoch": 0.11053984575835475,
"grad_norm": 0.033935546875,
"learning_rate": 0.0008780695408901613,
"loss": 0.324,
"step": 129
},
{
"epoch": 0.11139674378748929,
"grad_norm": 0.032958984375,
"learning_rate": 0.0008759199037394887,
"loss": 0.3199,
"step": 130
},
{
"epoch": 0.11225364181662383,
"grad_norm": 0.031494140625,
"learning_rate": 0.0008737541634312985,
"loss": 0.3034,
"step": 131
},
{
"epoch": 0.11311053984575835,
"grad_norm": 0.0322265625,
"learning_rate": 0.0008715724127386971,
"loss": 0.3153,
"step": 132
},
{
"epoch": 0.11396743787489289,
"grad_norm": 0.034423828125,
"learning_rate": 0.0008693747451206231,
"loss": 0.3202,
"step": 133
},
{
"epoch": 0.11482433590402742,
"grad_norm": 0.03369140625,
"learning_rate": 0.0008671612547178428,
"loss": 0.3325,
"step": 134
},
{
"epoch": 0.11568123393316196,
"grad_norm": 0.043701171875,
"learning_rate": 0.0008649320363489178,
"loss": 0.3207,
"step": 135
},
{
"epoch": 0.11653813196229648,
"grad_norm": 0.031982421875,
"learning_rate": 0.0008626871855061438,
"loss": 0.3279,
"step": 136
},
{
"epoch": 0.11739502999143102,
"grad_norm": 0.033935546875,
"learning_rate": 0.0008604267983514594,
"loss": 0.3236,
"step": 137
},
{
"epoch": 0.11825192802056556,
"grad_norm": 0.03076171875,
"learning_rate": 0.0008581509717123273,
"loss": 0.315,
"step": 138
},
{
"epoch": 0.11910882604970009,
"grad_norm": 0.032958984375,
"learning_rate": 0.0008558598030775857,
"loss": 0.3124,
"step": 139
},
{
"epoch": 0.11996572407883462,
"grad_norm": 0.04052734375,
"learning_rate": 0.0008535533905932737,
"loss": 0.3064,
"step": 140
},
{
"epoch": 0.12082262210796915,
"grad_norm": 0.031494140625,
"learning_rate": 0.0008512318330584259,
"loss": 0.3055,
"step": 141
},
{
"epoch": 0.12167952013710369,
"grad_norm": 0.03515625,
"learning_rate": 0.0008488952299208401,
"loss": 0.2951,
"step": 142
},
{
"epoch": 0.12253641816623821,
"grad_norm": 0.032958984375,
"learning_rate": 0.000846543681272818,
"loss": 0.3288,
"step": 143
},
{
"epoch": 0.12339331619537275,
"grad_norm": 0.032470703125,
"learning_rate": 0.000844177287846877,
"loss": 0.3015,
"step": 144
},
{
"epoch": 0.12425021422450729,
"grad_norm": 0.033935546875,
"learning_rate": 0.0008417961510114356,
"loss": 0.3013,
"step": 145
},
{
"epoch": 0.12510711225364182,
"grad_norm": 0.0341796875,
"learning_rate": 0.0008394003727664709,
"loss": 0.2914,
"step": 146
},
{
"epoch": 0.12596401028277635,
"grad_norm": 0.0306396484375,
"learning_rate": 0.000836990055739149,
"loss": 0.3018,
"step": 147
},
{
"epoch": 0.1268209083119109,
"grad_norm": 0.039794921875,
"learning_rate": 0.0008345653031794292,
"loss": 0.3074,
"step": 148
},
{
"epoch": 0.12767780634104542,
"grad_norm": 0.03955078125,
"learning_rate": 0.0008321262189556409,
"loss": 0.3094,
"step": 149
},
{
"epoch": 0.12853470437017994,
"grad_norm": 0.0311279296875,
"learning_rate": 0.0008296729075500344,
"loss": 0.2971,
"step": 150
},
{
"epoch": 0.1293916023993145,
"grad_norm": 0.03125,
"learning_rate": 0.0008272054740543053,
"loss": 0.307,
"step": 151
},
{
"epoch": 0.13024850042844902,
"grad_norm": 0.0390625,
"learning_rate": 0.0008247240241650918,
"loss": 0.2955,
"step": 152
},
{
"epoch": 0.13110539845758354,
"grad_norm": 0.029541015625,
"learning_rate": 0.0008222286641794488,
"loss": 0.2955,
"step": 153
},
{
"epoch": 0.1319622964867181,
"grad_norm": 0.0306396484375,
"learning_rate": 0.0008197195009902923,
"loss": 0.2904,
"step": 154
},
{
"epoch": 0.1328191945158526,
"grad_norm": 0.0341796875,
"learning_rate": 0.0008171966420818228,
"loss": 0.3027,
"step": 155
},
{
"epoch": 0.13367609254498714,
"grad_norm": 0.03515625,
"learning_rate": 0.0008146601955249188,
"loss": 0.2864,
"step": 156
},
{
"epoch": 0.13453299057412169,
"grad_norm": 0.0361328125,
"learning_rate": 0.0008121102699725089,
"loss": 0.2965,
"step": 157
},
{
"epoch": 0.1353898886032562,
"grad_norm": 0.041748046875,
"learning_rate": 0.0008095469746549171,
"loss": 0.3123,
"step": 158
},
{
"epoch": 0.13624678663239073,
"grad_norm": 0.03125,
"learning_rate": 0.0008069704193751832,
"loss": 0.2912,
"step": 159
},
{
"epoch": 0.13710368466152528,
"grad_norm": 0.033203125,
"learning_rate": 0.0008043807145043603,
"loss": 0.309,
"step": 160
},
{
"epoch": 0.1379605826906598,
"grad_norm": 0.0517578125,
"learning_rate": 0.0008017779709767858,
"loss": 0.2938,
"step": 161
},
{
"epoch": 0.13881748071979436,
"grad_norm": 0.031982421875,
"learning_rate": 0.0007991623002853296,
"loss": 0.2923,
"step": 162
},
{
"epoch": 0.13967437874892888,
"grad_norm": 0.038330078125,
"learning_rate": 0.0007965338144766185,
"loss": 0.3003,
"step": 163
},
{
"epoch": 0.1405312767780634,
"grad_norm": 0.033447265625,
"learning_rate": 0.0007938926261462366,
"loss": 0.2923,
"step": 164
},
{
"epoch": 0.14138817480719795,
"grad_norm": 0.042236328125,
"learning_rate": 0.0007912388484339011,
"loss": 0.2892,
"step": 165
},
{
"epoch": 0.14224507283633248,
"grad_norm": 0.032958984375,
"learning_rate": 0.0007885725950186169,
"loss": 0.3198,
"step": 166
},
{
"epoch": 0.143101970865467,
"grad_norm": 0.037109375,
"learning_rate": 0.000785893980113806,
"loss": 0.2814,
"step": 167
},
{
"epoch": 0.14395886889460155,
"grad_norm": 0.04833984375,
"learning_rate": 0.0007832031184624164,
"loss": 0.2911,
"step": 168
},
{
"epoch": 0.14481576692373607,
"grad_norm": 0.0341796875,
"learning_rate": 0.000780500125332005,
"loss": 0.2893,
"step": 169
},
{
"epoch": 0.1456726649528706,
"grad_norm": 0.034912109375,
"learning_rate": 0.0007777851165098011,
"loss": 0.2884,
"step": 170
},
{
"epoch": 0.14652956298200515,
"grad_norm": 0.038818359375,
"learning_rate": 0.0007750582082977468,
"loss": 0.3052,
"step": 171
},
{
"epoch": 0.14738646101113967,
"grad_norm": 0.0419921875,
"learning_rate": 0.0007723195175075137,
"loss": 0.2833,
"step": 172
},
{
"epoch": 0.14824335904027422,
"grad_norm": 0.040283203125,
"learning_rate": 0.0007695691614555002,
"loss": 0.2795,
"step": 173
},
{
"epoch": 0.14910025706940874,
"grad_norm": 0.038330078125,
"learning_rate": 0.0007668072579578058,
"loss": 0.3104,
"step": 174
},
{
"epoch": 0.14995715509854327,
"grad_norm": 0.0322265625,
"learning_rate": 0.000764033925325184,
"loss": 0.2931,
"step": 175
},
{
"epoch": 0.15081405312767782,
"grad_norm": 0.0361328125,
"learning_rate": 0.0007612492823579744,
"loss": 0.2867,
"step": 176
},
{
"epoch": 0.15167095115681234,
"grad_norm": 0.0291748046875,
"learning_rate": 0.0007584534483410137,
"loss": 0.3051,
"step": 177
},
{
"epoch": 0.15252784918594686,
"grad_norm": 0.0341796875,
"learning_rate": 0.0007556465430385259,
"loss": 0.2852,
"step": 178
},
{
"epoch": 0.1533847472150814,
"grad_norm": 0.036865234375,
"learning_rate": 0.0007528286866889924,
"loss": 0.2919,
"step": 179
},
{
"epoch": 0.15424164524421594,
"grad_norm": 0.0294189453125,
"learning_rate": 0.00075,
"loss": 0.2707,
"step": 180
},
{
"epoch": 0.15509854327335046,
"grad_norm": 0.0277099609375,
"learning_rate": 0.0007471606041430723,
"loss": 0.275,
"step": 181
},
{
"epoch": 0.155955441302485,
"grad_norm": 0.0361328125,
"learning_rate": 0.0007443106207484776,
"loss": 0.2718,
"step": 182
},
{
"epoch": 0.15681233933161953,
"grad_norm": 0.031005859375,
"learning_rate": 0.0007414501719000186,
"loss": 0.2869,
"step": 183
},
{
"epoch": 0.15766923736075408,
"grad_norm": 0.033203125,
"learning_rate": 0.0007385793801298042,
"loss": 0.275,
"step": 184
},
{
"epoch": 0.1585261353898886,
"grad_norm": 0.0277099609375,
"learning_rate": 0.000735698368412999,
"loss": 0.2852,
"step": 185
},
{
"epoch": 0.15938303341902313,
"grad_norm": 0.031005859375,
"learning_rate": 0.0007328072601625557,
"loss": 0.2959,
"step": 186
},
{
"epoch": 0.16023993144815768,
"grad_norm": 0.045654296875,
"learning_rate": 0.00072990617922393,
"loss": 0.2681,
"step": 187
},
{
"epoch": 0.1610968294772922,
"grad_norm": 0.034423828125,
"learning_rate": 0.0007269952498697733,
"loss": 0.2897,
"step": 188
},
{
"epoch": 0.16195372750642673,
"grad_norm": 0.0281982421875,
"learning_rate": 0.0007240745967946113,
"loss": 0.2775,
"step": 189
},
{
"epoch": 0.16281062553556128,
"grad_norm": 0.03564453125,
"learning_rate": 0.0007211443451095007,
"loss": 0.2692,
"step": 190
},
{
"epoch": 0.1636675235646958,
"grad_norm": 0.03125,
"learning_rate": 0.000718204620336671,
"loss": 0.2847,
"step": 191
},
{
"epoch": 0.16452442159383032,
"grad_norm": 0.03662109375,
"learning_rate": 0.0007152555484041476,
"loss": 0.2859,
"step": 192
},
{
"epoch": 0.16538131962296487,
"grad_norm": 0.0299072265625,
"learning_rate": 0.0007122972556403566,
"loss": 0.2784,
"step": 193
},
{
"epoch": 0.1662382176520994,
"grad_norm": 0.029052734375,
"learning_rate": 0.0007093298687687141,
"loss": 0.2801,
"step": 194
},
{
"epoch": 0.16709511568123395,
"grad_norm": 0.027587890625,
"learning_rate": 0.0007063535149021973,
"loss": 0.2787,
"step": 195
},
{
"epoch": 0.16795201371036847,
"grad_norm": 0.03173828125,
"learning_rate": 0.0007033683215379002,
"loss": 0.2796,
"step": 196
},
{
"epoch": 0.168808911739503,
"grad_norm": 0.0291748046875,
"learning_rate": 0.0007003744165515704,
"loss": 0.2817,
"step": 197
},
{
"epoch": 0.16966580976863754,
"grad_norm": 0.037109375,
"learning_rate": 0.0006973719281921336,
"loss": 0.2648,
"step": 198
},
{
"epoch": 0.17052270779777207,
"grad_norm": 0.03173828125,
"learning_rate": 0.0006943609850761978,
"loss": 0.2822,
"step": 199
},
{
"epoch": 0.1713796058269066,
"grad_norm": 0.029541015625,
"learning_rate": 0.000691341716182545,
"loss": 0.2867,
"step": 200
},
{
"epoch": 0.17223650385604114,
"grad_norm": 0.03564453125,
"learning_rate": 0.0006883142508466054,
"loss": 0.2901,
"step": 201
},
{
"epoch": 0.17309340188517566,
"grad_norm": 0.027099609375,
"learning_rate": 0.0006852787187549182,
"loss": 0.2644,
"step": 202
},
{
"epoch": 0.17395029991431019,
"grad_norm": 0.037353515625,
"learning_rate": 0.000682235249939575,
"loss": 0.277,
"step": 203
},
{
"epoch": 0.17480719794344474,
"grad_norm": 0.038818359375,
"learning_rate": 0.0006791839747726501,
"loss": 0.2932,
"step": 204
},
{
"epoch": 0.17566409597257926,
"grad_norm": 0.03466796875,
"learning_rate": 0.0006761250239606168,
"loss": 0.2822,
"step": 205
},
{
"epoch": 0.17652099400171378,
"grad_norm": 0.05322265625,
"learning_rate": 0.0006730585285387465,
"loss": 0.3618,
"step": 206
},
{
"epoch": 0.17737789203084833,
"grad_norm": 0.0289306640625,
"learning_rate": 0.000669984619865497,
"loss": 0.2766,
"step": 207
},
{
"epoch": 0.17823479005998286,
"grad_norm": 0.03759765625,
"learning_rate": 0.0006669034296168854,
"loss": 0.2795,
"step": 208
},
{
"epoch": 0.1790916880891174,
"grad_norm": 0.03369140625,
"learning_rate": 0.0006638150897808468,
"loss": 0.2788,
"step": 209
},
{
"epoch": 0.17994858611825193,
"grad_norm": 0.0322265625,
"learning_rate": 0.0006607197326515808,
"loss": 0.2795,
"step": 210
},
{
"epoch": 0.18080548414738645,
"grad_norm": 0.027099609375,
"learning_rate": 0.0006576174908238849,
"loss": 0.2742,
"step": 211
},
{
"epoch": 0.181662382176521,
"grad_norm": 0.029541015625,
"learning_rate": 0.0006545084971874737,
"loss": 0.2704,
"step": 212
},
{
"epoch": 0.18251928020565553,
"grad_norm": 0.02734375,
"learning_rate": 0.0006513928849212874,
"loss": 0.2725,
"step": 213
},
{
"epoch": 0.18337617823479005,
"grad_norm": 0.0400390625,
"learning_rate": 0.0006482707874877854,
"loss": 0.2742,
"step": 214
},
{
"epoch": 0.1842330762639246,
"grad_norm": 0.0272216796875,
"learning_rate": 0.0006451423386272311,
"loss": 0.268,
"step": 215
},
{
"epoch": 0.18508997429305912,
"grad_norm": 0.0308837890625,
"learning_rate": 0.0006420076723519614,
"loss": 0.2617,
"step": 216
},
{
"epoch": 0.18594687232219365,
"grad_norm": 0.0264892578125,
"learning_rate": 0.0006388669229406462,
"loss": 0.2629,
"step": 217
},
{
"epoch": 0.1868037703513282,
"grad_norm": 0.0286865234375,
"learning_rate": 0.0006357202249325371,
"loss": 0.2812,
"step": 218
},
{
"epoch": 0.18766066838046272,
"grad_norm": 0.02685546875,
"learning_rate": 0.000632567713121704,
"loss": 0.2766,
"step": 219
},
{
"epoch": 0.18851756640959727,
"grad_norm": 0.0291748046875,
"learning_rate": 0.0006294095225512603,
"loss": 0.2816,
"step": 220
},
{
"epoch": 0.1893744644387318,
"grad_norm": 0.027099609375,
"learning_rate": 0.000626245788507579,
"loss": 0.2744,
"step": 221
},
{
"epoch": 0.19023136246786632,
"grad_norm": 0.0281982421875,
"learning_rate": 0.0006230766465144965,
"loss": 0.2777,
"step": 222
},
{
"epoch": 0.19108826049700087,
"grad_norm": 0.0341796875,
"learning_rate": 0.0006199022323275083,
"loss": 0.2632,
"step": 223
},
{
"epoch": 0.1919451585261354,
"grad_norm": 0.0274658203125,
"learning_rate": 0.0006167226819279528,
"loss": 0.2759,
"step": 224
},
{
"epoch": 0.1928020565552699,
"grad_norm": 0.026611328125,
"learning_rate": 0.0006135381315171866,
"loss": 0.2926,
"step": 225
},
{
"epoch": 0.19365895458440446,
"grad_norm": 0.031494140625,
"learning_rate": 0.0006103487175107507,
"loss": 0.2759,
"step": 226
},
{
"epoch": 0.194515852613539,
"grad_norm": 0.0274658203125,
"learning_rate": 0.0006071545765325253,
"loss": 0.2706,
"step": 227
},
{
"epoch": 0.1953727506426735,
"grad_norm": 0.0390625,
"learning_rate": 0.0006039558454088796,
"loss": 0.2816,
"step": 228
},
{
"epoch": 0.19622964867180806,
"grad_norm": 0.0272216796875,
"learning_rate": 0.0006007526611628086,
"loss": 0.2698,
"step": 229
},
{
"epoch": 0.19708654670094258,
"grad_norm": 0.0272216796875,
"learning_rate": 0.0005975451610080642,
"loss": 0.2719,
"step": 230
},
{
"epoch": 0.19794344473007713,
"grad_norm": 0.0361328125,
"learning_rate": 0.0005943334823432777,
"loss": 0.2647,
"step": 231
},
{
"epoch": 0.19880034275921166,
"grad_norm": 0.029052734375,
"learning_rate": 0.0005911177627460738,
"loss": 0.2688,
"step": 232
},
{
"epoch": 0.19965724078834618,
"grad_norm": 0.032470703125,
"learning_rate": 0.0005878981399671774,
"loss": 0.2762,
"step": 233
},
{
"epoch": 0.20051413881748073,
"grad_norm": 0.029541015625,
"learning_rate": 0.0005846747519245122,
"loss": 0.2664,
"step": 234
},
{
"epoch": 0.20137103684661525,
"grad_norm": 0.033203125,
"learning_rate": 0.0005814477366972944,
"loss": 0.2715,
"step": 235
},
{
"epoch": 0.20222793487574978,
"grad_norm": 0.03955078125,
"learning_rate": 0.0005782172325201155,
"loss": 0.2728,
"step": 236
},
{
"epoch": 0.20308483290488433,
"grad_norm": 0.0272216796875,
"learning_rate": 0.0005749833777770225,
"loss": 0.2638,
"step": 237
},
{
"epoch": 0.20394173093401885,
"grad_norm": 0.026611328125,
"learning_rate": 0.0005717463109955896,
"loss": 0.271,
"step": 238
},
{
"epoch": 0.20479862896315337,
"grad_norm": 0.042236328125,
"learning_rate": 0.0005685061708409841,
"loss": 0.2682,
"step": 239
},
{
"epoch": 0.20565552699228792,
"grad_norm": 0.0274658203125,
"learning_rate": 0.000565263096110026,
"loss": 0.2635,
"step": 240
},
{
"epoch": 0.20651242502142245,
"grad_norm": 0.02685546875,
"learning_rate": 0.0005620172257252427,
"loss": 0.2513,
"step": 241
},
{
"epoch": 0.207369323050557,
"grad_norm": 0.02734375,
"learning_rate": 0.0005587686987289189,
"loss": 0.2672,
"step": 242
},
{
"epoch": 0.20822622107969152,
"grad_norm": 0.0311279296875,
"learning_rate": 0.0005555176542771388,
"loss": 0.2777,
"step": 243
},
{
"epoch": 0.20908311910882604,
"grad_norm": 0.0255126953125,
"learning_rate": 0.0005522642316338268,
"loss": 0.2669,
"step": 244
},
{
"epoch": 0.2099400171379606,
"grad_norm": 0.0286865234375,
"learning_rate": 0.0005490085701647804,
"loss": 0.2708,
"step": 245
},
{
"epoch": 0.21079691516709512,
"grad_norm": 0.0260009765625,
"learning_rate": 0.0005457508093317013,
"loss": 0.2727,
"step": 246
},
{
"epoch": 0.21165381319622964,
"grad_norm": 0.02490234375,
"learning_rate": 0.0005424910886862209,
"loss": 0.2751,
"step": 247
},
{
"epoch": 0.2125107112253642,
"grad_norm": 0.023193359375,
"learning_rate": 0.0005392295478639225,
"loss": 0.2649,
"step": 248
},
{
"epoch": 0.2133676092544987,
"grad_norm": 0.023681640625,
"learning_rate": 0.0005359663265783598,
"loss": 0.2647,
"step": 249
},
{
"epoch": 0.21422450728363324,
"grad_norm": 0.0283203125,
"learning_rate": 0.0005327015646150716,
"loss": 0.2594,
"step": 250
},
{
"epoch": 0.2150814053127678,
"grad_norm": 0.026611328125,
"learning_rate": 0.0005294354018255945,
"loss": 0.2944,
"step": 251
},
{
"epoch": 0.2159383033419023,
"grad_norm": 0.0257568359375,
"learning_rate": 0.000526167978121472,
"loss": 0.2886,
"step": 252
},
{
"epoch": 0.21679520137103683,
"grad_norm": 0.0281982421875,
"learning_rate": 0.0005228994334682604,
"loss": 0.2558,
"step": 253
},
{
"epoch": 0.21765209940017138,
"grad_norm": 0.025634765625,
"learning_rate": 0.0005196299078795343,
"loss": 0.2571,
"step": 254
},
{
"epoch": 0.2185089974293059,
"grad_norm": 0.02685546875,
"learning_rate": 0.0005163595414108881,
"loss": 0.2524,
"step": 255
},
{
"epoch": 0.21936589545844046,
"grad_norm": 0.0250244140625,
"learning_rate": 0.0005130884741539367,
"loss": 0.2698,
"step": 256
},
{
"epoch": 0.22022279348757498,
"grad_norm": 0.024658203125,
"learning_rate": 0.0005098168462303141,
"loss": 0.2716,
"step": 257
},
{
"epoch": 0.2210796915167095,
"grad_norm": 0.0238037109375,
"learning_rate": 0.0005065447977856722,
"loss": 0.2605,
"step": 258
},
{
"epoch": 0.22193658954584405,
"grad_norm": 0.025146484375,
"learning_rate": 0.0005032724689836759,
"loss": 0.2584,
"step": 259
},
{
"epoch": 0.22279348757497858,
"grad_norm": 0.025146484375,
"learning_rate": 0.0005,
"loss": 0.2618,
"step": 260
},
{
"epoch": 0.2236503856041131,
"grad_norm": 0.0289306640625,
"learning_rate": 0.0004967275310163241,
"loss": 0.2602,
"step": 261
},
{
"epoch": 0.22450728363324765,
"grad_norm": 0.031494140625,
"learning_rate": 0.0004934552022143279,
"loss": 0.2744,
"step": 262
},
{
"epoch": 0.22536418166238217,
"grad_norm": 0.036865234375,
"learning_rate": 0.0004901831537696859,
"loss": 0.2598,
"step": 263
},
{
"epoch": 0.2262210796915167,
"grad_norm": 0.0264892578125,
"learning_rate": 0.0004869115258460635,
"loss": 0.2629,
"step": 264
},
{
"epoch": 0.22707797772065125,
"grad_norm": 0.0245361328125,
"learning_rate": 0.00048364045858911197,
"loss": 0.2601,
"step": 265
},
{
"epoch": 0.22793487574978577,
"grad_norm": 0.035888671875,
"learning_rate": 0.00048037009212046586,
"loss": 0.261,
"step": 266
},
{
"epoch": 0.22879177377892032,
"grad_norm": 0.03076171875,
"learning_rate": 0.0004771005665317397,
"loss": 0.2531,
"step": 267
},
{
"epoch": 0.22964867180805484,
"grad_norm": 0.0250244140625,
"learning_rate": 0.0004738320218785281,
"loss": 0.2707,
"step": 268
},
{
"epoch": 0.23050556983718937,
"grad_norm": 0.0240478515625,
"learning_rate": 0.00047056459817440544,
"loss": 0.2636,
"step": 269
},
{
"epoch": 0.23136246786632392,
"grad_norm": 0.0264892578125,
"learning_rate": 0.00046729843538492847,
"loss": 0.2606,
"step": 270
},
{
"epoch": 0.23221936589545844,
"grad_norm": 0.0286865234375,
"learning_rate": 0.00046403367342164026,
"loss": 0.257,
"step": 271
},
{
"epoch": 0.23307626392459296,
"grad_norm": 0.028564453125,
"learning_rate": 0.0004607704521360776,
"loss": 0.2646,
"step": 272
},
{
"epoch": 0.23393316195372751,
"grad_norm": 0.028076171875,
"learning_rate": 0.0004575089113137792,
"loss": 0.2735,
"step": 273
},
{
"epoch": 0.23479005998286204,
"grad_norm": 0.0257568359375,
"learning_rate": 0.00045424919066829885,
"loss": 0.272,
"step": 274
},
{
"epoch": 0.23564695801199656,
"grad_norm": 0.02587890625,
"learning_rate": 0.0004509914298352197,
"loss": 0.266,
"step": 275
},
{
"epoch": 0.2365038560411311,
"grad_norm": 0.0242919921875,
"learning_rate": 0.00044773576836617336,
"loss": 0.2607,
"step": 276
},
{
"epoch": 0.23736075407026563,
"grad_norm": 0.0245361328125,
"learning_rate": 0.0004444823457228612,
"loss": 0.2696,
"step": 277
},
{
"epoch": 0.23821765209940018,
"grad_norm": 0.024658203125,
"learning_rate": 0.00044123130127108126,
"loss": 0.2598,
"step": 278
},
{
"epoch": 0.2390745501285347,
"grad_norm": 0.0301513671875,
"learning_rate": 0.0004379827742747575,
"loss": 0.2581,
"step": 279
},
{
"epoch": 0.23993144815766923,
"grad_norm": 0.0252685546875,
"learning_rate": 0.00043473690388997434,
"loss": 0.2652,
"step": 280
},
{
"epoch": 0.24078834618680378,
"grad_norm": 0.024658203125,
"learning_rate": 0.0004314938291590161,
"loss": 0.2635,
"step": 281
},
{
"epoch": 0.2416452442159383,
"grad_norm": 0.0223388671875,
"learning_rate": 0.0004282536890044104,
"loss": 0.2546,
"step": 282
},
{
"epoch": 0.24250214224507283,
"grad_norm": 0.0234375,
"learning_rate": 0.0004250166222229774,
"loss": 0.2533,
"step": 283
},
{
"epoch": 0.24335904027420738,
"grad_norm": 0.026123046875,
"learning_rate": 0.0004217827674798845,
"loss": 0.2712,
"step": 284
},
{
"epoch": 0.2442159383033419,
"grad_norm": 0.024658203125,
"learning_rate": 0.0004185522633027057,
"loss": 0.2658,
"step": 285
},
{
"epoch": 0.24507283633247642,
"grad_norm": 0.02587890625,
"learning_rate": 0.0004153252480754877,
"loss": 0.2588,
"step": 286
},
{
"epoch": 0.24592973436161097,
"grad_norm": 0.029541015625,
"learning_rate": 0.00041210186003282274,
"loss": 0.2671,
"step": 287
},
{
"epoch": 0.2467866323907455,
"grad_norm": 0.0245361328125,
"learning_rate": 0.00040888223725392626,
"loss": 0.2741,
"step": 288
},
{
"epoch": 0.24764353041988005,
"grad_norm": 0.0242919921875,
"learning_rate": 0.00040566651765672245,
"loss": 0.27,
"step": 289
},
{
"epoch": 0.24850042844901457,
"grad_norm": 0.02197265625,
"learning_rate": 0.00040245483899193594,
"loss": 0.2679,
"step": 290
},
{
"epoch": 0.2493573264781491,
"grad_norm": 0.0224609375,
"learning_rate": 0.00039924733883719147,
"loss": 0.2685,
"step": 291
},
{
"epoch": 0.25021422450728364,
"grad_norm": 0.0322265625,
"learning_rate": 0.0003960441545911204,
"loss": 0.2687,
"step": 292
},
{
"epoch": 0.25107112253641817,
"grad_norm": 0.0281982421875,
"learning_rate": 0.0003928454234674747,
"loss": 0.2554,
"step": 293
},
{
"epoch": 0.2519280205655527,
"grad_norm": 0.031494140625,
"learning_rate": 0.0003896512824892495,
"loss": 0.268,
"step": 294
},
{
"epoch": 0.2527849185946872,
"grad_norm": 0.0296630859375,
"learning_rate": 0.00038646186848281344,
"loss": 0.2694,
"step": 295
},
{
"epoch": 0.2536418166238218,
"grad_norm": 0.0283203125,
"learning_rate": 0.00038327731807204744,
"loss": 0.2585,
"step": 296
},
{
"epoch": 0.2544987146529563,
"grad_norm": 0.0341796875,
"learning_rate": 0.0003800977676724919,
"loss": 0.2764,
"step": 297
},
{
"epoch": 0.25535561268209084,
"grad_norm": 0.024658203125,
"learning_rate": 0.0003769233534855035,
"loss": 0.2688,
"step": 298
},
{
"epoch": 0.25621251071122536,
"grad_norm": 0.0277099609375,
"learning_rate": 0.00037375421149242103,
"loss": 0.2561,
"step": 299
},
{
"epoch": 0.2570694087403599,
"grad_norm": 0.0269775390625,
"learning_rate": 0.0003705904774487396,
"loss": 0.2564,
"step": 300
},
{
"epoch": 0.2579263067694944,
"grad_norm": 0.0247802734375,
"learning_rate": 0.0003674322868782959,
"loss": 0.2543,
"step": 301
},
{
"epoch": 0.258783204798629,
"grad_norm": 0.0255126953125,
"learning_rate": 0.0003642797750674629,
"loss": 0.2586,
"step": 302
},
{
"epoch": 0.2596401028277635,
"grad_norm": 0.0228271484375,
"learning_rate": 0.00036113307705935393,
"loss": 0.2624,
"step": 303
},
{
"epoch": 0.26049700085689803,
"grad_norm": 0.02197265625,
"learning_rate": 0.0003579923276480387,
"loss": 0.2658,
"step": 304
},
{
"epoch": 0.26135389888603255,
"grad_norm": 0.0245361328125,
"learning_rate": 0.0003548576613727689,
"loss": 0.2793,
"step": 305
},
{
"epoch": 0.2622107969151671,
"grad_norm": 0.031494140625,
"learning_rate": 0.0003517292125122146,
"loss": 0.2605,
"step": 306
},
{
"epoch": 0.26306769494430166,
"grad_norm": 0.0341796875,
"learning_rate": 0.0003486071150787128,
"loss": 0.2654,
"step": 307
},
{
"epoch": 0.2639245929734362,
"grad_norm": 0.0260009765625,
"learning_rate": 0.00034549150281252633,
"loss": 0.2711,
"step": 308
},
{
"epoch": 0.2647814910025707,
"grad_norm": 0.0233154296875,
"learning_rate": 0.0003423825091761153,
"loss": 0.2686,
"step": 309
},
{
"epoch": 0.2656383890317052,
"grad_norm": 0.0260009765625,
"learning_rate": 0.0003392802673484193,
"loss": 0.2539,
"step": 310
},
{
"epoch": 0.26649528706083975,
"grad_norm": 0.023193359375,
"learning_rate": 0.0003361849102191533,
"loss": 0.2706,
"step": 311
},
{
"epoch": 0.26735218508997427,
"grad_norm": 0.0260009765625,
"learning_rate": 0.00033309657038311456,
"loss": 0.2854,
"step": 312
},
{
"epoch": 0.26820908311910885,
"grad_norm": 0.0235595703125,
"learning_rate": 0.00033001538013450283,
"loss": 0.2714,
"step": 313
},
{
"epoch": 0.26906598114824337,
"grad_norm": 0.0213623046875,
"learning_rate": 0.0003269414714612534,
"loss": 0.2624,
"step": 314
},
{
"epoch": 0.2699228791773779,
"grad_norm": 0.0224609375,
"learning_rate": 0.00032387497603938325,
"loss": 0.264,
"step": 315
},
{
"epoch": 0.2707797772065124,
"grad_norm": 0.022705078125,
"learning_rate": 0.00032081602522734986,
"loss": 0.2611,
"step": 316
},
{
"epoch": 0.27163667523564694,
"grad_norm": 0.0262451171875,
"learning_rate": 0.0003177647500604252,
"loss": 0.2697,
"step": 317
},
{
"epoch": 0.27249357326478146,
"grad_norm": 0.024169921875,
"learning_rate": 0.00031472128124508187,
"loss": 0.2684,
"step": 318
},
{
"epoch": 0.27335047129391604,
"grad_norm": 0.0289306640625,
"learning_rate": 0.00031168574915339467,
"loss": 0.2627,
"step": 319
},
{
"epoch": 0.27420736932305056,
"grad_norm": 0.02197265625,
"learning_rate": 0.0003086582838174551,
"loss": 0.2661,
"step": 320
},
{
"epoch": 0.2750642673521851,
"grad_norm": 0.023193359375,
"learning_rate": 0.0003056390149238022,
"loss": 0.2733,
"step": 321
},
{
"epoch": 0.2759211653813196,
"grad_norm": 0.0240478515625,
"learning_rate": 0.00030262807180786645,
"loss": 0.2619,
"step": 322
},
{
"epoch": 0.27677806341045413,
"grad_norm": 0.0218505859375,
"learning_rate": 0.00029962558344842963,
"loss": 0.2607,
"step": 323
},
{
"epoch": 0.2776349614395887,
"grad_norm": 0.034912109375,
"learning_rate": 0.0002966316784621,
"loss": 0.2662,
"step": 324
},
{
"epoch": 0.27849185946872324,
"grad_norm": 0.029052734375,
"learning_rate": 0.0002936464850978027,
"loss": 0.2581,
"step": 325
},
{
"epoch": 0.27934875749785776,
"grad_norm": 0.0223388671875,
"learning_rate": 0.0002906701312312861,
"loss": 0.2662,
"step": 326
},
{
"epoch": 0.2802056555269923,
"grad_norm": 0.023681640625,
"learning_rate": 0.00028770274435964356,
"loss": 0.26,
"step": 327
},
{
"epoch": 0.2810625535561268,
"grad_norm": 0.0272216796875,
"learning_rate": 0.0002847444515958523,
"loss": 0.2645,
"step": 328
},
{
"epoch": 0.2819194515852613,
"grad_norm": 0.025146484375,
"learning_rate": 0.0002817953796633289,
"loss": 0.2635,
"step": 329
},
{
"epoch": 0.2827763496143959,
"grad_norm": 0.0233154296875,
"learning_rate": 0.00027885565489049947,
"loss": 0.2619,
"step": 330
},
{
"epoch": 0.28363324764353043,
"grad_norm": 0.0213623046875,
"learning_rate": 0.0002759254032053888,
"loss": 0.2668,
"step": 331
},
{
"epoch": 0.28449014567266495,
"grad_norm": 0.0216064453125,
"learning_rate": 0.00027300475013022663,
"loss": 0.2553,
"step": 332
},
{
"epoch": 0.2853470437017995,
"grad_norm": 0.0228271484375,
"learning_rate": 0.0002700938207760701,
"loss": 0.2614,
"step": 333
},
{
"epoch": 0.286203941730934,
"grad_norm": 0.02587890625,
"learning_rate": 0.0002671927398374443,
"loss": 0.2541,
"step": 334
},
{
"epoch": 0.2870608397600686,
"grad_norm": 0.022216796875,
"learning_rate": 0.00026430163158700117,
"loss": 0.256,
"step": 335
},
{
"epoch": 0.2879177377892031,
"grad_norm": 0.024169921875,
"learning_rate": 0.00026142061987019576,
"loss": 0.2675,
"step": 336
},
{
"epoch": 0.2887746358183376,
"grad_norm": 0.0284423828125,
"learning_rate": 0.0002585498280999815,
"loss": 0.2666,
"step": 337
},
{
"epoch": 0.28963153384747214,
"grad_norm": 0.02490234375,
"learning_rate": 0.0002556893792515227,
"loss": 0.2888,
"step": 338
},
{
"epoch": 0.29048843187660667,
"grad_norm": 0.030029296875,
"learning_rate": 0.00025283939585692784,
"loss": 0.2674,
"step": 339
},
{
"epoch": 0.2913453299057412,
"grad_norm": 0.0245361328125,
"learning_rate": 0.0002500000000000001,
"loss": 0.2624,
"step": 340
},
{
"epoch": 0.29220222793487577,
"grad_norm": 0.0205078125,
"learning_rate": 0.0002471713133110078,
"loss": 0.2457,
"step": 341
},
{
"epoch": 0.2930591259640103,
"grad_norm": 0.0244140625,
"learning_rate": 0.00024435345696147403,
"loss": 0.2567,
"step": 342
},
{
"epoch": 0.2939160239931448,
"grad_norm": 0.026123046875,
"learning_rate": 0.00024154655165898627,
"loss": 0.2569,
"step": 343
},
{
"epoch": 0.29477292202227934,
"grad_norm": 0.0244140625,
"learning_rate": 0.00023875071764202561,
"loss": 0.2583,
"step": 344
},
{
"epoch": 0.29562982005141386,
"grad_norm": 0.0230712890625,
"learning_rate": 0.00023596607467481602,
"loss": 0.2549,
"step": 345
},
{
"epoch": 0.29648671808054844,
"grad_norm": 0.030517578125,
"learning_rate": 0.00023319274204219425,
"loss": 0.2647,
"step": 346
},
{
"epoch": 0.29734361610968296,
"grad_norm": 0.0284423828125,
"learning_rate": 0.00023043083854449987,
"loss": 0.2848,
"step": 347
},
{
"epoch": 0.2982005141388175,
"grad_norm": 0.026123046875,
"learning_rate": 0.00022768048249248646,
"loss": 0.2724,
"step": 348
},
{
"epoch": 0.299057412167952,
"grad_norm": 0.027587890625,
"learning_rate": 0.00022494179170225333,
"loss": 0.2684,
"step": 349
},
{
"epoch": 0.29991431019708653,
"grad_norm": 0.0255126953125,
"learning_rate": 0.00022221488349019903,
"loss": 0.2623,
"step": 350
},
{
"epoch": 0.30077120822622105,
"grad_norm": 0.0269775390625,
"learning_rate": 0.0002194998746679952,
"loss": 0.2608,
"step": 351
},
{
"epoch": 0.30162810625535563,
"grad_norm": 0.03662109375,
"learning_rate": 0.0002167968815375837,
"loss": 0.2671,
"step": 352
},
{
"epoch": 0.30248500428449016,
"grad_norm": 0.031494140625,
"learning_rate": 0.00021410601988619394,
"loss": 0.2583,
"step": 353
},
{
"epoch": 0.3033419023136247,
"grad_norm": 0.02490234375,
"learning_rate": 0.00021142740498138323,
"loss": 0.2617,
"step": 354
},
{
"epoch": 0.3041988003427592,
"grad_norm": 0.022216796875,
"learning_rate": 0.000208761151566099,
"loss": 0.2569,
"step": 355
},
{
"epoch": 0.3050556983718937,
"grad_norm": 0.0250244140625,
"learning_rate": 0.00020610737385376348,
"loss": 0.2612,
"step": 356
},
{
"epoch": 0.3059125964010283,
"grad_norm": 0.02783203125,
"learning_rate": 0.00020346618552338148,
"loss": 0.2629,
"step": 357
},
{
"epoch": 0.3067694944301628,
"grad_norm": 0.02197265625,
"learning_rate": 0.00020083769971467047,
"loss": 0.2629,
"step": 358
},
{
"epoch": 0.30762639245929735,
"grad_norm": 0.024169921875,
"learning_rate": 0.0001982220290232143,
"loss": 0.2847,
"step": 359
},
{
"epoch": 0.30848329048843187,
"grad_norm": 0.026611328125,
"learning_rate": 0.00019561928549563967,
"loss": 0.266,
"step": 360
},
{
"epoch": 0.3093401885175664,
"grad_norm": 0.0272216796875,
"learning_rate": 0.00019302958062481672,
"loss": 0.2563,
"step": 361
},
{
"epoch": 0.3101970865467009,
"grad_norm": 0.031005859375,
"learning_rate": 0.00019045302534508295,
"loss": 0.2696,
"step": 362
},
{
"epoch": 0.3110539845758355,
"grad_norm": 0.0234375,
"learning_rate": 0.0001878897300274911,
"loss": 0.2636,
"step": 363
},
{
"epoch": 0.31191088260497,
"grad_norm": 0.0220947265625,
"learning_rate": 0.00018533980447508135,
"loss": 0.258,
"step": 364
},
{
"epoch": 0.31276778063410454,
"grad_norm": 0.022705078125,
"learning_rate": 0.00018280335791817732,
"loss": 0.2534,
"step": 365
},
{
"epoch": 0.31362467866323906,
"grad_norm": 0.0211181640625,
"learning_rate": 0.00018028049900970766,
"loss": 0.2709,
"step": 366
},
{
"epoch": 0.3144815766923736,
"grad_norm": 0.021728515625,
"learning_rate": 0.0001777713358205514,
"loss": 0.2708,
"step": 367
},
{
"epoch": 0.31533847472150817,
"grad_norm": 0.0205078125,
"learning_rate": 0.00017527597583490823,
"loss": 0.2587,
"step": 368
},
{
"epoch": 0.3161953727506427,
"grad_norm": 0.020263671875,
"learning_rate": 0.00017279452594569483,
"loss": 0.2597,
"step": 369
},
{
"epoch": 0.3170522707797772,
"grad_norm": 0.0242919921875,
"learning_rate": 0.00017032709244996558,
"loss": 0.2611,
"step": 370
},
{
"epoch": 0.31790916880891174,
"grad_norm": 0.021484375,
"learning_rate": 0.00016787378104435928,
"loss": 0.2697,
"step": 371
},
{
"epoch": 0.31876606683804626,
"grad_norm": 0.020751953125,
"learning_rate": 0.00016543469682057105,
"loss": 0.2641,
"step": 372
},
{
"epoch": 0.3196229648671808,
"grad_norm": 0.022216796875,
"learning_rate": 0.00016300994426085103,
"loss": 0.2658,
"step": 373
},
{
"epoch": 0.32047986289631536,
"grad_norm": 0.0213623046875,
"learning_rate": 0.0001605996272335291,
"loss": 0.2641,
"step": 374
},
{
"epoch": 0.3213367609254499,
"grad_norm": 0.0184326171875,
"learning_rate": 0.00015820384898856434,
"loss": 0.2651,
"step": 375
},
{
"epoch": 0.3221936589545844,
"grad_norm": 0.0262451171875,
"learning_rate": 0.00015582271215312294,
"loss": 0.2559,
"step": 376
},
{
"epoch": 0.32305055698371893,
"grad_norm": 0.023681640625,
"learning_rate": 0.00015345631872718213,
"loss": 0.2558,
"step": 377
},
{
"epoch": 0.32390745501285345,
"grad_norm": 0.0252685546875,
"learning_rate": 0.00015110477007916002,
"loss": 0.2537,
"step": 378
},
{
"epoch": 0.32476435304198803,
"grad_norm": 0.019775390625,
"learning_rate": 0.0001487681669415742,
"loss": 0.2565,
"step": 379
},
{
"epoch": 0.32562125107112255,
"grad_norm": 0.019775390625,
"learning_rate": 0.00014644660940672628,
"loss": 0.2562,
"step": 380
},
{
"epoch": 0.3264781491002571,
"grad_norm": 0.0301513671875,
"learning_rate": 0.00014414019692241437,
"loss": 0.2644,
"step": 381
},
{
"epoch": 0.3273350471293916,
"grad_norm": 0.019287109375,
"learning_rate": 0.00014184902828767287,
"loss": 0.2671,
"step": 382
},
{
"epoch": 0.3281919451585261,
"grad_norm": 0.0262451171875,
"learning_rate": 0.0001395732016485406,
"loss": 0.249,
"step": 383
},
{
"epoch": 0.32904884318766064,
"grad_norm": 0.0198974609375,
"learning_rate": 0.0001373128144938563,
"loss": 0.2558,
"step": 384
},
{
"epoch": 0.3299057412167952,
"grad_norm": 0.021728515625,
"learning_rate": 0.00013506796365108232,
"loss": 0.2693,
"step": 385
},
{
"epoch": 0.33076263924592975,
"grad_norm": 0.021484375,
"learning_rate": 0.00013283874528215734,
"loss": 0.2686,
"step": 386
},
{
"epoch": 0.33161953727506427,
"grad_norm": 0.02587890625,
"learning_rate": 0.00013062525487937698,
"loss": 0.2711,
"step": 387
},
{
"epoch": 0.3324764353041988,
"grad_norm": 0.018798828125,
"learning_rate": 0.00012842758726130281,
"loss": 0.2559,
"step": 388
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.0189208984375,
"learning_rate": 0.00012624583656870153,
"loss": 0.2639,
"step": 389
},
{
"epoch": 0.3341902313624679,
"grad_norm": 0.0196533203125,
"learning_rate": 0.00012408009626051135,
"loss": 0.2681,
"step": 390
},
{
"epoch": 0.3350471293916024,
"grad_norm": 0.02001953125,
"learning_rate": 0.00012193045910983863,
"loss": 0.2629,
"step": 391
},
{
"epoch": 0.33590402742073694,
"grad_norm": 0.019775390625,
"learning_rate": 0.00011979701719998454,
"loss": 0.2671,
"step": 392
},
{
"epoch": 0.33676092544987146,
"grad_norm": 0.0240478515625,
"learning_rate": 0.00011767986192049984,
"loss": 0.2651,
"step": 393
},
{
"epoch": 0.337617823479006,
"grad_norm": 0.0201416015625,
"learning_rate": 0.00011557908396327027,
"loss": 0.2646,
"step": 394
},
{
"epoch": 0.3384747215081405,
"grad_norm": 0.0257568359375,
"learning_rate": 0.00011349477331863151,
"loss": 0.2723,
"step": 395
},
{
"epoch": 0.3393316195372751,
"grad_norm": 0.023681640625,
"learning_rate": 0.00011142701927151455,
"loss": 0.2775,
"step": 396
},
{
"epoch": 0.3401885175664096,
"grad_norm": 0.032470703125,
"learning_rate": 0.00010937591039762085,
"loss": 0.265,
"step": 397
},
{
"epoch": 0.34104541559554413,
"grad_norm": 0.0198974609375,
"learning_rate": 0.00010734153455962764,
"loss": 0.2661,
"step": 398
},
{
"epoch": 0.34190231362467866,
"grad_norm": 0.0184326171875,
"learning_rate": 0.00010532397890342504,
"loss": 0.2526,
"step": 399
},
{
"epoch": 0.3427592116538132,
"grad_norm": 0.0181884765625,
"learning_rate": 0.00010332332985438247,
"loss": 0.2583,
"step": 400
},
{
"epoch": 0.34361610968294776,
"grad_norm": 0.0216064453125,
"learning_rate": 0.0001013396731136465,
"loss": 0.2544,
"step": 401
},
{
"epoch": 0.3444730077120823,
"grad_norm": 0.0247802734375,
"learning_rate": 9.937309365446973e-05,
"loss": 0.2796,
"step": 402
},
{
"epoch": 0.3453299057412168,
"grad_norm": 0.0189208984375,
"learning_rate": 9.742367571857092e-05,
"loss": 0.2611,
"step": 403
},
{
"epoch": 0.3461868037703513,
"grad_norm": 0.0225830078125,
"learning_rate": 9.549150281252633e-05,
"loss": 0.2568,
"step": 404
},
{
"epoch": 0.34704370179948585,
"grad_norm": 0.0233154296875,
"learning_rate": 9.357665770419243e-05,
"loss": 0.2661,
"step": 405
},
{
"epoch": 0.34790059982862037,
"grad_norm": 0.0245361328125,
"learning_rate": 9.167922241916055e-05,
"loss": 0.27,
"step": 406
},
{
"epoch": 0.34875749785775495,
"grad_norm": 0.0198974609375,
"learning_rate": 8.979927823724321e-05,
"loss": 0.2665,
"step": 407
},
{
"epoch": 0.3496143958868895,
"grad_norm": 0.0252685546875,
"learning_rate": 8.793690568899215e-05,
"loss": 0.26,
"step": 408
},
{
"epoch": 0.350471293916024,
"grad_norm": 0.029052734375,
"learning_rate": 8.609218455224893e-05,
"loss": 0.2625,
"step": 409
},
{
"epoch": 0.3513281919451585,
"grad_norm": 0.019287109375,
"learning_rate": 8.426519384872733e-05,
"loss": 0.2581,
"step": 410
},
{
"epoch": 0.35218508997429304,
"grad_norm": 0.02490234375,
"learning_rate": 8.24560118406285e-05,
"loss": 0.2629,
"step": 411
},
{
"epoch": 0.35304198800342756,
"grad_norm": 0.020263671875,
"learning_rate": 8.066471602728804e-05,
"loss": 0.2522,
"step": 412
},
{
"epoch": 0.35389888603256214,
"grad_norm": 0.0245361328125,
"learning_rate": 7.889138314185678e-05,
"loss": 0.2648,
"step": 413
},
{
"epoch": 0.35475578406169667,
"grad_norm": 0.0191650390625,
"learning_rate": 7.71360891480134e-05,
"loss": 0.2633,
"step": 414
},
{
"epoch": 0.3556126820908312,
"grad_norm": 0.02099609375,
"learning_rate": 7.53989092367106e-05,
"loss": 0.2681,
"step": 415
},
{
"epoch": 0.3564695801199657,
"grad_norm": 0.0294189453125,
"learning_rate": 7.367991782295391e-05,
"loss": 0.2681,
"step": 416
},
{
"epoch": 0.35732647814910024,
"grad_norm": 0.020751953125,
"learning_rate": 7.197918854261431e-05,
"loss": 0.2531,
"step": 417
},
{
"epoch": 0.3581833761782348,
"grad_norm": 0.021728515625,
"learning_rate": 7.029679424927366e-05,
"loss": 0.2607,
"step": 418
},
{
"epoch": 0.35904027420736934,
"grad_norm": 0.029052734375,
"learning_rate": 6.863280701110408e-05,
"loss": 0.2623,
"step": 419
},
{
"epoch": 0.35989717223650386,
"grad_norm": 0.0211181640625,
"learning_rate": 6.698729810778065e-05,
"loss": 0.2641,
"step": 420
},
{
"epoch": 0.3607540702656384,
"grad_norm": 0.019775390625,
"learning_rate": 6.536033802742814e-05,
"loss": 0.2809,
"step": 421
},
{
"epoch": 0.3616109682947729,
"grad_norm": 0.0181884765625,
"learning_rate": 6.375199646360142e-05,
"loss": 0.2679,
"step": 422
},
{
"epoch": 0.36246786632390743,
"grad_norm": 0.056396484375,
"learning_rate": 6.21623423123001e-05,
"loss": 0.3452,
"step": 423
},
{
"epoch": 0.363324764353042,
"grad_norm": 0.0211181640625,
"learning_rate": 6.059144366901737e-05,
"loss": 0.2508,
"step": 424
},
{
"epoch": 0.36418166238217653,
"grad_norm": 0.01806640625,
"learning_rate": 5.903936782582253e-05,
"loss": 0.2516,
"step": 425
},
{
"epoch": 0.36503856041131105,
"grad_norm": 0.0203857421875,
"learning_rate": 5.750618126847912e-05,
"loss": 0.2633,
"step": 426
},
{
"epoch": 0.3658954584404456,
"grad_norm": 0.0186767578125,
"learning_rate": 5.599194967359639e-05,
"loss": 0.263,
"step": 427
},
{
"epoch": 0.3667523564695801,
"grad_norm": 0.0257568359375,
"learning_rate": 5.449673790581611e-05,
"loss": 0.2754,
"step": 428
},
{
"epoch": 0.3676092544987147,
"grad_norm": 0.023193359375,
"learning_rate": 5.3020610015033946e-05,
"loss": 0.2628,
"step": 429
},
{
"epoch": 0.3684661525278492,
"grad_norm": 0.0223388671875,
"learning_rate": 5.1563629233655876e-05,
"loss": 0.2775,
"step": 430
},
{
"epoch": 0.3693230505569837,
"grad_norm": 0.0213623046875,
"learning_rate": 5.0125857973889355e-05,
"loss": 0.2529,
"step": 431
},
{
"epoch": 0.37017994858611825,
"grad_norm": 0.0189208984375,
"learning_rate": 4.87073578250698e-05,
"loss": 0.2672,
"step": 432
},
{
"epoch": 0.37103684661525277,
"grad_norm": 0.023193359375,
"learning_rate": 4.730818955102234e-05,
"loss": 0.2576,
"step": 433
},
{
"epoch": 0.3718937446443873,
"grad_norm": 0.027587890625,
"learning_rate": 4.592841308745932e-05,
"loss": 0.2575,
"step": 434
},
{
"epoch": 0.37275064267352187,
"grad_norm": 0.025146484375,
"learning_rate": 4.456808753941205e-05,
"loss": 0.257,
"step": 435
},
{
"epoch": 0.3736075407026564,
"grad_norm": 0.0201416015625,
"learning_rate": 4.322727117869951e-05,
"loss": 0.2661,
"step": 436
},
{
"epoch": 0.3744644387317909,
"grad_norm": 0.0302734375,
"learning_rate": 4.190602144143207e-05,
"loss": 0.278,
"step": 437
},
{
"epoch": 0.37532133676092544,
"grad_norm": 0.0250244140625,
"learning_rate": 4.06043949255509e-05,
"loss": 0.2695,
"step": 438
},
{
"epoch": 0.37617823479005996,
"grad_norm": 0.0216064453125,
"learning_rate": 3.932244738840379e-05,
"loss": 0.2559,
"step": 439
},
{
"epoch": 0.37703513281919454,
"grad_norm": 0.020751953125,
"learning_rate": 3.806023374435663e-05,
"loss": 0.2721,
"step": 440
},
{
"epoch": 0.37789203084832906,
"grad_norm": 0.025146484375,
"learning_rate": 3.681780806244095e-05,
"loss": 0.2479,
"step": 441
},
{
"epoch": 0.3787489288774636,
"grad_norm": 0.0223388671875,
"learning_rate": 3.559522356403788e-05,
"loss": 0.2686,
"step": 442
},
{
"epoch": 0.3796058269065981,
"grad_norm": 0.018798828125,
"learning_rate": 3.439253262059822e-05,
"loss": 0.2404,
"step": 443
},
{
"epoch": 0.38046272493573263,
"grad_norm": 0.021240234375,
"learning_rate": 3.3209786751399184e-05,
"loss": 0.2702,
"step": 444
},
{
"epoch": 0.38131962296486716,
"grad_norm": 0.0194091796875,
"learning_rate": 3.2047036621337236e-05,
"loss": 0.2568,
"step": 445
},
{
"epoch": 0.38217652099400173,
"grad_norm": 0.022216796875,
"learning_rate": 3.0904332038757974e-05,
"loss": 0.2586,
"step": 446
},
{
"epoch": 0.38303341902313626,
"grad_norm": 0.0205078125,
"learning_rate": 2.9781721953322627e-05,
"loss": 0.2557,
"step": 447
},
{
"epoch": 0.3838903170522708,
"grad_norm": 0.018798828125,
"learning_rate": 2.8679254453910786e-05,
"loss": 0.2515,
"step": 448
},
{
"epoch": 0.3847472150814053,
"grad_norm": 0.0186767578125,
"learning_rate": 2.7596976766560976e-05,
"loss": 0.2532,
"step": 449
},
{
"epoch": 0.3856041131105398,
"grad_norm": 0.0181884765625,
"learning_rate": 2.653493525244721e-05,
"loss": 0.2555,
"step": 450
},
{
"epoch": 0.3864610111396744,
"grad_norm": 0.0218505859375,
"learning_rate": 2.5493175405893076e-05,
"loss": 0.2469,
"step": 451
},
{
"epoch": 0.3873179091688089,
"grad_norm": 0.0242919921875,
"learning_rate": 2.4471741852423235e-05,
"loss": 0.2566,
"step": 452
},
{
"epoch": 0.38817480719794345,
"grad_norm": 0.020751953125,
"learning_rate": 2.3470678346851513e-05,
"loss": 0.273,
"step": 453
},
{
"epoch": 0.389031705227078,
"grad_norm": 0.01904296875,
"learning_rate": 2.2490027771406685e-05,
"loss": 0.2599,
"step": 454
},
{
"epoch": 0.3898886032562125,
"grad_norm": 0.021728515625,
"learning_rate": 2.152983213389559e-05,
"loss": 0.2591,
"step": 455
},
{
"epoch": 0.390745501285347,
"grad_norm": 0.01953125,
"learning_rate": 2.0590132565903473e-05,
"loss": 0.2733,
"step": 456
},
{
"epoch": 0.3916023993144816,
"grad_norm": 0.019287109375,
"learning_rate": 1.9670969321032406e-05,
"loss": 0.2603,
"step": 457
},
{
"epoch": 0.3924592973436161,
"grad_norm": 0.0233154296875,
"learning_rate": 1.8772381773176416e-05,
"loss": 0.2568,
"step": 458
},
{
"epoch": 0.39331619537275064,
"grad_norm": 0.022216796875,
"learning_rate": 1.7894408414835363e-05,
"loss": 0.2858,
"step": 459
},
{
"epoch": 0.39417309340188517,
"grad_norm": 0.0198974609375,
"learning_rate": 1.70370868554659e-05,
"loss": 0.2589,
"step": 460
},
{
"epoch": 0.3950299914310197,
"grad_norm": 0.01904296875,
"learning_rate": 1.620045381987012e-05,
"loss": 0.2503,
"step": 461
},
{
"epoch": 0.39588688946015427,
"grad_norm": 0.0205078125,
"learning_rate": 1.538454514662285e-05,
"loss": 0.2695,
"step": 462
},
{
"epoch": 0.3967437874892888,
"grad_norm": 0.0211181640625,
"learning_rate": 1.4589395786535953e-05,
"loss": 0.2616,
"step": 463
},
{
"epoch": 0.3976006855184233,
"grad_norm": 0.019287109375,
"learning_rate": 1.3815039801161721e-05,
"loss": 0.2542,
"step": 464
},
{
"epoch": 0.39845758354755784,
"grad_norm": 0.0244140625,
"learning_rate": 1.3061510361333184e-05,
"loss": 0.254,
"step": 465
},
{
"epoch": 0.39931448157669236,
"grad_norm": 0.0216064453125,
"learning_rate": 1.232883974574367e-05,
"loss": 0.2671,
"step": 466
},
{
"epoch": 0.4001713796058269,
"grad_norm": 0.0184326171875,
"learning_rate": 1.1617059339563806e-05,
"loss": 0.2515,
"step": 467
},
{
"epoch": 0.40102827763496146,
"grad_norm": 0.0201416015625,
"learning_rate": 1.0926199633097156e-05,
"loss": 0.2528,
"step": 468
},
{
"epoch": 0.401885175664096,
"grad_norm": 0.02001953125,
"learning_rate": 1.0256290220474307e-05,
"loss": 0.2661,
"step": 469
},
{
"epoch": 0.4027420736932305,
"grad_norm": 0.01953125,
"learning_rate": 9.607359798384786e-06,
"loss": 0.2616,
"step": 470
},
{
"epoch": 0.40359897172236503,
"grad_norm": 0.0208740234375,
"learning_rate": 8.979436164848088e-06,
"loss": 0.2668,
"step": 471
},
{
"epoch": 0.40445586975149955,
"grad_norm": 0.0196533203125,
"learning_rate": 8.372546218022748e-06,
"loss": 0.2446,
"step": 472
},
{
"epoch": 0.40531276778063413,
"grad_norm": 0.0181884765625,
"learning_rate": 7.786715955054202e-06,
"loss": 0.2594,
"step": 473
},
{
"epoch": 0.40616966580976865,
"grad_norm": 0.019775390625,
"learning_rate": 7.221970470961125e-06,
"loss": 0.2543,
"step": 474
},
{
"epoch": 0.4070265638389032,
"grad_norm": 0.01904296875,
"learning_rate": 6.678333957560512e-06,
"loss": 0.267,
"step": 475
},
{
"epoch": 0.4078834618680377,
"grad_norm": 0.02099609375,
"learning_rate": 6.15582970243117e-06,
"loss": 0.2606,
"step": 476
},
{
"epoch": 0.4087403598971722,
"grad_norm": 0.024169921875,
"learning_rate": 5.6544800879163026e-06,
"loss": 0.2652,
"step": 477
},
{
"epoch": 0.40959725792630675,
"grad_norm": 0.0201416015625,
"learning_rate": 5.174306590164879e-06,
"loss": 0.2613,
"step": 478
},
{
"epoch": 0.4104541559554413,
"grad_norm": 0.0223388671875,
"learning_rate": 4.715329778211374e-06,
"loss": 0.2791,
"step": 479
},
{
"epoch": 0.41131105398457585,
"grad_norm": 0.0194091796875,
"learning_rate": 4.277569313094809e-06,
"loss": 0.2666,
"step": 480
},
{
"epoch": 0.41216795201371037,
"grad_norm": 0.0213623046875,
"learning_rate": 3.861043947016474e-06,
"loss": 0.2592,
"step": 481
},
{
"epoch": 0.4130248500428449,
"grad_norm": 0.02294921875,
"learning_rate": 3.4657715225368535e-06,
"loss": 0.2629,
"step": 482
},
{
"epoch": 0.4138817480719794,
"grad_norm": 0.0211181640625,
"learning_rate": 3.09176897181096e-06,
"loss": 0.2624,
"step": 483
},
{
"epoch": 0.414738646101114,
"grad_norm": 0.017578125,
"learning_rate": 2.739052315863355e-06,
"loss": 0.2556,
"step": 484
},
{
"epoch": 0.4155955441302485,
"grad_norm": 0.02099609375,
"learning_rate": 2.4076366639015913e-06,
"loss": 0.2665,
"step": 485
},
{
"epoch": 0.41645244215938304,
"grad_norm": 0.01904296875,
"learning_rate": 2.097536212669171e-06,
"loss": 0.2584,
"step": 486
},
{
"epoch": 0.41730934018851756,
"grad_norm": 0.0240478515625,
"learning_rate": 1.8087642458373132e-06,
"loss": 0.263,
"step": 487
},
{
"epoch": 0.4181662382176521,
"grad_norm": 0.0218505859375,
"learning_rate": 1.541333133436018e-06,
"loss": 0.2611,
"step": 488
},
{
"epoch": 0.4190231362467866,
"grad_norm": 0.01806640625,
"learning_rate": 1.2952543313240472e-06,
"loss": 0.255,
"step": 489
},
{
"epoch": 0.4198800342759212,
"grad_norm": 0.0194091796875,
"learning_rate": 1.0705383806982606e-06,
"loss": 0.2719,
"step": 490
},
{
"epoch": 0.4207369323050557,
"grad_norm": 0.0206298828125,
"learning_rate": 8.671949076420882e-07,
"loss": 0.2695,
"step": 491
},
{
"epoch": 0.42159383033419023,
"grad_norm": 0.0198974609375,
"learning_rate": 6.852326227130834e-07,
"loss": 0.2709,
"step": 492
},
{
"epoch": 0.42245072836332476,
"grad_norm": 0.0272216796875,
"learning_rate": 5.246593205699424e-07,
"loss": 0.2517,
"step": 493
},
{
"epoch": 0.4233076263924593,
"grad_norm": 0.0211181640625,
"learning_rate": 3.854818796385495e-07,
"loss": 0.2614,
"step": 494
},
{
"epoch": 0.4241645244215938,
"grad_norm": 0.0208740234375,
"learning_rate": 2.677062618171577e-07,
"loss": 0.2542,
"step": 495
},
{
"epoch": 0.4250214224507284,
"grad_norm": 0.0194091796875,
"learning_rate": 1.7133751222137007e-07,
"loss": 0.2673,
"step": 496
},
{
"epoch": 0.4258783204798629,
"grad_norm": 0.0223388671875,
"learning_rate": 9.637975896759077e-08,
"loss": 0.2686,
"step": 497
},
{
"epoch": 0.4267352185089974,
"grad_norm": 0.0184326171875,
"learning_rate": 4.283621299649987e-08,
"loss": 0.2779,
"step": 498
},
{
"epoch": 0.42759211653813195,
"grad_norm": 0.0191650390625,
"learning_rate": 1.0709167935385456e-08,
"loss": 0.2736,
"step": 499
},
{
"epoch": 0.4284490145672665,
"grad_norm": 0.0223388671875,
"learning_rate": 0.0,
"loss": 0.2556,
"step": 500
},
{
"epoch": 0.4284490145672665,
"step": 500,
"total_flos": 4.430379024908288e+19,
"train_loss": 0.41707064187526705,
"train_runtime": 21021.7192,
"train_samples_per_second": 0.381,
"train_steps_per_second": 0.024
}
],
"logging_steps": 1.0,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.430379024908288e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}