hp_project_1 / checkpoint-550 /trainer_state.json
rr4433's picture
Training in progress, step 550, checkpoint
8393fc3 verified
{
"best_metric": 0.6811222434043884,
"best_model_checkpoint": "outputs/checkpoint-550",
"epoch": 0.3440725680325305,
"eval_steps": 25,
"global_step": 550,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006255864873318737,
"grad_norm": 3.302262783050537,
"learning_rate": 4e-05,
"loss": 1.7639,
"step": 1
},
{
"epoch": 0.0012511729746637473,
"grad_norm": 3.7728819847106934,
"learning_rate": 8e-05,
"loss": 2.3471,
"step": 2
},
{
"epoch": 0.001876759461995621,
"grad_norm": 3.575211763381958,
"learning_rate": 0.00012,
"loss": 1.274,
"step": 3
},
{
"epoch": 0.0025023459493274947,
"grad_norm": 4.3921918869018555,
"learning_rate": 0.00016,
"loss": 1.8361,
"step": 4
},
{
"epoch": 0.0031279324366593683,
"grad_norm": 3.215696096420288,
"learning_rate": 0.0002,
"loss": 2.8766,
"step": 5
},
{
"epoch": 0.003753518923991242,
"grad_norm": 4.060017108917236,
"learning_rate": 0.0001996638655462185,
"loss": 1.4329,
"step": 6
},
{
"epoch": 0.004379105411323116,
"grad_norm": 2.7935523986816406,
"learning_rate": 0.00019932773109243698,
"loss": 1.2844,
"step": 7
},
{
"epoch": 0.005004691898654989,
"grad_norm": 2.312218189239502,
"learning_rate": 0.00019899159663865548,
"loss": 1.8112,
"step": 8
},
{
"epoch": 0.005630278385986863,
"grad_norm": 3.5389914512634277,
"learning_rate": 0.00019865546218487395,
"loss": 2.0504,
"step": 9
},
{
"epoch": 0.006255864873318737,
"grad_norm": 2.913029432296753,
"learning_rate": 0.00019831932773109245,
"loss": 1.9101,
"step": 10
},
{
"epoch": 0.00688145136065061,
"grad_norm": 3.6916606426239014,
"learning_rate": 0.00019798319327731095,
"loss": 1.9899,
"step": 11
},
{
"epoch": 0.007507037847982484,
"grad_norm": 3.002810478210449,
"learning_rate": 0.00019764705882352942,
"loss": 1.3224,
"step": 12
},
{
"epoch": 0.008132624335314358,
"grad_norm": 1.657835602760315,
"learning_rate": 0.00019731092436974792,
"loss": 1.2208,
"step": 13
},
{
"epoch": 0.008758210822646231,
"grad_norm": 2.414161443710327,
"learning_rate": 0.00019697478991596642,
"loss": 1.5375,
"step": 14
},
{
"epoch": 0.009383797309978105,
"grad_norm": 1.9695100784301758,
"learning_rate": 0.00019663865546218486,
"loss": 0.9218,
"step": 15
},
{
"epoch": 0.010009383797309979,
"grad_norm": 3.9755845069885254,
"learning_rate": 0.00019630252100840336,
"loss": 1.3608,
"step": 16
},
{
"epoch": 0.010634970284641852,
"grad_norm": 6.843455791473389,
"learning_rate": 0.00019596638655462186,
"loss": 1.2168,
"step": 17
},
{
"epoch": 0.011260556771973726,
"grad_norm": 3.8736443519592285,
"learning_rate": 0.00019563025210084033,
"loss": 0.7392,
"step": 18
},
{
"epoch": 0.0118861432593056,
"grad_norm": 1.7369539737701416,
"learning_rate": 0.00019529411764705883,
"loss": 1.0495,
"step": 19
},
{
"epoch": 0.012511729746637473,
"grad_norm": 1.1708225011825562,
"learning_rate": 0.0001949579831932773,
"loss": 1.2266,
"step": 20
},
{
"epoch": 0.013137316233969347,
"grad_norm": 1.4693603515625,
"learning_rate": 0.0001946218487394958,
"loss": 1.1364,
"step": 21
},
{
"epoch": 0.01376290272130122,
"grad_norm": 0.8484959602355957,
"learning_rate": 0.0001942857142857143,
"loss": 0.6253,
"step": 22
},
{
"epoch": 0.014388489208633094,
"grad_norm": 2.7237887382507324,
"learning_rate": 0.00019394957983193278,
"loss": 1.2932,
"step": 23
},
{
"epoch": 0.015014075695964968,
"grad_norm": 1.1654947996139526,
"learning_rate": 0.00019361344537815127,
"loss": 0.5659,
"step": 24
},
{
"epoch": 0.01563966218329684,
"grad_norm": 1.7193485498428345,
"learning_rate": 0.00019327731092436975,
"loss": 1.3627,
"step": 25
},
{
"epoch": 0.01563966218329684,
"eval_loss": 1.0974555015563965,
"eval_runtime": 46.8133,
"eval_samples_per_second": 5.469,
"eval_steps_per_second": 2.734,
"step": 25
},
{
"epoch": 0.016265248670628715,
"grad_norm": 2.883988380432129,
"learning_rate": 0.00019294117647058825,
"loss": 0.6257,
"step": 26
},
{
"epoch": 0.01689083515796059,
"grad_norm": 1.4707483053207397,
"learning_rate": 0.00019260504201680674,
"loss": 0.879,
"step": 27
},
{
"epoch": 0.017516421645292463,
"grad_norm": 1.3346422910690308,
"learning_rate": 0.00019226890756302522,
"loss": 1.0058,
"step": 28
},
{
"epoch": 0.018142008132624336,
"grad_norm": 0.5815519094467163,
"learning_rate": 0.00019193277310924372,
"loss": 0.3475,
"step": 29
},
{
"epoch": 0.01876759461995621,
"grad_norm": 0.8800593018531799,
"learning_rate": 0.00019159663865546221,
"loss": 0.5426,
"step": 30
},
{
"epoch": 0.019393181107288084,
"grad_norm": 8.196944236755371,
"learning_rate": 0.0001912605042016807,
"loss": 1.0088,
"step": 31
},
{
"epoch": 0.020018767594619957,
"grad_norm": 3.264193296432495,
"learning_rate": 0.00019092436974789919,
"loss": 0.9319,
"step": 32
},
{
"epoch": 0.02064435408195183,
"grad_norm": 1.1047834157943726,
"learning_rate": 0.00019058823529411766,
"loss": 0.9262,
"step": 33
},
{
"epoch": 0.021269940569283705,
"grad_norm": 1.982783555984497,
"learning_rate": 0.00019025210084033613,
"loss": 1.2904,
"step": 34
},
{
"epoch": 0.021895527056615578,
"grad_norm": 2.6765289306640625,
"learning_rate": 0.00018991596638655463,
"loss": 1.0785,
"step": 35
},
{
"epoch": 0.022521113543947452,
"grad_norm": 4.674818992614746,
"learning_rate": 0.0001895798319327731,
"loss": 0.9822,
"step": 36
},
{
"epoch": 0.023146700031279326,
"grad_norm": 1.6232353448867798,
"learning_rate": 0.0001892436974789916,
"loss": 0.6441,
"step": 37
},
{
"epoch": 0.0237722865186112,
"grad_norm": 2.623237371444702,
"learning_rate": 0.0001889075630252101,
"loss": 0.8874,
"step": 38
},
{
"epoch": 0.024397873005943073,
"grad_norm": 1.4366761445999146,
"learning_rate": 0.00018857142857142857,
"loss": 0.4596,
"step": 39
},
{
"epoch": 0.025023459493274947,
"grad_norm": 1.8809682130813599,
"learning_rate": 0.00018823529411764707,
"loss": 0.887,
"step": 40
},
{
"epoch": 0.02564904598060682,
"grad_norm": 1.081438660621643,
"learning_rate": 0.00018789915966386554,
"loss": 0.4735,
"step": 41
},
{
"epoch": 0.026274632467938694,
"grad_norm": 2.1302649974823,
"learning_rate": 0.00018756302521008404,
"loss": 0.795,
"step": 42
},
{
"epoch": 0.026900218955270568,
"grad_norm": 2.005425453186035,
"learning_rate": 0.00018722689075630254,
"loss": 0.8891,
"step": 43
},
{
"epoch": 0.02752580544260244,
"grad_norm": 1.7256505489349365,
"learning_rate": 0.000186890756302521,
"loss": 0.5993,
"step": 44
},
{
"epoch": 0.028151391929934315,
"grad_norm": 0.927653968334198,
"learning_rate": 0.0001865546218487395,
"loss": 0.6185,
"step": 45
},
{
"epoch": 0.02877697841726619,
"grad_norm": 1.5710850954055786,
"learning_rate": 0.000186218487394958,
"loss": 0.5258,
"step": 46
},
{
"epoch": 0.029402564904598062,
"grad_norm": 1.8794296979904175,
"learning_rate": 0.00018588235294117648,
"loss": 0.8168,
"step": 47
},
{
"epoch": 0.030028151391929936,
"grad_norm": 0.9695333242416382,
"learning_rate": 0.00018554621848739498,
"loss": 0.5458,
"step": 48
},
{
"epoch": 0.03065373787926181,
"grad_norm": 3.7846665382385254,
"learning_rate": 0.00018521008403361345,
"loss": 0.943,
"step": 49
},
{
"epoch": 0.03127932436659368,
"grad_norm": 1.9213052988052368,
"learning_rate": 0.00018487394957983195,
"loss": 0.5069,
"step": 50
},
{
"epoch": 0.03127932436659368,
"eval_loss": 0.9765783548355103,
"eval_runtime": 43.502,
"eval_samples_per_second": 5.885,
"eval_steps_per_second": 2.942,
"step": 50
},
{
"epoch": 0.03190491085392556,
"grad_norm": 2.0580382347106934,
"learning_rate": 0.00018453781512605045,
"loss": 0.9423,
"step": 51
},
{
"epoch": 0.03253049734125743,
"grad_norm": 2.063591957092285,
"learning_rate": 0.0001842016806722689,
"loss": 0.7054,
"step": 52
},
{
"epoch": 0.033156083828589304,
"grad_norm": 1.2656595706939697,
"learning_rate": 0.0001838655462184874,
"loss": 0.401,
"step": 53
},
{
"epoch": 0.03378167031592118,
"grad_norm": 1.2392399311065674,
"learning_rate": 0.0001835294117647059,
"loss": 0.6077,
"step": 54
},
{
"epoch": 0.03440725680325305,
"grad_norm": 0.99504154920578,
"learning_rate": 0.00018319327731092437,
"loss": 0.6313,
"step": 55
},
{
"epoch": 0.035032843290584925,
"grad_norm": 2.0478012561798096,
"learning_rate": 0.00018285714285714286,
"loss": 1.2652,
"step": 56
},
{
"epoch": 0.0356584297779168,
"grad_norm": 0.9636131525039673,
"learning_rate": 0.00018252100840336134,
"loss": 0.7561,
"step": 57
},
{
"epoch": 0.03628401626524867,
"grad_norm": 0.874576210975647,
"learning_rate": 0.00018218487394957984,
"loss": 0.7461,
"step": 58
},
{
"epoch": 0.036909602752580546,
"grad_norm": 1.3745896816253662,
"learning_rate": 0.00018184873949579833,
"loss": 1.2856,
"step": 59
},
{
"epoch": 0.03753518923991242,
"grad_norm": 2.4839162826538086,
"learning_rate": 0.0001815126050420168,
"loss": 1.0574,
"step": 60
},
{
"epoch": 0.038160775727244294,
"grad_norm": 1.2671383619308472,
"learning_rate": 0.0001811764705882353,
"loss": 0.6177,
"step": 61
},
{
"epoch": 0.03878636221457617,
"grad_norm": 1.1862553358078003,
"learning_rate": 0.0001808403361344538,
"loss": 1.1169,
"step": 62
},
{
"epoch": 0.03941194870190804,
"grad_norm": 1.1347297430038452,
"learning_rate": 0.00018050420168067228,
"loss": 1.3303,
"step": 63
},
{
"epoch": 0.040037535189239915,
"grad_norm": 2.1583523750305176,
"learning_rate": 0.00018016806722689078,
"loss": 0.7941,
"step": 64
},
{
"epoch": 0.04066312167657179,
"grad_norm": 1.2432655096054077,
"learning_rate": 0.00017983193277310925,
"loss": 0.7848,
"step": 65
},
{
"epoch": 0.04128870816390366,
"grad_norm": 1.3345468044281006,
"learning_rate": 0.00017949579831932775,
"loss": 0.8953,
"step": 66
},
{
"epoch": 0.041914294651235535,
"grad_norm": 0.6861767768859863,
"learning_rate": 0.00017915966386554625,
"loss": 0.4162,
"step": 67
},
{
"epoch": 0.04253988113856741,
"grad_norm": 0.85309898853302,
"learning_rate": 0.00017882352941176472,
"loss": 0.6606,
"step": 68
},
{
"epoch": 0.04316546762589928,
"grad_norm": 1.0247780084609985,
"learning_rate": 0.00017848739495798322,
"loss": 0.5271,
"step": 69
},
{
"epoch": 0.043791054113231156,
"grad_norm": 1.3019441366195679,
"learning_rate": 0.0001781512605042017,
"loss": 0.5605,
"step": 70
},
{
"epoch": 0.04441664060056303,
"grad_norm": 1.1024900674819946,
"learning_rate": 0.00017781512605042016,
"loss": 0.9303,
"step": 71
},
{
"epoch": 0.045042227087894904,
"grad_norm": 1.079655408859253,
"learning_rate": 0.00017747899159663866,
"loss": 1.0138,
"step": 72
},
{
"epoch": 0.04566781357522678,
"grad_norm": 1.1078468561172485,
"learning_rate": 0.00017714285714285713,
"loss": 0.9861,
"step": 73
},
{
"epoch": 0.04629340006255865,
"grad_norm": 1.8648931980133057,
"learning_rate": 0.00017680672268907563,
"loss": 0.6756,
"step": 74
},
{
"epoch": 0.046918986549890525,
"grad_norm": 0.8588104248046875,
"learning_rate": 0.00017647058823529413,
"loss": 0.4867,
"step": 75
},
{
"epoch": 0.046918986549890525,
"eval_loss": 0.9139823913574219,
"eval_runtime": 43.5635,
"eval_samples_per_second": 5.876,
"eval_steps_per_second": 2.938,
"step": 75
},
{
"epoch": 0.0475445730372224,
"grad_norm": 1.6970480680465698,
"learning_rate": 0.0001761344537815126,
"loss": 0.5523,
"step": 76
},
{
"epoch": 0.04817015952455427,
"grad_norm": 0.8562026023864746,
"learning_rate": 0.0001757983193277311,
"loss": 0.4084,
"step": 77
},
{
"epoch": 0.048795746011886146,
"grad_norm": 0.9487925171852112,
"learning_rate": 0.0001754621848739496,
"loss": 0.6204,
"step": 78
},
{
"epoch": 0.04942133249921802,
"grad_norm": 11.929024696350098,
"learning_rate": 0.00017512605042016807,
"loss": 1.1662,
"step": 79
},
{
"epoch": 0.05004691898654989,
"grad_norm": 1.3468140363693237,
"learning_rate": 0.00017478991596638657,
"loss": 0.8037,
"step": 80
},
{
"epoch": 0.05067250547388177,
"grad_norm": 0.7379503846168518,
"learning_rate": 0.00017445378151260504,
"loss": 0.6564,
"step": 81
},
{
"epoch": 0.05129809196121364,
"grad_norm": 1.0315027236938477,
"learning_rate": 0.00017411764705882354,
"loss": 0.6377,
"step": 82
},
{
"epoch": 0.051923678448545514,
"grad_norm": 0.5900093913078308,
"learning_rate": 0.00017378151260504204,
"loss": 0.5122,
"step": 83
},
{
"epoch": 0.05254926493587739,
"grad_norm": 1.5138239860534668,
"learning_rate": 0.0001734453781512605,
"loss": 0.4769,
"step": 84
},
{
"epoch": 0.05317485142320926,
"grad_norm": 1.016790747642517,
"learning_rate": 0.000173109243697479,
"loss": 0.6654,
"step": 85
},
{
"epoch": 0.053800437910541135,
"grad_norm": 1.1964718103408813,
"learning_rate": 0.00017277310924369748,
"loss": 0.6334,
"step": 86
},
{
"epoch": 0.05442602439787301,
"grad_norm": 1.102842092514038,
"learning_rate": 0.00017243697478991598,
"loss": 0.832,
"step": 87
},
{
"epoch": 0.05505161088520488,
"grad_norm": 6.609305381774902,
"learning_rate": 0.00017210084033613448,
"loss": 0.6112,
"step": 88
},
{
"epoch": 0.055677197372536756,
"grad_norm": 2.6627745628356934,
"learning_rate": 0.00017176470588235293,
"loss": 1.032,
"step": 89
},
{
"epoch": 0.05630278385986863,
"grad_norm": 2.114955425262451,
"learning_rate": 0.00017142857142857143,
"loss": 0.6116,
"step": 90
},
{
"epoch": 0.0569283703472005,
"grad_norm": 1.7707552909851074,
"learning_rate": 0.00017109243697478992,
"loss": 0.4766,
"step": 91
},
{
"epoch": 0.05755395683453238,
"grad_norm": 0.9983264803886414,
"learning_rate": 0.0001707563025210084,
"loss": 0.5397,
"step": 92
},
{
"epoch": 0.05817954332186425,
"grad_norm": 8.190524101257324,
"learning_rate": 0.0001704201680672269,
"loss": 0.9531,
"step": 93
},
{
"epoch": 0.058805129809196124,
"grad_norm": 1.9920661449432373,
"learning_rate": 0.0001700840336134454,
"loss": 1.3801,
"step": 94
},
{
"epoch": 0.059430716296528,
"grad_norm": 0.8791856169700623,
"learning_rate": 0.00016974789915966387,
"loss": 0.6218,
"step": 95
},
{
"epoch": 0.06005630278385987,
"grad_norm": 1.0745537281036377,
"learning_rate": 0.00016941176470588237,
"loss": 0.5578,
"step": 96
},
{
"epoch": 0.060681889271191745,
"grad_norm": 1.4266705513000488,
"learning_rate": 0.00016907563025210084,
"loss": 1.5821,
"step": 97
},
{
"epoch": 0.06130747575852362,
"grad_norm": 1.1001832485198975,
"learning_rate": 0.00016873949579831934,
"loss": 0.5972,
"step": 98
},
{
"epoch": 0.06193306224585549,
"grad_norm": 1.3168463706970215,
"learning_rate": 0.00016840336134453784,
"loss": 0.5794,
"step": 99
},
{
"epoch": 0.06255864873318737,
"grad_norm": 1.0342196226119995,
"learning_rate": 0.0001680672268907563,
"loss": 0.6827,
"step": 100
},
{
"epoch": 0.06255864873318737,
"eval_loss": 0.8885337114334106,
"eval_runtime": 43.4886,
"eval_samples_per_second": 5.887,
"eval_steps_per_second": 2.943,
"step": 100
},
{
"epoch": 0.06318423522051923,
"grad_norm": 2.2497031688690186,
"learning_rate": 0.0001677310924369748,
"loss": 0.6468,
"step": 101
},
{
"epoch": 0.06380982170785111,
"grad_norm": 0.8061516284942627,
"learning_rate": 0.00016739495798319328,
"loss": 0.5388,
"step": 102
},
{
"epoch": 0.06443540819518298,
"grad_norm": 0.6954531669616699,
"learning_rate": 0.00016705882352941178,
"loss": 0.3191,
"step": 103
},
{
"epoch": 0.06506099468251486,
"grad_norm": 1.3721911907196045,
"learning_rate": 0.00016672268907563028,
"loss": 0.9,
"step": 104
},
{
"epoch": 0.06568658116984673,
"grad_norm": 1.084492564201355,
"learning_rate": 0.00016638655462184875,
"loss": 0.6144,
"step": 105
},
{
"epoch": 0.06631216765717861,
"grad_norm": 3.317697525024414,
"learning_rate": 0.00016605042016806725,
"loss": 0.634,
"step": 106
},
{
"epoch": 0.06693775414451048,
"grad_norm": 2.5598530769348145,
"learning_rate": 0.00016571428571428575,
"loss": 0.8931,
"step": 107
},
{
"epoch": 0.06756334063184236,
"grad_norm": 3.6414177417755127,
"learning_rate": 0.0001653781512605042,
"loss": 0.7226,
"step": 108
},
{
"epoch": 0.06818892711917422,
"grad_norm": 2.2443768978118896,
"learning_rate": 0.0001650420168067227,
"loss": 0.8862,
"step": 109
},
{
"epoch": 0.0688145136065061,
"grad_norm": 0.6285691857337952,
"learning_rate": 0.0001647058823529412,
"loss": 0.3766,
"step": 110
},
{
"epoch": 0.06944010009383797,
"grad_norm": 0.6171959042549133,
"learning_rate": 0.00016436974789915966,
"loss": 0.2821,
"step": 111
},
{
"epoch": 0.07006568658116985,
"grad_norm": 1.0057804584503174,
"learning_rate": 0.00016403361344537816,
"loss": 0.6293,
"step": 112
},
{
"epoch": 0.07069127306850172,
"grad_norm": 1.3190034627914429,
"learning_rate": 0.00016369747899159663,
"loss": 0.5547,
"step": 113
},
{
"epoch": 0.0713168595558336,
"grad_norm": 0.518517017364502,
"learning_rate": 0.00016336134453781513,
"loss": 0.1951,
"step": 114
},
{
"epoch": 0.07194244604316546,
"grad_norm": 0.848175048828125,
"learning_rate": 0.00016302521008403363,
"loss": 0.5091,
"step": 115
},
{
"epoch": 0.07256803253049735,
"grad_norm": 0.7387409806251526,
"learning_rate": 0.0001626890756302521,
"loss": 0.3872,
"step": 116
},
{
"epoch": 0.07319361901782921,
"grad_norm": 2.828091859817505,
"learning_rate": 0.0001623529411764706,
"loss": 1.2046,
"step": 117
},
{
"epoch": 0.07381920550516109,
"grad_norm": 1.7653822898864746,
"learning_rate": 0.00016201680672268907,
"loss": 1.8133,
"step": 118
},
{
"epoch": 0.07444479199249296,
"grad_norm": 3.5097360610961914,
"learning_rate": 0.00016168067226890757,
"loss": 0.6837,
"step": 119
},
{
"epoch": 0.07507037847982484,
"grad_norm": 1.3884797096252441,
"learning_rate": 0.00016134453781512607,
"loss": 0.8846,
"step": 120
},
{
"epoch": 0.0756959649671567,
"grad_norm": 22.705190658569336,
"learning_rate": 0.00016100840336134454,
"loss": 0.7281,
"step": 121
},
{
"epoch": 0.07632155145448859,
"grad_norm": 3.1223599910736084,
"learning_rate": 0.00016067226890756304,
"loss": 0.6254,
"step": 122
},
{
"epoch": 0.07694713794182045,
"grad_norm": 0.530583381652832,
"learning_rate": 0.00016033613445378154,
"loss": 0.3292,
"step": 123
},
{
"epoch": 0.07757272442915233,
"grad_norm": 1.4720183610916138,
"learning_rate": 0.00016,
"loss": 0.8192,
"step": 124
},
{
"epoch": 0.0781983109164842,
"grad_norm": 0.6448870301246643,
"learning_rate": 0.0001596638655462185,
"loss": 0.2431,
"step": 125
},
{
"epoch": 0.0781983109164842,
"eval_loss": 0.890012800693512,
"eval_runtime": 43.5059,
"eval_samples_per_second": 5.884,
"eval_steps_per_second": 2.942,
"step": 125
},
{
"epoch": 0.07882389740381608,
"grad_norm": 1.803906798362732,
"learning_rate": 0.00015932773109243698,
"loss": 0.8937,
"step": 126
},
{
"epoch": 0.07944948389114795,
"grad_norm": 2.2447054386138916,
"learning_rate": 0.00015899159663865546,
"loss": 0.6993,
"step": 127
},
{
"epoch": 0.08007507037847983,
"grad_norm": 0.6667381525039673,
"learning_rate": 0.00015865546218487396,
"loss": 0.4266,
"step": 128
},
{
"epoch": 0.0807006568658117,
"grad_norm": 1.1449408531188965,
"learning_rate": 0.00015831932773109243,
"loss": 0.5557,
"step": 129
},
{
"epoch": 0.08132624335314358,
"grad_norm": 1.399849534034729,
"learning_rate": 0.00015798319327731093,
"loss": 0.6761,
"step": 130
},
{
"epoch": 0.08195182984047544,
"grad_norm": 0.745627760887146,
"learning_rate": 0.00015764705882352943,
"loss": 0.5323,
"step": 131
},
{
"epoch": 0.08257741632780732,
"grad_norm": 1.162428379058838,
"learning_rate": 0.0001573109243697479,
"loss": 0.8231,
"step": 132
},
{
"epoch": 0.08320300281513919,
"grad_norm": 1.0329734086990356,
"learning_rate": 0.0001569747899159664,
"loss": 0.6179,
"step": 133
},
{
"epoch": 0.08382858930247107,
"grad_norm": 0.5739912986755371,
"learning_rate": 0.00015663865546218487,
"loss": 0.2515,
"step": 134
},
{
"epoch": 0.08445417578980294,
"grad_norm": 1.2065409421920776,
"learning_rate": 0.00015630252100840337,
"loss": 0.6161,
"step": 135
},
{
"epoch": 0.08507976227713482,
"grad_norm": 1.1025582551956177,
"learning_rate": 0.00015596638655462187,
"loss": 0.5926,
"step": 136
},
{
"epoch": 0.08570534876446669,
"grad_norm": 0.78680020570755,
"learning_rate": 0.00015563025210084034,
"loss": 0.9987,
"step": 137
},
{
"epoch": 0.08633093525179857,
"grad_norm": 0.6232782006263733,
"learning_rate": 0.00015529411764705884,
"loss": 0.4952,
"step": 138
},
{
"epoch": 0.08695652173913043,
"grad_norm": 3.347989559173584,
"learning_rate": 0.00015495798319327734,
"loss": 1.0787,
"step": 139
},
{
"epoch": 0.08758210822646231,
"grad_norm": 0.9020625352859497,
"learning_rate": 0.0001546218487394958,
"loss": 0.354,
"step": 140
},
{
"epoch": 0.08820769471379418,
"grad_norm": 1.8955539464950562,
"learning_rate": 0.0001542857142857143,
"loss": 0.5515,
"step": 141
},
{
"epoch": 0.08883328120112606,
"grad_norm": 5.194116115570068,
"learning_rate": 0.00015394957983193278,
"loss": 0.6843,
"step": 142
},
{
"epoch": 0.08945886768845793,
"grad_norm": 1.4467953443527222,
"learning_rate": 0.00015361344537815128,
"loss": 0.4236,
"step": 143
},
{
"epoch": 0.09008445417578981,
"grad_norm": 0.523921012878418,
"learning_rate": 0.00015327731092436978,
"loss": 0.2165,
"step": 144
},
{
"epoch": 0.09071004066312167,
"grad_norm": 1.653648018836975,
"learning_rate": 0.00015294117647058822,
"loss": 1.0643,
"step": 145
},
{
"epoch": 0.09133562715045355,
"grad_norm": 0.6991509199142456,
"learning_rate": 0.00015260504201680672,
"loss": 0.4398,
"step": 146
},
{
"epoch": 0.09196121363778542,
"grad_norm": 1.3986660242080688,
"learning_rate": 0.00015226890756302522,
"loss": 0.8488,
"step": 147
},
{
"epoch": 0.0925868001251173,
"grad_norm": 1.2424954175949097,
"learning_rate": 0.0001519327731092437,
"loss": 0.9516,
"step": 148
},
{
"epoch": 0.09321238661244917,
"grad_norm": 0.8900560140609741,
"learning_rate": 0.0001515966386554622,
"loss": 0.767,
"step": 149
},
{
"epoch": 0.09383797309978105,
"grad_norm": 40.042503356933594,
"learning_rate": 0.00015126050420168066,
"loss": 0.9691,
"step": 150
},
{
"epoch": 0.09383797309978105,
"eval_loss": 0.8660734295845032,
"eval_runtime": 43.5102,
"eval_samples_per_second": 5.884,
"eval_steps_per_second": 2.942,
"step": 150
},
{
"epoch": 0.09446355958711292,
"grad_norm": 2.816359519958496,
"learning_rate": 0.00015092436974789916,
"loss": 1.4959,
"step": 151
},
{
"epoch": 0.0950891460744448,
"grad_norm": 1.9332157373428345,
"learning_rate": 0.00015058823529411766,
"loss": 0.6786,
"step": 152
},
{
"epoch": 0.09571473256177666,
"grad_norm": 1.2608965635299683,
"learning_rate": 0.00015025210084033613,
"loss": 1.1282,
"step": 153
},
{
"epoch": 0.09634031904910854,
"grad_norm": 1.0167793035507202,
"learning_rate": 0.00014991596638655463,
"loss": 0.4932,
"step": 154
},
{
"epoch": 0.09696590553644041,
"grad_norm": 1.6121408939361572,
"learning_rate": 0.00014957983193277313,
"loss": 0.7193,
"step": 155
},
{
"epoch": 0.09759149202377229,
"grad_norm": 2.4104394912719727,
"learning_rate": 0.0001492436974789916,
"loss": 0.4472,
"step": 156
},
{
"epoch": 0.09821707851110416,
"grad_norm": 1.1095707416534424,
"learning_rate": 0.0001489075630252101,
"loss": 0.7595,
"step": 157
},
{
"epoch": 0.09884266499843604,
"grad_norm": 1.686458945274353,
"learning_rate": 0.00014857142857142857,
"loss": 0.5686,
"step": 158
},
{
"epoch": 0.0994682514857679,
"grad_norm": 3.2238378524780273,
"learning_rate": 0.00014823529411764707,
"loss": 0.4236,
"step": 159
},
{
"epoch": 0.10009383797309979,
"grad_norm": 1.800552248954773,
"learning_rate": 0.00014789915966386557,
"loss": 0.9519,
"step": 160
},
{
"epoch": 0.10071942446043165,
"grad_norm": 0.6441445350646973,
"learning_rate": 0.00014756302521008404,
"loss": 0.4119,
"step": 161
},
{
"epoch": 0.10134501094776353,
"grad_norm": 0.5892903804779053,
"learning_rate": 0.00014722689075630254,
"loss": 0.2956,
"step": 162
},
{
"epoch": 0.1019705974350954,
"grad_norm": 0.8733301758766174,
"learning_rate": 0.00014689075630252101,
"loss": 0.5749,
"step": 163
},
{
"epoch": 0.10259618392242728,
"grad_norm": 1.0460662841796875,
"learning_rate": 0.0001465546218487395,
"loss": 0.8167,
"step": 164
},
{
"epoch": 0.10322177040975915,
"grad_norm": 0.8178017735481262,
"learning_rate": 0.00014621848739495799,
"loss": 0.9027,
"step": 165
},
{
"epoch": 0.10384735689709103,
"grad_norm": 0.5698068737983704,
"learning_rate": 0.00014588235294117646,
"loss": 0.1829,
"step": 166
},
{
"epoch": 0.1044729433844229,
"grad_norm": 1.0011018514633179,
"learning_rate": 0.00014554621848739496,
"loss": 0.8985,
"step": 167
},
{
"epoch": 0.10509852987175478,
"grad_norm": 1.189772367477417,
"learning_rate": 0.00014521008403361346,
"loss": 0.5547,
"step": 168
},
{
"epoch": 0.10572411635908664,
"grad_norm": 0.7990069389343262,
"learning_rate": 0.00014487394957983193,
"loss": 0.6222,
"step": 169
},
{
"epoch": 0.10634970284641852,
"grad_norm": 0.6419771313667297,
"learning_rate": 0.00014453781512605043,
"loss": 0.3225,
"step": 170
},
{
"epoch": 0.10697528933375039,
"grad_norm": 0.8978354930877686,
"learning_rate": 0.00014420168067226893,
"loss": 0.4567,
"step": 171
},
{
"epoch": 0.10760087582108227,
"grad_norm": 0.7193794250488281,
"learning_rate": 0.0001438655462184874,
"loss": 0.4793,
"step": 172
},
{
"epoch": 0.10822646230841414,
"grad_norm": 0.9533759355545044,
"learning_rate": 0.0001435294117647059,
"loss": 1.4397,
"step": 173
},
{
"epoch": 0.10885204879574602,
"grad_norm": 0.48348739743232727,
"learning_rate": 0.00014319327731092437,
"loss": 0.3398,
"step": 174
},
{
"epoch": 0.10947763528307788,
"grad_norm": 0.7699019312858582,
"learning_rate": 0.00014285714285714287,
"loss": 0.7491,
"step": 175
},
{
"epoch": 0.10947763528307788,
"eval_loss": 0.8425782322883606,
"eval_runtime": 43.5013,
"eval_samples_per_second": 5.885,
"eval_steps_per_second": 2.942,
"step": 175
},
{
"epoch": 0.11010322177040976,
"grad_norm": 0.9201186895370483,
"learning_rate": 0.00014252100840336137,
"loss": 0.6919,
"step": 176
},
{
"epoch": 0.11072880825774163,
"grad_norm": 0.8190593123435974,
"learning_rate": 0.00014218487394957984,
"loss": 0.6262,
"step": 177
},
{
"epoch": 0.11135439474507351,
"grad_norm": 0.9715782403945923,
"learning_rate": 0.00014184873949579834,
"loss": 0.8364,
"step": 178
},
{
"epoch": 0.11197998123240538,
"grad_norm": 0.6699782609939575,
"learning_rate": 0.0001415126050420168,
"loss": 0.4898,
"step": 179
},
{
"epoch": 0.11260556771973726,
"grad_norm": 1.8386518955230713,
"learning_rate": 0.0001411764705882353,
"loss": 0.7812,
"step": 180
},
{
"epoch": 0.11323115420706913,
"grad_norm": 0.7240263819694519,
"learning_rate": 0.0001408403361344538,
"loss": 0.5508,
"step": 181
},
{
"epoch": 0.113856740694401,
"grad_norm": 0.6068630814552307,
"learning_rate": 0.00014050420168067225,
"loss": 0.5151,
"step": 182
},
{
"epoch": 0.11448232718173287,
"grad_norm": 1.6705517768859863,
"learning_rate": 0.00014016806722689075,
"loss": 1.2281,
"step": 183
},
{
"epoch": 0.11510791366906475,
"grad_norm": 1.6179956197738647,
"learning_rate": 0.00013983193277310925,
"loss": 0.7365,
"step": 184
},
{
"epoch": 0.11573350015639662,
"grad_norm": 1.5741758346557617,
"learning_rate": 0.00013949579831932772,
"loss": 1.0039,
"step": 185
},
{
"epoch": 0.1163590866437285,
"grad_norm": 0.9270511865615845,
"learning_rate": 0.00013915966386554622,
"loss": 0.5768,
"step": 186
},
{
"epoch": 0.11698467313106037,
"grad_norm": 1.3651914596557617,
"learning_rate": 0.00013882352941176472,
"loss": 0.7715,
"step": 187
},
{
"epoch": 0.11761025961839225,
"grad_norm": 1.4330601692199707,
"learning_rate": 0.0001384873949579832,
"loss": 0.4462,
"step": 188
},
{
"epoch": 0.11823584610572412,
"grad_norm": 0.9181672930717468,
"learning_rate": 0.0001381512605042017,
"loss": 0.3901,
"step": 189
},
{
"epoch": 0.118861432593056,
"grad_norm": 0.5304622650146484,
"learning_rate": 0.00013781512605042016,
"loss": 0.1718,
"step": 190
},
{
"epoch": 0.11948701908038786,
"grad_norm": 0.7475191354751587,
"learning_rate": 0.00013747899159663866,
"loss": 0.3602,
"step": 191
},
{
"epoch": 0.12011260556771974,
"grad_norm": 1.2558002471923828,
"learning_rate": 0.00013714285714285716,
"loss": 0.8558,
"step": 192
},
{
"epoch": 0.12073819205505161,
"grad_norm": 0.9859037399291992,
"learning_rate": 0.00013680672268907563,
"loss": 0.7155,
"step": 193
},
{
"epoch": 0.12136377854238349,
"grad_norm": 0.6028466820716858,
"learning_rate": 0.00013647058823529413,
"loss": 0.9596,
"step": 194
},
{
"epoch": 0.12198936502971536,
"grad_norm": 0.5713469386100769,
"learning_rate": 0.0001361344537815126,
"loss": 0.3442,
"step": 195
},
{
"epoch": 0.12261495151704724,
"grad_norm": 1.0781211853027344,
"learning_rate": 0.0001357983193277311,
"loss": 0.5569,
"step": 196
},
{
"epoch": 0.1232405380043791,
"grad_norm": 0.7850176095962524,
"learning_rate": 0.0001354621848739496,
"loss": 0.5853,
"step": 197
},
{
"epoch": 0.12386612449171099,
"grad_norm": 0.8100555539131165,
"learning_rate": 0.00013512605042016807,
"loss": 0.8285,
"step": 198
},
{
"epoch": 0.12449171097904285,
"grad_norm": 1.106834888458252,
"learning_rate": 0.00013478991596638657,
"loss": 0.9521,
"step": 199
},
{
"epoch": 0.12511729746637473,
"grad_norm": 1.4412230253219604,
"learning_rate": 0.00013445378151260507,
"loss": 0.6478,
"step": 200
},
{
"epoch": 0.12511729746637473,
"eval_loss": 0.8300326466560364,
"eval_runtime": 43.5102,
"eval_samples_per_second": 5.884,
"eval_steps_per_second": 2.942,
"step": 200
},
{
"epoch": 0.1257428839537066,
"grad_norm": 1.7852795124053955,
"learning_rate": 0.00013411764705882352,
"loss": 0.5687,
"step": 201
},
{
"epoch": 0.12636847044103847,
"grad_norm": 2.423583745956421,
"learning_rate": 0.00013378151260504202,
"loss": 0.9082,
"step": 202
},
{
"epoch": 0.12699405692837035,
"grad_norm": 1.538001298904419,
"learning_rate": 0.00013344537815126052,
"loss": 0.7143,
"step": 203
},
{
"epoch": 0.12761964341570223,
"grad_norm": 1.7380592823028564,
"learning_rate": 0.000133109243697479,
"loss": 0.8296,
"step": 204
},
{
"epoch": 0.1282452299030341,
"grad_norm": 0.8279218673706055,
"learning_rate": 0.0001327731092436975,
"loss": 0.6719,
"step": 205
},
{
"epoch": 0.12887081639036596,
"grad_norm": 0.7059926986694336,
"learning_rate": 0.00013243697478991596,
"loss": 0.4785,
"step": 206
},
{
"epoch": 0.12949640287769784,
"grad_norm": 0.6946935653686523,
"learning_rate": 0.00013210084033613446,
"loss": 0.4578,
"step": 207
},
{
"epoch": 0.13012198936502972,
"grad_norm": 0.9800712466239929,
"learning_rate": 0.00013176470588235296,
"loss": 1.4369,
"step": 208
},
{
"epoch": 0.1307475758523616,
"grad_norm": 0.708831787109375,
"learning_rate": 0.00013142857142857143,
"loss": 0.5071,
"step": 209
},
{
"epoch": 0.13137316233969346,
"grad_norm": 1.0098780393600464,
"learning_rate": 0.00013109243697478993,
"loss": 0.9155,
"step": 210
},
{
"epoch": 0.13199874882702534,
"grad_norm": 1.1598243713378906,
"learning_rate": 0.0001307563025210084,
"loss": 0.3757,
"step": 211
},
{
"epoch": 0.13262433531435722,
"grad_norm": 0.7583935260772705,
"learning_rate": 0.0001304201680672269,
"loss": 0.3365,
"step": 212
},
{
"epoch": 0.1332499218016891,
"grad_norm": 1.0866564512252808,
"learning_rate": 0.0001300840336134454,
"loss": 0.6398,
"step": 213
},
{
"epoch": 0.13387550828902095,
"grad_norm": 1.4322006702423096,
"learning_rate": 0.00012974789915966387,
"loss": 0.6427,
"step": 214
},
{
"epoch": 0.13450109477635283,
"grad_norm": 1.600325345993042,
"learning_rate": 0.00012941176470588237,
"loss": 0.6884,
"step": 215
},
{
"epoch": 0.1351266812636847,
"grad_norm": 1.0634167194366455,
"learning_rate": 0.00012907563025210087,
"loss": 1.0343,
"step": 216
},
{
"epoch": 0.13575226775101656,
"grad_norm": 0.9889366626739502,
"learning_rate": 0.00012873949579831934,
"loss": 0.717,
"step": 217
},
{
"epoch": 0.13637785423834844,
"grad_norm": 2.0635392665863037,
"learning_rate": 0.00012840336134453784,
"loss": 0.5965,
"step": 218
},
{
"epoch": 0.13700344072568033,
"grad_norm": 0.8937773704528809,
"learning_rate": 0.0001280672268907563,
"loss": 0.7281,
"step": 219
},
{
"epoch": 0.1376290272130122,
"grad_norm": 0.9768427014350891,
"learning_rate": 0.00012773109243697478,
"loss": 0.5687,
"step": 220
},
{
"epoch": 0.13825461370034406,
"grad_norm": 1.3913767337799072,
"learning_rate": 0.00012739495798319328,
"loss": 0.3984,
"step": 221
},
{
"epoch": 0.13888020018767594,
"grad_norm": 1.4933342933654785,
"learning_rate": 0.00012705882352941175,
"loss": 1.2441,
"step": 222
},
{
"epoch": 0.13950578667500782,
"grad_norm": 1.0846196413040161,
"learning_rate": 0.00012672268907563025,
"loss": 0.9013,
"step": 223
},
{
"epoch": 0.1401313731623397,
"grad_norm": 0.7788563370704651,
"learning_rate": 0.00012638655462184875,
"loss": 0.4674,
"step": 224
},
{
"epoch": 0.14075695964967155,
"grad_norm": 0.7341142296791077,
"learning_rate": 0.00012605042016806722,
"loss": 1.3271,
"step": 225
},
{
"epoch": 0.14075695964967155,
"eval_loss": 0.8179877996444702,
"eval_runtime": 43.5514,
"eval_samples_per_second": 5.878,
"eval_steps_per_second": 2.939,
"step": 225
},
{
"epoch": 0.14138254613700343,
"grad_norm": 6.473598480224609,
"learning_rate": 0.00012571428571428572,
"loss": 0.6219,
"step": 226
},
{
"epoch": 0.14200813262433531,
"grad_norm": 0.9846400022506714,
"learning_rate": 0.0001253781512605042,
"loss": 0.4407,
"step": 227
},
{
"epoch": 0.1426337191116672,
"grad_norm": 0.7880604267120361,
"learning_rate": 0.0001250420168067227,
"loss": 0.3927,
"step": 228
},
{
"epoch": 0.14325930559899905,
"grad_norm": 1.5999399423599243,
"learning_rate": 0.0001247058823529412,
"loss": 0.6917,
"step": 229
},
{
"epoch": 0.14388489208633093,
"grad_norm": 0.8072729110717773,
"learning_rate": 0.00012436974789915966,
"loss": 0.4909,
"step": 230
},
{
"epoch": 0.1445104785736628,
"grad_norm": 2.2560601234436035,
"learning_rate": 0.00012403361344537816,
"loss": 0.3355,
"step": 231
},
{
"epoch": 0.1451360650609947,
"grad_norm": 0.9964832663536072,
"learning_rate": 0.00012369747899159666,
"loss": 0.4436,
"step": 232
},
{
"epoch": 0.14576165154832654,
"grad_norm": 1.1081007719039917,
"learning_rate": 0.00012336134453781513,
"loss": 0.6582,
"step": 233
},
{
"epoch": 0.14638723803565842,
"grad_norm": 0.9722908735275269,
"learning_rate": 0.00012302521008403363,
"loss": 0.7412,
"step": 234
},
{
"epoch": 0.1470128245229903,
"grad_norm": 0.7456592917442322,
"learning_rate": 0.0001226890756302521,
"loss": 0.4303,
"step": 235
},
{
"epoch": 0.14763841101032218,
"grad_norm": 1.0428457260131836,
"learning_rate": 0.0001223529411764706,
"loss": 1.0538,
"step": 236
},
{
"epoch": 0.14826399749765404,
"grad_norm": 0.9209719896316528,
"learning_rate": 0.00012201680672268909,
"loss": 0.5864,
"step": 237
},
{
"epoch": 0.14888958398498592,
"grad_norm": 0.990292489528656,
"learning_rate": 0.00012168067226890756,
"loss": 0.5929,
"step": 238
},
{
"epoch": 0.1495151704723178,
"grad_norm": 0.6086494326591492,
"learning_rate": 0.00012134453781512605,
"loss": 0.4436,
"step": 239
},
{
"epoch": 0.15014075695964968,
"grad_norm": 1.429149866104126,
"learning_rate": 0.00012100840336134453,
"loss": 0.246,
"step": 240
},
{
"epoch": 0.15076634344698153,
"grad_norm": 1.8170491456985474,
"learning_rate": 0.00012067226890756302,
"loss": 0.6574,
"step": 241
},
{
"epoch": 0.1513919299343134,
"grad_norm": 1.1577768325805664,
"learning_rate": 0.00012033613445378152,
"loss": 0.5706,
"step": 242
},
{
"epoch": 0.1520175164216453,
"grad_norm": 0.7442137598991394,
"learning_rate": 0.00012,
"loss": 0.2772,
"step": 243
},
{
"epoch": 0.15264310290897717,
"grad_norm": 1.1375997066497803,
"learning_rate": 0.00011966386554621849,
"loss": 0.397,
"step": 244
},
{
"epoch": 0.15326868939630903,
"grad_norm": 0.8451513648033142,
"learning_rate": 0.00011932773109243697,
"loss": 0.5425,
"step": 245
},
{
"epoch": 0.1538942758836409,
"grad_norm": 0.7176560163497925,
"learning_rate": 0.00011899159663865547,
"loss": 0.4398,
"step": 246
},
{
"epoch": 0.1545198623709728,
"grad_norm": 1.049872875213623,
"learning_rate": 0.00011865546218487396,
"loss": 0.6479,
"step": 247
},
{
"epoch": 0.15514544885830467,
"grad_norm": 0.6093642115592957,
"learning_rate": 0.00011831932773109244,
"loss": 0.6125,
"step": 248
},
{
"epoch": 0.15577103534563652,
"grad_norm": 0.9963379502296448,
"learning_rate": 0.00011798319327731093,
"loss": 0.3768,
"step": 249
},
{
"epoch": 0.1563966218329684,
"grad_norm": 3.4668896198272705,
"learning_rate": 0.00011764705882352942,
"loss": 0.3744,
"step": 250
},
{
"epoch": 0.1563966218329684,
"eval_loss": 0.8456696271896362,
"eval_runtime": 43.5223,
"eval_samples_per_second": 5.882,
"eval_steps_per_second": 2.941,
"step": 250
},
{
"epoch": 0.15702220832030028,
"grad_norm": 0.6826130747795105,
"learning_rate": 0.00011731092436974791,
"loss": 0.4877,
"step": 251
},
{
"epoch": 0.15764779480763216,
"grad_norm": 1.8045300245285034,
"learning_rate": 0.0001169747899159664,
"loss": 0.9699,
"step": 252
},
{
"epoch": 0.15827338129496402,
"grad_norm": 0.7311923503875732,
"learning_rate": 0.00011663865546218489,
"loss": 0.4648,
"step": 253
},
{
"epoch": 0.1588989677822959,
"grad_norm": 1.7481943368911743,
"learning_rate": 0.00011630252100840337,
"loss": 0.8871,
"step": 254
},
{
"epoch": 0.15952455426962778,
"grad_norm": 2.6331326961517334,
"learning_rate": 0.00011596638655462187,
"loss": 0.8109,
"step": 255
},
{
"epoch": 0.16015014075695966,
"grad_norm": 0.899364709854126,
"learning_rate": 0.00011563025210084036,
"loss": 0.5021,
"step": 256
},
{
"epoch": 0.1607757272442915,
"grad_norm": 0.922218918800354,
"learning_rate": 0.00011529411764705881,
"loss": 0.5741,
"step": 257
},
{
"epoch": 0.1614013137316234,
"grad_norm": 5.335756301879883,
"learning_rate": 0.00011495798319327731,
"loss": 0.842,
"step": 258
},
{
"epoch": 0.16202690021895527,
"grad_norm": 0.8632665872573853,
"learning_rate": 0.0001146218487394958,
"loss": 0.4208,
"step": 259
},
{
"epoch": 0.16265248670628715,
"grad_norm": 4.576591968536377,
"learning_rate": 0.00011428571428571428,
"loss": 0.8813,
"step": 260
},
{
"epoch": 0.163278073193619,
"grad_norm": 0.907714307308197,
"learning_rate": 0.00011394957983193277,
"loss": 0.7204,
"step": 261
},
{
"epoch": 0.16390365968095089,
"grad_norm": 0.8328534960746765,
"learning_rate": 0.00011361344537815127,
"loss": 0.7552,
"step": 262
},
{
"epoch": 0.16452924616828277,
"grad_norm": 1.0882028341293335,
"learning_rate": 0.00011327731092436975,
"loss": 0.9079,
"step": 263
},
{
"epoch": 0.16515483265561465,
"grad_norm": 1.0093358755111694,
"learning_rate": 0.00011294117647058824,
"loss": 0.6284,
"step": 264
},
{
"epoch": 0.1657804191429465,
"grad_norm": 0.853907585144043,
"learning_rate": 0.00011260504201680672,
"loss": 0.508,
"step": 265
},
{
"epoch": 0.16640600563027838,
"grad_norm": 1.0016460418701172,
"learning_rate": 0.00011226890756302521,
"loss": 0.597,
"step": 266
},
{
"epoch": 0.16703159211761026,
"grad_norm": 1.0138968229293823,
"learning_rate": 0.00011193277310924371,
"loss": 0.9238,
"step": 267
},
{
"epoch": 0.16765717860494214,
"grad_norm": 1.1728049516677856,
"learning_rate": 0.0001115966386554622,
"loss": 0.9152,
"step": 268
},
{
"epoch": 0.168282765092274,
"grad_norm": 1.2228264808654785,
"learning_rate": 0.00011126050420168068,
"loss": 0.7483,
"step": 269
},
{
"epoch": 0.16890835157960588,
"grad_norm": 0.6260212659835815,
"learning_rate": 0.00011092436974789917,
"loss": 0.5566,
"step": 270
},
{
"epoch": 0.16953393806693776,
"grad_norm": 0.7589625716209412,
"learning_rate": 0.00011058823529411766,
"loss": 0.6242,
"step": 271
},
{
"epoch": 0.17015952455426964,
"grad_norm": 1.1016935110092163,
"learning_rate": 0.00011025210084033615,
"loss": 0.4419,
"step": 272
},
{
"epoch": 0.1707851110416015,
"grad_norm": 0.8092851042747498,
"learning_rate": 0.00010991596638655464,
"loss": 0.5168,
"step": 273
},
{
"epoch": 0.17141069752893337,
"grad_norm": 1.012885332107544,
"learning_rate": 0.00010957983193277312,
"loss": 0.4334,
"step": 274
},
{
"epoch": 0.17203628401626525,
"grad_norm": 2.6073336601257324,
"learning_rate": 0.00010924369747899159,
"loss": 0.5262,
"step": 275
},
{
"epoch": 0.17203628401626525,
"eval_loss": 0.8115787506103516,
"eval_runtime": 43.4931,
"eval_samples_per_second": 5.886,
"eval_steps_per_second": 2.943,
"step": 275
},
{
"epoch": 0.17266187050359713,
"grad_norm": 5.577237606048584,
"learning_rate": 0.00010890756302521008,
"loss": 1.0595,
"step": 276
},
{
"epoch": 0.17328745699092898,
"grad_norm": 1.1434190273284912,
"learning_rate": 0.00010857142857142856,
"loss": 0.4401,
"step": 277
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.951992928981781,
"learning_rate": 0.00010823529411764706,
"loss": 0.4393,
"step": 278
},
{
"epoch": 0.17453862996559275,
"grad_norm": 0.6695138216018677,
"learning_rate": 0.00010789915966386555,
"loss": 0.314,
"step": 279
},
{
"epoch": 0.17516421645292463,
"grad_norm": 0.40990278124809265,
"learning_rate": 0.00010756302521008403,
"loss": 0.192,
"step": 280
},
{
"epoch": 0.17578980294025648,
"grad_norm": 0.9555610418319702,
"learning_rate": 0.00010722689075630252,
"loss": 0.3646,
"step": 281
},
{
"epoch": 0.17641538942758836,
"grad_norm": 0.7370548844337463,
"learning_rate": 0.000106890756302521,
"loss": 0.8997,
"step": 282
},
{
"epoch": 0.17704097591492024,
"grad_norm": 1.0178982019424438,
"learning_rate": 0.0001065546218487395,
"loss": 0.986,
"step": 283
},
{
"epoch": 0.17766656240225212,
"grad_norm": 0.41388389468193054,
"learning_rate": 0.00010621848739495799,
"loss": 0.2069,
"step": 284
},
{
"epoch": 0.17829214888958397,
"grad_norm": 0.7140624523162842,
"learning_rate": 0.00010588235294117647,
"loss": 0.4852,
"step": 285
},
{
"epoch": 0.17891773537691585,
"grad_norm": 0.7758356332778931,
"learning_rate": 0.00010554621848739496,
"loss": 0.3943,
"step": 286
},
{
"epoch": 0.17954332186424773,
"grad_norm": 1.4193260669708252,
"learning_rate": 0.00010521008403361346,
"loss": 0.6412,
"step": 287
},
{
"epoch": 0.18016890835157962,
"grad_norm": 0.7264838814735413,
"learning_rate": 0.00010487394957983194,
"loss": 0.7834,
"step": 288
},
{
"epoch": 0.18079449483891147,
"grad_norm": 2.4300973415374756,
"learning_rate": 0.00010453781512605043,
"loss": 0.7462,
"step": 289
},
{
"epoch": 0.18142008132624335,
"grad_norm": 1.033916711807251,
"learning_rate": 0.00010420168067226892,
"loss": 0.5241,
"step": 290
},
{
"epoch": 0.18204566781357523,
"grad_norm": 0.5583767294883728,
"learning_rate": 0.00010386554621848741,
"loss": 0.7815,
"step": 291
},
{
"epoch": 0.1826712543009071,
"grad_norm": 0.7440481781959534,
"learning_rate": 0.0001035294117647059,
"loss": 0.4674,
"step": 292
},
{
"epoch": 0.18329684078823896,
"grad_norm": 4.230656147003174,
"learning_rate": 0.00010319327731092439,
"loss": 0.5219,
"step": 293
},
{
"epoch": 0.18392242727557084,
"grad_norm": 0.6165269017219543,
"learning_rate": 0.00010285714285714286,
"loss": 0.3274,
"step": 294
},
{
"epoch": 0.18454801376290272,
"grad_norm": 0.5844498872756958,
"learning_rate": 0.00010252100840336134,
"loss": 0.3719,
"step": 295
},
{
"epoch": 0.1851736002502346,
"grad_norm": 0.9936206936836243,
"learning_rate": 0.00010218487394957983,
"loss": 1.0453,
"step": 296
},
{
"epoch": 0.18579918673756646,
"grad_norm": 1.749831199645996,
"learning_rate": 0.00010184873949579831,
"loss": 0.6634,
"step": 297
},
{
"epoch": 0.18642477322489834,
"grad_norm": 0.4740132689476013,
"learning_rate": 0.0001015126050420168,
"loss": 0.2901,
"step": 298
},
{
"epoch": 0.18705035971223022,
"grad_norm": 0.664300262928009,
"learning_rate": 0.0001011764705882353,
"loss": 0.5869,
"step": 299
},
{
"epoch": 0.1876759461995621,
"grad_norm": 0.7400941252708435,
"learning_rate": 0.00010084033613445378,
"loss": 0.7881,
"step": 300
},
{
"epoch": 0.1876759461995621,
"eval_loss": 0.7877693772315979,
"eval_runtime": 43.5162,
"eval_samples_per_second": 5.883,
"eval_steps_per_second": 2.941,
"step": 300
},
{
"epoch": 0.18830153268689395,
"grad_norm": 0.6142858862876892,
"learning_rate": 0.00010050420168067227,
"loss": 0.3808,
"step": 301
},
{
"epoch": 0.18892711917422583,
"grad_norm": 1.991969347000122,
"learning_rate": 0.00010016806722689076,
"loss": 0.7035,
"step": 302
},
{
"epoch": 0.1895527056615577,
"grad_norm": 0.6220730543136597,
"learning_rate": 9.983193277310925e-05,
"loss": 0.2548,
"step": 303
},
{
"epoch": 0.1901782921488896,
"grad_norm": 0.6476833820343018,
"learning_rate": 9.949579831932774e-05,
"loss": 0.3569,
"step": 304
},
{
"epoch": 0.19080387863622145,
"grad_norm": 0.7133951783180237,
"learning_rate": 9.915966386554623e-05,
"loss": 0.4744,
"step": 305
},
{
"epoch": 0.19142946512355333,
"grad_norm": 0.6500736474990845,
"learning_rate": 9.882352941176471e-05,
"loss": 0.4653,
"step": 306
},
{
"epoch": 0.1920550516108852,
"grad_norm": 1.1231927871704102,
"learning_rate": 9.848739495798321e-05,
"loss": 0.818,
"step": 307
},
{
"epoch": 0.1926806380982171,
"grad_norm": 0.8654798865318298,
"learning_rate": 9.815126050420168e-05,
"loss": 0.7065,
"step": 308
},
{
"epoch": 0.19330622458554894,
"grad_norm": 0.45660969614982605,
"learning_rate": 9.781512605042017e-05,
"loss": 0.2412,
"step": 309
},
{
"epoch": 0.19393181107288082,
"grad_norm": 0.9538519978523254,
"learning_rate": 9.747899159663865e-05,
"loss": 1.3428,
"step": 310
},
{
"epoch": 0.1945573975602127,
"grad_norm": 0.596633791923523,
"learning_rate": 9.714285714285715e-05,
"loss": 0.5119,
"step": 311
},
{
"epoch": 0.19518298404754458,
"grad_norm": 0.5247074365615845,
"learning_rate": 9.680672268907564e-05,
"loss": 0.6413,
"step": 312
},
{
"epoch": 0.19580857053487644,
"grad_norm": 0.7713050246238708,
"learning_rate": 9.647058823529412e-05,
"loss": 0.49,
"step": 313
},
{
"epoch": 0.19643415702220832,
"grad_norm": 0.6971513628959656,
"learning_rate": 9.613445378151261e-05,
"loss": 0.6505,
"step": 314
},
{
"epoch": 0.1970597435095402,
"grad_norm": 0.5454917550086975,
"learning_rate": 9.579831932773111e-05,
"loss": 0.7018,
"step": 315
},
{
"epoch": 0.19768532999687208,
"grad_norm": 0.8349499702453613,
"learning_rate": 9.546218487394959e-05,
"loss": 0.3179,
"step": 316
},
{
"epoch": 0.19831091648420393,
"grad_norm": 0.5682560801506042,
"learning_rate": 9.512605042016806e-05,
"loss": 0.4003,
"step": 317
},
{
"epoch": 0.1989365029715358,
"grad_norm": 0.5094739198684692,
"learning_rate": 9.478991596638655e-05,
"loss": 0.313,
"step": 318
},
{
"epoch": 0.1995620894588677,
"grad_norm": 1.7074236869812012,
"learning_rate": 9.445378151260505e-05,
"loss": 0.9912,
"step": 319
},
{
"epoch": 0.20018767594619957,
"grad_norm": 1.1477283239364624,
"learning_rate": 9.411764705882353e-05,
"loss": 0.851,
"step": 320
},
{
"epoch": 0.20081326243353143,
"grad_norm": 0.6616579294204712,
"learning_rate": 9.378151260504202e-05,
"loss": 0.4844,
"step": 321
},
{
"epoch": 0.2014388489208633,
"grad_norm": 1.0401920080184937,
"learning_rate": 9.34453781512605e-05,
"loss": 0.5421,
"step": 322
},
{
"epoch": 0.2020644354081952,
"grad_norm": 0.729664146900177,
"learning_rate": 9.3109243697479e-05,
"loss": 0.6632,
"step": 323
},
{
"epoch": 0.20269002189552707,
"grad_norm": 0.6752575635910034,
"learning_rate": 9.277310924369749e-05,
"loss": 0.4352,
"step": 324
},
{
"epoch": 0.20331560838285892,
"grad_norm": 0.7963948249816895,
"learning_rate": 9.243697478991598e-05,
"loss": 0.7614,
"step": 325
},
{
"epoch": 0.20331560838285892,
"eval_loss": 0.771190881729126,
"eval_runtime": 43.551,
"eval_samples_per_second": 5.878,
"eval_steps_per_second": 2.939,
"step": 325
},
{
"epoch": 0.2039411948701908,
"grad_norm": 0.7778791189193726,
"learning_rate": 9.210084033613445e-05,
"loss": 0.7251,
"step": 326
},
{
"epoch": 0.20456678135752268,
"grad_norm": 3.0929737091064453,
"learning_rate": 9.176470588235295e-05,
"loss": 0.5375,
"step": 327
},
{
"epoch": 0.20519236784485456,
"grad_norm": 0.6188391447067261,
"learning_rate": 9.142857142857143e-05,
"loss": 0.4007,
"step": 328
},
{
"epoch": 0.20581795433218641,
"grad_norm": 0.9423925876617432,
"learning_rate": 9.109243697478992e-05,
"loss": 0.5059,
"step": 329
},
{
"epoch": 0.2064435408195183,
"grad_norm": 0.506572425365448,
"learning_rate": 9.07563025210084e-05,
"loss": 0.2794,
"step": 330
},
{
"epoch": 0.20706912730685018,
"grad_norm": 1.7139545679092407,
"learning_rate": 9.04201680672269e-05,
"loss": 0.5984,
"step": 331
},
{
"epoch": 0.20769471379418206,
"grad_norm": 0.5540574789047241,
"learning_rate": 9.008403361344539e-05,
"loss": 0.323,
"step": 332
},
{
"epoch": 0.2083203002815139,
"grad_norm": 0.6909454464912415,
"learning_rate": 8.974789915966387e-05,
"loss": 0.5399,
"step": 333
},
{
"epoch": 0.2089458867688458,
"grad_norm": 0.7409022450447083,
"learning_rate": 8.941176470588236e-05,
"loss": 0.4251,
"step": 334
},
{
"epoch": 0.20957147325617767,
"grad_norm": 0.6636312007904053,
"learning_rate": 8.907563025210084e-05,
"loss": 0.4021,
"step": 335
},
{
"epoch": 0.21019705974350955,
"grad_norm": 0.5426271557807922,
"learning_rate": 8.873949579831933e-05,
"loss": 0.2095,
"step": 336
},
{
"epoch": 0.2108226462308414,
"grad_norm": 0.8870647549629211,
"learning_rate": 8.840336134453782e-05,
"loss": 0.5773,
"step": 337
},
{
"epoch": 0.21144823271817328,
"grad_norm": 0.5508524179458618,
"learning_rate": 8.80672268907563e-05,
"loss": 0.6744,
"step": 338
},
{
"epoch": 0.21207381920550517,
"grad_norm": 1.6577738523483276,
"learning_rate": 8.77310924369748e-05,
"loss": 1.1134,
"step": 339
},
{
"epoch": 0.21269940569283705,
"grad_norm": 3.218395233154297,
"learning_rate": 8.739495798319329e-05,
"loss": 0.5932,
"step": 340
},
{
"epoch": 0.2133249921801689,
"grad_norm": 0.5119672417640686,
"learning_rate": 8.705882352941177e-05,
"loss": 0.1831,
"step": 341
},
{
"epoch": 0.21395057866750078,
"grad_norm": 0.4874535799026489,
"learning_rate": 8.672268907563026e-05,
"loss": 0.485,
"step": 342
},
{
"epoch": 0.21457616515483266,
"grad_norm": 0.6597093939781189,
"learning_rate": 8.638655462184874e-05,
"loss": 0.3588,
"step": 343
},
{
"epoch": 0.21520175164216454,
"grad_norm": 1.1764620542526245,
"learning_rate": 8.605042016806724e-05,
"loss": 1.8765,
"step": 344
},
{
"epoch": 0.2158273381294964,
"grad_norm": 0.6894935369491577,
"learning_rate": 8.571428571428571e-05,
"loss": 0.5355,
"step": 345
},
{
"epoch": 0.21645292461682827,
"grad_norm": 0.5896294116973877,
"learning_rate": 8.53781512605042e-05,
"loss": 0.494,
"step": 346
},
{
"epoch": 0.21707851110416015,
"grad_norm": 0.6212694048881531,
"learning_rate": 8.50420168067227e-05,
"loss": 0.5721,
"step": 347
},
{
"epoch": 0.21770409759149204,
"grad_norm": 0.5058571100234985,
"learning_rate": 8.470588235294118e-05,
"loss": 0.5051,
"step": 348
},
{
"epoch": 0.2183296840788239,
"grad_norm": 0.5089401006698608,
"learning_rate": 8.436974789915967e-05,
"loss": 0.3794,
"step": 349
},
{
"epoch": 0.21895527056615577,
"grad_norm": 6.416032314300537,
"learning_rate": 8.403361344537815e-05,
"loss": 0.5026,
"step": 350
},
{
"epoch": 0.21895527056615577,
"eval_loss": 0.7647964954376221,
"eval_runtime": 43.4854,
"eval_samples_per_second": 5.887,
"eval_steps_per_second": 2.944,
"step": 350
},
{
"epoch": 0.21958085705348765,
"grad_norm": 0.8862031698226929,
"learning_rate": 8.369747899159664e-05,
"loss": 0.5003,
"step": 351
},
{
"epoch": 0.22020644354081953,
"grad_norm": 1.3196977376937866,
"learning_rate": 8.336134453781514e-05,
"loss": 0.4573,
"step": 352
},
{
"epoch": 0.22083203002815138,
"grad_norm": 0.4763634204864502,
"learning_rate": 8.302521008403362e-05,
"loss": 0.3987,
"step": 353
},
{
"epoch": 0.22145761651548326,
"grad_norm": 0.45634883642196655,
"learning_rate": 8.26890756302521e-05,
"loss": 0.2064,
"step": 354
},
{
"epoch": 0.22208320300281514,
"grad_norm": 0.443393737077713,
"learning_rate": 8.23529411764706e-05,
"loss": 0.2308,
"step": 355
},
{
"epoch": 0.22270878949014702,
"grad_norm": 1.135941505432129,
"learning_rate": 8.201680672268908e-05,
"loss": 0.8731,
"step": 356
},
{
"epoch": 0.22333437597747888,
"grad_norm": 0.6853610873222351,
"learning_rate": 8.168067226890757e-05,
"loss": 0.5563,
"step": 357
},
{
"epoch": 0.22395996246481076,
"grad_norm": 0.6356902718544006,
"learning_rate": 8.134453781512605e-05,
"loss": 0.4265,
"step": 358
},
{
"epoch": 0.22458554895214264,
"grad_norm": 0.6331340074539185,
"learning_rate": 8.100840336134454e-05,
"loss": 0.3293,
"step": 359
},
{
"epoch": 0.22521113543947452,
"grad_norm": 0.8068905472755432,
"learning_rate": 8.067226890756304e-05,
"loss": 1.2136,
"step": 360
},
{
"epoch": 0.22583672192680637,
"grad_norm": 0.6827020049095154,
"learning_rate": 8.033613445378152e-05,
"loss": 0.8519,
"step": 361
},
{
"epoch": 0.22646230841413825,
"grad_norm": 0.829730749130249,
"learning_rate": 8e-05,
"loss": 0.6237,
"step": 362
},
{
"epoch": 0.22708789490147013,
"grad_norm": 0.5221096873283386,
"learning_rate": 7.966386554621849e-05,
"loss": 0.3099,
"step": 363
},
{
"epoch": 0.227713481388802,
"grad_norm": 0.6234191060066223,
"learning_rate": 7.932773109243698e-05,
"loss": 0.9356,
"step": 364
},
{
"epoch": 0.22833906787613387,
"grad_norm": 0.5766564607620239,
"learning_rate": 7.899159663865546e-05,
"loss": 0.2529,
"step": 365
},
{
"epoch": 0.22896465436346575,
"grad_norm": 0.758171558380127,
"learning_rate": 7.865546218487395e-05,
"loss": 0.7577,
"step": 366
},
{
"epoch": 0.22959024085079763,
"grad_norm": 0.6313957571983337,
"learning_rate": 7.831932773109243e-05,
"loss": 0.6219,
"step": 367
},
{
"epoch": 0.2302158273381295,
"grad_norm": 0.7843011617660522,
"learning_rate": 7.798319327731093e-05,
"loss": 0.6617,
"step": 368
},
{
"epoch": 0.23084141382546136,
"grad_norm": 0.9671229124069214,
"learning_rate": 7.764705882352942e-05,
"loss": 0.7156,
"step": 369
},
{
"epoch": 0.23146700031279324,
"grad_norm": 0.663546085357666,
"learning_rate": 7.73109243697479e-05,
"loss": 0.4462,
"step": 370
},
{
"epoch": 0.23209258680012512,
"grad_norm": 0.6233255863189697,
"learning_rate": 7.697478991596639e-05,
"loss": 0.5534,
"step": 371
},
{
"epoch": 0.232718173287457,
"grad_norm": 2.0895440578460693,
"learning_rate": 7.663865546218489e-05,
"loss": 0.7289,
"step": 372
},
{
"epoch": 0.23334375977478886,
"grad_norm": 0.6122156381607056,
"learning_rate": 7.630252100840336e-05,
"loss": 0.3891,
"step": 373
},
{
"epoch": 0.23396934626212074,
"grad_norm": 0.5940058827400208,
"learning_rate": 7.596638655462185e-05,
"loss": 0.3038,
"step": 374
},
{
"epoch": 0.23459493274945262,
"grad_norm": 0.35755977034568787,
"learning_rate": 7.563025210084033e-05,
"loss": 0.2848,
"step": 375
},
{
"epoch": 0.23459493274945262,
"eval_loss": 0.7521110773086548,
"eval_runtime": 43.5299,
"eval_samples_per_second": 5.881,
"eval_steps_per_second": 2.941,
"step": 375
},
{
"epoch": 0.2352205192367845,
"grad_norm": 0.8450719118118286,
"learning_rate": 7.529411764705883e-05,
"loss": 0.5348,
"step": 376
},
{
"epoch": 0.23584610572411635,
"grad_norm": 0.9100202918052673,
"learning_rate": 7.495798319327732e-05,
"loss": 0.8178,
"step": 377
},
{
"epoch": 0.23647169221144823,
"grad_norm": 0.5748711228370667,
"learning_rate": 7.46218487394958e-05,
"loss": 0.778,
"step": 378
},
{
"epoch": 0.2370972786987801,
"grad_norm": 0.5675060153007507,
"learning_rate": 7.428571428571429e-05,
"loss": 0.8042,
"step": 379
},
{
"epoch": 0.237722865186112,
"grad_norm": 6.2747392654418945,
"learning_rate": 7.394957983193279e-05,
"loss": 1.4173,
"step": 380
},
{
"epoch": 0.23834845167344385,
"grad_norm": 0.6252509355545044,
"learning_rate": 7.361344537815127e-05,
"loss": 0.5095,
"step": 381
},
{
"epoch": 0.23897403816077573,
"grad_norm": 1.0525410175323486,
"learning_rate": 7.327731092436974e-05,
"loss": 1.1774,
"step": 382
},
{
"epoch": 0.2395996246481076,
"grad_norm": 0.505670428276062,
"learning_rate": 7.294117647058823e-05,
"loss": 0.7603,
"step": 383
},
{
"epoch": 0.2402252111354395,
"grad_norm": 0.5476568341255188,
"learning_rate": 7.260504201680673e-05,
"loss": 0.3354,
"step": 384
},
{
"epoch": 0.24085079762277134,
"grad_norm": 0.687854528427124,
"learning_rate": 7.226890756302521e-05,
"loss": 0.6306,
"step": 385
},
{
"epoch": 0.24147638411010322,
"grad_norm": 1.3373991250991821,
"learning_rate": 7.19327731092437e-05,
"loss": 0.5736,
"step": 386
},
{
"epoch": 0.2421019705974351,
"grad_norm": 0.5465985536575317,
"learning_rate": 7.159663865546218e-05,
"loss": 0.2859,
"step": 387
},
{
"epoch": 0.24272755708476698,
"grad_norm": 0.6637946963310242,
"learning_rate": 7.126050420168068e-05,
"loss": 0.5948,
"step": 388
},
{
"epoch": 0.24335314357209883,
"grad_norm": 0.637915313243866,
"learning_rate": 7.092436974789917e-05,
"loss": 0.3947,
"step": 389
},
{
"epoch": 0.24397873005943072,
"grad_norm": 0.8073198795318604,
"learning_rate": 7.058823529411765e-05,
"loss": 0.5006,
"step": 390
},
{
"epoch": 0.2446043165467626,
"grad_norm": 0.7423315644264221,
"learning_rate": 7.025210084033613e-05,
"loss": 0.5209,
"step": 391
},
{
"epoch": 0.24522990303409448,
"grad_norm": 0.6418082118034363,
"learning_rate": 6.991596638655463e-05,
"loss": 0.3793,
"step": 392
},
{
"epoch": 0.24585548952142633,
"grad_norm": 1.072240948677063,
"learning_rate": 6.957983193277311e-05,
"loss": 0.3656,
"step": 393
},
{
"epoch": 0.2464810760087582,
"grad_norm": 1.3167545795440674,
"learning_rate": 6.92436974789916e-05,
"loss": 0.9145,
"step": 394
},
{
"epoch": 0.2471066624960901,
"grad_norm": 0.6734040379524231,
"learning_rate": 6.890756302521008e-05,
"loss": 0.4063,
"step": 395
},
{
"epoch": 0.24773224898342197,
"grad_norm": 0.48195910453796387,
"learning_rate": 6.857142857142858e-05,
"loss": 0.4146,
"step": 396
},
{
"epoch": 0.24835783547075382,
"grad_norm": 1.2620956897735596,
"learning_rate": 6.823529411764707e-05,
"loss": 0.4289,
"step": 397
},
{
"epoch": 0.2489834219580857,
"grad_norm": 0.6438835859298706,
"learning_rate": 6.789915966386555e-05,
"loss": 0.6204,
"step": 398
},
{
"epoch": 0.24960900844541759,
"grad_norm": 1.6006457805633545,
"learning_rate": 6.756302521008404e-05,
"loss": 0.8828,
"step": 399
},
{
"epoch": 0.25023459493274947,
"grad_norm": 3.7350921630859375,
"learning_rate": 6.722689075630254e-05,
"loss": 0.5608,
"step": 400
},
{
"epoch": 0.25023459493274947,
"eval_loss": 0.7404398322105408,
"eval_runtime": 43.5267,
"eval_samples_per_second": 5.881,
"eval_steps_per_second": 2.941,
"step": 400
},
{
"epoch": 0.2508601814200813,
"grad_norm": 0.776977002620697,
"learning_rate": 6.689075630252101e-05,
"loss": 0.709,
"step": 401
},
{
"epoch": 0.2514857679074132,
"grad_norm": 0.547192394733429,
"learning_rate": 6.65546218487395e-05,
"loss": 0.2832,
"step": 402
},
{
"epoch": 0.2521113543947451,
"grad_norm": 1.2148370742797852,
"learning_rate": 6.621848739495798e-05,
"loss": 0.9248,
"step": 403
},
{
"epoch": 0.25273694088207693,
"grad_norm": 0.5215961337089539,
"learning_rate": 6.588235294117648e-05,
"loss": 0.5407,
"step": 404
},
{
"epoch": 0.25336252736940884,
"grad_norm": 0.32982224225997925,
"learning_rate": 6.554621848739496e-05,
"loss": 0.1891,
"step": 405
},
{
"epoch": 0.2539881138567407,
"grad_norm": 0.707619309425354,
"learning_rate": 6.521008403361345e-05,
"loss": 0.6929,
"step": 406
},
{
"epoch": 0.25461370034407255,
"grad_norm": 1.87132728099823,
"learning_rate": 6.487394957983193e-05,
"loss": 0.6022,
"step": 407
},
{
"epoch": 0.25523928683140445,
"grad_norm": 0.5033402442932129,
"learning_rate": 6.453781512605043e-05,
"loss": 0.2833,
"step": 408
},
{
"epoch": 0.2558648733187363,
"grad_norm": 1.0010263919830322,
"learning_rate": 6.420168067226892e-05,
"loss": 0.2809,
"step": 409
},
{
"epoch": 0.2564904598060682,
"grad_norm": 0.9624127745628357,
"learning_rate": 6.386554621848739e-05,
"loss": 0.5303,
"step": 410
},
{
"epoch": 0.25711604629340007,
"grad_norm": 1.2495983839035034,
"learning_rate": 6.352941176470588e-05,
"loss": 0.5994,
"step": 411
},
{
"epoch": 0.2577416327807319,
"grad_norm": 0.7493329048156738,
"learning_rate": 6.319327731092438e-05,
"loss": 0.4795,
"step": 412
},
{
"epoch": 0.25836721926806383,
"grad_norm": 1.0783026218414307,
"learning_rate": 6.285714285714286e-05,
"loss": 1.13,
"step": 413
},
{
"epoch": 0.2589928057553957,
"grad_norm": 0.6462905406951904,
"learning_rate": 6.252100840336135e-05,
"loss": 0.4111,
"step": 414
},
{
"epoch": 0.25961839224272754,
"grad_norm": 0.4357486665248871,
"learning_rate": 6.218487394957983e-05,
"loss": 0.2122,
"step": 415
},
{
"epoch": 0.26024397873005944,
"grad_norm": 0.42553481459617615,
"learning_rate": 6.184873949579833e-05,
"loss": 0.2509,
"step": 416
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.8176494836807251,
"learning_rate": 6.151260504201682e-05,
"loss": 1.157,
"step": 417
},
{
"epoch": 0.2614951517047232,
"grad_norm": 0.527748703956604,
"learning_rate": 6.11764705882353e-05,
"loss": 0.3804,
"step": 418
},
{
"epoch": 0.26212073819205506,
"grad_norm": 0.9033327102661133,
"learning_rate": 6.084033613445378e-05,
"loss": 0.6555,
"step": 419
},
{
"epoch": 0.2627463246793869,
"grad_norm": 0.7106732130050659,
"learning_rate": 6.0504201680672267e-05,
"loss": 0.4358,
"step": 420
},
{
"epoch": 0.2633719111667188,
"grad_norm": 1.1655712127685547,
"learning_rate": 6.016806722689076e-05,
"loss": 0.5233,
"step": 421
},
{
"epoch": 0.2639974976540507,
"grad_norm": 0.7053611874580383,
"learning_rate": 5.9831932773109244e-05,
"loss": 0.428,
"step": 422
},
{
"epoch": 0.2646230841413825,
"grad_norm": 0.7588666677474976,
"learning_rate": 5.9495798319327737e-05,
"loss": 0.5904,
"step": 423
},
{
"epoch": 0.26524867062871443,
"grad_norm": 0.6778993010520935,
"learning_rate": 5.915966386554622e-05,
"loss": 0.7834,
"step": 424
},
{
"epoch": 0.2658742571160463,
"grad_norm": 0.5685262084007263,
"learning_rate": 5.882352941176471e-05,
"loss": 0.7958,
"step": 425
},
{
"epoch": 0.2658742571160463,
"eval_loss": 0.7321073412895203,
"eval_runtime": 43.5595,
"eval_samples_per_second": 5.877,
"eval_steps_per_second": 2.939,
"step": 425
},
{
"epoch": 0.2664998436033782,
"grad_norm": 0.41137516498565674,
"learning_rate": 5.84873949579832e-05,
"loss": 0.255,
"step": 426
},
{
"epoch": 0.26712543009071005,
"grad_norm": 0.48806631565093994,
"learning_rate": 5.8151260504201685e-05,
"loss": 0.2586,
"step": 427
},
{
"epoch": 0.2677510165780419,
"grad_norm": 0.877154529094696,
"learning_rate": 5.781512605042018e-05,
"loss": 0.5605,
"step": 428
},
{
"epoch": 0.2683766030653738,
"grad_norm": 1.1426063776016235,
"learning_rate": 5.7478991596638656e-05,
"loss": 0.6767,
"step": 429
},
{
"epoch": 0.26900218955270566,
"grad_norm": 0.7325838208198547,
"learning_rate": 5.714285714285714e-05,
"loss": 0.4384,
"step": 430
},
{
"epoch": 0.2696277760400375,
"grad_norm": 0.815000593662262,
"learning_rate": 5.6806722689075634e-05,
"loss": 0.5198,
"step": 431
},
{
"epoch": 0.2702533625273694,
"grad_norm": 0.582699716091156,
"learning_rate": 5.647058823529412e-05,
"loss": 0.5345,
"step": 432
},
{
"epoch": 0.2708789490147013,
"grad_norm": 0.6257805228233337,
"learning_rate": 5.6134453781512605e-05,
"loss": 0.4172,
"step": 433
},
{
"epoch": 0.27150453550203313,
"grad_norm": 0.8166823983192444,
"learning_rate": 5.57983193277311e-05,
"loss": 0.7274,
"step": 434
},
{
"epoch": 0.27213012198936504,
"grad_norm": 0.6732988953590393,
"learning_rate": 5.546218487394958e-05,
"loss": 0.781,
"step": 435
},
{
"epoch": 0.2727557084766969,
"grad_norm": 0.6230109930038452,
"learning_rate": 5.5126050420168075e-05,
"loss": 0.6356,
"step": 436
},
{
"epoch": 0.2733812949640288,
"grad_norm": 0.6590014696121216,
"learning_rate": 5.478991596638656e-05,
"loss": 0.9665,
"step": 437
},
{
"epoch": 0.27400688145136065,
"grad_norm": 0.3651019036769867,
"learning_rate": 5.445378151260504e-05,
"loss": 0.3858,
"step": 438
},
{
"epoch": 0.2746324679386925,
"grad_norm": 0.6834749579429626,
"learning_rate": 5.411764705882353e-05,
"loss": 0.5685,
"step": 439
},
{
"epoch": 0.2752580544260244,
"grad_norm": 0.46671655774116516,
"learning_rate": 5.378151260504202e-05,
"loss": 0.4953,
"step": 440
},
{
"epoch": 0.27588364091335627,
"grad_norm": 0.6245185732841492,
"learning_rate": 5.34453781512605e-05,
"loss": 0.7836,
"step": 441
},
{
"epoch": 0.2765092274006881,
"grad_norm": 0.5942935943603516,
"learning_rate": 5.3109243697478995e-05,
"loss": 0.3441,
"step": 442
},
{
"epoch": 0.27713481388802,
"grad_norm": 0.7539409399032593,
"learning_rate": 5.277310924369748e-05,
"loss": 0.7533,
"step": 443
},
{
"epoch": 0.2777604003753519,
"grad_norm": 0.40587514638900757,
"learning_rate": 5.243697478991597e-05,
"loss": 0.3298,
"step": 444
},
{
"epoch": 0.2783859868626838,
"grad_norm": 0.5237724184989929,
"learning_rate": 5.210084033613446e-05,
"loss": 0.65,
"step": 445
},
{
"epoch": 0.27901157335001564,
"grad_norm": 0.6571043133735657,
"learning_rate": 5.176470588235295e-05,
"loss": 0.5741,
"step": 446
},
{
"epoch": 0.2796371598373475,
"grad_norm": 0.4717683792114258,
"learning_rate": 5.142857142857143e-05,
"loss": 0.3073,
"step": 447
},
{
"epoch": 0.2802627463246794,
"grad_norm": 0.4331720173358917,
"learning_rate": 5.1092436974789914e-05,
"loss": 0.401,
"step": 448
},
{
"epoch": 0.28088833281201125,
"grad_norm": 0.6984372138977051,
"learning_rate": 5.07563025210084e-05,
"loss": 0.5573,
"step": 449
},
{
"epoch": 0.2815139192993431,
"grad_norm": 0.556936502456665,
"learning_rate": 5.042016806722689e-05,
"loss": 0.4267,
"step": 450
},
{
"epoch": 0.2815139192993431,
"eval_loss": 0.7238953709602356,
"eval_runtime": 43.5441,
"eval_samples_per_second": 5.879,
"eval_steps_per_second": 2.94,
"step": 450
},
{
"epoch": 0.282139505786675,
"grad_norm": 0.5527558922767639,
"learning_rate": 5.008403361344538e-05,
"loss": 0.3417,
"step": 451
},
{
"epoch": 0.28276509227400687,
"grad_norm": 0.6538864374160767,
"learning_rate": 4.974789915966387e-05,
"loss": 0.2658,
"step": 452
},
{
"epoch": 0.2833906787613388,
"grad_norm": 0.8087690472602844,
"learning_rate": 4.9411764705882355e-05,
"loss": 0.6919,
"step": 453
},
{
"epoch": 0.28401626524867063,
"grad_norm": 0.8165414333343506,
"learning_rate": 4.907563025210084e-05,
"loss": 0.575,
"step": 454
},
{
"epoch": 0.2846418517360025,
"grad_norm": 0.8950768113136292,
"learning_rate": 4.8739495798319326e-05,
"loss": 1.0045,
"step": 455
},
{
"epoch": 0.2852674382233344,
"grad_norm": 0.6484183073043823,
"learning_rate": 4.840336134453782e-05,
"loss": 0.5661,
"step": 456
},
{
"epoch": 0.28589302471066624,
"grad_norm": 2.4603426456451416,
"learning_rate": 4.8067226890756304e-05,
"loss": 0.5769,
"step": 457
},
{
"epoch": 0.2865186111979981,
"grad_norm": 0.28350579738616943,
"learning_rate": 4.7731092436974796e-05,
"loss": 0.1358,
"step": 458
},
{
"epoch": 0.28714419768533,
"grad_norm": 0.5952170491218567,
"learning_rate": 4.7394957983193275e-05,
"loss": 0.7661,
"step": 459
},
{
"epoch": 0.28776978417266186,
"grad_norm": 0.6130964756011963,
"learning_rate": 4.705882352941177e-05,
"loss": 0.6188,
"step": 460
},
{
"epoch": 0.28839537065999377,
"grad_norm": 2.7420737743377686,
"learning_rate": 4.672268907563025e-05,
"loss": 0.5851,
"step": 461
},
{
"epoch": 0.2890209571473256,
"grad_norm": 0.6672143340110779,
"learning_rate": 4.6386554621848745e-05,
"loss": 0.6167,
"step": 462
},
{
"epoch": 0.28964654363465747,
"grad_norm": 0.5279991030693054,
"learning_rate": 4.6050420168067224e-05,
"loss": 0.5681,
"step": 463
},
{
"epoch": 0.2902721301219894,
"grad_norm": 1.2923184633255005,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.6798,
"step": 464
},
{
"epoch": 0.29089771660932123,
"grad_norm": 0.718950629234314,
"learning_rate": 4.53781512605042e-05,
"loss": 0.7855,
"step": 465
},
{
"epoch": 0.2915233030966531,
"grad_norm": 0.6185110807418823,
"learning_rate": 4.5042016806722694e-05,
"loss": 0.3669,
"step": 466
},
{
"epoch": 0.292148889583985,
"grad_norm": 0.7038689255714417,
"learning_rate": 4.470588235294118e-05,
"loss": 0.534,
"step": 467
},
{
"epoch": 0.29277447607131685,
"grad_norm": 0.9318879246711731,
"learning_rate": 4.4369747899159665e-05,
"loss": 0.8676,
"step": 468
},
{
"epoch": 0.29340006255864876,
"grad_norm": 0.5326322317123413,
"learning_rate": 4.403361344537815e-05,
"loss": 0.511,
"step": 469
},
{
"epoch": 0.2940256490459806,
"grad_norm": 0.6265137195587158,
"learning_rate": 4.369747899159664e-05,
"loss": 0.3251,
"step": 470
},
{
"epoch": 0.29465123553331246,
"grad_norm": 0.4834919273853302,
"learning_rate": 4.336134453781513e-05,
"loss": 0.3329,
"step": 471
},
{
"epoch": 0.29527682202064437,
"grad_norm": 0.5931064486503601,
"learning_rate": 4.302521008403362e-05,
"loss": 0.3942,
"step": 472
},
{
"epoch": 0.2959024085079762,
"grad_norm": 0.23674030601978302,
"learning_rate": 4.26890756302521e-05,
"loss": 0.2589,
"step": 473
},
{
"epoch": 0.2965279949953081,
"grad_norm": 0.5993025302886963,
"learning_rate": 4.235294117647059e-05,
"loss": 0.4007,
"step": 474
},
{
"epoch": 0.29715358148264,
"grad_norm": 0.944987416267395,
"learning_rate": 4.201680672268908e-05,
"loss": 0.6389,
"step": 475
},
{
"epoch": 0.29715358148264,
"eval_loss": 0.7186967134475708,
"eval_runtime": 43.5187,
"eval_samples_per_second": 5.883,
"eval_steps_per_second": 2.941,
"step": 475
},
{
"epoch": 0.29777916796997184,
"grad_norm": 0.5033174157142639,
"learning_rate": 4.168067226890757e-05,
"loss": 0.3001,
"step": 476
},
{
"epoch": 0.29840475445730374,
"grad_norm": 0.55152827501297,
"learning_rate": 4.134453781512605e-05,
"loss": 0.3029,
"step": 477
},
{
"epoch": 0.2990303409446356,
"grad_norm": 0.3445724546909332,
"learning_rate": 4.100840336134454e-05,
"loss": 0.2868,
"step": 478
},
{
"epoch": 0.29965592743196745,
"grad_norm": 0.7697100639343262,
"learning_rate": 4.0672268907563026e-05,
"loss": 0.4365,
"step": 479
},
{
"epoch": 0.30028151391929936,
"grad_norm": 0.8685904145240784,
"learning_rate": 4.033613445378152e-05,
"loss": 0.5848,
"step": 480
},
{
"epoch": 0.3009071004066312,
"grad_norm": 0.9457945227622986,
"learning_rate": 4e-05,
"loss": 0.8947,
"step": 481
},
{
"epoch": 0.30153268689396306,
"grad_norm": 0.6347827911376953,
"learning_rate": 3.966386554621849e-05,
"loss": 0.4328,
"step": 482
},
{
"epoch": 0.302158273381295,
"grad_norm": 0.47420793771743774,
"learning_rate": 3.9327731092436974e-05,
"loss": 0.3031,
"step": 483
},
{
"epoch": 0.3027838598686268,
"grad_norm": 0.6944661736488342,
"learning_rate": 3.8991596638655467e-05,
"loss": 0.3967,
"step": 484
},
{
"epoch": 0.30340944635595873,
"grad_norm": 0.5811148881912231,
"learning_rate": 3.865546218487395e-05,
"loss": 0.4092,
"step": 485
},
{
"epoch": 0.3040350328432906,
"grad_norm": 0.8307628631591797,
"learning_rate": 3.8319327731092444e-05,
"loss": 0.4091,
"step": 486
},
{
"epoch": 0.30466061933062244,
"grad_norm": 0.53610759973526,
"learning_rate": 3.798319327731092e-05,
"loss": 0.2789,
"step": 487
},
{
"epoch": 0.30528620581795435,
"grad_norm": 0.7043578028678894,
"learning_rate": 3.7647058823529415e-05,
"loss": 0.4689,
"step": 488
},
{
"epoch": 0.3059117923052862,
"grad_norm": 0.6058409810066223,
"learning_rate": 3.73109243697479e-05,
"loss": 0.4195,
"step": 489
},
{
"epoch": 0.30653737879261805,
"grad_norm": 0.6879023313522339,
"learning_rate": 3.697478991596639e-05,
"loss": 0.8699,
"step": 490
},
{
"epoch": 0.30716296527994996,
"grad_norm": 0.9076145887374878,
"learning_rate": 3.663865546218487e-05,
"loss": 0.4701,
"step": 491
},
{
"epoch": 0.3077885517672818,
"grad_norm": 0.7418045401573181,
"learning_rate": 3.6302521008403364e-05,
"loss": 0.618,
"step": 492
},
{
"epoch": 0.3084141382546137,
"grad_norm": 0.7602983713150024,
"learning_rate": 3.596638655462185e-05,
"loss": 0.4609,
"step": 493
},
{
"epoch": 0.3090397247419456,
"grad_norm": 0.776549220085144,
"learning_rate": 3.563025210084034e-05,
"loss": 0.422,
"step": 494
},
{
"epoch": 0.30966531122927743,
"grad_norm": 0.6375955939292908,
"learning_rate": 3.529411764705883e-05,
"loss": 0.6409,
"step": 495
},
{
"epoch": 0.31029089771660934,
"grad_norm": 1.038455843925476,
"learning_rate": 3.495798319327731e-05,
"loss": 0.6265,
"step": 496
},
{
"epoch": 0.3109164842039412,
"grad_norm": 0.7373848557472229,
"learning_rate": 3.46218487394958e-05,
"loss": 0.5249,
"step": 497
},
{
"epoch": 0.31154207069127304,
"grad_norm": 0.5265942215919495,
"learning_rate": 3.428571428571429e-05,
"loss": 0.2448,
"step": 498
},
{
"epoch": 0.31216765717860495,
"grad_norm": 0.6590535640716553,
"learning_rate": 3.3949579831932776e-05,
"loss": 0.6337,
"step": 499
},
{
"epoch": 0.3127932436659368,
"grad_norm": 1.0600816011428833,
"learning_rate": 3.361344537815127e-05,
"loss": 0.6247,
"step": 500
},
{
"epoch": 0.3127932436659368,
"eval_loss": 0.6991736888885498,
"eval_runtime": 43.4877,
"eval_samples_per_second": 5.887,
"eval_steps_per_second": 2.943,
"step": 500
},
{
"epoch": 0.3134188301532687,
"grad_norm": 0.6913841366767883,
"learning_rate": 3.327731092436975e-05,
"loss": 0.4161,
"step": 501
},
{
"epoch": 0.31404441664060057,
"grad_norm": 0.627515971660614,
"learning_rate": 3.294117647058824e-05,
"loss": 0.418,
"step": 502
},
{
"epoch": 0.3146700031279324,
"grad_norm": 0.6809556484222412,
"learning_rate": 3.2605042016806725e-05,
"loss": 0.4784,
"step": 503
},
{
"epoch": 0.3152955896152643,
"grad_norm": 0.6119003891944885,
"learning_rate": 3.226890756302522e-05,
"loss": 0.4283,
"step": 504
},
{
"epoch": 0.3159211761025962,
"grad_norm": 0.44398972392082214,
"learning_rate": 3.1932773109243696e-05,
"loss": 0.2422,
"step": 505
},
{
"epoch": 0.31654676258992803,
"grad_norm": 0.607389509677887,
"learning_rate": 3.159663865546219e-05,
"loss": 0.4103,
"step": 506
},
{
"epoch": 0.31717234907725994,
"grad_norm": 0.5077054500579834,
"learning_rate": 3.1260504201680673e-05,
"loss": 0.444,
"step": 507
},
{
"epoch": 0.3177979355645918,
"grad_norm": 0.6008110642433167,
"learning_rate": 3.0924369747899166e-05,
"loss": 0.29,
"step": 508
},
{
"epoch": 0.3184235220519237,
"grad_norm": 0.5967888832092285,
"learning_rate": 3.058823529411765e-05,
"loss": 0.3271,
"step": 509
},
{
"epoch": 0.31904910853925555,
"grad_norm": 0.5713754296302795,
"learning_rate": 3.0252100840336133e-05,
"loss": 0.7984,
"step": 510
},
{
"epoch": 0.3196746950265874,
"grad_norm": 0.520448625087738,
"learning_rate": 2.9915966386554622e-05,
"loss": 0.6564,
"step": 511
},
{
"epoch": 0.3203002815139193,
"grad_norm": 0.7720608115196228,
"learning_rate": 2.957983193277311e-05,
"loss": 0.7506,
"step": 512
},
{
"epoch": 0.32092586800125117,
"grad_norm": 0.6118083000183105,
"learning_rate": 2.92436974789916e-05,
"loss": 0.5653,
"step": 513
},
{
"epoch": 0.321551454488583,
"grad_norm": 0.8836561441421509,
"learning_rate": 2.890756302521009e-05,
"loss": 0.5534,
"step": 514
},
{
"epoch": 0.32217704097591493,
"grad_norm": 0.7553030848503113,
"learning_rate": 2.857142857142857e-05,
"loss": 0.7212,
"step": 515
},
{
"epoch": 0.3228026274632468,
"grad_norm": 0.6662014722824097,
"learning_rate": 2.823529411764706e-05,
"loss": 0.8303,
"step": 516
},
{
"epoch": 0.3234282139505787,
"grad_norm": 0.7432876825332642,
"learning_rate": 2.789915966386555e-05,
"loss": 0.7016,
"step": 517
},
{
"epoch": 0.32405380043791054,
"grad_norm": 0.6568440198898315,
"learning_rate": 2.7563025210084037e-05,
"loss": 0.3467,
"step": 518
},
{
"epoch": 0.3246793869252424,
"grad_norm": 0.5214548110961914,
"learning_rate": 2.722689075630252e-05,
"loss": 0.2693,
"step": 519
},
{
"epoch": 0.3253049734125743,
"grad_norm": 0.6171154379844666,
"learning_rate": 2.689075630252101e-05,
"loss": 0.8636,
"step": 520
},
{
"epoch": 0.32593055989990616,
"grad_norm": 0.528217613697052,
"learning_rate": 2.6554621848739497e-05,
"loss": 0.3577,
"step": 521
},
{
"epoch": 0.326556146387238,
"grad_norm": 0.4560914635658264,
"learning_rate": 2.6218487394957986e-05,
"loss": 0.2316,
"step": 522
},
{
"epoch": 0.3271817328745699,
"grad_norm": 0.4513761103153229,
"learning_rate": 2.5882352941176475e-05,
"loss": 0.2614,
"step": 523
},
{
"epoch": 0.32780731936190177,
"grad_norm": 0.7243900299072266,
"learning_rate": 2.5546218487394957e-05,
"loss": 0.6181,
"step": 524
},
{
"epoch": 0.3284329058492337,
"grad_norm": 1.2595303058624268,
"learning_rate": 2.5210084033613446e-05,
"loss": 0.6232,
"step": 525
},
{
"epoch": 0.3284329058492337,
"eval_loss": 0.6902230381965637,
"eval_runtime": 43.5369,
"eval_samples_per_second": 5.88,
"eval_steps_per_second": 2.94,
"step": 525
},
{
"epoch": 0.32905849233656553,
"grad_norm": 0.8221277594566345,
"learning_rate": 2.4873949579831935e-05,
"loss": 0.5717,
"step": 526
},
{
"epoch": 0.3296840788238974,
"grad_norm": 0.79567551612854,
"learning_rate": 2.453781512605042e-05,
"loss": 0.9611,
"step": 527
},
{
"epoch": 0.3303096653112293,
"grad_norm": 0.7971245050430298,
"learning_rate": 2.420168067226891e-05,
"loss": 0.3147,
"step": 528
},
{
"epoch": 0.33093525179856115,
"grad_norm": 0.5613353848457336,
"learning_rate": 2.3865546218487398e-05,
"loss": 0.2808,
"step": 529
},
{
"epoch": 0.331560838285893,
"grad_norm": 0.5482297539710999,
"learning_rate": 2.3529411764705884e-05,
"loss": 0.3648,
"step": 530
},
{
"epoch": 0.3321864247732249,
"grad_norm": 0.8833631873130798,
"learning_rate": 2.3193277310924373e-05,
"loss": 0.5692,
"step": 531
},
{
"epoch": 0.33281201126055676,
"grad_norm": 0.9152381420135498,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.657,
"step": 532
},
{
"epoch": 0.33343759774788867,
"grad_norm": 4.917063236236572,
"learning_rate": 2.2521008403361347e-05,
"loss": 0.6422,
"step": 533
},
{
"epoch": 0.3340631842352205,
"grad_norm": 0.466827392578125,
"learning_rate": 2.2184873949579832e-05,
"loss": 0.3138,
"step": 534
},
{
"epoch": 0.3346887707225524,
"grad_norm": 0.4326834976673126,
"learning_rate": 2.184873949579832e-05,
"loss": 0.2392,
"step": 535
},
{
"epoch": 0.3353143572098843,
"grad_norm": 0.9225325584411621,
"learning_rate": 2.151260504201681e-05,
"loss": 0.9698,
"step": 536
},
{
"epoch": 0.33593994369721614,
"grad_norm": 51.696407318115234,
"learning_rate": 2.1176470588235296e-05,
"loss": 1.3961,
"step": 537
},
{
"epoch": 0.336565530184548,
"grad_norm": 0.7156023383140564,
"learning_rate": 2.0840336134453785e-05,
"loss": 0.6503,
"step": 538
},
{
"epoch": 0.3371911166718799,
"grad_norm": 0.46554067730903625,
"learning_rate": 2.050420168067227e-05,
"loss": 0.3789,
"step": 539
},
{
"epoch": 0.33781670315921175,
"grad_norm": 1.5252093076705933,
"learning_rate": 2.016806722689076e-05,
"loss": 1.3196,
"step": 540
},
{
"epoch": 0.33844228964654366,
"grad_norm": 0.836153507232666,
"learning_rate": 1.9831932773109244e-05,
"loss": 0.4832,
"step": 541
},
{
"epoch": 0.3390678761338755,
"grad_norm": 0.6036570072174072,
"learning_rate": 1.9495798319327733e-05,
"loss": 0.4105,
"step": 542
},
{
"epoch": 0.33969346262120736,
"grad_norm": 0.8396046161651611,
"learning_rate": 1.9159663865546222e-05,
"loss": 0.6624,
"step": 543
},
{
"epoch": 0.3403190491085393,
"grad_norm": 1.1757038831710815,
"learning_rate": 1.8823529411764708e-05,
"loss": 0.4706,
"step": 544
},
{
"epoch": 0.3409446355958711,
"grad_norm": 0.3689747154712677,
"learning_rate": 1.8487394957983196e-05,
"loss": 0.3568,
"step": 545
},
{
"epoch": 0.341570222083203,
"grad_norm": 0.5986055135726929,
"learning_rate": 1.8151260504201682e-05,
"loss": 0.6308,
"step": 546
},
{
"epoch": 0.3421958085705349,
"grad_norm": 0.4873492419719696,
"learning_rate": 1.781512605042017e-05,
"loss": 0.3524,
"step": 547
},
{
"epoch": 0.34282139505786674,
"grad_norm": 0.7307869791984558,
"learning_rate": 1.7478991596638656e-05,
"loss": 1.0777,
"step": 548
},
{
"epoch": 0.34344698154519865,
"grad_norm": 0.6755880117416382,
"learning_rate": 1.7142857142857145e-05,
"loss": 0.3718,
"step": 549
},
{
"epoch": 0.3440725680325305,
"grad_norm": 0.8727423548698425,
"learning_rate": 1.6806722689075634e-05,
"loss": 0.6502,
"step": 550
},
{
"epoch": 0.3440725680325305,
"eval_loss": 0.6811222434043884,
"eval_runtime": 43.5707,
"eval_samples_per_second": 5.876,
"eval_steps_per_second": 2.938,
"step": 550
}
],
"logging_steps": 1,
"max_steps": 600,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.137904338467881e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}