hp_project_1 / checkpoint-350 /trainer_state.json
rr4433's picture
Training in progress, step 350, checkpoint
131e7a5 verified
raw
history blame
64.6 kB
{
"best_metric": 0.7647964954376221,
"best_model_checkpoint": "outputs/checkpoint-350",
"epoch": 0.21895527056615577,
"eval_steps": 25,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006255864873318737,
"grad_norm": 3.302262783050537,
"learning_rate": 4e-05,
"loss": 1.7639,
"step": 1
},
{
"epoch": 0.0012511729746637473,
"grad_norm": 3.7728819847106934,
"learning_rate": 8e-05,
"loss": 2.3471,
"step": 2
},
{
"epoch": 0.001876759461995621,
"grad_norm": 3.575211763381958,
"learning_rate": 0.00012,
"loss": 1.274,
"step": 3
},
{
"epoch": 0.0025023459493274947,
"grad_norm": 4.3921918869018555,
"learning_rate": 0.00016,
"loss": 1.8361,
"step": 4
},
{
"epoch": 0.0031279324366593683,
"grad_norm": 3.215696096420288,
"learning_rate": 0.0002,
"loss": 2.8766,
"step": 5
},
{
"epoch": 0.003753518923991242,
"grad_norm": 4.060017108917236,
"learning_rate": 0.0001996638655462185,
"loss": 1.4329,
"step": 6
},
{
"epoch": 0.004379105411323116,
"grad_norm": 2.7935523986816406,
"learning_rate": 0.00019932773109243698,
"loss": 1.2844,
"step": 7
},
{
"epoch": 0.005004691898654989,
"grad_norm": 2.312218189239502,
"learning_rate": 0.00019899159663865548,
"loss": 1.8112,
"step": 8
},
{
"epoch": 0.005630278385986863,
"grad_norm": 3.5389914512634277,
"learning_rate": 0.00019865546218487395,
"loss": 2.0504,
"step": 9
},
{
"epoch": 0.006255864873318737,
"grad_norm": 2.913029432296753,
"learning_rate": 0.00019831932773109245,
"loss": 1.9101,
"step": 10
},
{
"epoch": 0.00688145136065061,
"grad_norm": 3.6916606426239014,
"learning_rate": 0.00019798319327731095,
"loss": 1.9899,
"step": 11
},
{
"epoch": 0.007507037847982484,
"grad_norm": 3.002810478210449,
"learning_rate": 0.00019764705882352942,
"loss": 1.3224,
"step": 12
},
{
"epoch": 0.008132624335314358,
"grad_norm": 1.657835602760315,
"learning_rate": 0.00019731092436974792,
"loss": 1.2208,
"step": 13
},
{
"epoch": 0.008758210822646231,
"grad_norm": 2.414161443710327,
"learning_rate": 0.00019697478991596642,
"loss": 1.5375,
"step": 14
},
{
"epoch": 0.009383797309978105,
"grad_norm": 1.9695100784301758,
"learning_rate": 0.00019663865546218486,
"loss": 0.9218,
"step": 15
},
{
"epoch": 0.010009383797309979,
"grad_norm": 3.9755845069885254,
"learning_rate": 0.00019630252100840336,
"loss": 1.3608,
"step": 16
},
{
"epoch": 0.010634970284641852,
"grad_norm": 6.843455791473389,
"learning_rate": 0.00019596638655462186,
"loss": 1.2168,
"step": 17
},
{
"epoch": 0.011260556771973726,
"grad_norm": 3.8736443519592285,
"learning_rate": 0.00019563025210084033,
"loss": 0.7392,
"step": 18
},
{
"epoch": 0.0118861432593056,
"grad_norm": 1.7369539737701416,
"learning_rate": 0.00019529411764705883,
"loss": 1.0495,
"step": 19
},
{
"epoch": 0.012511729746637473,
"grad_norm": 1.1708225011825562,
"learning_rate": 0.0001949579831932773,
"loss": 1.2266,
"step": 20
},
{
"epoch": 0.013137316233969347,
"grad_norm": 1.4693603515625,
"learning_rate": 0.0001946218487394958,
"loss": 1.1364,
"step": 21
},
{
"epoch": 0.01376290272130122,
"grad_norm": 0.8484959602355957,
"learning_rate": 0.0001942857142857143,
"loss": 0.6253,
"step": 22
},
{
"epoch": 0.014388489208633094,
"grad_norm": 2.7237887382507324,
"learning_rate": 0.00019394957983193278,
"loss": 1.2932,
"step": 23
},
{
"epoch": 0.015014075695964968,
"grad_norm": 1.1654947996139526,
"learning_rate": 0.00019361344537815127,
"loss": 0.5659,
"step": 24
},
{
"epoch": 0.01563966218329684,
"grad_norm": 1.7193485498428345,
"learning_rate": 0.00019327731092436975,
"loss": 1.3627,
"step": 25
},
{
"epoch": 0.01563966218329684,
"eval_loss": 1.0974555015563965,
"eval_runtime": 46.8133,
"eval_samples_per_second": 5.469,
"eval_steps_per_second": 2.734,
"step": 25
},
{
"epoch": 0.016265248670628715,
"grad_norm": 2.883988380432129,
"learning_rate": 0.00019294117647058825,
"loss": 0.6257,
"step": 26
},
{
"epoch": 0.01689083515796059,
"grad_norm": 1.4707483053207397,
"learning_rate": 0.00019260504201680674,
"loss": 0.879,
"step": 27
},
{
"epoch": 0.017516421645292463,
"grad_norm": 1.3346422910690308,
"learning_rate": 0.00019226890756302522,
"loss": 1.0058,
"step": 28
},
{
"epoch": 0.018142008132624336,
"grad_norm": 0.5815519094467163,
"learning_rate": 0.00019193277310924372,
"loss": 0.3475,
"step": 29
},
{
"epoch": 0.01876759461995621,
"grad_norm": 0.8800593018531799,
"learning_rate": 0.00019159663865546221,
"loss": 0.5426,
"step": 30
},
{
"epoch": 0.019393181107288084,
"grad_norm": 8.196944236755371,
"learning_rate": 0.0001912605042016807,
"loss": 1.0088,
"step": 31
},
{
"epoch": 0.020018767594619957,
"grad_norm": 3.264193296432495,
"learning_rate": 0.00019092436974789919,
"loss": 0.9319,
"step": 32
},
{
"epoch": 0.02064435408195183,
"grad_norm": 1.1047834157943726,
"learning_rate": 0.00019058823529411766,
"loss": 0.9262,
"step": 33
},
{
"epoch": 0.021269940569283705,
"grad_norm": 1.982783555984497,
"learning_rate": 0.00019025210084033613,
"loss": 1.2904,
"step": 34
},
{
"epoch": 0.021895527056615578,
"grad_norm": 2.6765289306640625,
"learning_rate": 0.00018991596638655463,
"loss": 1.0785,
"step": 35
},
{
"epoch": 0.022521113543947452,
"grad_norm": 4.674818992614746,
"learning_rate": 0.0001895798319327731,
"loss": 0.9822,
"step": 36
},
{
"epoch": 0.023146700031279326,
"grad_norm": 1.6232353448867798,
"learning_rate": 0.0001892436974789916,
"loss": 0.6441,
"step": 37
},
{
"epoch": 0.0237722865186112,
"grad_norm": 2.623237371444702,
"learning_rate": 0.0001889075630252101,
"loss": 0.8874,
"step": 38
},
{
"epoch": 0.024397873005943073,
"grad_norm": 1.4366761445999146,
"learning_rate": 0.00018857142857142857,
"loss": 0.4596,
"step": 39
},
{
"epoch": 0.025023459493274947,
"grad_norm": 1.8809682130813599,
"learning_rate": 0.00018823529411764707,
"loss": 0.887,
"step": 40
},
{
"epoch": 0.02564904598060682,
"grad_norm": 1.081438660621643,
"learning_rate": 0.00018789915966386554,
"loss": 0.4735,
"step": 41
},
{
"epoch": 0.026274632467938694,
"grad_norm": 2.1302649974823,
"learning_rate": 0.00018756302521008404,
"loss": 0.795,
"step": 42
},
{
"epoch": 0.026900218955270568,
"grad_norm": 2.005425453186035,
"learning_rate": 0.00018722689075630254,
"loss": 0.8891,
"step": 43
},
{
"epoch": 0.02752580544260244,
"grad_norm": 1.7256505489349365,
"learning_rate": 0.000186890756302521,
"loss": 0.5993,
"step": 44
},
{
"epoch": 0.028151391929934315,
"grad_norm": 0.927653968334198,
"learning_rate": 0.0001865546218487395,
"loss": 0.6185,
"step": 45
},
{
"epoch": 0.02877697841726619,
"grad_norm": 1.5710850954055786,
"learning_rate": 0.000186218487394958,
"loss": 0.5258,
"step": 46
},
{
"epoch": 0.029402564904598062,
"grad_norm": 1.8794296979904175,
"learning_rate": 0.00018588235294117648,
"loss": 0.8168,
"step": 47
},
{
"epoch": 0.030028151391929936,
"grad_norm": 0.9695333242416382,
"learning_rate": 0.00018554621848739498,
"loss": 0.5458,
"step": 48
},
{
"epoch": 0.03065373787926181,
"grad_norm": 3.7846665382385254,
"learning_rate": 0.00018521008403361345,
"loss": 0.943,
"step": 49
},
{
"epoch": 0.03127932436659368,
"grad_norm": 1.9213052988052368,
"learning_rate": 0.00018487394957983195,
"loss": 0.5069,
"step": 50
},
{
"epoch": 0.03127932436659368,
"eval_loss": 0.9765783548355103,
"eval_runtime": 43.502,
"eval_samples_per_second": 5.885,
"eval_steps_per_second": 2.942,
"step": 50
},
{
"epoch": 0.03190491085392556,
"grad_norm": 2.0580382347106934,
"learning_rate": 0.00018453781512605045,
"loss": 0.9423,
"step": 51
},
{
"epoch": 0.03253049734125743,
"grad_norm": 2.063591957092285,
"learning_rate": 0.0001842016806722689,
"loss": 0.7054,
"step": 52
},
{
"epoch": 0.033156083828589304,
"grad_norm": 1.2656595706939697,
"learning_rate": 0.0001838655462184874,
"loss": 0.401,
"step": 53
},
{
"epoch": 0.03378167031592118,
"grad_norm": 1.2392399311065674,
"learning_rate": 0.0001835294117647059,
"loss": 0.6077,
"step": 54
},
{
"epoch": 0.03440725680325305,
"grad_norm": 0.99504154920578,
"learning_rate": 0.00018319327731092437,
"loss": 0.6313,
"step": 55
},
{
"epoch": 0.035032843290584925,
"grad_norm": 2.0478012561798096,
"learning_rate": 0.00018285714285714286,
"loss": 1.2652,
"step": 56
},
{
"epoch": 0.0356584297779168,
"grad_norm": 0.9636131525039673,
"learning_rate": 0.00018252100840336134,
"loss": 0.7561,
"step": 57
},
{
"epoch": 0.03628401626524867,
"grad_norm": 0.874576210975647,
"learning_rate": 0.00018218487394957984,
"loss": 0.7461,
"step": 58
},
{
"epoch": 0.036909602752580546,
"grad_norm": 1.3745896816253662,
"learning_rate": 0.00018184873949579833,
"loss": 1.2856,
"step": 59
},
{
"epoch": 0.03753518923991242,
"grad_norm": 2.4839162826538086,
"learning_rate": 0.0001815126050420168,
"loss": 1.0574,
"step": 60
},
{
"epoch": 0.038160775727244294,
"grad_norm": 1.2671383619308472,
"learning_rate": 0.0001811764705882353,
"loss": 0.6177,
"step": 61
},
{
"epoch": 0.03878636221457617,
"grad_norm": 1.1862553358078003,
"learning_rate": 0.0001808403361344538,
"loss": 1.1169,
"step": 62
},
{
"epoch": 0.03941194870190804,
"grad_norm": 1.1347297430038452,
"learning_rate": 0.00018050420168067228,
"loss": 1.3303,
"step": 63
},
{
"epoch": 0.040037535189239915,
"grad_norm": 2.1583523750305176,
"learning_rate": 0.00018016806722689078,
"loss": 0.7941,
"step": 64
},
{
"epoch": 0.04066312167657179,
"grad_norm": 1.2432655096054077,
"learning_rate": 0.00017983193277310925,
"loss": 0.7848,
"step": 65
},
{
"epoch": 0.04128870816390366,
"grad_norm": 1.3345468044281006,
"learning_rate": 0.00017949579831932775,
"loss": 0.8953,
"step": 66
},
{
"epoch": 0.041914294651235535,
"grad_norm": 0.6861767768859863,
"learning_rate": 0.00017915966386554625,
"loss": 0.4162,
"step": 67
},
{
"epoch": 0.04253988113856741,
"grad_norm": 0.85309898853302,
"learning_rate": 0.00017882352941176472,
"loss": 0.6606,
"step": 68
},
{
"epoch": 0.04316546762589928,
"grad_norm": 1.0247780084609985,
"learning_rate": 0.00017848739495798322,
"loss": 0.5271,
"step": 69
},
{
"epoch": 0.043791054113231156,
"grad_norm": 1.3019441366195679,
"learning_rate": 0.0001781512605042017,
"loss": 0.5605,
"step": 70
},
{
"epoch": 0.04441664060056303,
"grad_norm": 1.1024900674819946,
"learning_rate": 0.00017781512605042016,
"loss": 0.9303,
"step": 71
},
{
"epoch": 0.045042227087894904,
"grad_norm": 1.079655408859253,
"learning_rate": 0.00017747899159663866,
"loss": 1.0138,
"step": 72
},
{
"epoch": 0.04566781357522678,
"grad_norm": 1.1078468561172485,
"learning_rate": 0.00017714285714285713,
"loss": 0.9861,
"step": 73
},
{
"epoch": 0.04629340006255865,
"grad_norm": 1.8648931980133057,
"learning_rate": 0.00017680672268907563,
"loss": 0.6756,
"step": 74
},
{
"epoch": 0.046918986549890525,
"grad_norm": 0.8588104248046875,
"learning_rate": 0.00017647058823529413,
"loss": 0.4867,
"step": 75
},
{
"epoch": 0.046918986549890525,
"eval_loss": 0.9139823913574219,
"eval_runtime": 43.5635,
"eval_samples_per_second": 5.876,
"eval_steps_per_second": 2.938,
"step": 75
},
{
"epoch": 0.0475445730372224,
"grad_norm": 1.6970480680465698,
"learning_rate": 0.0001761344537815126,
"loss": 0.5523,
"step": 76
},
{
"epoch": 0.04817015952455427,
"grad_norm": 0.8562026023864746,
"learning_rate": 0.0001757983193277311,
"loss": 0.4084,
"step": 77
},
{
"epoch": 0.048795746011886146,
"grad_norm": 0.9487925171852112,
"learning_rate": 0.0001754621848739496,
"loss": 0.6204,
"step": 78
},
{
"epoch": 0.04942133249921802,
"grad_norm": 11.929024696350098,
"learning_rate": 0.00017512605042016807,
"loss": 1.1662,
"step": 79
},
{
"epoch": 0.05004691898654989,
"grad_norm": 1.3468140363693237,
"learning_rate": 0.00017478991596638657,
"loss": 0.8037,
"step": 80
},
{
"epoch": 0.05067250547388177,
"grad_norm": 0.7379503846168518,
"learning_rate": 0.00017445378151260504,
"loss": 0.6564,
"step": 81
},
{
"epoch": 0.05129809196121364,
"grad_norm": 1.0315027236938477,
"learning_rate": 0.00017411764705882354,
"loss": 0.6377,
"step": 82
},
{
"epoch": 0.051923678448545514,
"grad_norm": 0.5900093913078308,
"learning_rate": 0.00017378151260504204,
"loss": 0.5122,
"step": 83
},
{
"epoch": 0.05254926493587739,
"grad_norm": 1.5138239860534668,
"learning_rate": 0.0001734453781512605,
"loss": 0.4769,
"step": 84
},
{
"epoch": 0.05317485142320926,
"grad_norm": 1.016790747642517,
"learning_rate": 0.000173109243697479,
"loss": 0.6654,
"step": 85
},
{
"epoch": 0.053800437910541135,
"grad_norm": 1.1964718103408813,
"learning_rate": 0.00017277310924369748,
"loss": 0.6334,
"step": 86
},
{
"epoch": 0.05442602439787301,
"grad_norm": 1.102842092514038,
"learning_rate": 0.00017243697478991598,
"loss": 0.832,
"step": 87
},
{
"epoch": 0.05505161088520488,
"grad_norm": 6.609305381774902,
"learning_rate": 0.00017210084033613448,
"loss": 0.6112,
"step": 88
},
{
"epoch": 0.055677197372536756,
"grad_norm": 2.6627745628356934,
"learning_rate": 0.00017176470588235293,
"loss": 1.032,
"step": 89
},
{
"epoch": 0.05630278385986863,
"grad_norm": 2.114955425262451,
"learning_rate": 0.00017142857142857143,
"loss": 0.6116,
"step": 90
},
{
"epoch": 0.0569283703472005,
"grad_norm": 1.7707552909851074,
"learning_rate": 0.00017109243697478992,
"loss": 0.4766,
"step": 91
},
{
"epoch": 0.05755395683453238,
"grad_norm": 0.9983264803886414,
"learning_rate": 0.0001707563025210084,
"loss": 0.5397,
"step": 92
},
{
"epoch": 0.05817954332186425,
"grad_norm": 8.190524101257324,
"learning_rate": 0.0001704201680672269,
"loss": 0.9531,
"step": 93
},
{
"epoch": 0.058805129809196124,
"grad_norm": 1.9920661449432373,
"learning_rate": 0.0001700840336134454,
"loss": 1.3801,
"step": 94
},
{
"epoch": 0.059430716296528,
"grad_norm": 0.8791856169700623,
"learning_rate": 0.00016974789915966387,
"loss": 0.6218,
"step": 95
},
{
"epoch": 0.06005630278385987,
"grad_norm": 1.0745537281036377,
"learning_rate": 0.00016941176470588237,
"loss": 0.5578,
"step": 96
},
{
"epoch": 0.060681889271191745,
"grad_norm": 1.4266705513000488,
"learning_rate": 0.00016907563025210084,
"loss": 1.5821,
"step": 97
},
{
"epoch": 0.06130747575852362,
"grad_norm": 1.1001832485198975,
"learning_rate": 0.00016873949579831934,
"loss": 0.5972,
"step": 98
},
{
"epoch": 0.06193306224585549,
"grad_norm": 1.3168463706970215,
"learning_rate": 0.00016840336134453784,
"loss": 0.5794,
"step": 99
},
{
"epoch": 0.06255864873318737,
"grad_norm": 1.0342196226119995,
"learning_rate": 0.0001680672268907563,
"loss": 0.6827,
"step": 100
},
{
"epoch": 0.06255864873318737,
"eval_loss": 0.8885337114334106,
"eval_runtime": 43.4886,
"eval_samples_per_second": 5.887,
"eval_steps_per_second": 2.943,
"step": 100
},
{
"epoch": 0.06318423522051923,
"grad_norm": 2.2497031688690186,
"learning_rate": 0.0001677310924369748,
"loss": 0.6468,
"step": 101
},
{
"epoch": 0.06380982170785111,
"grad_norm": 0.8061516284942627,
"learning_rate": 0.00016739495798319328,
"loss": 0.5388,
"step": 102
},
{
"epoch": 0.06443540819518298,
"grad_norm": 0.6954531669616699,
"learning_rate": 0.00016705882352941178,
"loss": 0.3191,
"step": 103
},
{
"epoch": 0.06506099468251486,
"grad_norm": 1.3721911907196045,
"learning_rate": 0.00016672268907563028,
"loss": 0.9,
"step": 104
},
{
"epoch": 0.06568658116984673,
"grad_norm": 1.084492564201355,
"learning_rate": 0.00016638655462184875,
"loss": 0.6144,
"step": 105
},
{
"epoch": 0.06631216765717861,
"grad_norm": 3.317697525024414,
"learning_rate": 0.00016605042016806725,
"loss": 0.634,
"step": 106
},
{
"epoch": 0.06693775414451048,
"grad_norm": 2.5598530769348145,
"learning_rate": 0.00016571428571428575,
"loss": 0.8931,
"step": 107
},
{
"epoch": 0.06756334063184236,
"grad_norm": 3.6414177417755127,
"learning_rate": 0.0001653781512605042,
"loss": 0.7226,
"step": 108
},
{
"epoch": 0.06818892711917422,
"grad_norm": 2.2443768978118896,
"learning_rate": 0.0001650420168067227,
"loss": 0.8862,
"step": 109
},
{
"epoch": 0.0688145136065061,
"grad_norm": 0.6285691857337952,
"learning_rate": 0.0001647058823529412,
"loss": 0.3766,
"step": 110
},
{
"epoch": 0.06944010009383797,
"grad_norm": 0.6171959042549133,
"learning_rate": 0.00016436974789915966,
"loss": 0.2821,
"step": 111
},
{
"epoch": 0.07006568658116985,
"grad_norm": 1.0057804584503174,
"learning_rate": 0.00016403361344537816,
"loss": 0.6293,
"step": 112
},
{
"epoch": 0.07069127306850172,
"grad_norm": 1.3190034627914429,
"learning_rate": 0.00016369747899159663,
"loss": 0.5547,
"step": 113
},
{
"epoch": 0.0713168595558336,
"grad_norm": 0.518517017364502,
"learning_rate": 0.00016336134453781513,
"loss": 0.1951,
"step": 114
},
{
"epoch": 0.07194244604316546,
"grad_norm": 0.848175048828125,
"learning_rate": 0.00016302521008403363,
"loss": 0.5091,
"step": 115
},
{
"epoch": 0.07256803253049735,
"grad_norm": 0.7387409806251526,
"learning_rate": 0.0001626890756302521,
"loss": 0.3872,
"step": 116
},
{
"epoch": 0.07319361901782921,
"grad_norm": 2.828091859817505,
"learning_rate": 0.0001623529411764706,
"loss": 1.2046,
"step": 117
},
{
"epoch": 0.07381920550516109,
"grad_norm": 1.7653822898864746,
"learning_rate": 0.00016201680672268907,
"loss": 1.8133,
"step": 118
},
{
"epoch": 0.07444479199249296,
"grad_norm": 3.5097360610961914,
"learning_rate": 0.00016168067226890757,
"loss": 0.6837,
"step": 119
},
{
"epoch": 0.07507037847982484,
"grad_norm": 1.3884797096252441,
"learning_rate": 0.00016134453781512607,
"loss": 0.8846,
"step": 120
},
{
"epoch": 0.0756959649671567,
"grad_norm": 22.705190658569336,
"learning_rate": 0.00016100840336134454,
"loss": 0.7281,
"step": 121
},
{
"epoch": 0.07632155145448859,
"grad_norm": 3.1223599910736084,
"learning_rate": 0.00016067226890756304,
"loss": 0.6254,
"step": 122
},
{
"epoch": 0.07694713794182045,
"grad_norm": 0.530583381652832,
"learning_rate": 0.00016033613445378154,
"loss": 0.3292,
"step": 123
},
{
"epoch": 0.07757272442915233,
"grad_norm": 1.4720183610916138,
"learning_rate": 0.00016,
"loss": 0.8192,
"step": 124
},
{
"epoch": 0.0781983109164842,
"grad_norm": 0.6448870301246643,
"learning_rate": 0.0001596638655462185,
"loss": 0.2431,
"step": 125
},
{
"epoch": 0.0781983109164842,
"eval_loss": 0.890012800693512,
"eval_runtime": 43.5059,
"eval_samples_per_second": 5.884,
"eval_steps_per_second": 2.942,
"step": 125
},
{
"epoch": 0.07882389740381608,
"grad_norm": 1.803906798362732,
"learning_rate": 0.00015932773109243698,
"loss": 0.8937,
"step": 126
},
{
"epoch": 0.07944948389114795,
"grad_norm": 2.2447054386138916,
"learning_rate": 0.00015899159663865546,
"loss": 0.6993,
"step": 127
},
{
"epoch": 0.08007507037847983,
"grad_norm": 0.6667381525039673,
"learning_rate": 0.00015865546218487396,
"loss": 0.4266,
"step": 128
},
{
"epoch": 0.0807006568658117,
"grad_norm": 1.1449408531188965,
"learning_rate": 0.00015831932773109243,
"loss": 0.5557,
"step": 129
},
{
"epoch": 0.08132624335314358,
"grad_norm": 1.399849534034729,
"learning_rate": 0.00015798319327731093,
"loss": 0.6761,
"step": 130
},
{
"epoch": 0.08195182984047544,
"grad_norm": 0.745627760887146,
"learning_rate": 0.00015764705882352943,
"loss": 0.5323,
"step": 131
},
{
"epoch": 0.08257741632780732,
"grad_norm": 1.162428379058838,
"learning_rate": 0.0001573109243697479,
"loss": 0.8231,
"step": 132
},
{
"epoch": 0.08320300281513919,
"grad_norm": 1.0329734086990356,
"learning_rate": 0.0001569747899159664,
"loss": 0.6179,
"step": 133
},
{
"epoch": 0.08382858930247107,
"grad_norm": 0.5739912986755371,
"learning_rate": 0.00015663865546218487,
"loss": 0.2515,
"step": 134
},
{
"epoch": 0.08445417578980294,
"grad_norm": 1.2065409421920776,
"learning_rate": 0.00015630252100840337,
"loss": 0.6161,
"step": 135
},
{
"epoch": 0.08507976227713482,
"grad_norm": 1.1025582551956177,
"learning_rate": 0.00015596638655462187,
"loss": 0.5926,
"step": 136
},
{
"epoch": 0.08570534876446669,
"grad_norm": 0.78680020570755,
"learning_rate": 0.00015563025210084034,
"loss": 0.9987,
"step": 137
},
{
"epoch": 0.08633093525179857,
"grad_norm": 0.6232782006263733,
"learning_rate": 0.00015529411764705884,
"loss": 0.4952,
"step": 138
},
{
"epoch": 0.08695652173913043,
"grad_norm": 3.347989559173584,
"learning_rate": 0.00015495798319327734,
"loss": 1.0787,
"step": 139
},
{
"epoch": 0.08758210822646231,
"grad_norm": 0.9020625352859497,
"learning_rate": 0.0001546218487394958,
"loss": 0.354,
"step": 140
},
{
"epoch": 0.08820769471379418,
"grad_norm": 1.8955539464950562,
"learning_rate": 0.0001542857142857143,
"loss": 0.5515,
"step": 141
},
{
"epoch": 0.08883328120112606,
"grad_norm": 5.194116115570068,
"learning_rate": 0.00015394957983193278,
"loss": 0.6843,
"step": 142
},
{
"epoch": 0.08945886768845793,
"grad_norm": 1.4467953443527222,
"learning_rate": 0.00015361344537815128,
"loss": 0.4236,
"step": 143
},
{
"epoch": 0.09008445417578981,
"grad_norm": 0.523921012878418,
"learning_rate": 0.00015327731092436978,
"loss": 0.2165,
"step": 144
},
{
"epoch": 0.09071004066312167,
"grad_norm": 1.653648018836975,
"learning_rate": 0.00015294117647058822,
"loss": 1.0643,
"step": 145
},
{
"epoch": 0.09133562715045355,
"grad_norm": 0.6991509199142456,
"learning_rate": 0.00015260504201680672,
"loss": 0.4398,
"step": 146
},
{
"epoch": 0.09196121363778542,
"grad_norm": 1.3986660242080688,
"learning_rate": 0.00015226890756302522,
"loss": 0.8488,
"step": 147
},
{
"epoch": 0.0925868001251173,
"grad_norm": 1.2424954175949097,
"learning_rate": 0.0001519327731092437,
"loss": 0.9516,
"step": 148
},
{
"epoch": 0.09321238661244917,
"grad_norm": 0.8900560140609741,
"learning_rate": 0.0001515966386554622,
"loss": 0.767,
"step": 149
},
{
"epoch": 0.09383797309978105,
"grad_norm": 40.042503356933594,
"learning_rate": 0.00015126050420168066,
"loss": 0.9691,
"step": 150
},
{
"epoch": 0.09383797309978105,
"eval_loss": 0.8660734295845032,
"eval_runtime": 43.5102,
"eval_samples_per_second": 5.884,
"eval_steps_per_second": 2.942,
"step": 150
},
{
"epoch": 0.09446355958711292,
"grad_norm": 2.816359519958496,
"learning_rate": 0.00015092436974789916,
"loss": 1.4959,
"step": 151
},
{
"epoch": 0.0950891460744448,
"grad_norm": 1.9332157373428345,
"learning_rate": 0.00015058823529411766,
"loss": 0.6786,
"step": 152
},
{
"epoch": 0.09571473256177666,
"grad_norm": 1.2608965635299683,
"learning_rate": 0.00015025210084033613,
"loss": 1.1282,
"step": 153
},
{
"epoch": 0.09634031904910854,
"grad_norm": 1.0167793035507202,
"learning_rate": 0.00014991596638655463,
"loss": 0.4932,
"step": 154
},
{
"epoch": 0.09696590553644041,
"grad_norm": 1.6121408939361572,
"learning_rate": 0.00014957983193277313,
"loss": 0.7193,
"step": 155
},
{
"epoch": 0.09759149202377229,
"grad_norm": 2.4104394912719727,
"learning_rate": 0.0001492436974789916,
"loss": 0.4472,
"step": 156
},
{
"epoch": 0.09821707851110416,
"grad_norm": 1.1095707416534424,
"learning_rate": 0.0001489075630252101,
"loss": 0.7595,
"step": 157
},
{
"epoch": 0.09884266499843604,
"grad_norm": 1.686458945274353,
"learning_rate": 0.00014857142857142857,
"loss": 0.5686,
"step": 158
},
{
"epoch": 0.0994682514857679,
"grad_norm": 3.2238378524780273,
"learning_rate": 0.00014823529411764707,
"loss": 0.4236,
"step": 159
},
{
"epoch": 0.10009383797309979,
"grad_norm": 1.800552248954773,
"learning_rate": 0.00014789915966386557,
"loss": 0.9519,
"step": 160
},
{
"epoch": 0.10071942446043165,
"grad_norm": 0.6441445350646973,
"learning_rate": 0.00014756302521008404,
"loss": 0.4119,
"step": 161
},
{
"epoch": 0.10134501094776353,
"grad_norm": 0.5892903804779053,
"learning_rate": 0.00014722689075630254,
"loss": 0.2956,
"step": 162
},
{
"epoch": 0.1019705974350954,
"grad_norm": 0.8733301758766174,
"learning_rate": 0.00014689075630252101,
"loss": 0.5749,
"step": 163
},
{
"epoch": 0.10259618392242728,
"grad_norm": 1.0460662841796875,
"learning_rate": 0.0001465546218487395,
"loss": 0.8167,
"step": 164
},
{
"epoch": 0.10322177040975915,
"grad_norm": 0.8178017735481262,
"learning_rate": 0.00014621848739495799,
"loss": 0.9027,
"step": 165
},
{
"epoch": 0.10384735689709103,
"grad_norm": 0.5698068737983704,
"learning_rate": 0.00014588235294117646,
"loss": 0.1829,
"step": 166
},
{
"epoch": 0.1044729433844229,
"grad_norm": 1.0011018514633179,
"learning_rate": 0.00014554621848739496,
"loss": 0.8985,
"step": 167
},
{
"epoch": 0.10509852987175478,
"grad_norm": 1.189772367477417,
"learning_rate": 0.00014521008403361346,
"loss": 0.5547,
"step": 168
},
{
"epoch": 0.10572411635908664,
"grad_norm": 0.7990069389343262,
"learning_rate": 0.00014487394957983193,
"loss": 0.6222,
"step": 169
},
{
"epoch": 0.10634970284641852,
"grad_norm": 0.6419771313667297,
"learning_rate": 0.00014453781512605043,
"loss": 0.3225,
"step": 170
},
{
"epoch": 0.10697528933375039,
"grad_norm": 0.8978354930877686,
"learning_rate": 0.00014420168067226893,
"loss": 0.4567,
"step": 171
},
{
"epoch": 0.10760087582108227,
"grad_norm": 0.7193794250488281,
"learning_rate": 0.0001438655462184874,
"loss": 0.4793,
"step": 172
},
{
"epoch": 0.10822646230841414,
"grad_norm": 0.9533759355545044,
"learning_rate": 0.0001435294117647059,
"loss": 1.4397,
"step": 173
},
{
"epoch": 0.10885204879574602,
"grad_norm": 0.48348739743232727,
"learning_rate": 0.00014319327731092437,
"loss": 0.3398,
"step": 174
},
{
"epoch": 0.10947763528307788,
"grad_norm": 0.7699019312858582,
"learning_rate": 0.00014285714285714287,
"loss": 0.7491,
"step": 175
},
{
"epoch": 0.10947763528307788,
"eval_loss": 0.8425782322883606,
"eval_runtime": 43.5013,
"eval_samples_per_second": 5.885,
"eval_steps_per_second": 2.942,
"step": 175
},
{
"epoch": 0.11010322177040976,
"grad_norm": 0.9201186895370483,
"learning_rate": 0.00014252100840336137,
"loss": 0.6919,
"step": 176
},
{
"epoch": 0.11072880825774163,
"grad_norm": 0.8190593123435974,
"learning_rate": 0.00014218487394957984,
"loss": 0.6262,
"step": 177
},
{
"epoch": 0.11135439474507351,
"grad_norm": 0.9715782403945923,
"learning_rate": 0.00014184873949579834,
"loss": 0.8364,
"step": 178
},
{
"epoch": 0.11197998123240538,
"grad_norm": 0.6699782609939575,
"learning_rate": 0.0001415126050420168,
"loss": 0.4898,
"step": 179
},
{
"epoch": 0.11260556771973726,
"grad_norm": 1.8386518955230713,
"learning_rate": 0.0001411764705882353,
"loss": 0.7812,
"step": 180
},
{
"epoch": 0.11323115420706913,
"grad_norm": 0.7240263819694519,
"learning_rate": 0.0001408403361344538,
"loss": 0.5508,
"step": 181
},
{
"epoch": 0.113856740694401,
"grad_norm": 0.6068630814552307,
"learning_rate": 0.00014050420168067225,
"loss": 0.5151,
"step": 182
},
{
"epoch": 0.11448232718173287,
"grad_norm": 1.6705517768859863,
"learning_rate": 0.00014016806722689075,
"loss": 1.2281,
"step": 183
},
{
"epoch": 0.11510791366906475,
"grad_norm": 1.6179956197738647,
"learning_rate": 0.00013983193277310925,
"loss": 0.7365,
"step": 184
},
{
"epoch": 0.11573350015639662,
"grad_norm": 1.5741758346557617,
"learning_rate": 0.00013949579831932772,
"loss": 1.0039,
"step": 185
},
{
"epoch": 0.1163590866437285,
"grad_norm": 0.9270511865615845,
"learning_rate": 0.00013915966386554622,
"loss": 0.5768,
"step": 186
},
{
"epoch": 0.11698467313106037,
"grad_norm": 1.3651914596557617,
"learning_rate": 0.00013882352941176472,
"loss": 0.7715,
"step": 187
},
{
"epoch": 0.11761025961839225,
"grad_norm": 1.4330601692199707,
"learning_rate": 0.0001384873949579832,
"loss": 0.4462,
"step": 188
},
{
"epoch": 0.11823584610572412,
"grad_norm": 0.9181672930717468,
"learning_rate": 0.0001381512605042017,
"loss": 0.3901,
"step": 189
},
{
"epoch": 0.118861432593056,
"grad_norm": 0.5304622650146484,
"learning_rate": 0.00013781512605042016,
"loss": 0.1718,
"step": 190
},
{
"epoch": 0.11948701908038786,
"grad_norm": 0.7475191354751587,
"learning_rate": 0.00013747899159663866,
"loss": 0.3602,
"step": 191
},
{
"epoch": 0.12011260556771974,
"grad_norm": 1.2558002471923828,
"learning_rate": 0.00013714285714285716,
"loss": 0.8558,
"step": 192
},
{
"epoch": 0.12073819205505161,
"grad_norm": 0.9859037399291992,
"learning_rate": 0.00013680672268907563,
"loss": 0.7155,
"step": 193
},
{
"epoch": 0.12136377854238349,
"grad_norm": 0.6028466820716858,
"learning_rate": 0.00013647058823529413,
"loss": 0.9596,
"step": 194
},
{
"epoch": 0.12198936502971536,
"grad_norm": 0.5713469386100769,
"learning_rate": 0.0001361344537815126,
"loss": 0.3442,
"step": 195
},
{
"epoch": 0.12261495151704724,
"grad_norm": 1.0781211853027344,
"learning_rate": 0.0001357983193277311,
"loss": 0.5569,
"step": 196
},
{
"epoch": 0.1232405380043791,
"grad_norm": 0.7850176095962524,
"learning_rate": 0.0001354621848739496,
"loss": 0.5853,
"step": 197
},
{
"epoch": 0.12386612449171099,
"grad_norm": 0.8100555539131165,
"learning_rate": 0.00013512605042016807,
"loss": 0.8285,
"step": 198
},
{
"epoch": 0.12449171097904285,
"grad_norm": 1.106834888458252,
"learning_rate": 0.00013478991596638657,
"loss": 0.9521,
"step": 199
},
{
"epoch": 0.12511729746637473,
"grad_norm": 1.4412230253219604,
"learning_rate": 0.00013445378151260507,
"loss": 0.6478,
"step": 200
},
{
"epoch": 0.12511729746637473,
"eval_loss": 0.8300326466560364,
"eval_runtime": 43.5102,
"eval_samples_per_second": 5.884,
"eval_steps_per_second": 2.942,
"step": 200
},
{
"epoch": 0.1257428839537066,
"grad_norm": 1.7852795124053955,
"learning_rate": 0.00013411764705882352,
"loss": 0.5687,
"step": 201
},
{
"epoch": 0.12636847044103847,
"grad_norm": 2.423583745956421,
"learning_rate": 0.00013378151260504202,
"loss": 0.9082,
"step": 202
},
{
"epoch": 0.12699405692837035,
"grad_norm": 1.538001298904419,
"learning_rate": 0.00013344537815126052,
"loss": 0.7143,
"step": 203
},
{
"epoch": 0.12761964341570223,
"grad_norm": 1.7380592823028564,
"learning_rate": 0.000133109243697479,
"loss": 0.8296,
"step": 204
},
{
"epoch": 0.1282452299030341,
"grad_norm": 0.8279218673706055,
"learning_rate": 0.0001327731092436975,
"loss": 0.6719,
"step": 205
},
{
"epoch": 0.12887081639036596,
"grad_norm": 0.7059926986694336,
"learning_rate": 0.00013243697478991596,
"loss": 0.4785,
"step": 206
},
{
"epoch": 0.12949640287769784,
"grad_norm": 0.6946935653686523,
"learning_rate": 0.00013210084033613446,
"loss": 0.4578,
"step": 207
},
{
"epoch": 0.13012198936502972,
"grad_norm": 0.9800712466239929,
"learning_rate": 0.00013176470588235296,
"loss": 1.4369,
"step": 208
},
{
"epoch": 0.1307475758523616,
"grad_norm": 0.708831787109375,
"learning_rate": 0.00013142857142857143,
"loss": 0.5071,
"step": 209
},
{
"epoch": 0.13137316233969346,
"grad_norm": 1.0098780393600464,
"learning_rate": 0.00013109243697478993,
"loss": 0.9155,
"step": 210
},
{
"epoch": 0.13199874882702534,
"grad_norm": 1.1598243713378906,
"learning_rate": 0.0001307563025210084,
"loss": 0.3757,
"step": 211
},
{
"epoch": 0.13262433531435722,
"grad_norm": 0.7583935260772705,
"learning_rate": 0.0001304201680672269,
"loss": 0.3365,
"step": 212
},
{
"epoch": 0.1332499218016891,
"grad_norm": 1.0866564512252808,
"learning_rate": 0.0001300840336134454,
"loss": 0.6398,
"step": 213
},
{
"epoch": 0.13387550828902095,
"grad_norm": 1.4322006702423096,
"learning_rate": 0.00012974789915966387,
"loss": 0.6427,
"step": 214
},
{
"epoch": 0.13450109477635283,
"grad_norm": 1.600325345993042,
"learning_rate": 0.00012941176470588237,
"loss": 0.6884,
"step": 215
},
{
"epoch": 0.1351266812636847,
"grad_norm": 1.0634167194366455,
"learning_rate": 0.00012907563025210087,
"loss": 1.0343,
"step": 216
},
{
"epoch": 0.13575226775101656,
"grad_norm": 0.9889366626739502,
"learning_rate": 0.00012873949579831934,
"loss": 0.717,
"step": 217
},
{
"epoch": 0.13637785423834844,
"grad_norm": 2.0635392665863037,
"learning_rate": 0.00012840336134453784,
"loss": 0.5965,
"step": 218
},
{
"epoch": 0.13700344072568033,
"grad_norm": 0.8937773704528809,
"learning_rate": 0.0001280672268907563,
"loss": 0.7281,
"step": 219
},
{
"epoch": 0.1376290272130122,
"grad_norm": 0.9768427014350891,
"learning_rate": 0.00012773109243697478,
"loss": 0.5687,
"step": 220
},
{
"epoch": 0.13825461370034406,
"grad_norm": 1.3913767337799072,
"learning_rate": 0.00012739495798319328,
"loss": 0.3984,
"step": 221
},
{
"epoch": 0.13888020018767594,
"grad_norm": 1.4933342933654785,
"learning_rate": 0.00012705882352941175,
"loss": 1.2441,
"step": 222
},
{
"epoch": 0.13950578667500782,
"grad_norm": 1.0846196413040161,
"learning_rate": 0.00012672268907563025,
"loss": 0.9013,
"step": 223
},
{
"epoch": 0.1401313731623397,
"grad_norm": 0.7788563370704651,
"learning_rate": 0.00012638655462184875,
"loss": 0.4674,
"step": 224
},
{
"epoch": 0.14075695964967155,
"grad_norm": 0.7341142296791077,
"learning_rate": 0.00012605042016806722,
"loss": 1.3271,
"step": 225
},
{
"epoch": 0.14075695964967155,
"eval_loss": 0.8179877996444702,
"eval_runtime": 43.5514,
"eval_samples_per_second": 5.878,
"eval_steps_per_second": 2.939,
"step": 225
},
{
"epoch": 0.14138254613700343,
"grad_norm": 6.473598480224609,
"learning_rate": 0.00012571428571428572,
"loss": 0.6219,
"step": 226
},
{
"epoch": 0.14200813262433531,
"grad_norm": 0.9846400022506714,
"learning_rate": 0.0001253781512605042,
"loss": 0.4407,
"step": 227
},
{
"epoch": 0.1426337191116672,
"grad_norm": 0.7880604267120361,
"learning_rate": 0.0001250420168067227,
"loss": 0.3927,
"step": 228
},
{
"epoch": 0.14325930559899905,
"grad_norm": 1.5999399423599243,
"learning_rate": 0.0001247058823529412,
"loss": 0.6917,
"step": 229
},
{
"epoch": 0.14388489208633093,
"grad_norm": 0.8072729110717773,
"learning_rate": 0.00012436974789915966,
"loss": 0.4909,
"step": 230
},
{
"epoch": 0.1445104785736628,
"grad_norm": 2.2560601234436035,
"learning_rate": 0.00012403361344537816,
"loss": 0.3355,
"step": 231
},
{
"epoch": 0.1451360650609947,
"grad_norm": 0.9964832663536072,
"learning_rate": 0.00012369747899159666,
"loss": 0.4436,
"step": 232
},
{
"epoch": 0.14576165154832654,
"grad_norm": 1.1081007719039917,
"learning_rate": 0.00012336134453781513,
"loss": 0.6582,
"step": 233
},
{
"epoch": 0.14638723803565842,
"grad_norm": 0.9722908735275269,
"learning_rate": 0.00012302521008403363,
"loss": 0.7412,
"step": 234
},
{
"epoch": 0.1470128245229903,
"grad_norm": 0.7456592917442322,
"learning_rate": 0.0001226890756302521,
"loss": 0.4303,
"step": 235
},
{
"epoch": 0.14763841101032218,
"grad_norm": 1.0428457260131836,
"learning_rate": 0.0001223529411764706,
"loss": 1.0538,
"step": 236
},
{
"epoch": 0.14826399749765404,
"grad_norm": 0.9209719896316528,
"learning_rate": 0.00012201680672268909,
"loss": 0.5864,
"step": 237
},
{
"epoch": 0.14888958398498592,
"grad_norm": 0.990292489528656,
"learning_rate": 0.00012168067226890756,
"loss": 0.5929,
"step": 238
},
{
"epoch": 0.1495151704723178,
"grad_norm": 0.6086494326591492,
"learning_rate": 0.00012134453781512605,
"loss": 0.4436,
"step": 239
},
{
"epoch": 0.15014075695964968,
"grad_norm": 1.429149866104126,
"learning_rate": 0.00012100840336134453,
"loss": 0.246,
"step": 240
},
{
"epoch": 0.15076634344698153,
"grad_norm": 1.8170491456985474,
"learning_rate": 0.00012067226890756302,
"loss": 0.6574,
"step": 241
},
{
"epoch": 0.1513919299343134,
"grad_norm": 1.1577768325805664,
"learning_rate": 0.00012033613445378152,
"loss": 0.5706,
"step": 242
},
{
"epoch": 0.1520175164216453,
"grad_norm": 0.7442137598991394,
"learning_rate": 0.00012,
"loss": 0.2772,
"step": 243
},
{
"epoch": 0.15264310290897717,
"grad_norm": 1.1375997066497803,
"learning_rate": 0.00011966386554621849,
"loss": 0.397,
"step": 244
},
{
"epoch": 0.15326868939630903,
"grad_norm": 0.8451513648033142,
"learning_rate": 0.00011932773109243697,
"loss": 0.5425,
"step": 245
},
{
"epoch": 0.1538942758836409,
"grad_norm": 0.7176560163497925,
"learning_rate": 0.00011899159663865547,
"loss": 0.4398,
"step": 246
},
{
"epoch": 0.1545198623709728,
"grad_norm": 1.049872875213623,
"learning_rate": 0.00011865546218487396,
"loss": 0.6479,
"step": 247
},
{
"epoch": 0.15514544885830467,
"grad_norm": 0.6093642115592957,
"learning_rate": 0.00011831932773109244,
"loss": 0.6125,
"step": 248
},
{
"epoch": 0.15577103534563652,
"grad_norm": 0.9963379502296448,
"learning_rate": 0.00011798319327731093,
"loss": 0.3768,
"step": 249
},
{
"epoch": 0.1563966218329684,
"grad_norm": 3.4668896198272705,
"learning_rate": 0.00011764705882352942,
"loss": 0.3744,
"step": 250
},
{
"epoch": 0.1563966218329684,
"eval_loss": 0.8456696271896362,
"eval_runtime": 43.5223,
"eval_samples_per_second": 5.882,
"eval_steps_per_second": 2.941,
"step": 250
},
{
"epoch": 0.15702220832030028,
"grad_norm": 0.6826130747795105,
"learning_rate": 0.00011731092436974791,
"loss": 0.4877,
"step": 251
},
{
"epoch": 0.15764779480763216,
"grad_norm": 1.8045300245285034,
"learning_rate": 0.0001169747899159664,
"loss": 0.9699,
"step": 252
},
{
"epoch": 0.15827338129496402,
"grad_norm": 0.7311923503875732,
"learning_rate": 0.00011663865546218489,
"loss": 0.4648,
"step": 253
},
{
"epoch": 0.1588989677822959,
"grad_norm": 1.7481943368911743,
"learning_rate": 0.00011630252100840337,
"loss": 0.8871,
"step": 254
},
{
"epoch": 0.15952455426962778,
"grad_norm": 2.6331326961517334,
"learning_rate": 0.00011596638655462187,
"loss": 0.8109,
"step": 255
},
{
"epoch": 0.16015014075695966,
"grad_norm": 0.899364709854126,
"learning_rate": 0.00011563025210084036,
"loss": 0.5021,
"step": 256
},
{
"epoch": 0.1607757272442915,
"grad_norm": 0.922218918800354,
"learning_rate": 0.00011529411764705881,
"loss": 0.5741,
"step": 257
},
{
"epoch": 0.1614013137316234,
"grad_norm": 5.335756301879883,
"learning_rate": 0.00011495798319327731,
"loss": 0.842,
"step": 258
},
{
"epoch": 0.16202690021895527,
"grad_norm": 0.8632665872573853,
"learning_rate": 0.0001146218487394958,
"loss": 0.4208,
"step": 259
},
{
"epoch": 0.16265248670628715,
"grad_norm": 4.576591968536377,
"learning_rate": 0.00011428571428571428,
"loss": 0.8813,
"step": 260
},
{
"epoch": 0.163278073193619,
"grad_norm": 0.907714307308197,
"learning_rate": 0.00011394957983193277,
"loss": 0.7204,
"step": 261
},
{
"epoch": 0.16390365968095089,
"grad_norm": 0.8328534960746765,
"learning_rate": 0.00011361344537815127,
"loss": 0.7552,
"step": 262
},
{
"epoch": 0.16452924616828277,
"grad_norm": 1.0882028341293335,
"learning_rate": 0.00011327731092436975,
"loss": 0.9079,
"step": 263
},
{
"epoch": 0.16515483265561465,
"grad_norm": 1.0093358755111694,
"learning_rate": 0.00011294117647058824,
"loss": 0.6284,
"step": 264
},
{
"epoch": 0.1657804191429465,
"grad_norm": 0.853907585144043,
"learning_rate": 0.00011260504201680672,
"loss": 0.508,
"step": 265
},
{
"epoch": 0.16640600563027838,
"grad_norm": 1.0016460418701172,
"learning_rate": 0.00011226890756302521,
"loss": 0.597,
"step": 266
},
{
"epoch": 0.16703159211761026,
"grad_norm": 1.0138968229293823,
"learning_rate": 0.00011193277310924371,
"loss": 0.9238,
"step": 267
},
{
"epoch": 0.16765717860494214,
"grad_norm": 1.1728049516677856,
"learning_rate": 0.0001115966386554622,
"loss": 0.9152,
"step": 268
},
{
"epoch": 0.168282765092274,
"grad_norm": 1.2228264808654785,
"learning_rate": 0.00011126050420168068,
"loss": 0.7483,
"step": 269
},
{
"epoch": 0.16890835157960588,
"grad_norm": 0.6260212659835815,
"learning_rate": 0.00011092436974789917,
"loss": 0.5566,
"step": 270
},
{
"epoch": 0.16953393806693776,
"grad_norm": 0.7589625716209412,
"learning_rate": 0.00011058823529411766,
"loss": 0.6242,
"step": 271
},
{
"epoch": 0.17015952455426964,
"grad_norm": 1.1016935110092163,
"learning_rate": 0.00011025210084033615,
"loss": 0.4419,
"step": 272
},
{
"epoch": 0.1707851110416015,
"grad_norm": 0.8092851042747498,
"learning_rate": 0.00010991596638655464,
"loss": 0.5168,
"step": 273
},
{
"epoch": 0.17141069752893337,
"grad_norm": 1.012885332107544,
"learning_rate": 0.00010957983193277312,
"loss": 0.4334,
"step": 274
},
{
"epoch": 0.17203628401626525,
"grad_norm": 2.6073336601257324,
"learning_rate": 0.00010924369747899159,
"loss": 0.5262,
"step": 275
},
{
"epoch": 0.17203628401626525,
"eval_loss": 0.8115787506103516,
"eval_runtime": 43.4931,
"eval_samples_per_second": 5.886,
"eval_steps_per_second": 2.943,
"step": 275
},
{
"epoch": 0.17266187050359713,
"grad_norm": 5.577237606048584,
"learning_rate": 0.00010890756302521008,
"loss": 1.0595,
"step": 276
},
{
"epoch": 0.17328745699092898,
"grad_norm": 1.1434190273284912,
"learning_rate": 0.00010857142857142856,
"loss": 0.4401,
"step": 277
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.951992928981781,
"learning_rate": 0.00010823529411764706,
"loss": 0.4393,
"step": 278
},
{
"epoch": 0.17453862996559275,
"grad_norm": 0.6695138216018677,
"learning_rate": 0.00010789915966386555,
"loss": 0.314,
"step": 279
},
{
"epoch": 0.17516421645292463,
"grad_norm": 0.40990278124809265,
"learning_rate": 0.00010756302521008403,
"loss": 0.192,
"step": 280
},
{
"epoch": 0.17578980294025648,
"grad_norm": 0.9555610418319702,
"learning_rate": 0.00010722689075630252,
"loss": 0.3646,
"step": 281
},
{
"epoch": 0.17641538942758836,
"grad_norm": 0.7370548844337463,
"learning_rate": 0.000106890756302521,
"loss": 0.8997,
"step": 282
},
{
"epoch": 0.17704097591492024,
"grad_norm": 1.0178982019424438,
"learning_rate": 0.0001065546218487395,
"loss": 0.986,
"step": 283
},
{
"epoch": 0.17766656240225212,
"grad_norm": 0.41388389468193054,
"learning_rate": 0.00010621848739495799,
"loss": 0.2069,
"step": 284
},
{
"epoch": 0.17829214888958397,
"grad_norm": 0.7140624523162842,
"learning_rate": 0.00010588235294117647,
"loss": 0.4852,
"step": 285
},
{
"epoch": 0.17891773537691585,
"grad_norm": 0.7758356332778931,
"learning_rate": 0.00010554621848739496,
"loss": 0.3943,
"step": 286
},
{
"epoch": 0.17954332186424773,
"grad_norm": 1.4193260669708252,
"learning_rate": 0.00010521008403361346,
"loss": 0.6412,
"step": 287
},
{
"epoch": 0.18016890835157962,
"grad_norm": 0.7264838814735413,
"learning_rate": 0.00010487394957983194,
"loss": 0.7834,
"step": 288
},
{
"epoch": 0.18079449483891147,
"grad_norm": 2.4300973415374756,
"learning_rate": 0.00010453781512605043,
"loss": 0.7462,
"step": 289
},
{
"epoch": 0.18142008132624335,
"grad_norm": 1.033916711807251,
"learning_rate": 0.00010420168067226892,
"loss": 0.5241,
"step": 290
},
{
"epoch": 0.18204566781357523,
"grad_norm": 0.5583767294883728,
"learning_rate": 0.00010386554621848741,
"loss": 0.7815,
"step": 291
},
{
"epoch": 0.1826712543009071,
"grad_norm": 0.7440481781959534,
"learning_rate": 0.0001035294117647059,
"loss": 0.4674,
"step": 292
},
{
"epoch": 0.18329684078823896,
"grad_norm": 4.230656147003174,
"learning_rate": 0.00010319327731092439,
"loss": 0.5219,
"step": 293
},
{
"epoch": 0.18392242727557084,
"grad_norm": 0.6165269017219543,
"learning_rate": 0.00010285714285714286,
"loss": 0.3274,
"step": 294
},
{
"epoch": 0.18454801376290272,
"grad_norm": 0.5844498872756958,
"learning_rate": 0.00010252100840336134,
"loss": 0.3719,
"step": 295
},
{
"epoch": 0.1851736002502346,
"grad_norm": 0.9936206936836243,
"learning_rate": 0.00010218487394957983,
"loss": 1.0453,
"step": 296
},
{
"epoch": 0.18579918673756646,
"grad_norm": 1.749831199645996,
"learning_rate": 0.00010184873949579831,
"loss": 0.6634,
"step": 297
},
{
"epoch": 0.18642477322489834,
"grad_norm": 0.4740132689476013,
"learning_rate": 0.0001015126050420168,
"loss": 0.2901,
"step": 298
},
{
"epoch": 0.18705035971223022,
"grad_norm": 0.664300262928009,
"learning_rate": 0.0001011764705882353,
"loss": 0.5869,
"step": 299
},
{
"epoch": 0.1876759461995621,
"grad_norm": 0.7400941252708435,
"learning_rate": 0.00010084033613445378,
"loss": 0.7881,
"step": 300
},
{
"epoch": 0.1876759461995621,
"eval_loss": 0.7877693772315979,
"eval_runtime": 43.5162,
"eval_samples_per_second": 5.883,
"eval_steps_per_second": 2.941,
"step": 300
},
{
"epoch": 0.18830153268689395,
"grad_norm": 0.6142858862876892,
"learning_rate": 0.00010050420168067227,
"loss": 0.3808,
"step": 301
},
{
"epoch": 0.18892711917422583,
"grad_norm": 1.991969347000122,
"learning_rate": 0.00010016806722689076,
"loss": 0.7035,
"step": 302
},
{
"epoch": 0.1895527056615577,
"grad_norm": 0.6220730543136597,
"learning_rate": 9.983193277310925e-05,
"loss": 0.2548,
"step": 303
},
{
"epoch": 0.1901782921488896,
"grad_norm": 0.6476833820343018,
"learning_rate": 9.949579831932774e-05,
"loss": 0.3569,
"step": 304
},
{
"epoch": 0.19080387863622145,
"grad_norm": 0.7133951783180237,
"learning_rate": 9.915966386554623e-05,
"loss": 0.4744,
"step": 305
},
{
"epoch": 0.19142946512355333,
"grad_norm": 0.6500736474990845,
"learning_rate": 9.882352941176471e-05,
"loss": 0.4653,
"step": 306
},
{
"epoch": 0.1920550516108852,
"grad_norm": 1.1231927871704102,
"learning_rate": 9.848739495798321e-05,
"loss": 0.818,
"step": 307
},
{
"epoch": 0.1926806380982171,
"grad_norm": 0.8654798865318298,
"learning_rate": 9.815126050420168e-05,
"loss": 0.7065,
"step": 308
},
{
"epoch": 0.19330622458554894,
"grad_norm": 0.45660969614982605,
"learning_rate": 9.781512605042017e-05,
"loss": 0.2412,
"step": 309
},
{
"epoch": 0.19393181107288082,
"grad_norm": 0.9538519978523254,
"learning_rate": 9.747899159663865e-05,
"loss": 1.3428,
"step": 310
},
{
"epoch": 0.1945573975602127,
"grad_norm": 0.596633791923523,
"learning_rate": 9.714285714285715e-05,
"loss": 0.5119,
"step": 311
},
{
"epoch": 0.19518298404754458,
"grad_norm": 0.5247074365615845,
"learning_rate": 9.680672268907564e-05,
"loss": 0.6413,
"step": 312
},
{
"epoch": 0.19580857053487644,
"grad_norm": 0.7713050246238708,
"learning_rate": 9.647058823529412e-05,
"loss": 0.49,
"step": 313
},
{
"epoch": 0.19643415702220832,
"grad_norm": 0.6971513628959656,
"learning_rate": 9.613445378151261e-05,
"loss": 0.6505,
"step": 314
},
{
"epoch": 0.1970597435095402,
"grad_norm": 0.5454917550086975,
"learning_rate": 9.579831932773111e-05,
"loss": 0.7018,
"step": 315
},
{
"epoch": 0.19768532999687208,
"grad_norm": 0.8349499702453613,
"learning_rate": 9.546218487394959e-05,
"loss": 0.3179,
"step": 316
},
{
"epoch": 0.19831091648420393,
"grad_norm": 0.5682560801506042,
"learning_rate": 9.512605042016806e-05,
"loss": 0.4003,
"step": 317
},
{
"epoch": 0.1989365029715358,
"grad_norm": 0.5094739198684692,
"learning_rate": 9.478991596638655e-05,
"loss": 0.313,
"step": 318
},
{
"epoch": 0.1995620894588677,
"grad_norm": 1.7074236869812012,
"learning_rate": 9.445378151260505e-05,
"loss": 0.9912,
"step": 319
},
{
"epoch": 0.20018767594619957,
"grad_norm": 1.1477283239364624,
"learning_rate": 9.411764705882353e-05,
"loss": 0.851,
"step": 320
},
{
"epoch": 0.20081326243353143,
"grad_norm": 0.6616579294204712,
"learning_rate": 9.378151260504202e-05,
"loss": 0.4844,
"step": 321
},
{
"epoch": 0.2014388489208633,
"grad_norm": 1.0401920080184937,
"learning_rate": 9.34453781512605e-05,
"loss": 0.5421,
"step": 322
},
{
"epoch": 0.2020644354081952,
"grad_norm": 0.729664146900177,
"learning_rate": 9.3109243697479e-05,
"loss": 0.6632,
"step": 323
},
{
"epoch": 0.20269002189552707,
"grad_norm": 0.6752575635910034,
"learning_rate": 9.277310924369749e-05,
"loss": 0.4352,
"step": 324
},
{
"epoch": 0.20331560838285892,
"grad_norm": 0.7963948249816895,
"learning_rate": 9.243697478991598e-05,
"loss": 0.7614,
"step": 325
},
{
"epoch": 0.20331560838285892,
"eval_loss": 0.771190881729126,
"eval_runtime": 43.551,
"eval_samples_per_second": 5.878,
"eval_steps_per_second": 2.939,
"step": 325
},
{
"epoch": 0.2039411948701908,
"grad_norm": 0.7778791189193726,
"learning_rate": 9.210084033613445e-05,
"loss": 0.7251,
"step": 326
},
{
"epoch": 0.20456678135752268,
"grad_norm": 3.0929737091064453,
"learning_rate": 9.176470588235295e-05,
"loss": 0.5375,
"step": 327
},
{
"epoch": 0.20519236784485456,
"grad_norm": 0.6188391447067261,
"learning_rate": 9.142857142857143e-05,
"loss": 0.4007,
"step": 328
},
{
"epoch": 0.20581795433218641,
"grad_norm": 0.9423925876617432,
"learning_rate": 9.109243697478992e-05,
"loss": 0.5059,
"step": 329
},
{
"epoch": 0.2064435408195183,
"grad_norm": 0.506572425365448,
"learning_rate": 9.07563025210084e-05,
"loss": 0.2794,
"step": 330
},
{
"epoch": 0.20706912730685018,
"grad_norm": 1.7139545679092407,
"learning_rate": 9.04201680672269e-05,
"loss": 0.5984,
"step": 331
},
{
"epoch": 0.20769471379418206,
"grad_norm": 0.5540574789047241,
"learning_rate": 9.008403361344539e-05,
"loss": 0.323,
"step": 332
},
{
"epoch": 0.2083203002815139,
"grad_norm": 0.6909454464912415,
"learning_rate": 8.974789915966387e-05,
"loss": 0.5399,
"step": 333
},
{
"epoch": 0.2089458867688458,
"grad_norm": 0.7409022450447083,
"learning_rate": 8.941176470588236e-05,
"loss": 0.4251,
"step": 334
},
{
"epoch": 0.20957147325617767,
"grad_norm": 0.6636312007904053,
"learning_rate": 8.907563025210084e-05,
"loss": 0.4021,
"step": 335
},
{
"epoch": 0.21019705974350955,
"grad_norm": 0.5426271557807922,
"learning_rate": 8.873949579831933e-05,
"loss": 0.2095,
"step": 336
},
{
"epoch": 0.2108226462308414,
"grad_norm": 0.8870647549629211,
"learning_rate": 8.840336134453782e-05,
"loss": 0.5773,
"step": 337
},
{
"epoch": 0.21144823271817328,
"grad_norm": 0.5508524179458618,
"learning_rate": 8.80672268907563e-05,
"loss": 0.6744,
"step": 338
},
{
"epoch": 0.21207381920550517,
"grad_norm": 1.6577738523483276,
"learning_rate": 8.77310924369748e-05,
"loss": 1.1134,
"step": 339
},
{
"epoch": 0.21269940569283705,
"grad_norm": 3.218395233154297,
"learning_rate": 8.739495798319329e-05,
"loss": 0.5932,
"step": 340
},
{
"epoch": 0.2133249921801689,
"grad_norm": 0.5119672417640686,
"learning_rate": 8.705882352941177e-05,
"loss": 0.1831,
"step": 341
},
{
"epoch": 0.21395057866750078,
"grad_norm": 0.4874535799026489,
"learning_rate": 8.672268907563026e-05,
"loss": 0.485,
"step": 342
},
{
"epoch": 0.21457616515483266,
"grad_norm": 0.6597093939781189,
"learning_rate": 8.638655462184874e-05,
"loss": 0.3588,
"step": 343
},
{
"epoch": 0.21520175164216454,
"grad_norm": 1.1764620542526245,
"learning_rate": 8.605042016806724e-05,
"loss": 1.8765,
"step": 344
},
{
"epoch": 0.2158273381294964,
"grad_norm": 0.6894935369491577,
"learning_rate": 8.571428571428571e-05,
"loss": 0.5355,
"step": 345
},
{
"epoch": 0.21645292461682827,
"grad_norm": 0.5896294116973877,
"learning_rate": 8.53781512605042e-05,
"loss": 0.494,
"step": 346
},
{
"epoch": 0.21707851110416015,
"grad_norm": 0.6212694048881531,
"learning_rate": 8.50420168067227e-05,
"loss": 0.5721,
"step": 347
},
{
"epoch": 0.21770409759149204,
"grad_norm": 0.5058571100234985,
"learning_rate": 8.470588235294118e-05,
"loss": 0.5051,
"step": 348
},
{
"epoch": 0.2183296840788239,
"grad_norm": 0.5089401006698608,
"learning_rate": 8.436974789915967e-05,
"loss": 0.3794,
"step": 349
},
{
"epoch": 0.21895527056615577,
"grad_norm": 6.416032314300537,
"learning_rate": 8.403361344537815e-05,
"loss": 0.5026,
"step": 350
},
{
"epoch": 0.21895527056615577,
"eval_loss": 0.7647964954376221,
"eval_runtime": 43.4854,
"eval_samples_per_second": 5.887,
"eval_steps_per_second": 2.944,
"step": 350
}
],
"logging_steps": 1,
"max_steps": 600,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.5558583235916595e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}