zitongyang's picture
Upload folder using huggingface_hub
aff0303 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9971523493118177,
"eval_steps": 500,
"global_step": 526,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0037968675842429997,
"grad_norm": 190.1852569580078,
"learning_rate": 1.8518518518518518e-07,
"loss": 2.1743,
"step": 1
},
{
"epoch": 0.007593735168485999,
"grad_norm": 318.4629211425781,
"learning_rate": 3.7037037037037036e-07,
"loss": 2.1888,
"step": 2
},
{
"epoch": 0.011390602752728999,
"grad_norm": 4.048186302185059,
"learning_rate": 5.555555555555555e-07,
"loss": 1.8374,
"step": 3
},
{
"epoch": 0.015187470336971999,
"grad_norm": 149.984375,
"learning_rate": 7.407407407407407e-07,
"loss": 2.2082,
"step": 4
},
{
"epoch": 0.018984337921214997,
"grad_norm": 140.36471557617188,
"learning_rate": 9.259259259259259e-07,
"loss": 2.5169,
"step": 5
},
{
"epoch": 0.022781205505457997,
"grad_norm": 81.98410034179688,
"learning_rate": 1.111111111111111e-06,
"loss": 2.0747,
"step": 6
},
{
"epoch": 0.026578073089700997,
"grad_norm": 3.9009580612182617,
"learning_rate": 1.2962962962962962e-06,
"loss": 1.8328,
"step": 7
},
{
"epoch": 0.030374940673943997,
"grad_norm": 239.54661560058594,
"learning_rate": 1.4814814814814815e-06,
"loss": 2.1877,
"step": 8
},
{
"epoch": 0.034171808258187,
"grad_norm": 181.9974822998047,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.9126,
"step": 9
},
{
"epoch": 0.037968675842429994,
"grad_norm": 71.94468688964844,
"learning_rate": 1.8518518518518519e-06,
"loss": 1.9289,
"step": 10
},
{
"epoch": 0.041765543426673,
"grad_norm": 327.50091552734375,
"learning_rate": 2.037037037037037e-06,
"loss": 1.8236,
"step": 11
},
{
"epoch": 0.045562411010915994,
"grad_norm": 3.223841667175293,
"learning_rate": 2.222222222222222e-06,
"loss": 1.7878,
"step": 12
},
{
"epoch": 0.04935927859515899,
"grad_norm": 230.49221801757812,
"learning_rate": 2.4074074074074075e-06,
"loss": 1.6871,
"step": 13
},
{
"epoch": 0.053156146179401995,
"grad_norm": 18558.3125,
"learning_rate": 2.5925925925925925e-06,
"loss": 1.6958,
"step": 14
},
{
"epoch": 0.05695301376364499,
"grad_norm": 2.5916926860809326,
"learning_rate": 2.7777777777777783e-06,
"loss": 1.7525,
"step": 15
},
{
"epoch": 0.060749881347887995,
"grad_norm": 2.61940598487854,
"learning_rate": 2.962962962962963e-06,
"loss": 1.7431,
"step": 16
},
{
"epoch": 0.064546748932131,
"grad_norm": 175.40200805664062,
"learning_rate": 3.1481481481481483e-06,
"loss": 1.6176,
"step": 17
},
{
"epoch": 0.068343616516374,
"grad_norm": 2.723275899887085,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.7091,
"step": 18
},
{
"epoch": 0.07214048410061699,
"grad_norm": 575.9302368164062,
"learning_rate": 3.5185185185185187e-06,
"loss": 1.4155,
"step": 19
},
{
"epoch": 0.07593735168485999,
"grad_norm": 1.5204131603240967,
"learning_rate": 3.7037037037037037e-06,
"loss": 1.668,
"step": 20
},
{
"epoch": 0.07973421926910298,
"grad_norm": 172.04342651367188,
"learning_rate": 3.88888888888889e-06,
"loss": 1.5175,
"step": 21
},
{
"epoch": 0.083531086853346,
"grad_norm": 36.41116714477539,
"learning_rate": 4.074074074074074e-06,
"loss": 1.3265,
"step": 22
},
{
"epoch": 0.08732795443758899,
"grad_norm": 1.3005378246307373,
"learning_rate": 4.2592592592592596e-06,
"loss": 1.6097,
"step": 23
},
{
"epoch": 0.09112482202183199,
"grad_norm": 23.33233642578125,
"learning_rate": 4.444444444444444e-06,
"loss": 1.4055,
"step": 24
},
{
"epoch": 0.09492168960607499,
"grad_norm": 1.2947165966033936,
"learning_rate": 4.62962962962963e-06,
"loss": 1.552,
"step": 25
},
{
"epoch": 0.09871855719031798,
"grad_norm": 62.264190673828125,
"learning_rate": 4.814814814814815e-06,
"loss": 1.198,
"step": 26
},
{
"epoch": 0.10251542477456099,
"grad_norm": 16.471799850463867,
"learning_rate": 5e-06,
"loss": 1.3374,
"step": 27
},
{
"epoch": 0.10631229235880399,
"grad_norm": 9.552218437194824,
"learning_rate": 4.999950454155801e-06,
"loss": 1.3352,
"step": 28
},
{
"epoch": 0.11010915994304699,
"grad_norm": 1.3561033010482788,
"learning_rate": 4.999801818587036e-06,
"loss": 1.5152,
"step": 29
},
{
"epoch": 0.11390602752728998,
"grad_norm": 3.8642690181732178,
"learning_rate": 4.999554099185124e-06,
"loss": 1.3093,
"step": 30
},
{
"epoch": 0.11770289511153298,
"grad_norm": 0.9432722926139832,
"learning_rate": 4.999207305768841e-06,
"loss": 1.3048,
"step": 31
},
{
"epoch": 0.12149976269577599,
"grad_norm": 0.9486730694770813,
"learning_rate": 4.998761452083922e-06,
"loss": 1.4368,
"step": 32
},
{
"epoch": 0.12529663028001897,
"grad_norm": 0.7894783020019531,
"learning_rate": 4.998216555802526e-06,
"loss": 1.2765,
"step": 33
},
{
"epoch": 0.129093497864262,
"grad_norm": 0.8032355308532715,
"learning_rate": 4.997572638522531e-06,
"loss": 1.425,
"step": 34
},
{
"epoch": 0.132890365448505,
"grad_norm": 0.6962498426437378,
"learning_rate": 4.996829725766676e-06,
"loss": 1.4005,
"step": 35
},
{
"epoch": 0.136687233032748,
"grad_norm": 0.6125718355178833,
"learning_rate": 4.995987846981554e-06,
"loss": 1.0826,
"step": 36
},
{
"epoch": 0.140484100616991,
"grad_norm": 0.6794630885124207,
"learning_rate": 4.995047035536439e-06,
"loss": 1.0644,
"step": 37
},
{
"epoch": 0.14428096820123398,
"grad_norm": 0.6797907948493958,
"learning_rate": 4.9940073287219705e-06,
"loss": 1.0821,
"step": 38
},
{
"epoch": 0.14807783578547698,
"grad_norm": 0.6790012121200562,
"learning_rate": 4.992868767748669e-06,
"loss": 1.2284,
"step": 39
},
{
"epoch": 0.15187470336971998,
"grad_norm": 0.6396064758300781,
"learning_rate": 4.991631397745307e-06,
"loss": 1.197,
"step": 40
},
{
"epoch": 0.15567157095396297,
"grad_norm": 0.6216674447059631,
"learning_rate": 4.990295267757117e-06,
"loss": 1.3547,
"step": 41
},
{
"epoch": 0.15946843853820597,
"grad_norm": 0.49298611283302307,
"learning_rate": 4.98886043074385e-06,
"loss": 0.8974,
"step": 42
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.5648188591003418,
"learning_rate": 4.987326943577675e-06,
"loss": 1.1868,
"step": 43
},
{
"epoch": 0.167062173706692,
"grad_norm": 0.5617629885673523,
"learning_rate": 4.985694867040924e-06,
"loss": 1.197,
"step": 44
},
{
"epoch": 0.170859041290935,
"grad_norm": 0.5470258593559265,
"learning_rate": 4.983964265823687e-06,
"loss": 1.0242,
"step": 45
},
{
"epoch": 0.17465590887517798,
"grad_norm": 0.5692163705825806,
"learning_rate": 4.98213520852124e-06,
"loss": 1.3244,
"step": 46
},
{
"epoch": 0.17845277645942098,
"grad_norm": 0.4028306007385254,
"learning_rate": 4.980207767631335e-06,
"loss": 0.7001,
"step": 47
},
{
"epoch": 0.18224964404366398,
"grad_norm": 0.49932053685188293,
"learning_rate": 4.978182019551321e-06,
"loss": 1.1597,
"step": 48
},
{
"epoch": 0.18604651162790697,
"grad_norm": 0.5206290483474731,
"learning_rate": 4.976058044575116e-06,
"loss": 1.3185,
"step": 49
},
{
"epoch": 0.18984337921214997,
"grad_norm": 0.534298300743103,
"learning_rate": 4.973835926890027e-06,
"loss": 1.3182,
"step": 50
},
{
"epoch": 0.19364024679639297,
"grad_norm": 0.4649883210659027,
"learning_rate": 4.9715157545734124e-06,
"loss": 1.1734,
"step": 51
},
{
"epoch": 0.19743711438063596,
"grad_norm": 0.45419999957084656,
"learning_rate": 4.969097619589187e-06,
"loss": 1.1433,
"step": 52
},
{
"epoch": 0.201233981964879,
"grad_norm": 0.4962867498397827,
"learning_rate": 4.9665816177841845e-06,
"loss": 1.1749,
"step": 53
},
{
"epoch": 0.20503084954912199,
"grad_norm": 0.4966491162776947,
"learning_rate": 4.963967848884349e-06,
"loss": 1.1595,
"step": 54
},
{
"epoch": 0.20882771713336498,
"grad_norm": 0.4114384055137634,
"learning_rate": 4.961256416490793e-06,
"loss": 0.8368,
"step": 55
},
{
"epoch": 0.21262458471760798,
"grad_norm": 0.438700407743454,
"learning_rate": 4.9584474280756785e-06,
"loss": 1.1412,
"step": 56
},
{
"epoch": 0.21642145230185098,
"grad_norm": 0.45085862278938293,
"learning_rate": 4.9555409949779695e-06,
"loss": 1.1312,
"step": 57
},
{
"epoch": 0.22021831988609397,
"grad_norm": 0.4020615220069885,
"learning_rate": 4.952537232399012e-06,
"loss": 0.9898,
"step": 58
},
{
"epoch": 0.22401518747033697,
"grad_norm": 0.43643948435783386,
"learning_rate": 4.9494362593979665e-06,
"loss": 1.1352,
"step": 59
},
{
"epoch": 0.22781205505457996,
"grad_norm": 0.40290120244026184,
"learning_rate": 4.946238198887093e-06,
"loss": 0.9822,
"step": 60
},
{
"epoch": 0.23160892263882296,
"grad_norm": 0.4434157609939575,
"learning_rate": 4.942943177626879e-06,
"loss": 1.1303,
"step": 61
},
{
"epoch": 0.23540579022306596,
"grad_norm": 0.48527973890304565,
"learning_rate": 4.939551326221012e-06,
"loss": 1.2812,
"step": 62
},
{
"epoch": 0.23920265780730898,
"grad_norm": 0.4416787028312683,
"learning_rate": 4.936062779111205e-06,
"loss": 1.1381,
"step": 63
},
{
"epoch": 0.24299952539155198,
"grad_norm": 0.4510328769683838,
"learning_rate": 4.932477674571867e-06,
"loss": 1.2696,
"step": 64
},
{
"epoch": 0.24679639297579498,
"grad_norm": 0.38020116090774536,
"learning_rate": 4.928796154704623e-06,
"loss": 0.9767,
"step": 65
},
{
"epoch": 0.25059326056003794,
"grad_norm": 0.4880014657974243,
"learning_rate": 4.925018365432681e-06,
"loss": 1.2679,
"step": 66
},
{
"epoch": 0.25439012814428097,
"grad_norm": 0.4506332576274872,
"learning_rate": 4.921144456495048e-06,
"loss": 1.1344,
"step": 67
},
{
"epoch": 0.258186995728524,
"grad_norm": 0.45424792170524597,
"learning_rate": 4.9171745814405945e-06,
"loss": 1.2797,
"step": 68
},
{
"epoch": 0.26198386331276696,
"grad_norm": 0.4969100058078766,
"learning_rate": 4.9131088976219695e-06,
"loss": 1.2685,
"step": 69
},
{
"epoch": 0.26578073089701,
"grad_norm": 0.4142298400402069,
"learning_rate": 4.908947566189362e-06,
"loss": 1.1168,
"step": 70
},
{
"epoch": 0.26957759848125296,
"grad_norm": 0.4250011444091797,
"learning_rate": 4.904690752084117e-06,
"loss": 1.1161,
"step": 71
},
{
"epoch": 0.273374466065496,
"grad_norm": 0.4570682644844055,
"learning_rate": 4.900338624032191e-06,
"loss": 1.2663,
"step": 72
},
{
"epoch": 0.27717133364973895,
"grad_norm": 0.43025946617126465,
"learning_rate": 4.895891354537472e-06,
"loss": 1.2616,
"step": 73
},
{
"epoch": 0.280968201233982,
"grad_norm": 0.42482641339302063,
"learning_rate": 4.891349119874936e-06,
"loss": 1.1215,
"step": 74
},
{
"epoch": 0.28476506881822494,
"grad_norm": 0.4691850244998932,
"learning_rate": 4.886712100083664e-06,
"loss": 1.1117,
"step": 75
},
{
"epoch": 0.28856193640246797,
"grad_norm": 0.4500940442085266,
"learning_rate": 4.881980478959707e-06,
"loss": 1.1082,
"step": 76
},
{
"epoch": 0.292358803986711,
"grad_norm": 0.3645106256008148,
"learning_rate": 4.877154444048792e-06,
"loss": 0.9683,
"step": 77
},
{
"epoch": 0.29615567157095396,
"grad_norm": 0.42706671357154846,
"learning_rate": 4.872234186638898e-06,
"loss": 1.106,
"step": 78
},
{
"epoch": 0.299952539155197,
"grad_norm": 0.416423499584198,
"learning_rate": 4.8672199017526725e-06,
"loss": 1.1109,
"step": 79
},
{
"epoch": 0.30374940673943995,
"grad_norm": 0.46140867471694946,
"learning_rate": 4.862111788139697e-06,
"loss": 1.2646,
"step": 80
},
{
"epoch": 0.307546274323683,
"grad_norm": 0.4429333508014679,
"learning_rate": 4.856910048268613e-06,
"loss": 1.1331,
"step": 81
},
{
"epoch": 0.31134314190792595,
"grad_norm": 0.42622944712638855,
"learning_rate": 4.851614888319093e-06,
"loss": 1.0966,
"step": 82
},
{
"epoch": 0.31514000949216897,
"grad_norm": 0.5261926054954529,
"learning_rate": 4.846226518173676e-06,
"loss": 1.2528,
"step": 83
},
{
"epoch": 0.31893687707641194,
"grad_norm": 0.4338201880455017,
"learning_rate": 4.840745151409437e-06,
"loss": 1.2542,
"step": 84
},
{
"epoch": 0.32273374466065496,
"grad_norm": 0.44151878356933594,
"learning_rate": 4.835171005289533e-06,
"loss": 1.1135,
"step": 85
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.4812638461589813,
"learning_rate": 4.8295043007545836e-06,
"loss": 1.2653,
"step": 86
},
{
"epoch": 0.33032747982914096,
"grad_norm": 0.562940239906311,
"learning_rate": 4.823745262413917e-06,
"loss": 1.2494,
"step": 87
},
{
"epoch": 0.334124347413384,
"grad_norm": 0.4242718517780304,
"learning_rate": 4.817894118536667e-06,
"loss": 1.1076,
"step": 88
},
{
"epoch": 0.33792121499762695,
"grad_norm": 0.45141878724098206,
"learning_rate": 4.811951101042722e-06,
"loss": 1.1108,
"step": 89
},
{
"epoch": 0.34171808258187,
"grad_norm": 0.5067852139472961,
"learning_rate": 4.805916445493538e-06,
"loss": 1.2272,
"step": 90
},
{
"epoch": 0.34551495016611294,
"grad_norm": 0.4855363965034485,
"learning_rate": 4.799790391082799e-06,
"loss": 1.1,
"step": 91
},
{
"epoch": 0.34931181775035597,
"grad_norm": 0.47762539982795715,
"learning_rate": 4.793573180626934e-06,
"loss": 1.2453,
"step": 92
},
{
"epoch": 0.35310868533459894,
"grad_norm": 0.5055530071258545,
"learning_rate": 4.787265060555495e-06,
"loss": 1.2437,
"step": 93
},
{
"epoch": 0.35690555291884196,
"grad_norm": 0.4677724838256836,
"learning_rate": 4.7808662809013895e-06,
"loss": 1.223,
"step": 94
},
{
"epoch": 0.36070242050308493,
"grad_norm": 0.47210943698883057,
"learning_rate": 4.774377095290969e-06,
"loss": 1.2287,
"step": 95
},
{
"epoch": 0.36449928808732796,
"grad_norm": 0.48820409178733826,
"learning_rate": 4.76779776093398e-06,
"loss": 1.0997,
"step": 96
},
{
"epoch": 0.368296155671571,
"grad_norm": 0.509981632232666,
"learning_rate": 4.761128538613359e-06,
"loss": 1.2303,
"step": 97
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.46266815066337585,
"learning_rate": 4.754369692674906e-06,
"loss": 1.2366,
"step": 98
},
{
"epoch": 0.375889890840057,
"grad_norm": 0.4689420759677887,
"learning_rate": 4.747521491016805e-06,
"loss": 1.2379,
"step": 99
},
{
"epoch": 0.37968675842429994,
"grad_norm": 0.454712837934494,
"learning_rate": 4.740584205079002e-06,
"loss": 1.1078,
"step": 100
},
{
"epoch": 0.38348362600854297,
"grad_norm": 0.5033725500106812,
"learning_rate": 4.7335581098324465e-06,
"loss": 1.2319,
"step": 101
},
{
"epoch": 0.38728049359278593,
"grad_norm": 0.5276666283607483,
"learning_rate": 4.726443483768195e-06,
"loss": 1.2373,
"step": 102
},
{
"epoch": 0.39107736117702896,
"grad_norm": 0.4005415737628937,
"learning_rate": 4.719240608886372e-06,
"loss": 0.9373,
"step": 103
},
{
"epoch": 0.39487422876127193,
"grad_norm": 0.456887423992157,
"learning_rate": 4.711949770684989e-06,
"loss": 1.2402,
"step": 104
},
{
"epoch": 0.39867109634551495,
"grad_norm": 0.4958035349845886,
"learning_rate": 4.704571258148634e-06,
"loss": 1.2327,
"step": 105
},
{
"epoch": 0.402467963929758,
"grad_norm": 0.40182697772979736,
"learning_rate": 4.697105363737015e-06,
"loss": 0.9684,
"step": 106
},
{
"epoch": 0.40626483151400095,
"grad_norm": 0.43226033449172974,
"learning_rate": 4.689552383373362e-06,
"loss": 1.0848,
"step": 107
},
{
"epoch": 0.41006169909824397,
"grad_norm": 0.4296877682209015,
"learning_rate": 4.681912616432707e-06,
"loss": 0.9568,
"step": 108
},
{
"epoch": 0.41385856668248694,
"grad_norm": 0.5179736018180847,
"learning_rate": 4.674186365730012e-06,
"loss": 1.2286,
"step": 109
},
{
"epoch": 0.41765543426672996,
"grad_norm": 0.4191986620426178,
"learning_rate": 4.666373937508166e-06,
"loss": 0.9606,
"step": 110
},
{
"epoch": 0.42145230185097293,
"grad_norm": 0.412567675113678,
"learning_rate": 4.658475641425854e-06,
"loss": 1.0793,
"step": 111
},
{
"epoch": 0.42524916943521596,
"grad_norm": 0.4645300507545471,
"learning_rate": 4.6504917905452705e-06,
"loss": 1.0835,
"step": 112
},
{
"epoch": 0.4290460370194589,
"grad_norm": 0.3561910092830658,
"learning_rate": 4.6424227013197235e-06,
"loss": 0.7952,
"step": 113
},
{
"epoch": 0.43284290460370195,
"grad_norm": 0.5064642429351807,
"learning_rate": 4.6342686935810795e-06,
"loss": 1.2319,
"step": 114
},
{
"epoch": 0.4366397721879449,
"grad_norm": 0.4808673858642578,
"learning_rate": 4.6260300905271e-06,
"loss": 1.223,
"step": 115
},
{
"epoch": 0.44043663977218794,
"grad_norm": 0.4152892529964447,
"learning_rate": 4.617707218708617e-06,
"loss": 1.0765,
"step": 116
},
{
"epoch": 0.44423350735643097,
"grad_norm": 0.49487704038619995,
"learning_rate": 4.6093004080166e-06,
"loss": 1.2205,
"step": 117
},
{
"epoch": 0.44803037494067394,
"grad_norm": 0.4736645817756653,
"learning_rate": 4.600809991669076e-06,
"loss": 1.0811,
"step": 118
},
{
"epoch": 0.45182724252491696,
"grad_norm": 0.46936193108558655,
"learning_rate": 4.59223630619792e-06,
"loss": 1.0809,
"step": 119
},
{
"epoch": 0.45562411010915993,
"grad_norm": 0.4555075466632843,
"learning_rate": 4.5835796914355195e-06,
"loss": 1.1008,
"step": 120
},
{
"epoch": 0.45942097769340295,
"grad_norm": 0.43824076652526855,
"learning_rate": 4.5748404905013045e-06,
"loss": 1.1092,
"step": 121
},
{
"epoch": 0.4632178452776459,
"grad_norm": 0.4486856162548065,
"learning_rate": 4.5660190497881455e-06,
"loss": 1.0784,
"step": 122
},
{
"epoch": 0.46701471286188895,
"grad_norm": 0.4244840145111084,
"learning_rate": 4.557115718948622e-06,
"loss": 0.958,
"step": 123
},
{
"epoch": 0.4708115804461319,
"grad_norm": 0.4413922429084778,
"learning_rate": 4.548130850881171e-06,
"loss": 1.2196,
"step": 124
},
{
"epoch": 0.47460844803037494,
"grad_norm": 0.4678729772567749,
"learning_rate": 4.53906480171609e-06,
"loss": 1.1124,
"step": 125
},
{
"epoch": 0.47840531561461797,
"grad_norm": 0.48969003558158875,
"learning_rate": 4.529917930801427e-06,
"loss": 1.2281,
"step": 126
},
{
"epoch": 0.48220218319886093,
"grad_norm": 0.4379821717739105,
"learning_rate": 4.520690600688734e-06,
"loss": 1.0849,
"step": 127
},
{
"epoch": 0.48599905078310396,
"grad_norm": 0.4471890330314636,
"learning_rate": 4.5113831771187e-06,
"loss": 1.0791,
"step": 128
},
{
"epoch": 0.4897959183673469,
"grad_norm": 0.49659088253974915,
"learning_rate": 4.501996029006651e-06,
"loss": 1.097,
"step": 129
},
{
"epoch": 0.49359278595158995,
"grad_norm": 0.4094451665878296,
"learning_rate": 4.492529528427929e-06,
"loss": 0.972,
"step": 130
},
{
"epoch": 0.4973896535358329,
"grad_norm": 0.4491405487060547,
"learning_rate": 4.4829840506031455e-06,
"loss": 1.0826,
"step": 131
},
{
"epoch": 0.5011865211200759,
"grad_norm": 0.42457208037376404,
"learning_rate": 4.473359973883305e-06,
"loss": 1.1066,
"step": 132
},
{
"epoch": 0.5049833887043189,
"grad_norm": 0.4856266379356384,
"learning_rate": 4.463657679734813e-06,
"loss": 1.225,
"step": 133
},
{
"epoch": 0.5087802562885619,
"grad_norm": 0.47458940744400024,
"learning_rate": 4.453877552724352e-06,
"loss": 1.2234,
"step": 134
},
{
"epoch": 0.512577123872805,
"grad_norm": 0.36578264832496643,
"learning_rate": 4.444019980503641e-06,
"loss": 0.9276,
"step": 135
},
{
"epoch": 0.516373991457048,
"grad_norm": 0.4141775965690613,
"learning_rate": 4.4340853537940715e-06,
"loss": 1.0895,
"step": 136
},
{
"epoch": 0.5201708590412909,
"grad_norm": 0.4604395031929016,
"learning_rate": 4.424074066371216e-06,
"loss": 1.2279,
"step": 137
},
{
"epoch": 0.5239677266255339,
"grad_norm": 0.4443908929824829,
"learning_rate": 4.4139865150492235e-06,
"loss": 1.0862,
"step": 138
},
{
"epoch": 0.527764594209777,
"grad_norm": 0.4457070529460907,
"learning_rate": 4.403823099665093e-06,
"loss": 1.2267,
"step": 139
},
{
"epoch": 0.53156146179402,
"grad_norm": 0.4152843952178955,
"learning_rate": 4.393584223062819e-06,
"loss": 1.0827,
"step": 140
},
{
"epoch": 0.5353583293782629,
"grad_norm": 0.42452365159988403,
"learning_rate": 4.38327029107743e-06,
"loss": 1.0743,
"step": 141
},
{
"epoch": 0.5391551969625059,
"grad_norm": 0.3921717703342438,
"learning_rate": 4.372881712518898e-06,
"loss": 0.9454,
"step": 142
},
{
"epoch": 0.5429520645467489,
"grad_norm": 0.5372737646102905,
"learning_rate": 4.362418899155941e-06,
"loss": 1.227,
"step": 143
},
{
"epoch": 0.546748932130992,
"grad_norm": 0.5399878621101379,
"learning_rate": 4.351882265699696e-06,
"loss": 1.2155,
"step": 144
},
{
"epoch": 0.550545799715235,
"grad_norm": 0.44373106956481934,
"learning_rate": 4.341272229787281e-06,
"loss": 1.0809,
"step": 145
},
{
"epoch": 0.5543426672994779,
"grad_norm": 0.4626406133174896,
"learning_rate": 4.330589211965246e-06,
"loss": 1.2145,
"step": 146
},
{
"epoch": 0.5581395348837209,
"grad_norm": 0.4238491356372833,
"learning_rate": 4.319833635672899e-06,
"loss": 1.0835,
"step": 147
},
{
"epoch": 0.561936402467964,
"grad_norm": 0.5245795249938965,
"learning_rate": 4.309005927225528e-06,
"loss": 1.222,
"step": 148
},
{
"epoch": 0.565733270052207,
"grad_norm": 0.43987414240837097,
"learning_rate": 4.2981065157974955e-06,
"loss": 0.9477,
"step": 149
},
{
"epoch": 0.5695301376364499,
"grad_norm": 0.4053569734096527,
"learning_rate": 4.287135833405235e-06,
"loss": 1.0717,
"step": 150
},
{
"epoch": 0.5733270052206929,
"grad_norm": 0.411527544260025,
"learning_rate": 4.276094314890122e-06,
"loss": 0.9402,
"step": 151
},
{
"epoch": 0.5771238728049359,
"grad_norm": 0.5262433290481567,
"learning_rate": 4.2649823979012424e-06,
"loss": 1.2261,
"step": 152
},
{
"epoch": 0.580920740389179,
"grad_norm": 0.44064462184906006,
"learning_rate": 4.253800522878043e-06,
"loss": 1.0823,
"step": 153
},
{
"epoch": 0.584717607973422,
"grad_norm": 0.4491995871067047,
"learning_rate": 4.242549133032872e-06,
"loss": 1.2246,
"step": 154
},
{
"epoch": 0.5885144755576649,
"grad_norm": 0.44037237763404846,
"learning_rate": 4.2312286743334174e-06,
"loss": 1.08,
"step": 155
},
{
"epoch": 0.5923113431419079,
"grad_norm": 0.4179958701133728,
"learning_rate": 4.219839595485026e-06,
"loss": 1.0792,
"step": 156
},
{
"epoch": 0.5961082107261509,
"grad_norm": 0.4985540509223938,
"learning_rate": 4.2083823479129175e-06,
"loss": 1.2162,
"step": 157
},
{
"epoch": 0.599905078310394,
"grad_norm": 0.43255966901779175,
"learning_rate": 4.196857385744295e-06,
"loss": 1.0821,
"step": 158
},
{
"epoch": 0.6037019458946369,
"grad_norm": 0.38958489894866943,
"learning_rate": 4.185265165790343e-06,
"loss": 0.93,
"step": 159
},
{
"epoch": 0.6074988134788799,
"grad_norm": 0.43625640869140625,
"learning_rate": 4.17360614752812e-06,
"loss": 1.0673,
"step": 160
},
{
"epoch": 0.6112956810631229,
"grad_norm": 0.3636965751647949,
"learning_rate": 4.161880793082348e-06,
"loss": 0.9311,
"step": 161
},
{
"epoch": 0.615092548647366,
"grad_norm": 0.41958415508270264,
"learning_rate": 4.150089567207094e-06,
"loss": 0.9723,
"step": 162
},
{
"epoch": 0.6188894162316089,
"grad_norm": 0.4178905189037323,
"learning_rate": 4.138232937267351e-06,
"loss": 0.9489,
"step": 163
},
{
"epoch": 0.6226862838158519,
"grad_norm": 0.4551599621772766,
"learning_rate": 4.126311373220511e-06,
"loss": 1.0803,
"step": 164
},
{
"epoch": 0.6264831514000949,
"grad_norm": 0.3745613992214203,
"learning_rate": 4.114325347597736e-06,
"loss": 0.9532,
"step": 165
},
{
"epoch": 0.6302800189843379,
"grad_norm": 0.4355705976486206,
"learning_rate": 4.102275335485234e-06,
"loss": 1.2055,
"step": 166
},
{
"epoch": 0.634076886568581,
"grad_norm": 0.43142765760421753,
"learning_rate": 4.0901618145054246e-06,
"loss": 1.216,
"step": 167
},
{
"epoch": 0.6378737541528239,
"grad_norm": 0.3976465165615082,
"learning_rate": 4.077985264798004e-06,
"loss": 0.9956,
"step": 168
},
{
"epoch": 0.6416706217370669,
"grad_norm": 0.4292868375778198,
"learning_rate": 4.06574616900092e-06,
"loss": 1.0555,
"step": 169
},
{
"epoch": 0.6454674893213099,
"grad_norm": 0.5177546143531799,
"learning_rate": 4.053445012231241e-06,
"loss": 1.2066,
"step": 170
},
{
"epoch": 0.649264356905553,
"grad_norm": 0.41628462076187134,
"learning_rate": 4.041082282065922e-06,
"loss": 1.21,
"step": 171
},
{
"epoch": 0.6530612244897959,
"grad_norm": 0.44618961215019226,
"learning_rate": 4.028658468522489e-06,
"loss": 1.2193,
"step": 172
},
{
"epoch": 0.6568580920740389,
"grad_norm": 0.44253483414649963,
"learning_rate": 4.016174064039602e-06,
"loss": 1.2155,
"step": 173
},
{
"epoch": 0.6606549596582819,
"grad_norm": 0.3929227292537689,
"learning_rate": 4.003629563457551e-06,
"loss": 1.0683,
"step": 174
},
{
"epoch": 0.6644518272425249,
"grad_norm": 0.5715161561965942,
"learning_rate": 3.991025463998632e-06,
"loss": 1.2264,
"step": 175
},
{
"epoch": 0.668248694826768,
"grad_norm": 0.4098718762397766,
"learning_rate": 3.978362265247444e-06,
"loss": 1.0723,
"step": 176
},
{
"epoch": 0.6720455624110109,
"grad_norm": 0.4266994595527649,
"learning_rate": 3.965640469131084e-06,
"loss": 1.0663,
"step": 177
},
{
"epoch": 0.6758424299952539,
"grad_norm": 0.379220187664032,
"learning_rate": 3.952860579899257e-06,
"loss": 0.9418,
"step": 178
},
{
"epoch": 0.6796392975794969,
"grad_norm": 0.3946760594844818,
"learning_rate": 3.940023104104281e-06,
"loss": 1.0605,
"step": 179
},
{
"epoch": 0.68343616516374,
"grad_norm": 0.47356459498405457,
"learning_rate": 3.9271285505810185e-06,
"loss": 1.0615,
"step": 180
},
{
"epoch": 0.6872330327479829,
"grad_norm": 0.4356970489025116,
"learning_rate": 3.9141774304267e-06,
"loss": 1.0714,
"step": 181
},
{
"epoch": 0.6910299003322259,
"grad_norm": 0.4237845540046692,
"learning_rate": 3.9011702569806716e-06,
"loss": 1.0862,
"step": 182
},
{
"epoch": 0.6948267679164689,
"grad_norm": 0.4926426410675049,
"learning_rate": 3.888107545804043e-06,
"loss": 1.2091,
"step": 183
},
{
"epoch": 0.6986236355007119,
"grad_norm": 0.43669024109840393,
"learning_rate": 3.874989814659258e-06,
"loss": 1.0805,
"step": 184
},
{
"epoch": 0.702420503084955,
"grad_norm": 0.435596227645874,
"learning_rate": 3.861817583489566e-06,
"loss": 1.2199,
"step": 185
},
{
"epoch": 0.7062173706691979,
"grad_norm": 0.407825231552124,
"learning_rate": 3.848591374398421e-06,
"loss": 1.1038,
"step": 186
},
{
"epoch": 0.7100142382534409,
"grad_norm": 0.41674646735191345,
"learning_rate": 3.835311711628774e-06,
"loss": 1.0718,
"step": 187
},
{
"epoch": 0.7138111058376839,
"grad_norm": 0.4335026144981384,
"learning_rate": 3.82197912154231e-06,
"loss": 1.0727,
"step": 188
},
{
"epoch": 0.717607973421927,
"grad_norm": 0.47680917382240295,
"learning_rate": 3.808594132598574e-06,
"loss": 1.2056,
"step": 189
},
{
"epoch": 0.7214048410061699,
"grad_norm": 0.38044315576553345,
"learning_rate": 3.7951572753340273e-06,
"loss": 0.9217,
"step": 190
},
{
"epoch": 0.7252017085904129,
"grad_norm": 0.4642685353755951,
"learning_rate": 3.781669082341018e-06,
"loss": 1.071,
"step": 191
},
{
"epoch": 0.7289985761746559,
"grad_norm": 0.4392843544483185,
"learning_rate": 3.768130088246674e-06,
"loss": 1.2135,
"step": 192
},
{
"epoch": 0.7327954437588989,
"grad_norm": 0.4412919580936432,
"learning_rate": 3.7545408296917087e-06,
"loss": 1.2025,
"step": 193
},
{
"epoch": 0.736592311343142,
"grad_norm": 0.4227074682712555,
"learning_rate": 3.740901845309152e-06,
"loss": 1.0674,
"step": 194
},
{
"epoch": 0.7403891789273849,
"grad_norm": 0.45686036348342896,
"learning_rate": 3.727213675703e-06,
"loss": 1.2151,
"step": 195
},
{
"epoch": 0.7441860465116279,
"grad_norm": 0.4000316262245178,
"learning_rate": 3.713476863426787e-06,
"loss": 1.0748,
"step": 196
},
{
"epoch": 0.7479829140958709,
"grad_norm": 0.42302843928337097,
"learning_rate": 3.699691952962083e-06,
"loss": 1.0892,
"step": 197
},
{
"epoch": 0.751779781680114,
"grad_norm": 0.4179258644580841,
"learning_rate": 3.6858594906969073e-06,
"loss": 1.0626,
"step": 198
},
{
"epoch": 0.7555766492643569,
"grad_norm": 0.4555260241031647,
"learning_rate": 3.6719800249040778e-06,
"loss": 1.2077,
"step": 199
},
{
"epoch": 0.7593735168485999,
"grad_norm": 0.36426669359207153,
"learning_rate": 3.6580541057194728e-06,
"loss": 0.944,
"step": 200
},
{
"epoch": 0.7631703844328429,
"grad_norm": 0.3654433786869049,
"learning_rate": 3.6440822851202312e-06,
"loss": 0.9322,
"step": 201
},
{
"epoch": 0.7669672520170859,
"grad_norm": 0.49382278323173523,
"learning_rate": 3.63006511690287e-06,
"loss": 1.2109,
"step": 202
},
{
"epoch": 0.770764119601329,
"grad_norm": 0.4325428307056427,
"learning_rate": 3.616003156661334e-06,
"loss": 1.0696,
"step": 203
},
{
"epoch": 0.7745609871855719,
"grad_norm": 0.4158753454685211,
"learning_rate": 3.6018969617649784e-06,
"loss": 1.0642,
"step": 204
},
{
"epoch": 0.7783578547698149,
"grad_norm": 0.4324316382408142,
"learning_rate": 3.5877470913364697e-06,
"loss": 1.2037,
"step": 205
},
{
"epoch": 0.7821547223540579,
"grad_norm": 0.46198171377182007,
"learning_rate": 3.5735541062296287e-06,
"loss": 1.0685,
"step": 206
},
{
"epoch": 0.7859515899383009,
"grad_norm": 0.43740782141685486,
"learning_rate": 3.559318569007198e-06,
"loss": 0.9377,
"step": 207
},
{
"epoch": 0.7897484575225439,
"grad_norm": 0.4424276351928711,
"learning_rate": 3.545041043918546e-06,
"loss": 1.2036,
"step": 208
},
{
"epoch": 0.7935453251067869,
"grad_norm": 0.4607738256454468,
"learning_rate": 3.5307220968772983e-06,
"loss": 1.2091,
"step": 209
},
{
"epoch": 0.7973421926910299,
"grad_norm": 0.4169575273990631,
"learning_rate": 3.516362295438911e-06,
"loss": 1.0704,
"step": 210
},
{
"epoch": 0.8011390602752729,
"grad_norm": 0.5177940130233765,
"learning_rate": 3.501962208778172e-06,
"loss": 1.2059,
"step": 211
},
{
"epoch": 0.804935927859516,
"grad_norm": 0.4142232835292816,
"learning_rate": 3.487522407666641e-06,
"loss": 1.0769,
"step": 212
},
{
"epoch": 0.8087327954437589,
"grad_norm": 0.4076462984085083,
"learning_rate": 3.473043464450027e-06,
"loss": 1.0677,
"step": 213
},
{
"epoch": 0.8125296630280019,
"grad_norm": 0.5051140785217285,
"learning_rate": 3.458525953025503e-06,
"loss": 1.0704,
"step": 214
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.37081989645957947,
"learning_rate": 3.443970448818954e-06,
"loss": 0.9524,
"step": 215
},
{
"epoch": 0.8201233981964879,
"grad_norm": 0.3888426423072815,
"learning_rate": 3.429377528762177e-06,
"loss": 1.0749,
"step": 216
},
{
"epoch": 0.8239202657807309,
"grad_norm": 0.49155759811401367,
"learning_rate": 3.414747771270007e-06,
"loss": 1.2125,
"step": 217
},
{
"epoch": 0.8277171333649739,
"grad_norm": 0.38015004992485046,
"learning_rate": 3.40008175621739e-06,
"loss": 1.0571,
"step": 218
},
{
"epoch": 0.8315140009492169,
"grad_norm": 0.4071613848209381,
"learning_rate": 3.3853800649164053e-06,
"loss": 1.053,
"step": 219
},
{
"epoch": 0.8353108685334599,
"grad_norm": 0.42250776290893555,
"learning_rate": 3.3706432800932184e-06,
"loss": 1.0685,
"step": 220
},
{
"epoch": 0.8391077361177028,
"grad_norm": 0.431573748588562,
"learning_rate": 3.3558719858649835e-06,
"loss": 1.0752,
"step": 221
},
{
"epoch": 0.8429046037019459,
"grad_norm": 0.46120485663414,
"learning_rate": 3.341066767716697e-06,
"loss": 1.2063,
"step": 222
},
{
"epoch": 0.8467014712861889,
"grad_norm": 0.4283529818058014,
"learning_rate": 3.3262282124779823e-06,
"loss": 1.0799,
"step": 223
},
{
"epoch": 0.8504983388704319,
"grad_norm": 0.37570565938949585,
"learning_rate": 3.3113569082998367e-06,
"loss": 1.0655,
"step": 224
},
{
"epoch": 0.8542952064546749,
"grad_norm": 0.2539384663105011,
"learning_rate": 3.2964534446313163e-06,
"loss": 0.4994,
"step": 225
},
{
"epoch": 0.8580920740389179,
"grad_norm": 0.4729446470737457,
"learning_rate": 3.2815184121961725e-06,
"loss": 1.2129,
"step": 226
},
{
"epoch": 0.8618889416231609,
"grad_norm": 0.3342674672603607,
"learning_rate": 3.266552402969437e-06,
"loss": 0.7854,
"step": 227
},
{
"epoch": 0.8656858092074039,
"grad_norm": 0.45071637630462646,
"learning_rate": 3.251556010153958e-06,
"loss": 1.2061,
"step": 228
},
{
"epoch": 0.8694826767916469,
"grad_norm": 0.4373473525047302,
"learning_rate": 3.2365298281568913e-06,
"loss": 1.0659,
"step": 229
},
{
"epoch": 0.8732795443758898,
"grad_norm": 0.42635422945022583,
"learning_rate": 3.2214744525661336e-06,
"loss": 1.2066,
"step": 230
},
{
"epoch": 0.8770764119601329,
"grad_norm": 0.45856305956840515,
"learning_rate": 3.2063904801267184e-06,
"loss": 1.1959,
"step": 231
},
{
"epoch": 0.8808732795443759,
"grad_norm": 0.4771096408367157,
"learning_rate": 3.191278508717166e-06,
"loss": 1.2025,
"step": 232
},
{
"epoch": 0.8846701471286189,
"grad_norm": 0.4655088484287262,
"learning_rate": 3.176139137325781e-06,
"loss": 1.2094,
"step": 233
},
{
"epoch": 0.8884670147128619,
"grad_norm": 0.39348939061164856,
"learning_rate": 3.1609729660269114e-06,
"loss": 1.0672,
"step": 234
},
{
"epoch": 0.8922638822971048,
"grad_norm": 0.445089727640152,
"learning_rate": 3.1457805959571663e-06,
"loss": 1.1994,
"step": 235
},
{
"epoch": 0.8960607498813479,
"grad_norm": 0.3999580144882202,
"learning_rate": 3.130562629291586e-06,
"loss": 1.0561,
"step": 236
},
{
"epoch": 0.8998576174655909,
"grad_norm": 0.4471660554409027,
"learning_rate": 3.1153196692197747e-06,
"loss": 1.0674,
"step": 237
},
{
"epoch": 0.9036544850498339,
"grad_norm": 0.4131118655204773,
"learning_rate": 3.100052319921992e-06,
"loss": 0.9328,
"step": 238
},
{
"epoch": 0.9074513526340768,
"grad_norm": 0.4657612144947052,
"learning_rate": 3.0847611865452064e-06,
"loss": 1.2085,
"step": 239
},
{
"epoch": 0.9112482202183199,
"grad_norm": 0.4662039577960968,
"learning_rate": 3.069446875179106e-06,
"loss": 1.2029,
"step": 240
},
{
"epoch": 0.9150450878025629,
"grad_norm": 0.41623374819755554,
"learning_rate": 3.0541099928320806e-06,
"loss": 0.9324,
"step": 241
},
{
"epoch": 0.9188419553868059,
"grad_norm": 0.4404604434967041,
"learning_rate": 3.0387511474071556e-06,
"loss": 1.0648,
"step": 242
},
{
"epoch": 0.9226388229710489,
"grad_norm": 0.3622177243232727,
"learning_rate": 3.023370947677901e-06,
"loss": 0.9238,
"step": 243
},
{
"epoch": 0.9264356905552918,
"grad_norm": 0.4089031517505646,
"learning_rate": 3.007970003264301e-06,
"loss": 1.0869,
"step": 244
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.4477076232433319,
"learning_rate": 2.99254892460859e-06,
"loss": 1.1983,
"step": 245
},
{
"epoch": 0.9340294257237779,
"grad_norm": 0.46868517994880676,
"learning_rate": 2.9771083229510543e-06,
"loss": 1.1963,
"step": 246
},
{
"epoch": 0.9378262933080209,
"grad_norm": 0.3464803397655487,
"learning_rate": 2.9616488103058115e-06,
"loss": 0.9272,
"step": 247
},
{
"epoch": 0.9416231608922638,
"grad_norm": 0.45595675706863403,
"learning_rate": 2.9461709994365445e-06,
"loss": 1.2056,
"step": 248
},
{
"epoch": 0.9454200284765069,
"grad_norm": 0.39231353998184204,
"learning_rate": 2.930675503832217e-06,
"loss": 1.0618,
"step": 249
},
{
"epoch": 0.9492168960607499,
"grad_norm": 0.3708343505859375,
"learning_rate": 2.91516293768276e-06,
"loss": 0.9102,
"step": 250
},
{
"epoch": 0.9530137636449929,
"grad_norm": 0.4427671432495117,
"learning_rate": 2.899633915854721e-06,
"loss": 1.2012,
"step": 251
},
{
"epoch": 0.9568106312292359,
"grad_norm": 0.33957725763320923,
"learning_rate": 2.8840890538668955e-06,
"loss": 0.8041,
"step": 252
},
{
"epoch": 0.9606074988134788,
"grad_norm": 0.3556421995162964,
"learning_rate": 2.868528967865934e-06,
"loss": 0.9198,
"step": 253
},
{
"epoch": 0.9644043663977219,
"grad_norm": 0.4399990439414978,
"learning_rate": 2.8529542746019118e-06,
"loss": 1.1998,
"step": 254
},
{
"epoch": 0.9682012339819649,
"grad_norm": 0.4560842514038086,
"learning_rate": 2.8373655914038907e-06,
"loss": 1.2007,
"step": 255
},
{
"epoch": 0.9719981015662079,
"grad_norm": 0.38949334621429443,
"learning_rate": 2.821763536155446e-06,
"loss": 1.0533,
"step": 256
},
{
"epoch": 0.9757949691504508,
"grad_norm": 0.40262502431869507,
"learning_rate": 2.806148727270176e-06,
"loss": 1.0585,
"step": 257
},
{
"epoch": 0.9795918367346939,
"grad_norm": 0.4113542139530182,
"learning_rate": 2.7905217836671915e-06,
"loss": 1.0601,
"step": 258
},
{
"epoch": 0.9833887043189369,
"grad_norm": 0.39467111229896545,
"learning_rate": 2.774883324746583e-06,
"loss": 1.0627,
"step": 259
},
{
"epoch": 0.9871855719031799,
"grad_norm": 0.4230286180973053,
"learning_rate": 2.7592339703648696e-06,
"loss": 1.2039,
"step": 260
},
{
"epoch": 0.9909824394874229,
"grad_norm": 0.3624156415462494,
"learning_rate": 2.743574340810431e-06,
"loss": 0.9253,
"step": 261
},
{
"epoch": 0.9947793070716658,
"grad_norm": 0.45184388756752014,
"learning_rate": 2.7279050567789195e-06,
"loss": 1.1943,
"step": 262
},
{
"epoch": 0.9985761746559089,
"grad_norm": 0.43259841203689575,
"learning_rate": 2.7122267393486605e-06,
"loss": 1.2086,
"step": 263
},
{
"epoch": 1.0023730422401518,
"grad_norm": 0.7363314628601074,
"learning_rate": 2.6965400099560305e-06,
"loss": 1.9312,
"step": 264
},
{
"epoch": 1.0061699098243948,
"grad_norm": 0.42088714241981506,
"learning_rate": 2.6808454903708313e-06,
"loss": 1.0671,
"step": 265
},
{
"epoch": 1.0099667774086378,
"grad_norm": 0.47700321674346924,
"learning_rate": 2.66514380267164e-06,
"loss": 1.2008,
"step": 266
},
{
"epoch": 1.0137636449928809,
"grad_norm": 0.40473899245262146,
"learning_rate": 2.6494355692211537e-06,
"loss": 1.0598,
"step": 267
},
{
"epoch": 1.0175605125771239,
"grad_norm": 0.40599343180656433,
"learning_rate": 2.6337214126415237e-06,
"loss": 1.0582,
"step": 268
},
{
"epoch": 1.021357380161367,
"grad_norm": 0.49867045879364014,
"learning_rate": 2.6180019557896725e-06,
"loss": 1.1879,
"step": 269
},
{
"epoch": 1.02515424774561,
"grad_norm": 0.432919442653656,
"learning_rate": 2.6022778217326077e-06,
"loss": 1.1843,
"step": 270
},
{
"epoch": 1.028951115329853,
"grad_norm": 0.40411439538002014,
"learning_rate": 2.586549633722726e-06,
"loss": 1.2015,
"step": 271
},
{
"epoch": 1.032747982914096,
"grad_norm": 0.4091154932975769,
"learning_rate": 2.5708180151731105e-06,
"loss": 1.0366,
"step": 272
},
{
"epoch": 1.0365448504983388,
"grad_norm": 0.41487744450569153,
"learning_rate": 2.555083589632818e-06,
"loss": 1.0597,
"step": 273
},
{
"epoch": 1.0403417180825818,
"grad_norm": 0.43414047360420227,
"learning_rate": 2.5393469807621646e-06,
"loss": 1.1722,
"step": 274
},
{
"epoch": 1.0441385856668248,
"grad_norm": 0.43517452478408813,
"learning_rate": 2.523608812308009e-06,
"loss": 1.1897,
"step": 275
},
{
"epoch": 1.0479354532510678,
"grad_norm": 0.41618216037750244,
"learning_rate": 2.5078697080790248e-06,
"loss": 1.0384,
"step": 276
},
{
"epoch": 1.0517323208353109,
"grad_norm": 0.3956233561038971,
"learning_rate": 2.4921302919209765e-06,
"loss": 1.0523,
"step": 277
},
{
"epoch": 1.055529188419554,
"grad_norm": 0.3869052231311798,
"learning_rate": 2.476391187691992e-06,
"loss": 1.0695,
"step": 278
},
{
"epoch": 1.059326056003797,
"grad_norm": 0.38108527660369873,
"learning_rate": 2.4606530192378358e-06,
"loss": 0.9268,
"step": 279
},
{
"epoch": 1.06312292358804,
"grad_norm": 0.40593841671943665,
"learning_rate": 2.4449164103671834e-06,
"loss": 1.0487,
"step": 280
},
{
"epoch": 1.0669197911722827,
"grad_norm": 0.35500654578208923,
"learning_rate": 2.4291819848268908e-06,
"loss": 0.9251,
"step": 281
},
{
"epoch": 1.0707166587565258,
"grad_norm": 0.40299785137176514,
"learning_rate": 2.4134503662772754e-06,
"loss": 1.0533,
"step": 282
},
{
"epoch": 1.0745135263407688,
"grad_norm": 0.43219995498657227,
"learning_rate": 2.3977221782673936e-06,
"loss": 1.1866,
"step": 283
},
{
"epoch": 1.0783103939250118,
"grad_norm": 0.45522886514663696,
"learning_rate": 2.3819980442103288e-06,
"loss": 1.2028,
"step": 284
},
{
"epoch": 1.0821072615092548,
"grad_norm": 0.4167502522468567,
"learning_rate": 2.3662785873584775e-06,
"loss": 1.0328,
"step": 285
},
{
"epoch": 1.0859041290934979,
"grad_norm": 0.402190238237381,
"learning_rate": 2.350564430778847e-06,
"loss": 1.0613,
"step": 286
},
{
"epoch": 1.089700996677741,
"grad_norm": 0.36273133754730225,
"learning_rate": 2.3348561973283613e-06,
"loss": 0.9201,
"step": 287
},
{
"epoch": 1.093497864261984,
"grad_norm": 0.41140493750572205,
"learning_rate": 2.31915450962917e-06,
"loss": 1.0598,
"step": 288
},
{
"epoch": 1.097294731846227,
"grad_norm": 0.3959132730960846,
"learning_rate": 2.3034599900439703e-06,
"loss": 1.0547,
"step": 289
},
{
"epoch": 1.10109159943047,
"grad_norm": 0.43085983395576477,
"learning_rate": 2.2877732606513407e-06,
"loss": 1.1836,
"step": 290
},
{
"epoch": 1.1048884670147128,
"grad_norm": 0.37919700145721436,
"learning_rate": 2.2720949432210813e-06,
"loss": 1.0431,
"step": 291
},
{
"epoch": 1.1086853345989558,
"grad_norm": 0.3869677186012268,
"learning_rate": 2.2564256591895695e-06,
"loss": 0.9199,
"step": 292
},
{
"epoch": 1.1124822021831988,
"grad_norm": 0.40192729234695435,
"learning_rate": 2.2407660296351313e-06,
"loss": 0.9259,
"step": 293
},
{
"epoch": 1.1162790697674418,
"grad_norm": 0.41164687275886536,
"learning_rate": 2.225116675253418e-06,
"loss": 1.0489,
"step": 294
},
{
"epoch": 1.1200759373516849,
"grad_norm": 0.3917531371116638,
"learning_rate": 2.209478216332809e-06,
"loss": 1.0493,
"step": 295
},
{
"epoch": 1.123872804935928,
"grad_norm": 0.3413256108760834,
"learning_rate": 2.193851272729825e-06,
"loss": 0.9091,
"step": 296
},
{
"epoch": 1.127669672520171,
"grad_norm": 0.4337766170501709,
"learning_rate": 2.1782364638445545e-06,
"loss": 1.191,
"step": 297
},
{
"epoch": 1.131466540104414,
"grad_norm": 0.42421215772628784,
"learning_rate": 2.16263440859611e-06,
"loss": 1.1943,
"step": 298
},
{
"epoch": 1.1352634076886567,
"grad_norm": 0.38704779744148254,
"learning_rate": 2.1470457253980887e-06,
"loss": 1.1828,
"step": 299
},
{
"epoch": 1.1390602752728998,
"grad_norm": 0.38762617111206055,
"learning_rate": 2.131471032134067e-06,
"loss": 1.0479,
"step": 300
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.3805724084377289,
"learning_rate": 2.115910946133105e-06,
"loss": 1.049,
"step": 301
},
{
"epoch": 1.1466540104413858,
"grad_norm": 0.38618871569633484,
"learning_rate": 2.10036608414528e-06,
"loss": 1.0566,
"step": 302
},
{
"epoch": 1.1504508780256288,
"grad_norm": 0.32153603434562683,
"learning_rate": 2.084837062317241e-06,
"loss": 0.7648,
"step": 303
},
{
"epoch": 1.1542477456098719,
"grad_norm": 0.4316171109676361,
"learning_rate": 2.0693244961677834e-06,
"loss": 1.183,
"step": 304
},
{
"epoch": 1.158044613194115,
"grad_norm": 0.3866140842437744,
"learning_rate": 2.0538290005634564e-06,
"loss": 1.1986,
"step": 305
},
{
"epoch": 1.161841480778358,
"grad_norm": 0.41327404975891113,
"learning_rate": 2.0383511896941894e-06,
"loss": 1.1974,
"step": 306
},
{
"epoch": 1.165638348362601,
"grad_norm": 0.37834978103637695,
"learning_rate": 2.0228916770489466e-06,
"loss": 1.0443,
"step": 307
},
{
"epoch": 1.169435215946844,
"grad_norm": 0.41474854946136475,
"learning_rate": 2.0074510753914113e-06,
"loss": 1.1803,
"step": 308
},
{
"epoch": 1.1732320835310868,
"grad_norm": 0.3727324306964874,
"learning_rate": 1.9920299967356995e-06,
"loss": 1.055,
"step": 309
},
{
"epoch": 1.1770289511153298,
"grad_norm": 0.40379658341407776,
"learning_rate": 1.9766290523220997e-06,
"loss": 1.2038,
"step": 310
},
{
"epoch": 1.1808258186995728,
"grad_norm": 0.39679378271102905,
"learning_rate": 1.9612488525928453e-06,
"loss": 1.1826,
"step": 311
},
{
"epoch": 1.1846226862838158,
"grad_norm": 0.4498043954372406,
"learning_rate": 1.94589000716792e-06,
"loss": 1.1875,
"step": 312
},
{
"epoch": 1.1884195538680589,
"grad_norm": 0.41798946261405945,
"learning_rate": 1.9305531248208948e-06,
"loss": 1.1847,
"step": 313
},
{
"epoch": 1.1922164214523019,
"grad_norm": 0.42096462845802307,
"learning_rate": 1.9152388134547944e-06,
"loss": 1.059,
"step": 314
},
{
"epoch": 1.196013289036545,
"grad_norm": 0.3939502537250519,
"learning_rate": 1.8999476800780087e-06,
"loss": 1.0457,
"step": 315
},
{
"epoch": 1.199810156620788,
"grad_norm": 0.44725000858306885,
"learning_rate": 1.8846803307802263e-06,
"loss": 1.1939,
"step": 316
},
{
"epoch": 1.2036070242050307,
"grad_norm": 0.3450300097465515,
"learning_rate": 1.8694373707084148e-06,
"loss": 0.9045,
"step": 317
},
{
"epoch": 1.2074038917892738,
"grad_norm": 0.4713125228881836,
"learning_rate": 1.854219404042834e-06,
"loss": 1.1848,
"step": 318
},
{
"epoch": 1.2112007593735168,
"grad_norm": 0.33273351192474365,
"learning_rate": 1.8390270339730892e-06,
"loss": 0.7687,
"step": 319
},
{
"epoch": 1.2149976269577598,
"grad_norm": 0.35393235087394714,
"learning_rate": 1.82386086267422e-06,
"loss": 0.9687,
"step": 320
},
{
"epoch": 1.2187944945420028,
"grad_norm": 0.35949307680130005,
"learning_rate": 1.8087214912828343e-06,
"loss": 0.9033,
"step": 321
},
{
"epoch": 1.2225913621262459,
"grad_norm": 0.42038631439208984,
"learning_rate": 1.793609519873282e-06,
"loss": 1.1888,
"step": 322
},
{
"epoch": 1.2263882297104889,
"grad_norm": 0.4522581696510315,
"learning_rate": 1.7785255474338675e-06,
"loss": 1.188,
"step": 323
},
{
"epoch": 1.230185097294732,
"grad_norm": 0.3686380386352539,
"learning_rate": 1.763470171843109e-06,
"loss": 1.0683,
"step": 324
},
{
"epoch": 1.233981964878975,
"grad_norm": 0.4130842685699463,
"learning_rate": 1.748443989846042e-06,
"loss": 1.1772,
"step": 325
},
{
"epoch": 1.237778832463218,
"grad_norm": 0.3096576929092407,
"learning_rate": 1.7334475970305639e-06,
"loss": 0.767,
"step": 326
},
{
"epoch": 1.2415757000474608,
"grad_norm": 0.3671686351299286,
"learning_rate": 1.718481587803828e-06,
"loss": 0.9131,
"step": 327
},
{
"epoch": 1.2453725676317038,
"grad_norm": 0.4213862717151642,
"learning_rate": 1.7035465553686839e-06,
"loss": 1.1863,
"step": 328
},
{
"epoch": 1.2491694352159468,
"grad_norm": 0.3827097713947296,
"learning_rate": 1.688643091700164e-06,
"loss": 1.0456,
"step": 329
},
{
"epoch": 1.2529663028001898,
"grad_norm": 0.3981434106826782,
"learning_rate": 1.6737717875220177e-06,
"loss": 1.0414,
"step": 330
},
{
"epoch": 1.2567631703844329,
"grad_norm": 0.3808799982070923,
"learning_rate": 1.6589332322833035e-06,
"loss": 1.0457,
"step": 331
},
{
"epoch": 1.2605600379686759,
"grad_norm": 0.39812132716178894,
"learning_rate": 1.6441280141350169e-06,
"loss": 1.0624,
"step": 332
},
{
"epoch": 1.264356905552919,
"grad_norm": 0.3926783502101898,
"learning_rate": 1.6293567199067827e-06,
"loss": 1.1868,
"step": 333
},
{
"epoch": 1.2681537731371617,
"grad_norm": 0.42180249094963074,
"learning_rate": 1.6146199350835956e-06,
"loss": 1.1785,
"step": 334
},
{
"epoch": 1.2719506407214047,
"grad_norm": 0.3862990736961365,
"learning_rate": 1.5999182437826111e-06,
"loss": 1.0557,
"step": 335
},
{
"epoch": 1.2757475083056478,
"grad_norm": 0.33035123348236084,
"learning_rate": 1.5852522287299946e-06,
"loss": 0.9202,
"step": 336
},
{
"epoch": 1.2795443758898908,
"grad_norm": 0.3912082612514496,
"learning_rate": 1.5706224712378237e-06,
"loss": 1.0578,
"step": 337
},
{
"epoch": 1.2833412434741338,
"grad_norm": 0.368190199136734,
"learning_rate": 1.5560295511810465e-06,
"loss": 1.0456,
"step": 338
},
{
"epoch": 1.2871381110583768,
"grad_norm": 0.3697574734687805,
"learning_rate": 1.5414740469744986e-06,
"loss": 1.0412,
"step": 339
},
{
"epoch": 1.2909349786426199,
"grad_norm": 0.3395715057849884,
"learning_rate": 1.5269565355499738e-06,
"loss": 0.9436,
"step": 340
},
{
"epoch": 1.2947318462268629,
"grad_norm": 0.38877978920936584,
"learning_rate": 1.5124775923333604e-06,
"loss": 1.0565,
"step": 341
},
{
"epoch": 1.298528713811106,
"grad_norm": 0.3653126060962677,
"learning_rate": 1.4980377912218291e-06,
"loss": 1.0534,
"step": 342
},
{
"epoch": 1.302325581395349,
"grad_norm": 0.403150737285614,
"learning_rate": 1.4836377045610903e-06,
"loss": 1.063,
"step": 343
},
{
"epoch": 1.306122448979592,
"grad_norm": 0.3969637155532837,
"learning_rate": 1.4692779031227032e-06,
"loss": 1.1949,
"step": 344
},
{
"epoch": 1.3099193165638348,
"grad_norm": 0.3859919607639313,
"learning_rate": 1.4549589560814558e-06,
"loss": 1.0345,
"step": 345
},
{
"epoch": 1.3137161841480778,
"grad_norm": 0.3501088321208954,
"learning_rate": 1.440681430992803e-06,
"loss": 1.0387,
"step": 346
},
{
"epoch": 1.3175130517323208,
"grad_norm": 0.38995981216430664,
"learning_rate": 1.4264458937703717e-06,
"loss": 1.0632,
"step": 347
},
{
"epoch": 1.3213099193165638,
"grad_norm": 0.3388945460319519,
"learning_rate": 1.4122529086635312e-06,
"loss": 0.9119,
"step": 348
},
{
"epoch": 1.3251067869008069,
"grad_norm": 0.37815216183662415,
"learning_rate": 1.398103038235022e-06,
"loss": 1.1801,
"step": 349
},
{
"epoch": 1.3289036544850499,
"grad_norm": 0.3603959381580353,
"learning_rate": 1.3839968433386659e-06,
"loss": 1.0401,
"step": 350
},
{
"epoch": 1.332700522069293,
"grad_norm": 0.4169382154941559,
"learning_rate": 1.3699348830971316e-06,
"loss": 1.1889,
"step": 351
},
{
"epoch": 1.3364973896535357,
"grad_norm": 0.4009808599948883,
"learning_rate": 1.3559177148797698e-06,
"loss": 1.1802,
"step": 352
},
{
"epoch": 1.3402942572377787,
"grad_norm": 0.3702283203601837,
"learning_rate": 1.3419458942805274e-06,
"loss": 0.9253,
"step": 353
},
{
"epoch": 1.3440911248220218,
"grad_norm": 0.3447802662849426,
"learning_rate": 1.3280199750959233e-06,
"loss": 0.9254,
"step": 354
},
{
"epoch": 1.3478879924062648,
"grad_norm": 0.40221476554870605,
"learning_rate": 1.314140509303093e-06,
"loss": 1.0729,
"step": 355
},
{
"epoch": 1.3516848599905078,
"grad_norm": 0.40008315443992615,
"learning_rate": 1.3003080470379176e-06,
"loss": 1.1839,
"step": 356
},
{
"epoch": 1.3554817275747508,
"grad_norm": 0.4559074640274048,
"learning_rate": 1.2865231365732136e-06,
"loss": 1.184,
"step": 357
},
{
"epoch": 1.3592785951589939,
"grad_norm": 0.3520359694957733,
"learning_rate": 1.2727863242970007e-06,
"loss": 0.9056,
"step": 358
},
{
"epoch": 1.3630754627432369,
"grad_norm": 0.39236965775489807,
"learning_rate": 1.2590981546908481e-06,
"loss": 1.1798,
"step": 359
},
{
"epoch": 1.36687233032748,
"grad_norm": 0.39921608567237854,
"learning_rate": 1.245459170308292e-06,
"loss": 1.1876,
"step": 360
},
{
"epoch": 1.370669197911723,
"grad_norm": 0.32645294070243835,
"learning_rate": 1.2318699117533264e-06,
"loss": 0.9214,
"step": 361
},
{
"epoch": 1.374466065495966,
"grad_norm": 0.3884054720401764,
"learning_rate": 1.2183309176589822e-06,
"loss": 1.1828,
"step": 362
},
{
"epoch": 1.3782629330802088,
"grad_norm": 0.3472171425819397,
"learning_rate": 1.2048427246659738e-06,
"loss": 0.9014,
"step": 363
},
{
"epoch": 1.3820598006644518,
"grad_norm": 0.34238362312316895,
"learning_rate": 1.1914058674014264e-06,
"loss": 0.9166,
"step": 364
},
{
"epoch": 1.3858566682486948,
"grad_norm": 0.4093136489391327,
"learning_rate": 1.17802087845769e-06,
"loss": 1.1757,
"step": 365
},
{
"epoch": 1.3896535358329378,
"grad_norm": 0.4089086949825287,
"learning_rate": 1.164688288371227e-06,
"loss": 1.1898,
"step": 366
},
{
"epoch": 1.3934504034171808,
"grad_norm": 0.36652591824531555,
"learning_rate": 1.1514086256015803e-06,
"loss": 1.0574,
"step": 367
},
{
"epoch": 1.3972472710014239,
"grad_norm": 0.3458910584449768,
"learning_rate": 1.138182416510434e-06,
"loss": 1.0404,
"step": 368
},
{
"epoch": 1.401044138585667,
"grad_norm": 0.403898149728775,
"learning_rate": 1.1250101853407427e-06,
"loss": 1.185,
"step": 369
},
{
"epoch": 1.4048410061699097,
"grad_norm": 0.38953447341918945,
"learning_rate": 1.1118924541959573e-06,
"loss": 1.1767,
"step": 370
},
{
"epoch": 1.4086378737541527,
"grad_norm": 0.41569021344184875,
"learning_rate": 1.09882974301933e-06,
"loss": 1.1867,
"step": 371
},
{
"epoch": 1.4124347413383957,
"grad_norm": 0.3661198019981384,
"learning_rate": 1.0858225695733006e-06,
"loss": 1.0516,
"step": 372
},
{
"epoch": 1.4162316089226388,
"grad_norm": 0.40183043479919434,
"learning_rate": 1.072871449418982e-06,
"loss": 1.0376,
"step": 373
},
{
"epoch": 1.4200284765068818,
"grad_norm": 0.3806290030479431,
"learning_rate": 1.0599768958957193e-06,
"loss": 1.0473,
"step": 374
},
{
"epoch": 1.4238253440911248,
"grad_norm": 0.3974708616733551,
"learning_rate": 1.0471394201007435e-06,
"loss": 1.1799,
"step": 375
},
{
"epoch": 1.4276222116753678,
"grad_norm": 0.37168920040130615,
"learning_rate": 1.0343595308689156e-06,
"loss": 1.0478,
"step": 376
},
{
"epoch": 1.4314190792596109,
"grad_norm": 0.36702632904052734,
"learning_rate": 1.021637734752557e-06,
"loss": 1.0368,
"step": 377
},
{
"epoch": 1.435215946843854,
"grad_norm": 0.36013931035995483,
"learning_rate": 1.0089745360013685e-06,
"loss": 1.0488,
"step": 378
},
{
"epoch": 1.439012814428097,
"grad_norm": 0.39848053455352783,
"learning_rate": 9.963704365424494e-07,
"loss": 1.0501,
"step": 379
},
{
"epoch": 1.44280968201234,
"grad_norm": 0.3630136549472809,
"learning_rate": 9.838259359603987e-07,
"loss": 1.0455,
"step": 380
},
{
"epoch": 1.4466065495965827,
"grad_norm": 0.38877931237220764,
"learning_rate": 9.713415314775122e-07,
"loss": 1.1887,
"step": 381
},
{
"epoch": 1.4504034171808258,
"grad_norm": 0.3823590874671936,
"learning_rate": 9.589177179340775e-07,
"loss": 1.1795,
"step": 382
},
{
"epoch": 1.4542002847650688,
"grad_norm": 0.31407251954078674,
"learning_rate": 9.465549877687602e-07,
"loss": 0.8088,
"step": 383
},
{
"epoch": 1.4579971523493118,
"grad_norm": 0.3785543143749237,
"learning_rate": 9.342538309990804e-07,
"loss": 1.0545,
"step": 384
},
{
"epoch": 1.4617940199335548,
"grad_norm": 0.3719753921031952,
"learning_rate": 9.220147352019965e-07,
"loss": 1.1924,
"step": 385
},
{
"epoch": 1.4655908875177979,
"grad_norm": 0.3974050283432007,
"learning_rate": 9.098381854945762e-07,
"loss": 1.0512,
"step": 386
},
{
"epoch": 1.469387755102041,
"grad_norm": 0.380188912153244,
"learning_rate": 8.977246645147655e-07,
"loss": 1.1842,
"step": 387
},
{
"epoch": 1.4731846226862837,
"grad_norm": 0.33522146940231323,
"learning_rate": 8.856746524022647e-07,
"loss": 0.9293,
"step": 388
},
{
"epoch": 1.4769814902705267,
"grad_norm": 0.34948912262916565,
"learning_rate": 8.736886267794911e-07,
"loss": 1.0576,
"step": 389
},
{
"epoch": 1.4807783578547697,
"grad_norm": 0.3386431336402893,
"learning_rate": 8.617670627326503e-07,
"loss": 0.9029,
"step": 390
},
{
"epoch": 1.4845752254390128,
"grad_norm": 0.32754087448120117,
"learning_rate": 8.49910432792907e-07,
"loss": 0.9201,
"step": 391
},
{
"epoch": 1.4883720930232558,
"grad_norm": 0.38169023394584656,
"learning_rate": 8.381192069176539e-07,
"loss": 1.1824,
"step": 392
},
{
"epoch": 1.4921689606074988,
"grad_norm": 0.3721928596496582,
"learning_rate": 8.263938524718812e-07,
"loss": 1.0526,
"step": 393
},
{
"epoch": 1.4959658281917418,
"grad_norm": 0.3896506130695343,
"learning_rate": 8.147348342096579e-07,
"loss": 1.1811,
"step": 394
},
{
"epoch": 1.4997626957759849,
"grad_norm": 0.3741244375705719,
"learning_rate": 8.031426142557061e-07,
"loss": 1.0512,
"step": 395
},
{
"epoch": 1.503559563360228,
"grad_norm": 0.3273400366306305,
"learning_rate": 7.916176520870836e-07,
"loss": 0.905,
"step": 396
},
{
"epoch": 1.507356430944471,
"grad_norm": 0.3622443675994873,
"learning_rate": 7.80160404514975e-07,
"loss": 1.0561,
"step": 397
},
{
"epoch": 1.511153298528714,
"grad_norm": 0.3936561048030853,
"learning_rate": 7.687713256665835e-07,
"loss": 1.1887,
"step": 398
},
{
"epoch": 1.514950166112957,
"grad_norm": 0.37885287404060364,
"learning_rate": 7.574508669671288e-07,
"loss": 1.185,
"step": 399
},
{
"epoch": 1.5187470336971998,
"grad_norm": 0.36686670780181885,
"learning_rate": 7.46199477121958e-07,
"loss": 1.1886,
"step": 400
},
{
"epoch": 1.5225439012814428,
"grad_norm": 0.3786951005458832,
"learning_rate": 7.350176020987585e-07,
"loss": 1.1841,
"step": 401
},
{
"epoch": 1.5263407688656858,
"grad_norm": 0.3833267390727997,
"learning_rate": 7.239056851098785e-07,
"loss": 1.191,
"step": 402
},
{
"epoch": 1.5301376364499288,
"grad_norm": 0.2992788553237915,
"learning_rate": 7.128641665947658e-07,
"loss": 0.7744,
"step": 403
},
{
"epoch": 1.5339345040341716,
"grad_norm": 0.379607230424881,
"learning_rate": 7.018934842025058e-07,
"loss": 1.1837,
"step": 404
},
{
"epoch": 1.5377313716184147,
"grad_norm": 0.3761638104915619,
"learning_rate": 6.90994072774473e-07,
"loss": 1.0384,
"step": 405
},
{
"epoch": 1.5415282392026577,
"grad_norm": 0.405567467212677,
"learning_rate": 6.801663643271012e-07,
"loss": 1.1853,
"step": 406
},
{
"epoch": 1.5453251067869007,
"grad_norm": 0.3884119987487793,
"learning_rate": 6.69410788034755e-07,
"loss": 1.1932,
"step": 407
},
{
"epoch": 1.5491219743711437,
"grad_norm": 0.35525161027908325,
"learning_rate": 6.587277702127196e-07,
"loss": 1.0483,
"step": 408
},
{
"epoch": 1.5529188419553868,
"grad_norm": 0.3150971829891205,
"learning_rate": 6.481177343003043e-07,
"loss": 0.9127,
"step": 409
},
{
"epoch": 1.5567157095396298,
"grad_norm": 0.39407217502593994,
"learning_rate": 6.375811008440591e-07,
"loss": 1.179,
"step": 410
},
{
"epoch": 1.5605125771238728,
"grad_norm": 0.402775377035141,
"learning_rate": 6.271182874811024e-07,
"loss": 1.0475,
"step": 411
},
{
"epoch": 1.5643094447081158,
"grad_norm": 0.3489883840084076,
"learning_rate": 6.167297089225713e-07,
"loss": 0.9048,
"step": 412
},
{
"epoch": 1.5681063122923589,
"grad_norm": 0.35091614723205566,
"learning_rate": 6.064157769371823e-07,
"loss": 1.0413,
"step": 413
},
{
"epoch": 1.5719031798766019,
"grad_norm": 0.35318344831466675,
"learning_rate": 5.961769003349077e-07,
"loss": 1.0326,
"step": 414
},
{
"epoch": 1.575700047460845,
"grad_norm": 0.3418586850166321,
"learning_rate": 5.860134849507765e-07,
"loss": 0.9169,
"step": 415
},
{
"epoch": 1.579496915045088,
"grad_norm": 0.35626721382141113,
"learning_rate": 5.759259336287851e-07,
"loss": 1.0404,
"step": 416
},
{
"epoch": 1.583293782629331,
"grad_norm": 0.3962875306606293,
"learning_rate": 5.659146462059292e-07,
"loss": 1.1874,
"step": 417
},
{
"epoch": 1.5870906502135738,
"grad_norm": 0.3754862844944,
"learning_rate": 5.559800194963591e-07,
"loss": 1.1858,
"step": 418
},
{
"epoch": 1.5908875177978168,
"grad_norm": 0.37421101331710815,
"learning_rate": 5.46122447275649e-07,
"loss": 1.1813,
"step": 419
},
{
"epoch": 1.5946843853820598,
"grad_norm": 0.3670285642147064,
"learning_rate": 5.363423202651876e-07,
"loss": 1.0544,
"step": 420
},
{
"epoch": 1.5984812529663028,
"grad_norm": 0.42034921050071716,
"learning_rate": 5.266400261166951e-07,
"loss": 1.1884,
"step": 421
},
{
"epoch": 1.6022781205505456,
"grad_norm": 0.3546159565448761,
"learning_rate": 5.170159493968549e-07,
"loss": 1.1814,
"step": 422
},
{
"epoch": 1.6060749881347887,
"grad_norm": 0.3211246728897095,
"learning_rate": 5.074704715720711e-07,
"loss": 0.9178,
"step": 423
},
{
"epoch": 1.6098718557190317,
"grad_norm": 0.3514634370803833,
"learning_rate": 4.980039709933492e-07,
"loss": 0.913,
"step": 424
},
{
"epoch": 1.6136687233032747,
"grad_norm": 0.4017420709133148,
"learning_rate": 4.886168228813007e-07,
"loss": 1.1828,
"step": 425
},
{
"epoch": 1.6174655908875177,
"grad_norm": 0.3313949406147003,
"learning_rate": 4.793093993112663e-07,
"loss": 0.9082,
"step": 426
},
{
"epoch": 1.6212624584717608,
"grad_norm": 0.33639460802078247,
"learning_rate": 4.700820691985739e-07,
"loss": 0.9069,
"step": 427
},
{
"epoch": 1.6250593260560038,
"grad_norm": 0.3237963616847992,
"learning_rate": 4.6093519828391025e-07,
"loss": 0.894,
"step": 428
},
{
"epoch": 1.6288561936402468,
"grad_norm": 0.3815336525440216,
"learning_rate": 4.51869149118829e-07,
"loss": 1.1764,
"step": 429
},
{
"epoch": 1.6326530612244898,
"grad_norm": 0.36702704429626465,
"learning_rate": 4.428842810513784e-07,
"loss": 1.1678,
"step": 430
},
{
"epoch": 1.6364499288087329,
"grad_norm": 0.3530699610710144,
"learning_rate": 4.3398095021185557e-07,
"loss": 1.1782,
"step": 431
},
{
"epoch": 1.6402467963929759,
"grad_norm": 0.33500173687934875,
"learning_rate": 4.251595094986957e-07,
"loss": 1.0625,
"step": 432
},
{
"epoch": 1.644043663977219,
"grad_norm": 0.37163975834846497,
"learning_rate": 4.1642030856448104e-07,
"loss": 1.1739,
"step": 433
},
{
"epoch": 1.647840531561462,
"grad_norm": 0.37387996912002563,
"learning_rate": 4.077636938020807e-07,
"loss": 1.0592,
"step": 434
},
{
"epoch": 1.651637399145705,
"grad_norm": 0.3629416823387146,
"learning_rate": 3.991900083309241e-07,
"loss": 1.1718,
"step": 435
},
{
"epoch": 1.6554342667299478,
"grad_norm": 0.3603303134441376,
"learning_rate": 3.906995919833997e-07,
"loss": 1.0432,
"step": 436
},
{
"epoch": 1.6592311343141908,
"grad_norm": 0.31616950035095215,
"learning_rate": 3.8229278129138293e-07,
"loss": 0.9182,
"step": 437
},
{
"epoch": 1.6630280018984338,
"grad_norm": 0.3574075698852539,
"learning_rate": 3.739699094729002e-07,
"loss": 1.0423,
"step": 438
},
{
"epoch": 1.6668248694826768,
"grad_norm": 0.33015620708465576,
"learning_rate": 3.6573130641892053e-07,
"loss": 0.8927,
"step": 439
},
{
"epoch": 1.6706217370669196,
"grad_norm": 0.32390230894088745,
"learning_rate": 3.575772986802775e-07,
"loss": 0.9272,
"step": 440
},
{
"epoch": 1.6744186046511627,
"grad_norm": 0.37338370084762573,
"learning_rate": 3.4950820945472945e-07,
"loss": 1.1717,
"step": 441
},
{
"epoch": 1.6782154722354057,
"grad_norm": 0.35633939504623413,
"learning_rate": 3.4152435857414676e-07,
"loss": 1.1792,
"step": 442
},
{
"epoch": 1.6820123398196487,
"grad_norm": 0.35473427176475525,
"learning_rate": 3.3362606249183446e-07,
"loss": 1.0391,
"step": 443
},
{
"epoch": 1.6858092074038917,
"grad_norm": 0.36706259846687317,
"learning_rate": 3.2581363426998966e-07,
"loss": 1.0463,
"step": 444
},
{
"epoch": 1.6896060749881348,
"grad_norm": 0.3562609553337097,
"learning_rate": 3.18087383567294e-07,
"loss": 1.0515,
"step": 445
},
{
"epoch": 1.6934029425723778,
"grad_norm": 0.35783901810646057,
"learning_rate": 3.1044761662663933e-07,
"loss": 1.0449,
"step": 446
},
{
"epoch": 1.6971998101566208,
"grad_norm": 0.3588060736656189,
"learning_rate": 3.0289463626298585e-07,
"loss": 1.1823,
"step": 447
},
{
"epoch": 1.7009966777408638,
"grad_norm": 0.35698550939559937,
"learning_rate": 2.9542874185136545e-07,
"loss": 1.1792,
"step": 448
},
{
"epoch": 1.7047935453251069,
"grad_norm": 0.37660661339759827,
"learning_rate": 2.880502293150117e-07,
"loss": 1.1868,
"step": 449
},
{
"epoch": 1.7085904129093499,
"grad_norm": 0.38808876276016235,
"learning_rate": 2.8075939111362915e-07,
"loss": 1.1823,
"step": 450
},
{
"epoch": 1.712387280493593,
"grad_norm": 0.35739246010780334,
"learning_rate": 2.7355651623180574e-07,
"loss": 1.1793,
"step": 451
},
{
"epoch": 1.716184148077836,
"grad_norm": 0.3609924018383026,
"learning_rate": 2.6644189016755415e-07,
"loss": 1.0662,
"step": 452
},
{
"epoch": 1.719981015662079,
"grad_norm": 0.3684343099594116,
"learning_rate": 2.5941579492099853e-07,
"loss": 1.1712,
"step": 453
},
{
"epoch": 1.7237778832463218,
"grad_norm": 0.3490484952926636,
"learning_rate": 2.524785089831955e-07,
"loss": 1.0555,
"step": 454
},
{
"epoch": 1.7275747508305648,
"grad_norm": 0.32522618770599365,
"learning_rate": 2.456303073250943e-07,
"loss": 0.9226,
"step": 455
},
{
"epoch": 1.7313716184148078,
"grad_norm": 0.38961344957351685,
"learning_rate": 2.388714613866422e-07,
"loss": 1.1842,
"step": 456
},
{
"epoch": 1.7351684859990508,
"grad_norm": 0.35974937677383423,
"learning_rate": 2.3220223906602113e-07,
"loss": 1.0626,
"step": 457
},
{
"epoch": 1.7389653535832936,
"grad_norm": 0.3246191143989563,
"learning_rate": 2.2562290470903082e-07,
"loss": 0.9113,
"step": 458
},
{
"epoch": 1.7427622211675367,
"grad_norm": 0.3299662172794342,
"learning_rate": 2.191337190986112e-07,
"loss": 0.9153,
"step": 459
},
{
"epoch": 1.7465590887517797,
"grad_norm": 0.38010498881340027,
"learning_rate": 2.1273493944450634e-07,
"loss": 1.1955,
"step": 460
},
{
"epoch": 1.7503559563360227,
"grad_norm": 0.38703298568725586,
"learning_rate": 2.06426819373067e-07,
"loss": 1.1757,
"step": 461
},
{
"epoch": 1.7541528239202657,
"grad_norm": 0.294664204120636,
"learning_rate": 2.0020960891720147e-07,
"loss": 0.7584,
"step": 462
},
{
"epoch": 1.7579496915045087,
"grad_norm": 0.3676691949367523,
"learning_rate": 1.9408355450646234e-07,
"loss": 1.1826,
"step": 463
},
{
"epoch": 1.7617465590887518,
"grad_norm": 0.36037302017211914,
"learning_rate": 1.8804889895727872e-07,
"loss": 1.046,
"step": 464
},
{
"epoch": 1.7655434266729948,
"grad_norm": 0.3599529266357422,
"learning_rate": 1.821058814633339e-07,
"loss": 1.0328,
"step": 465
},
{
"epoch": 1.7693402942572378,
"grad_norm": 0.32030507922172546,
"learning_rate": 1.762547375860832e-07,
"loss": 0.9193,
"step": 466
},
{
"epoch": 1.7731371618414808,
"grad_norm": 0.4202202558517456,
"learning_rate": 1.7049569924541653e-07,
"loss": 1.1895,
"step": 467
},
{
"epoch": 1.7769340294257239,
"grad_norm": 0.35407963395118713,
"learning_rate": 1.6482899471046726e-07,
"loss": 1.0546,
"step": 468
},
{
"epoch": 1.780730897009967,
"grad_norm": 0.3341258466243744,
"learning_rate": 1.5925484859056372e-07,
"loss": 1.0506,
"step": 469
},
{
"epoch": 1.78452776459421,
"grad_norm": 0.3770039677619934,
"learning_rate": 1.5377348182632536e-07,
"loss": 1.1839,
"step": 470
},
{
"epoch": 1.7883246321784527,
"grad_norm": 0.3668918311595917,
"learning_rate": 1.4838511168090707e-07,
"loss": 1.1741,
"step": 471
},
{
"epoch": 1.7921214997626957,
"grad_norm": 0.3690701425075531,
"learning_rate": 1.4308995173138828e-07,
"loss": 1.1862,
"step": 472
},
{
"epoch": 1.7959183673469388,
"grad_norm": 0.33050721883773804,
"learning_rate": 1.3788821186030338e-07,
"loss": 1.0628,
"step": 473
},
{
"epoch": 1.7997152349311818,
"grad_norm": 0.35489219427108765,
"learning_rate": 1.3278009824732763e-07,
"loss": 1.0405,
"step": 474
},
{
"epoch": 1.8035121025154248,
"grad_norm": 0.3431392014026642,
"learning_rate": 1.2776581336110234e-07,
"loss": 1.0425,
"step": 475
},
{
"epoch": 1.8073089700996676,
"grad_norm": 0.35951146483421326,
"learning_rate": 1.2284555595120901e-07,
"loss": 1.192,
"step": 476
},
{
"epoch": 1.8111058376839106,
"grad_norm": 0.34095799922943115,
"learning_rate": 1.1801952104029347e-07,
"loss": 0.9311,
"step": 477
},
{
"epoch": 1.8149027052681537,
"grad_norm": 0.36434096097946167,
"learning_rate": 1.1328789991633532e-07,
"loss": 1.1809,
"step": 478
},
{
"epoch": 1.8186995728523967,
"grad_norm": 0.35194143652915955,
"learning_rate": 1.0865088012506408e-07,
"loss": 1.0538,
"step": 479
},
{
"epoch": 1.8224964404366397,
"grad_norm": 0.3786377012729645,
"learning_rate": 1.0410864546252841e-07,
"loss": 1.1839,
"step": 480
},
{
"epoch": 1.8262933080208827,
"grad_norm": 0.3330764174461365,
"learning_rate": 9.966137596780945e-08,
"loss": 1.0354,
"step": 481
},
{
"epoch": 1.8300901756051258,
"grad_norm": 0.33103281259536743,
"learning_rate": 9.530924791588319e-08,
"loss": 0.956,
"step": 482
},
{
"epoch": 1.8338870431893688,
"grad_norm": 0.34974756836891174,
"learning_rate": 9.10524338106375e-08,
"loss": 1.1761,
"step": 483
},
{
"epoch": 1.8376839107736118,
"grad_norm": 0.3340107202529907,
"learning_rate": 8.689110237803056e-08,
"loss": 1.0462,
"step": 484
},
{
"epoch": 1.8414807783578548,
"grad_norm": 0.337950199842453,
"learning_rate": 8.282541855940546e-08,
"loss": 1.0587,
"step": 485
},
{
"epoch": 1.8452776459420979,
"grad_norm": 0.34110939502716064,
"learning_rate": 7.885554350495206e-08,
"loss": 1.1823,
"step": 486
},
{
"epoch": 1.849074513526341,
"grad_norm": 0.3682970702648163,
"learning_rate": 7.498163456731878e-08,
"loss": 1.1863,
"step": 487
},
{
"epoch": 1.852871381110584,
"grad_norm": 0.3614114820957184,
"learning_rate": 7.120384529537672e-08,
"loss": 1.1923,
"step": 488
},
{
"epoch": 1.8566682486948267,
"grad_norm": 1.5183669328689575,
"learning_rate": 6.752232542813319e-08,
"loss": 0.5159,
"step": 489
},
{
"epoch": 1.8604651162790697,
"grad_norm": 1.4683269262313843,
"learning_rate": 6.393722088879534e-08,
"loss": 0.504,
"step": 490
},
{
"epoch": 1.8642619838633128,
"grad_norm": 1.4609661102294922,
"learning_rate": 6.044867377898806e-08,
"loss": 0.5112,
"step": 491
},
{
"epoch": 1.8680588514475558,
"grad_norm": 1.6301960945129395,
"learning_rate": 5.7056822373121324e-08,
"loss": 0.5463,
"step": 492
},
{
"epoch": 1.8718557190317988,
"grad_norm": 1.419601321220398,
"learning_rate": 5.3761801112907356e-08,
"loss": 0.4816,
"step": 493
},
{
"epoch": 1.8756525866160416,
"grad_norm": 1.1993861198425293,
"learning_rate": 5.0563740602034284e-08,
"loss": 0.418,
"step": 494
},
{
"epoch": 1.8794494542002846,
"grad_norm": 1.2279670238494873,
"learning_rate": 4.746276760098867e-08,
"loss": 0.4118,
"step": 495
},
{
"epoch": 1.8832463217845277,
"grad_norm": 0.9904822111129761,
"learning_rate": 4.44590050220306e-08,
"loss": 0.3383,
"step": 496
},
{
"epoch": 1.8870431893687707,
"grad_norm": 1.4921010732650757,
"learning_rate": 4.155257192432205e-08,
"loss": 0.5024,
"step": 497
},
{
"epoch": 1.8908400569530137,
"grad_norm": 1.2175933122634888,
"learning_rate": 3.874358350920843e-08,
"loss": 0.4521,
"step": 498
},
{
"epoch": 1.8946369245372567,
"grad_norm": 0.9808973670005798,
"learning_rate": 3.603215111565139e-08,
"loss": 0.3847,
"step": 499
},
{
"epoch": 1.8984337921214998,
"grad_norm": 1.0177325010299683,
"learning_rate": 3.341838221581656e-08,
"loss": 0.433,
"step": 500
},
{
"epoch": 1.9022306597057428,
"grad_norm": 1.1320708990097046,
"learning_rate": 3.090238041081328e-08,
"loss": 0.4865,
"step": 501
},
{
"epoch": 1.9060275272899858,
"grad_norm": 0.8693588972091675,
"learning_rate": 2.848424542658823e-08,
"loss": 0.3843,
"step": 502
},
{
"epoch": 1.9098243948742288,
"grad_norm": 1.0695801973342896,
"learning_rate": 2.6164073109972986e-08,
"loss": 0.4924,
"step": 503
},
{
"epoch": 1.9136212624584719,
"grad_norm": 1.0257716178894043,
"learning_rate": 2.3941955424884312e-08,
"loss": 0.4785,
"step": 504
},
{
"epoch": 1.9174181300427149,
"grad_norm": 0.8927969932556152,
"learning_rate": 2.1817980448679553e-08,
"loss": 0.4477,
"step": 505
},
{
"epoch": 1.921214997626958,
"grad_norm": 0.960476815700531,
"learning_rate": 1.979223236866501e-08,
"loss": 0.484,
"step": 506
},
{
"epoch": 1.9250118652112007,
"grad_norm": 0.8209058046340942,
"learning_rate": 1.7864791478760245e-08,
"loss": 0.4217,
"step": 507
},
{
"epoch": 1.9288087327954437,
"grad_norm": 0.815679669380188,
"learning_rate": 1.603573417631371e-08,
"loss": 0.4293,
"step": 508
},
{
"epoch": 1.9326056003796868,
"grad_norm": 0.9071650505065918,
"learning_rate": 1.4305132959075706e-08,
"loss": 0.4821,
"step": 509
},
{
"epoch": 1.9364024679639298,
"grad_norm": 0.9293763637542725,
"learning_rate": 1.2673056422325413e-08,
"loss": 0.4737,
"step": 510
},
{
"epoch": 1.9401993355481728,
"grad_norm": 0.8001239895820618,
"learning_rate": 1.1139569256150285e-08,
"loss": 0.4284,
"step": 511
},
{
"epoch": 1.9439962031324156,
"grad_norm": 0.7981712222099304,
"learning_rate": 9.704732242883374e-09,
"loss": 0.4344,
"step": 512
},
{
"epoch": 1.9477930707166586,
"grad_norm": 0.5713729858398438,
"learning_rate": 8.368602254693603e-09,
"loss": 0.3397,
"step": 513
},
{
"epoch": 1.9515899383009017,
"grad_norm": 0.7548425197601318,
"learning_rate": 7.131232251331721e-09,
"loss": 0.4207,
"step": 514
},
{
"epoch": 1.9553868058851447,
"grad_norm": 0.6806543469429016,
"learning_rate": 5.992671278030327e-09,
"loss": 0.4099,
"step": 515
},
{
"epoch": 1.9591836734693877,
"grad_norm": 0.836120069026947,
"learning_rate": 4.952964463561805e-09,
"loss": 0.4671,
"step": 516
},
{
"epoch": 1.9629805410536307,
"grad_norm": 0.662464439868927,
"learning_rate": 4.012153018446984e-09,
"loss": 0.3776,
"step": 517
},
{
"epoch": 1.9667774086378738,
"grad_norm": 0.7680605053901672,
"learning_rate": 3.170274233324222e-09,
"loss": 0.4336,
"step": 518
},
{
"epoch": 1.9705742762221168,
"grad_norm": 0.7456918954849243,
"learning_rate": 2.4273614774691923e-09,
"loss": 0.4209,
"step": 519
},
{
"epoch": 1.9743711438063598,
"grad_norm": 0.7245385050773621,
"learning_rate": 1.7834441974740047e-09,
"loss": 0.4,
"step": 520
},
{
"epoch": 1.9781680113906028,
"grad_norm": 0.6544315814971924,
"learning_rate": 1.2385479160784141e-09,
"loss": 0.3741,
"step": 521
},
{
"epoch": 1.9819648789748459,
"grad_norm": 0.721889317035675,
"learning_rate": 7.926942311597962e-10,
"loss": 0.4156,
"step": 522
},
{
"epoch": 1.9857617465590889,
"grad_norm": 0.8772999048233032,
"learning_rate": 4.4590081487577706e-10,
"loss": 0.4664,
"step": 523
},
{
"epoch": 1.989558614143332,
"grad_norm": 0.8888481855392456,
"learning_rate": 1.9818141296451544e-10,
"loss": 0.4626,
"step": 524
},
{
"epoch": 1.9933554817275747,
"grad_norm": 0.7575893998146057,
"learning_rate": 4.954584419930575e-11,
"loss": 0.4245,
"step": 525
},
{
"epoch": 1.9971523493118177,
"grad_norm": 0.878999650478363,
"learning_rate": 0.0,
"loss": 0.4739,
"step": 526
}
],
"logging_steps": 1.0,
"max_steps": 526,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6538932878717747e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}