PEFT
Safetensors
kimnamssya's picture
Upload folder using huggingface_hub
24ce0bf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9814814814814814,
"eval_steps": 68,
"global_step": 540,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003703703703703704,
"grad_norm": 0.40625306963920593,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.3427,
"step": 1
},
{
"epoch": 0.003703703703703704,
"eval_loss": 1.3691776990890503,
"eval_runtime": 80.493,
"eval_samples_per_second": 3.615,
"eval_steps_per_second": 0.46,
"step": 1
},
{
"epoch": 0.007407407407407408,
"grad_norm": 0.6216382384300232,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.3914,
"step": 2
},
{
"epoch": 0.011111111111111112,
"grad_norm": 0.4203539788722992,
"learning_rate": 3e-06,
"loss": 1.3421,
"step": 3
},
{
"epoch": 0.014814814814814815,
"grad_norm": 0.48187777400016785,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3913,
"step": 4
},
{
"epoch": 0.018518518518518517,
"grad_norm": 0.4104997515678406,
"learning_rate": 5e-06,
"loss": 1.3264,
"step": 5
},
{
"epoch": 0.022222222222222223,
"grad_norm": 0.5217423439025879,
"learning_rate": 6e-06,
"loss": 1.3418,
"step": 6
},
{
"epoch": 0.025925925925925925,
"grad_norm": 0.40824779868125916,
"learning_rate": 7e-06,
"loss": 1.3761,
"step": 7
},
{
"epoch": 0.02962962962962963,
"grad_norm": 0.41881611943244934,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3631,
"step": 8
},
{
"epoch": 0.03333333333333333,
"grad_norm": 0.43708905577659607,
"learning_rate": 9e-06,
"loss": 1.3911,
"step": 9
},
{
"epoch": 0.037037037037037035,
"grad_norm": 0.48373478651046753,
"learning_rate": 1e-05,
"loss": 1.3813,
"step": 10
},
{
"epoch": 0.040740740740740744,
"grad_norm": 0.428241491317749,
"learning_rate": 9.999912161129377e-06,
"loss": 1.3825,
"step": 11
},
{
"epoch": 0.044444444444444446,
"grad_norm": 0.4543517827987671,
"learning_rate": 9.999648647603774e-06,
"loss": 1.3413,
"step": 12
},
{
"epoch": 0.04814814814814815,
"grad_norm": 0.48931288719177246,
"learning_rate": 9.999209468681885e-06,
"loss": 1.4078,
"step": 13
},
{
"epoch": 0.05185185185185185,
"grad_norm": 0.47361329197883606,
"learning_rate": 9.998594639794502e-06,
"loss": 1.3926,
"step": 14
},
{
"epoch": 0.05555555555555555,
"grad_norm": 0.46920689940452576,
"learning_rate": 9.997804182543973e-06,
"loss": 1.3043,
"step": 15
},
{
"epoch": 0.05925925925925926,
"grad_norm": 0.44550788402557373,
"learning_rate": 9.996838124703448e-06,
"loss": 1.3535,
"step": 16
},
{
"epoch": 0.06296296296296296,
"grad_norm": 0.4951707720756531,
"learning_rate": 9.995696500215899e-06,
"loss": 1.3355,
"step": 17
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.5006001591682434,
"learning_rate": 9.994379349192927e-06,
"loss": 1.3064,
"step": 18
},
{
"epoch": 0.07037037037037037,
"grad_norm": 0.45947596430778503,
"learning_rate": 9.992886717913358e-06,
"loss": 1.394,
"step": 19
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.49364641308784485,
"learning_rate": 9.991218658821609e-06,
"loss": 1.3043,
"step": 20
},
{
"epoch": 0.07777777777777778,
"grad_norm": 0.47694772481918335,
"learning_rate": 9.989375230525849e-06,
"loss": 1.3287,
"step": 21
},
{
"epoch": 0.08148148148148149,
"grad_norm": 0.5253634452819824,
"learning_rate": 9.987356497795944e-06,
"loss": 1.3046,
"step": 22
},
{
"epoch": 0.08518518518518518,
"grad_norm": 0.5501742362976074,
"learning_rate": 9.985162531561174e-06,
"loss": 1.3499,
"step": 23
},
{
"epoch": 0.08888888888888889,
"grad_norm": 0.5258708000183105,
"learning_rate": 9.982793408907747e-06,
"loss": 1.2779,
"step": 24
},
{
"epoch": 0.09259259259259259,
"grad_norm": 0.4966470003128052,
"learning_rate": 9.980249213076085e-06,
"loss": 1.2702,
"step": 25
},
{
"epoch": 0.0962962962962963,
"grad_norm": 0.4991610050201416,
"learning_rate": 9.977530033457906e-06,
"loss": 1.3286,
"step": 26
},
{
"epoch": 0.1,
"grad_norm": 0.5212219953536987,
"learning_rate": 9.97463596559307e-06,
"loss": 1.2978,
"step": 27
},
{
"epoch": 0.1037037037037037,
"grad_norm": 0.4977610409259796,
"learning_rate": 9.971567111166246e-06,
"loss": 1.3247,
"step": 28
},
{
"epoch": 0.10740740740740741,
"grad_norm": 0.5000190734863281,
"learning_rate": 9.968323578003312e-06,
"loss": 1.3017,
"step": 29
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.476797878742218,
"learning_rate": 9.964905480067585e-06,
"loss": 1.2287,
"step": 30
},
{
"epoch": 0.11481481481481481,
"grad_norm": 0.5062195062637329,
"learning_rate": 9.961312937455812e-06,
"loss": 1.2521,
"step": 31
},
{
"epoch": 0.11851851851851852,
"grad_norm": 0.5346536636352539,
"learning_rate": 9.957546076393944e-06,
"loss": 1.2907,
"step": 32
},
{
"epoch": 0.12222222222222222,
"grad_norm": 0.5018014311790466,
"learning_rate": 9.95360502923271e-06,
"loss": 1.273,
"step": 33
},
{
"epoch": 0.1259259259259259,
"grad_norm": 0.4412826895713806,
"learning_rate": 9.949489934442966e-06,
"loss": 1.202,
"step": 34
},
{
"epoch": 0.12962962962962962,
"grad_norm": 0.47726863622665405,
"learning_rate": 9.945200936610821e-06,
"loss": 1.1432,
"step": 35
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.4887215197086334,
"learning_rate": 9.940738186432565e-06,
"loss": 1.1524,
"step": 36
},
{
"epoch": 0.13703703703703704,
"grad_norm": 0.4492252469062805,
"learning_rate": 9.936101840709373e-06,
"loss": 1.1903,
"step": 37
},
{
"epoch": 0.14074074074074075,
"grad_norm": 0.43920594453811646,
"learning_rate": 9.931292062341793e-06,
"loss": 1.1942,
"step": 38
},
{
"epoch": 0.14444444444444443,
"grad_norm": 0.44488102197647095,
"learning_rate": 9.926309020324025e-06,
"loss": 1.1919,
"step": 39
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.5044857263565063,
"learning_rate": 9.921152889737985e-06,
"loss": 1.1351,
"step": 40
},
{
"epoch": 0.15185185185185185,
"grad_norm": 0.45221227407455444,
"learning_rate": 9.915823851747143e-06,
"loss": 1.1624,
"step": 41
},
{
"epoch": 0.15555555555555556,
"grad_norm": 0.5037719011306763,
"learning_rate": 9.910322093590177e-06,
"loss": 1.1718,
"step": 42
},
{
"epoch": 0.15925925925925927,
"grad_norm": 0.44602254033088684,
"learning_rate": 9.90464780857437e-06,
"loss": 1.1546,
"step": 43
},
{
"epoch": 0.16296296296296298,
"grad_norm": 0.44312745332717896,
"learning_rate": 9.898801196068839e-06,
"loss": 1.2048,
"step": 44
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.5689204931259155,
"learning_rate": 9.892782461497521e-06,
"loss": 1.2042,
"step": 45
},
{
"epoch": 0.17037037037037037,
"grad_norm": 0.47574153542518616,
"learning_rate": 9.886591816331953e-06,
"loss": 1.072,
"step": 46
},
{
"epoch": 0.17407407407407408,
"grad_norm": 0.5947781801223755,
"learning_rate": 9.880229478083849e-06,
"loss": 1.1788,
"step": 47
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.45822006464004517,
"learning_rate": 9.87369567029745e-06,
"loss": 1.1901,
"step": 48
},
{
"epoch": 0.1814814814814815,
"grad_norm": 0.4415622055530548,
"learning_rate": 9.866990622541677e-06,
"loss": 1.1071,
"step": 49
},
{
"epoch": 0.18518518518518517,
"grad_norm": 0.49463754892349243,
"learning_rate": 9.860114570402055e-06,
"loss": 1.1492,
"step": 50
},
{
"epoch": 0.18888888888888888,
"grad_norm": 0.5251724720001221,
"learning_rate": 9.853067755472447e-06,
"loss": 1.102,
"step": 51
},
{
"epoch": 0.1925925925925926,
"grad_norm": 0.4823416471481323,
"learning_rate": 9.845850425346563e-06,
"loss": 1.1561,
"step": 52
},
{
"epoch": 0.1962962962962963,
"grad_norm": 0.5142261385917664,
"learning_rate": 9.838462833609249e-06,
"loss": 1.2041,
"step": 53
},
{
"epoch": 0.2,
"grad_norm": 0.5137107372283936,
"learning_rate": 9.830905239827592e-06,
"loss": 1.0813,
"step": 54
},
{
"epoch": 0.2037037037037037,
"grad_norm": 0.41644176840782166,
"learning_rate": 9.823177909541795e-06,
"loss": 1.0974,
"step": 55
},
{
"epoch": 0.2074074074074074,
"grad_norm": 0.40043726563453674,
"learning_rate": 9.815281114255841e-06,
"loss": 1.1076,
"step": 56
},
{
"epoch": 0.2111111111111111,
"grad_norm": 0.43805867433547974,
"learning_rate": 9.807215131427966e-06,
"loss": 1.0959,
"step": 57
},
{
"epoch": 0.21481481481481482,
"grad_norm": 0.5732157230377197,
"learning_rate": 9.798980244460892e-06,
"loss": 1.0742,
"step": 58
},
{
"epoch": 0.21851851851851853,
"grad_norm": 0.44811880588531494,
"learning_rate": 9.790576742691895e-06,
"loss": 1.0058,
"step": 59
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.44900447130203247,
"learning_rate": 9.782004921382612e-06,
"loss": 1.0982,
"step": 60
},
{
"epoch": 0.22592592592592592,
"grad_norm": 0.521683394908905,
"learning_rate": 9.773265081708687e-06,
"loss": 1.1294,
"step": 61
},
{
"epoch": 0.22962962962962963,
"grad_norm": 0.48734819889068604,
"learning_rate": 9.764357530749178e-06,
"loss": 1.0575,
"step": 62
},
{
"epoch": 0.23333333333333334,
"grad_norm": 0.47888699173927307,
"learning_rate": 9.755282581475769e-06,
"loss": 1.0333,
"step": 63
},
{
"epoch": 0.23703703703703705,
"grad_norm": 0.45292389392852783,
"learning_rate": 9.74604055274178e-06,
"loss": 1.0786,
"step": 64
},
{
"epoch": 0.24074074074074073,
"grad_norm": 0.46524283289909363,
"learning_rate": 9.736631769270958e-06,
"loss": 1.0708,
"step": 65
},
{
"epoch": 0.24444444444444444,
"grad_norm": 0.4456775486469269,
"learning_rate": 9.727056561646067e-06,
"loss": 1.0512,
"step": 66
},
{
"epoch": 0.24814814814814815,
"grad_norm": 0.461698055267334,
"learning_rate": 9.717315266297277e-06,
"loss": 1.1535,
"step": 67
},
{
"epoch": 0.2518518518518518,
"grad_norm": 0.5552849173545837,
"learning_rate": 9.707408225490343e-06,
"loss": 1.1064,
"step": 68
},
{
"epoch": 0.2518518518518518,
"eval_loss": 1.065529227256775,
"eval_runtime": 80.9702,
"eval_samples_per_second": 3.594,
"eval_steps_per_second": 0.457,
"step": 68
},
{
"epoch": 0.25555555555555554,
"grad_norm": 0.541875422000885,
"learning_rate": 9.697335787314573e-06,
"loss": 1.0527,
"step": 69
},
{
"epoch": 0.25925925925925924,
"grad_norm": 0.4617699384689331,
"learning_rate": 9.687098305670606e-06,
"loss": 1.0456,
"step": 70
},
{
"epoch": 0.26296296296296295,
"grad_norm": 0.4448198080062866,
"learning_rate": 9.676696140257969e-06,
"loss": 1.0364,
"step": 71
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.3684210479259491,
"learning_rate": 9.66612965656245e-06,
"loss": 1.1162,
"step": 72
},
{
"epoch": 0.27037037037037037,
"grad_norm": 0.6021161079406738,
"learning_rate": 9.655399225843244e-06,
"loss": 1.0799,
"step": 73
},
{
"epoch": 0.2740740740740741,
"grad_norm": 0.575809895992279,
"learning_rate": 9.644505225119922e-06,
"loss": 1.0222,
"step": 74
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.5453614592552185,
"learning_rate": 9.633448037159167e-06,
"loss": 1.0339,
"step": 75
},
{
"epoch": 0.2814814814814815,
"grad_norm": 0.5681980848312378,
"learning_rate": 9.622228050461345e-06,
"loss": 1.0622,
"step": 76
},
{
"epoch": 0.2851851851851852,
"grad_norm": 0.4109339714050293,
"learning_rate": 9.610845659246833e-06,
"loss": 1.0395,
"step": 77
},
{
"epoch": 0.28888888888888886,
"grad_norm": 0.4249359667301178,
"learning_rate": 9.599301263442194e-06,
"loss": 1.0346,
"step": 78
},
{
"epoch": 0.29259259259259257,
"grad_norm": 0.5109196901321411,
"learning_rate": 9.587595268666099e-06,
"loss": 1.0834,
"step": 79
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.512137770652771,
"learning_rate": 9.575728086215093e-06,
"loss": 1.0438,
"step": 80
},
{
"epoch": 0.3,
"grad_norm": 0.5844932198524475,
"learning_rate": 9.56370013304914e-06,
"loss": 0.9966,
"step": 81
},
{
"epoch": 0.3037037037037037,
"grad_norm": 0.4886794984340668,
"learning_rate": 9.551511831776966e-06,
"loss": 1.0461,
"step": 82
},
{
"epoch": 0.3074074074074074,
"grad_norm": 0.4917876124382019,
"learning_rate": 9.53916361064122e-06,
"loss": 1.0121,
"step": 83
},
{
"epoch": 0.3111111111111111,
"grad_norm": 0.48174771666526794,
"learning_rate": 9.526655903503423e-06,
"loss": 1.0579,
"step": 84
},
{
"epoch": 0.3148148148148148,
"grad_norm": 0.5147380232810974,
"learning_rate": 9.513989149828718e-06,
"loss": 1.0065,
"step": 85
},
{
"epoch": 0.31851851851851853,
"grad_norm": 0.4484403431415558,
"learning_rate": 9.501163794670445e-06,
"loss": 1.0089,
"step": 86
},
{
"epoch": 0.32222222222222224,
"grad_norm": 0.45849668979644775,
"learning_rate": 9.488180288654485e-06,
"loss": 1.0262,
"step": 87
},
{
"epoch": 0.32592592592592595,
"grad_norm": 0.571622908115387,
"learning_rate": 9.475039087963443e-06,
"loss": 1.0129,
"step": 88
},
{
"epoch": 0.3296296296296296,
"grad_norm": 0.5279180407524109,
"learning_rate": 9.461740654320608e-06,
"loss": 1.03,
"step": 89
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.47328171133995056,
"learning_rate": 9.448285454973739e-06,
"loss": 0.9805,
"step": 90
},
{
"epoch": 0.337037037037037,
"grad_norm": 0.4972725212574005,
"learning_rate": 9.434673962678638e-06,
"loss": 0.976,
"step": 91
},
{
"epoch": 0.34074074074074073,
"grad_norm": 0.5977814793586731,
"learning_rate": 9.420906655682553e-06,
"loss": 0.989,
"step": 92
},
{
"epoch": 0.34444444444444444,
"grad_norm": 0.5420663356781006,
"learning_rate": 9.40698401770736e-06,
"loss": 1.0225,
"step": 93
},
{
"epoch": 0.34814814814814815,
"grad_norm": 0.410198450088501,
"learning_rate": 9.392906537932582e-06,
"loss": 1.0393,
"step": 94
},
{
"epoch": 0.35185185185185186,
"grad_norm": 0.5001354217529297,
"learning_rate": 9.378674710978185e-06,
"loss": 0.9712,
"step": 95
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.5929519534111023,
"learning_rate": 9.364289036887214e-06,
"loss": 1.0759,
"step": 96
},
{
"epoch": 0.3592592592592593,
"grad_norm": 0.5323709845542908,
"learning_rate": 9.349750021108212e-06,
"loss": 1.0619,
"step": 97
},
{
"epoch": 0.362962962962963,
"grad_norm": 0.5360124707221985,
"learning_rate": 9.335058174477472e-06,
"loss": 0.9957,
"step": 98
},
{
"epoch": 0.36666666666666664,
"grad_norm": 0.5704509019851685,
"learning_rate": 9.320214013201079e-06,
"loss": 1.0591,
"step": 99
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.4351862967014313,
"learning_rate": 9.305218058836778e-06,
"loss": 1.014,
"step": 100
},
{
"epoch": 0.37407407407407406,
"grad_norm": 0.48397883772850037,
"learning_rate": 9.290070838275649e-06,
"loss": 1.0094,
"step": 101
},
{
"epoch": 0.37777777777777777,
"grad_norm": 0.5487049221992493,
"learning_rate": 9.274772883723587e-06,
"loss": 0.9604,
"step": 102
},
{
"epoch": 0.3814814814814815,
"grad_norm": 0.4735201895236969,
"learning_rate": 9.259324732682615e-06,
"loss": 0.9577,
"step": 103
},
{
"epoch": 0.3851851851851852,
"grad_norm": 0.5162625312805176,
"learning_rate": 9.24372692793199e-06,
"loss": 1.0095,
"step": 104
},
{
"epoch": 0.3888888888888889,
"grad_norm": 0.4944085478782654,
"learning_rate": 9.22798001750913e-06,
"loss": 1.0086,
"step": 105
},
{
"epoch": 0.3925925925925926,
"grad_norm": 0.5985198616981506,
"learning_rate": 9.21208455469037e-06,
"loss": 0.9878,
"step": 106
},
{
"epoch": 0.3962962962962963,
"grad_norm": 0.6551868915557861,
"learning_rate": 9.196041097971509e-06,
"loss": 1.0079,
"step": 107
},
{
"epoch": 0.4,
"grad_norm": 0.4953964352607727,
"learning_rate": 9.179850211048193e-06,
"loss": 1.0403,
"step": 108
},
{
"epoch": 0.40370370370370373,
"grad_norm": 0.46935591101646423,
"learning_rate": 9.163512462796113e-06,
"loss": 1.0443,
"step": 109
},
{
"epoch": 0.4074074074074074,
"grad_norm": 0.48214173316955566,
"learning_rate": 9.14702842725101e-06,
"loss": 0.9952,
"step": 110
},
{
"epoch": 0.4111111111111111,
"grad_norm": 0.5411708354949951,
"learning_rate": 9.13039868358851e-06,
"loss": 1.0634,
"step": 111
},
{
"epoch": 0.4148148148148148,
"grad_norm": 0.68564373254776,
"learning_rate": 9.113623816103775e-06,
"loss": 0.9307,
"step": 112
},
{
"epoch": 0.4185185185185185,
"grad_norm": 0.536626398563385,
"learning_rate": 9.09670441419097e-06,
"loss": 1.0535,
"step": 113
},
{
"epoch": 0.4222222222222222,
"grad_norm": 0.485929012298584,
"learning_rate": 9.079641072322555e-06,
"loss": 1.0176,
"step": 114
},
{
"epoch": 0.42592592592592593,
"grad_norm": 0.5539782047271729,
"learning_rate": 9.062434390028407e-06,
"loss": 0.9906,
"step": 115
},
{
"epoch": 0.42962962962962964,
"grad_norm": 0.49939635396003723,
"learning_rate": 9.045084971874738e-06,
"loss": 0.9586,
"step": 116
},
{
"epoch": 0.43333333333333335,
"grad_norm": 0.48620209097862244,
"learning_rate": 9.027593427442867e-06,
"loss": 1.0209,
"step": 117
},
{
"epoch": 0.43703703703703706,
"grad_norm": 0.4806266725063324,
"learning_rate": 9.009960371307798e-06,
"loss": 1.0185,
"step": 118
},
{
"epoch": 0.44074074074074077,
"grad_norm": 0.6763521432876587,
"learning_rate": 8.992186423016626e-06,
"loss": 1.0247,
"step": 119
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.5310172438621521,
"learning_rate": 8.974272207066767e-06,
"loss": 1.006,
"step": 120
},
{
"epoch": 0.44814814814814813,
"grad_norm": 0.5065312385559082,
"learning_rate": 8.956218352884022e-06,
"loss": 0.9535,
"step": 121
},
{
"epoch": 0.45185185185185184,
"grad_norm": 0.5911722183227539,
"learning_rate": 8.938025494800454e-06,
"loss": 0.9698,
"step": 122
},
{
"epoch": 0.45555555555555555,
"grad_norm": 0.60561203956604,
"learning_rate": 8.919694272032108e-06,
"loss": 1.0081,
"step": 123
},
{
"epoch": 0.45925925925925926,
"grad_norm": 0.5998137593269348,
"learning_rate": 8.901225328656543e-06,
"loss": 1.0332,
"step": 124
},
{
"epoch": 0.46296296296296297,
"grad_norm": 0.6571759581565857,
"learning_rate": 8.882619313590212e-06,
"loss": 1.0501,
"step": 125
},
{
"epoch": 0.4666666666666667,
"grad_norm": 0.5181518793106079,
"learning_rate": 8.863876880565656e-06,
"loss": 0.9653,
"step": 126
},
{
"epoch": 0.4703703703703704,
"grad_norm": 0.5412523746490479,
"learning_rate": 8.844998688108535e-06,
"loss": 0.999,
"step": 127
},
{
"epoch": 0.4740740740740741,
"grad_norm": 0.5652058124542236,
"learning_rate": 8.825985399514488e-06,
"loss": 0.9647,
"step": 128
},
{
"epoch": 0.4777777777777778,
"grad_norm": 0.52536940574646,
"learning_rate": 8.806837682825835e-06,
"loss": 0.9694,
"step": 129
},
{
"epoch": 0.48148148148148145,
"grad_norm": 0.6217904686927795,
"learning_rate": 8.787556210808101e-06,
"loss": 1.0241,
"step": 130
},
{
"epoch": 0.48518518518518516,
"grad_norm": 0.43509605526924133,
"learning_rate": 8.768141660926375e-06,
"loss": 0.9598,
"step": 131
},
{
"epoch": 0.4888888888888889,
"grad_norm": 0.5001434087753296,
"learning_rate": 8.748594715321512e-06,
"loss": 0.9697,
"step": 132
},
{
"epoch": 0.4925925925925926,
"grad_norm": 0.6269538402557373,
"learning_rate": 8.728916060786162e-06,
"loss": 1.0074,
"step": 133
},
{
"epoch": 0.4962962962962963,
"grad_norm": 0.6777300834655762,
"learning_rate": 8.70910638874064e-06,
"loss": 0.9968,
"step": 134
},
{
"epoch": 0.5,
"grad_norm": 0.5371289849281311,
"learning_rate": 8.689166395208638e-06,
"loss": 0.9684,
"step": 135
},
{
"epoch": 0.5037037037037037,
"grad_norm": 0.6136884093284607,
"learning_rate": 8.669096780792754e-06,
"loss": 1.0297,
"step": 136
},
{
"epoch": 0.5037037037037037,
"eval_loss": 0.9753141403198242,
"eval_runtime": 81.1717,
"eval_samples_per_second": 3.585,
"eval_steps_per_second": 0.456,
"step": 136
},
{
"epoch": 0.5074074074074074,
"grad_norm": 0.5171265602111816,
"learning_rate": 8.6488982506499e-06,
"loss": 0.962,
"step": 137
},
{
"epoch": 0.5111111111111111,
"grad_norm": 0.6454190611839294,
"learning_rate": 8.628571514466502e-06,
"loss": 0.9555,
"step": 138
},
{
"epoch": 0.5148148148148148,
"grad_norm": 0.5578838586807251,
"learning_rate": 8.608117286433583e-06,
"loss": 0.9079,
"step": 139
},
{
"epoch": 0.5185185185185185,
"grad_norm": 0.5714731216430664,
"learning_rate": 8.587536285221656e-06,
"loss": 0.9894,
"step": 140
},
{
"epoch": 0.5222222222222223,
"grad_norm": 0.5244677066802979,
"learning_rate": 8.566829233955484e-06,
"loss": 0.9735,
"step": 141
},
{
"epoch": 0.5259259259259259,
"grad_norm": 0.4161701798439026,
"learning_rate": 8.545996860188668e-06,
"loss": 0.9945,
"step": 142
},
{
"epoch": 0.5296296296296297,
"grad_norm": 0.6657142639160156,
"learning_rate": 8.525039895878078e-06,
"loss": 0.982,
"step": 143
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.7206271886825562,
"learning_rate": 8.503959077358143e-06,
"loss": 0.9977,
"step": 144
},
{
"epoch": 0.5370370370370371,
"grad_norm": 0.7977305054664612,
"learning_rate": 8.482755145314987e-06,
"loss": 0.9605,
"step": 145
},
{
"epoch": 0.5407407407407407,
"grad_norm": 0.8049225211143494,
"learning_rate": 8.46142884476038e-06,
"loss": 0.999,
"step": 146
},
{
"epoch": 0.5444444444444444,
"grad_norm": 0.49984222650527954,
"learning_rate": 8.439980925005587e-06,
"loss": 0.9595,
"step": 147
},
{
"epoch": 0.5481481481481482,
"grad_norm": 0.48655927181243896,
"learning_rate": 8.418412139635026e-06,
"loss": 0.9481,
"step": 148
},
{
"epoch": 0.5518518518518518,
"grad_norm": 0.5527738332748413,
"learning_rate": 8.396723246479798e-06,
"loss": 0.9665,
"step": 149
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.6328939199447632,
"learning_rate": 8.374915007591053e-06,
"loss": 1.0021,
"step": 150
},
{
"epoch": 0.5592592592592592,
"grad_norm": 0.6932883262634277,
"learning_rate": 8.352988189213223e-06,
"loss": 0.9991,
"step": 151
},
{
"epoch": 0.562962962962963,
"grad_norm": 0.5916227698326111,
"learning_rate": 8.330943561757092e-06,
"loss": 0.9661,
"step": 152
},
{
"epoch": 0.5666666666666667,
"grad_norm": 0.471822589635849,
"learning_rate": 8.308781899772731e-06,
"loss": 0.9396,
"step": 153
},
{
"epoch": 0.5703703703703704,
"grad_norm": 0.5403897166252136,
"learning_rate": 8.286503981922284e-06,
"loss": 0.9444,
"step": 154
},
{
"epoch": 0.5740740740740741,
"grad_norm": 0.5560125708580017,
"learning_rate": 8.264110590952609e-06,
"loss": 0.9487,
"step": 155
},
{
"epoch": 0.5777777777777777,
"grad_norm": 0.6282420754432678,
"learning_rate": 8.241602513667775e-06,
"loss": 1.0124,
"step": 156
},
{
"epoch": 0.5814814814814815,
"grad_norm": 0.4911057949066162,
"learning_rate": 8.218980540901417e-06,
"loss": 0.971,
"step": 157
},
{
"epoch": 0.5851851851851851,
"grad_norm": 0.6368396878242493,
"learning_rate": 8.19624546748895e-06,
"loss": 1.0181,
"step": 158
},
{
"epoch": 0.5888888888888889,
"grad_norm": 0.6642744541168213,
"learning_rate": 8.173398092239647e-06,
"loss": 1.0051,
"step": 159
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.554905116558075,
"learning_rate": 8.150439217908557e-06,
"loss": 0.9329,
"step": 160
},
{
"epoch": 0.5962962962962963,
"grad_norm": 0.5215203762054443,
"learning_rate": 8.12736965116832e-06,
"loss": 0.9506,
"step": 161
},
{
"epoch": 0.6,
"grad_norm": 0.4904837906360626,
"learning_rate": 8.104190202580811e-06,
"loss": 0.9864,
"step": 162
},
{
"epoch": 0.6037037037037037,
"grad_norm": 0.570766806602478,
"learning_rate": 8.080901686568664e-06,
"loss": 0.9379,
"step": 163
},
{
"epoch": 0.6074074074074074,
"grad_norm": 0.687227725982666,
"learning_rate": 8.057504921386661e-06,
"loss": 0.9714,
"step": 164
},
{
"epoch": 0.6111111111111112,
"grad_norm": 0.6017288565635681,
"learning_rate": 8.034000729092967e-06,
"loss": 0.9709,
"step": 165
},
{
"epoch": 0.6148148148148148,
"grad_norm": 0.6062106490135193,
"learning_rate": 8.010389935520269e-06,
"loss": 1.0362,
"step": 166
},
{
"epoch": 0.6185185185185185,
"grad_norm": 0.5548331141471863,
"learning_rate": 7.986673370246743e-06,
"loss": 0.9581,
"step": 167
},
{
"epoch": 0.6222222222222222,
"grad_norm": 0.5252346396446228,
"learning_rate": 7.962851866566912e-06,
"loss": 0.9669,
"step": 168
},
{
"epoch": 0.6259259259259259,
"grad_norm": 0.7005597352981567,
"learning_rate": 7.938926261462366e-06,
"loss": 0.987,
"step": 169
},
{
"epoch": 0.6296296296296297,
"grad_norm": 0.5916934609413147,
"learning_rate": 7.914897395572362e-06,
"loss": 0.9433,
"step": 170
},
{
"epoch": 0.6333333333333333,
"grad_norm": 0.6202555298805237,
"learning_rate": 7.890766113164272e-06,
"loss": 0.9833,
"step": 171
},
{
"epoch": 0.6370370370370371,
"grad_norm": 0.5578716397285461,
"learning_rate": 7.866533262103937e-06,
"loss": 0.9479,
"step": 172
},
{
"epoch": 0.6407407407407407,
"grad_norm": 0.6666351556777954,
"learning_rate": 7.842199693825863e-06,
"loss": 0.9383,
"step": 173
},
{
"epoch": 0.6444444444444445,
"grad_norm": 0.5507566332817078,
"learning_rate": 7.817766263303312e-06,
"loss": 0.9767,
"step": 174
},
{
"epoch": 0.6481481481481481,
"grad_norm": 0.6183774471282959,
"learning_rate": 7.793233829018263e-06,
"loss": 0.9078,
"step": 175
},
{
"epoch": 0.6518518518518519,
"grad_norm": 0.499009370803833,
"learning_rate": 7.768603252931243e-06,
"loss": 0.9563,
"step": 176
},
{
"epoch": 0.6555555555555556,
"grad_norm": 0.629336416721344,
"learning_rate": 7.743875400451047e-06,
"loss": 0.911,
"step": 177
},
{
"epoch": 0.6592592592592592,
"grad_norm": 0.5423790812492371,
"learning_rate": 7.719051140404327e-06,
"loss": 0.9434,
"step": 178
},
{
"epoch": 0.662962962962963,
"grad_norm": 0.6060659289360046,
"learning_rate": 7.69413134500507e-06,
"loss": 0.95,
"step": 179
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.5223778486251831,
"learning_rate": 7.669116889823955e-06,
"loss": 0.9474,
"step": 180
},
{
"epoch": 0.6703703703703704,
"grad_norm": 0.6271294355392456,
"learning_rate": 7.644008653757571e-06,
"loss": 0.9652,
"step": 181
},
{
"epoch": 0.674074074074074,
"grad_norm": 0.5973348617553711,
"learning_rate": 7.6188075189975644e-06,
"loss": 0.9333,
"step": 182
},
{
"epoch": 0.6777777777777778,
"grad_norm": 0.5119736790657043,
"learning_rate": 7.593514370999617e-06,
"loss": 0.9253,
"step": 183
},
{
"epoch": 0.6814814814814815,
"grad_norm": 0.6887508630752563,
"learning_rate": 7.568130098452352e-06,
"loss": 0.9344,
"step": 184
},
{
"epoch": 0.6851851851851852,
"grad_norm": 0.5387381911277771,
"learning_rate": 7.542655593246103e-06,
"loss": 0.9645,
"step": 185
},
{
"epoch": 0.6888888888888889,
"grad_norm": 0.5810338854789734,
"learning_rate": 7.517091750441576e-06,
"loss": 0.9406,
"step": 186
},
{
"epoch": 0.6925925925925925,
"grad_norm": 0.6561952829360962,
"learning_rate": 7.491439468238404e-06,
"loss": 0.9363,
"step": 187
},
{
"epoch": 0.6962962962962963,
"grad_norm": 0.7444878220558167,
"learning_rate": 7.465699647943586e-06,
"loss": 0.945,
"step": 188
},
{
"epoch": 0.7,
"grad_norm": 0.6265509724617004,
"learning_rate": 7.43987319393982e-06,
"loss": 0.9576,
"step": 189
},
{
"epoch": 0.7037037037037037,
"grad_norm": 0.6139175295829773,
"learning_rate": 7.413961013653725e-06,
"loss": 0.9697,
"step": 190
},
{
"epoch": 0.7074074074074074,
"grad_norm": 0.5767727494239807,
"learning_rate": 7.387964017523964e-06,
"loss": 0.9721,
"step": 191
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.757271945476532,
"learning_rate": 7.361883118969248e-06,
"loss": 1.0013,
"step": 192
},
{
"epoch": 0.7148148148148148,
"grad_norm": 0.6246291995048523,
"learning_rate": 7.335719234356245e-06,
"loss": 0.9418,
"step": 193
},
{
"epoch": 0.7185185185185186,
"grad_norm": 0.4833630621433258,
"learning_rate": 7.309473282967387e-06,
"loss": 0.9435,
"step": 194
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.5289487242698669,
"learning_rate": 7.283146186968566e-06,
"loss": 0.9617,
"step": 195
},
{
"epoch": 0.725925925925926,
"grad_norm": 0.6008256673812866,
"learning_rate": 7.256738871376733e-06,
"loss": 0.8983,
"step": 196
},
{
"epoch": 0.7296296296296296,
"grad_norm": 0.5227617621421814,
"learning_rate": 7.230252264027398e-06,
"loss": 0.8768,
"step": 197
},
{
"epoch": 0.7333333333333333,
"grad_norm": 0.6785119771957397,
"learning_rate": 7.203687295542032e-06,
"loss": 0.9393,
"step": 198
},
{
"epoch": 0.737037037037037,
"grad_norm": 0.6053286790847778,
"learning_rate": 7.1770448992953676e-06,
"loss": 0.9125,
"step": 199
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.7238445281982422,
"learning_rate": 7.1503260113826035e-06,
"loss": 0.9305,
"step": 200
},
{
"epoch": 0.7444444444444445,
"grad_norm": 0.6719542741775513,
"learning_rate": 7.123531570586515e-06,
"loss": 0.9643,
"step": 201
},
{
"epoch": 0.7481481481481481,
"grad_norm": 0.5546441674232483,
"learning_rate": 7.09666251834447e-06,
"loss": 0.9663,
"step": 202
},
{
"epoch": 0.7518518518518519,
"grad_norm": 0.5350282192230225,
"learning_rate": 7.069719798715347e-06,
"loss": 0.9041,
"step": 203
},
{
"epoch": 0.7555555555555555,
"grad_norm": 0.5801582932472229,
"learning_rate": 7.042704358346375e-06,
"loss": 0.9444,
"step": 204
},
{
"epoch": 0.7555555555555555,
"eval_loss": 0.9426867961883545,
"eval_runtime": 81.1055,
"eval_samples_per_second": 3.588,
"eval_steps_per_second": 0.456,
"step": 204
},
{
"epoch": 0.7592592592592593,
"grad_norm": 0.7228114008903503,
"learning_rate": 7.015617146439863e-06,
"loss": 0.931,
"step": 205
},
{
"epoch": 0.762962962962963,
"grad_norm": 0.5295515656471252,
"learning_rate": 6.988459114719849e-06,
"loss": 0.9457,
"step": 206
},
{
"epoch": 0.7666666666666667,
"grad_norm": 0.5533620119094849,
"learning_rate": 6.9612312173986675e-06,
"loss": 0.9407,
"step": 207
},
{
"epoch": 0.7703703703703704,
"grad_norm": 0.6508337259292603,
"learning_rate": 6.933934411143419e-06,
"loss": 0.9176,
"step": 208
},
{
"epoch": 0.774074074074074,
"grad_norm": 0.644389808177948,
"learning_rate": 6.906569655042357e-06,
"loss": 0.9796,
"step": 209
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.5943438410758972,
"learning_rate": 6.879137910571191e-06,
"loss": 0.9508,
"step": 210
},
{
"epoch": 0.7814814814814814,
"grad_norm": 0.5512163639068604,
"learning_rate": 6.8516401415593005e-06,
"loss": 0.9066,
"step": 211
},
{
"epoch": 0.7851851851851852,
"grad_norm": 0.5512770414352417,
"learning_rate": 6.824077314155877e-06,
"loss": 0.9169,
"step": 212
},
{
"epoch": 0.7888888888888889,
"grad_norm": 0.7245272397994995,
"learning_rate": 6.7964503967959705e-06,
"loss": 0.9563,
"step": 213
},
{
"epoch": 0.7925925925925926,
"grad_norm": 0.704143762588501,
"learning_rate": 6.768760360166471e-06,
"loss": 0.9662,
"step": 214
},
{
"epoch": 0.7962962962962963,
"grad_norm": 0.5439050197601318,
"learning_rate": 6.741008177171995e-06,
"loss": 0.9609,
"step": 215
},
{
"epoch": 0.8,
"grad_norm": 0.6104442477226257,
"learning_rate": 6.713194822900707e-06,
"loss": 0.9313,
"step": 216
},
{
"epoch": 0.8037037037037037,
"grad_norm": 0.7294436693191528,
"learning_rate": 6.6853212745900585e-06,
"loss": 0.933,
"step": 217
},
{
"epoch": 0.8074074074074075,
"grad_norm": 0.5400619506835938,
"learning_rate": 6.657388511592453e-06,
"loss": 0.9367,
"step": 218
},
{
"epoch": 0.8111111111111111,
"grad_norm": 0.8623405694961548,
"learning_rate": 6.62939751534083e-06,
"loss": 0.9719,
"step": 219
},
{
"epoch": 0.8148148148148148,
"grad_norm": 0.6787410378456116,
"learning_rate": 6.601349269314188e-06,
"loss": 0.9882,
"step": 220
},
{
"epoch": 0.8185185185185185,
"grad_norm": 0.6689869165420532,
"learning_rate": 6.573244759003033e-06,
"loss": 0.9445,
"step": 221
},
{
"epoch": 0.8222222222222222,
"grad_norm": 0.7502297759056091,
"learning_rate": 6.545084971874738e-06,
"loss": 0.9276,
"step": 222
},
{
"epoch": 0.825925925925926,
"grad_norm": 2.460090160369873,
"learning_rate": 6.516870897338864e-06,
"loss": 0.9684,
"step": 223
},
{
"epoch": 0.8296296296296296,
"grad_norm": 0.8110550045967102,
"learning_rate": 6.488603526712391e-06,
"loss": 0.9212,
"step": 224
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.5253615975379944,
"learning_rate": 6.46028385318488e-06,
"loss": 0.9385,
"step": 225
},
{
"epoch": 0.837037037037037,
"grad_norm": 0.5551905632019043,
"learning_rate": 6.431912871783587e-06,
"loss": 0.9331,
"step": 226
},
{
"epoch": 0.8407407407407408,
"grad_norm": 0.6484084129333496,
"learning_rate": 6.4034915793385e-06,
"loss": 0.9936,
"step": 227
},
{
"epoch": 0.8444444444444444,
"grad_norm": 0.6521108746528625,
"learning_rate": 6.3750209744473105e-06,
"loss": 0.974,
"step": 228
},
{
"epoch": 0.8481481481481481,
"grad_norm": 0.7478381395339966,
"learning_rate": 6.346502057440327e-06,
"loss": 0.9569,
"step": 229
},
{
"epoch": 0.8518518518518519,
"grad_norm": 0.6053647398948669,
"learning_rate": 6.3179358303453386e-06,
"loss": 0.8928,
"step": 230
},
{
"epoch": 0.8555555555555555,
"grad_norm": 0.7461119890213013,
"learning_rate": 6.289323296852393e-06,
"loss": 0.9121,
"step": 231
},
{
"epoch": 0.8592592592592593,
"grad_norm": 0.6154372692108154,
"learning_rate": 6.260665462278544e-06,
"loss": 0.952,
"step": 232
},
{
"epoch": 0.8629629629629629,
"grad_norm": 0.710970938205719,
"learning_rate": 6.231963333532516e-06,
"loss": 0.9365,
"step": 233
},
{
"epoch": 0.8666666666666667,
"grad_norm": 0.6357712149620056,
"learning_rate": 6.203217919079343e-06,
"loss": 0.8836,
"step": 234
},
{
"epoch": 0.8703703703703703,
"grad_norm": 0.6976805329322815,
"learning_rate": 6.17443022890492e-06,
"loss": 0.9757,
"step": 235
},
{
"epoch": 0.8740740740740741,
"grad_norm": 0.6872934699058533,
"learning_rate": 6.145601274480521e-06,
"loss": 0.9814,
"step": 236
},
{
"epoch": 0.8777777777777778,
"grad_norm": 0.7947030663490295,
"learning_rate": 6.116732068727271e-06,
"loss": 0.9016,
"step": 237
},
{
"epoch": 0.8814814814814815,
"grad_norm": 0.63334721326828,
"learning_rate": 6.08782362598054e-06,
"loss": 0.9679,
"step": 238
},
{
"epoch": 0.8851851851851852,
"grad_norm": 0.5451921820640564,
"learning_rate": 6.058876961954308e-06,
"loss": 0.9511,
"step": 239
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.5797951221466064,
"learning_rate": 6.029893093705492e-06,
"loss": 0.9553,
"step": 240
},
{
"epoch": 0.8925925925925926,
"grad_norm": 0.5836870074272156,
"learning_rate": 6.0008730395981905e-06,
"loss": 0.9562,
"step": 241
},
{
"epoch": 0.8962962962962963,
"grad_norm": 0.6153254508972168,
"learning_rate": 5.971817819267914e-06,
"loss": 0.9199,
"step": 242
},
{
"epoch": 0.9,
"grad_norm": 0.6756653785705566,
"learning_rate": 5.9427284535857585e-06,
"loss": 0.9599,
"step": 243
},
{
"epoch": 0.9037037037037037,
"grad_norm": 0.6547468304634094,
"learning_rate": 5.9136059646225375e-06,
"loss": 0.9485,
"step": 244
},
{
"epoch": 0.9074074074074074,
"grad_norm": 0.7384520769119263,
"learning_rate": 5.884451375612865e-06,
"loss": 0.927,
"step": 245
},
{
"epoch": 0.9111111111111111,
"grad_norm": 0.6480386853218079,
"learning_rate": 5.855265710919211e-06,
"loss": 1.0039,
"step": 246
},
{
"epoch": 0.9148148148148149,
"grad_norm": 0.5494263768196106,
"learning_rate": 5.826049995995905e-06,
"loss": 0.9706,
"step": 247
},
{
"epoch": 0.9185185185185185,
"grad_norm": 0.5438244342803955,
"learning_rate": 5.796805257353109e-06,
"loss": 0.963,
"step": 248
},
{
"epoch": 0.9222222222222223,
"grad_norm": 0.6168299317359924,
"learning_rate": 5.767532522520746e-06,
"loss": 0.9594,
"step": 249
},
{
"epoch": 0.9259259259259259,
"grad_norm": 0.6753399968147278,
"learning_rate": 5.738232820012407e-06,
"loss": 0.9181,
"step": 250
},
{
"epoch": 0.9296296296296296,
"grad_norm": 0.5123042464256287,
"learning_rate": 5.7089071792892e-06,
"loss": 0.9216,
"step": 251
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.7598656415939331,
"learning_rate": 5.679556630723592e-06,
"loss": 0.9725,
"step": 252
},
{
"epoch": 0.937037037037037,
"grad_norm": 0.6306942701339722,
"learning_rate": 5.6501822055631976e-06,
"loss": 0.9041,
"step": 253
},
{
"epoch": 0.9407407407407408,
"grad_norm": 0.7515453696250916,
"learning_rate": 5.620784935894548e-06,
"loss": 0.9192,
"step": 254
},
{
"epoch": 0.9444444444444444,
"grad_norm": 0.6113058924674988,
"learning_rate": 5.591365854606829e-06,
"loss": 0.949,
"step": 255
},
{
"epoch": 0.9481481481481482,
"grad_norm": 0.6589618921279907,
"learning_rate": 5.561925995355595e-06,
"loss": 0.9384,
"step": 256
},
{
"epoch": 0.9518518518518518,
"grad_norm": 0.7518366575241089,
"learning_rate": 5.532466392526439e-06,
"loss": 0.8959,
"step": 257
},
{
"epoch": 0.9555555555555556,
"grad_norm": 0.5112090110778809,
"learning_rate": 5.5029880811986546e-06,
"loss": 0.9214,
"step": 258
},
{
"epoch": 0.9592592592592593,
"grad_norm": 0.6436278820037842,
"learning_rate": 5.4734920971088766e-06,
"loss": 0.9165,
"step": 259
},
{
"epoch": 0.9629629629629629,
"grad_norm": 0.685821533203125,
"learning_rate": 5.443979476614674e-06,
"loss": 0.9114,
"step": 260
},
{
"epoch": 0.9666666666666667,
"grad_norm": 0.5555897951126099,
"learning_rate": 5.4144512566581495e-06,
"loss": 0.9791,
"step": 261
},
{
"epoch": 0.9703703703703703,
"grad_norm": 0.6167283058166504,
"learning_rate": 5.384908474729501e-06,
"loss": 0.9029,
"step": 262
},
{
"epoch": 0.9740740740740741,
"grad_norm": 0.6644378304481506,
"learning_rate": 5.3553521688305655e-06,
"loss": 0.9659,
"step": 263
},
{
"epoch": 0.9777777777777777,
"grad_norm": 0.6106395721435547,
"learning_rate": 5.325783377438357e-06,
"loss": 0.9161,
"step": 264
},
{
"epoch": 0.9814814814814815,
"grad_norm": 0.6115413904190063,
"learning_rate": 5.296203139468572e-06,
"loss": 0.8719,
"step": 265
},
{
"epoch": 0.9851851851851852,
"grad_norm": 0.8100462555885315,
"learning_rate": 5.266612494239088e-06,
"loss": 0.9013,
"step": 266
},
{
"epoch": 0.9888888888888889,
"grad_norm": 0.7386695742607117,
"learning_rate": 5.23701248143345e-06,
"loss": 0.9151,
"step": 267
},
{
"epoch": 0.9925925925925926,
"grad_norm": 0.5981118679046631,
"learning_rate": 5.207404141064334e-06,
"loss": 0.9077,
"step": 268
},
{
"epoch": 0.9962962962962963,
"grad_norm": 0.5839532613754272,
"learning_rate": 5.177788513437013e-06,
"loss": 0.9564,
"step": 269
},
{
"epoch": 1.0,
"grad_norm": 0.4894520938396454,
"learning_rate": 5.148166639112799e-06,
"loss": 0.9273,
"step": 270
},
{
"epoch": 1.0037037037037038,
"grad_norm": 0.6211138963699341,
"learning_rate": 5.118539558872489e-06,
"loss": 0.9478,
"step": 271
},
{
"epoch": 1.0074074074074073,
"grad_norm": 0.7439696192741394,
"learning_rate": 5.088908313679788e-06,
"loss": 0.9341,
"step": 272
},
{
"epoch": 1.0074074074074073,
"eval_loss": 0.9261184930801392,
"eval_runtime": 80.9898,
"eval_samples_per_second": 3.593,
"eval_steps_per_second": 0.457,
"step": 272
},
{
"epoch": 1.011111111111111,
"grad_norm": 0.6589562296867371,
"learning_rate": 5.059273944644742e-06,
"loss": 0.9316,
"step": 273
},
{
"epoch": 1.0148148148148148,
"grad_norm": 0.5672058463096619,
"learning_rate": 5.029637492987153e-06,
"loss": 0.9235,
"step": 274
},
{
"epoch": 1.0185185185185186,
"grad_norm": 0.6068680882453918,
"learning_rate": 5e-06,
"loss": 0.9136,
"step": 275
},
{
"epoch": 1.0037037037037038,
"grad_norm": 0.7259117960929871,
"learning_rate": 4.970362507012848e-06,
"loss": 0.8627,
"step": 276
},
{
"epoch": 1.0074074074074073,
"grad_norm": 0.665239691734314,
"learning_rate": 4.940726055355259e-06,
"loss": 0.9385,
"step": 277
},
{
"epoch": 1.011111111111111,
"grad_norm": 0.71152263879776,
"learning_rate": 4.911091686320213e-06,
"loss": 0.9532,
"step": 278
},
{
"epoch": 1.0148148148148148,
"grad_norm": 0.7714909911155701,
"learning_rate": 4.881460441127513e-06,
"loss": 0.8689,
"step": 279
},
{
"epoch": 1.0185185185185186,
"grad_norm": 0.6783362030982971,
"learning_rate": 4.8518333608872015e-06,
"loss": 0.948,
"step": 280
},
{
"epoch": 1.0222222222222221,
"grad_norm": 0.5598512291908264,
"learning_rate": 4.822211486562989e-06,
"loss": 0.953,
"step": 281
},
{
"epoch": 1.025925925925926,
"grad_norm": 0.7532334327697754,
"learning_rate": 4.792595858935668e-06,
"loss": 0.9703,
"step": 282
},
{
"epoch": 1.0296296296296297,
"grad_norm": 0.7283293604850769,
"learning_rate": 4.7629875185665505e-06,
"loss": 0.9526,
"step": 283
},
{
"epoch": 1.0333333333333334,
"grad_norm": 0.6575984358787537,
"learning_rate": 4.733387505760913e-06,
"loss": 0.9042,
"step": 284
},
{
"epoch": 1.037037037037037,
"grad_norm": 0.5753719210624695,
"learning_rate": 4.703796860531429e-06,
"loss": 0.9009,
"step": 285
},
{
"epoch": 1.0407407407407407,
"grad_norm": 0.7370662689208984,
"learning_rate": 4.674216622561645e-06,
"loss": 0.8645,
"step": 286
},
{
"epoch": 1.0444444444444445,
"grad_norm": 0.602418839931488,
"learning_rate": 4.644647831169435e-06,
"loss": 0.9141,
"step": 287
},
{
"epoch": 1.048148148148148,
"grad_norm": 0.7609613537788391,
"learning_rate": 4.6150915252705005e-06,
"loss": 0.8668,
"step": 288
},
{
"epoch": 1.0518518518518518,
"grad_norm": 0.8010672330856323,
"learning_rate": 4.585548743341851e-06,
"loss": 0.9242,
"step": 289
},
{
"epoch": 1.0555555555555556,
"grad_norm": 0.6908420324325562,
"learning_rate": 4.556020523385326e-06,
"loss": 0.9566,
"step": 290
},
{
"epoch": 1.0592592592592593,
"grad_norm": 0.7219347357749939,
"learning_rate": 4.526507902891124e-06,
"loss": 0.8987,
"step": 291
},
{
"epoch": 1.0629629629629629,
"grad_norm": 0.5726153254508972,
"learning_rate": 4.497011918801347e-06,
"loss": 0.9259,
"step": 292
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.7002944350242615,
"learning_rate": 4.467533607473563e-06,
"loss": 0.9171,
"step": 293
},
{
"epoch": 1.0703703703703704,
"grad_norm": 0.7401637434959412,
"learning_rate": 4.438074004644407e-06,
"loss": 0.9147,
"step": 294
},
{
"epoch": 1.074074074074074,
"grad_norm": 0.7317702770233154,
"learning_rate": 4.408634145393172e-06,
"loss": 0.8777,
"step": 295
},
{
"epoch": 1.0777777777777777,
"grad_norm": 0.586495041847229,
"learning_rate": 4.379215064105454e-06,
"loss": 0.8734,
"step": 296
},
{
"epoch": 1.0814814814814815,
"grad_norm": 0.7603331804275513,
"learning_rate": 4.349817794436805e-06,
"loss": 0.9757,
"step": 297
},
{
"epoch": 1.0851851851851853,
"grad_norm": 0.7039903402328491,
"learning_rate": 4.32044336927641e-06,
"loss": 0.9117,
"step": 298
},
{
"epoch": 1.0888888888888888,
"grad_norm": 0.7265645265579224,
"learning_rate": 4.2910928207108005e-06,
"loss": 0.9547,
"step": 299
},
{
"epoch": 1.0925925925925926,
"grad_norm": 0.5854629278182983,
"learning_rate": 4.261767179987595e-06,
"loss": 0.9309,
"step": 300
},
{
"epoch": 1.0962962962962963,
"grad_norm": 0.7084276676177979,
"learning_rate": 4.232467477479255e-06,
"loss": 0.9414,
"step": 301
},
{
"epoch": 1.1,
"grad_norm": 0.7032147645950317,
"learning_rate": 4.203194742646893e-06,
"loss": 0.846,
"step": 302
},
{
"epoch": 1.1037037037037036,
"grad_norm": 0.7182865142822266,
"learning_rate": 4.173950004004097e-06,
"loss": 0.9737,
"step": 303
},
{
"epoch": 1.1074074074074074,
"grad_norm": 0.6024776697158813,
"learning_rate": 4.1447342890807905e-06,
"loss": 0.8605,
"step": 304
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.717693567276001,
"learning_rate": 4.115548624387136e-06,
"loss": 0.8731,
"step": 305
},
{
"epoch": 1.1148148148148147,
"grad_norm": 0.8089867830276489,
"learning_rate": 4.086394035377463e-06,
"loss": 0.9019,
"step": 306
},
{
"epoch": 1.1185185185185185,
"grad_norm": 0.5785974860191345,
"learning_rate": 4.057271546414242e-06,
"loss": 0.9574,
"step": 307
},
{
"epoch": 1.1222222222222222,
"grad_norm": 0.7001700401306152,
"learning_rate": 4.028182180732088e-06,
"loss": 0.8993,
"step": 308
},
{
"epoch": 1.125925925925926,
"grad_norm": 0.7361912131309509,
"learning_rate": 3.99912696040181e-06,
"loss": 0.9711,
"step": 309
},
{
"epoch": 1.1296296296296295,
"grad_norm": 0.7708266973495483,
"learning_rate": 3.970106906294509e-06,
"loss": 0.9195,
"step": 310
},
{
"epoch": 1.1333333333333333,
"grad_norm": 0.5702573657035828,
"learning_rate": 3.9411230380456925e-06,
"loss": 0.9393,
"step": 311
},
{
"epoch": 1.137037037037037,
"grad_norm": 0.6527413725852966,
"learning_rate": 3.912176374019462e-06,
"loss": 0.9125,
"step": 312
},
{
"epoch": 1.1407407407407408,
"grad_norm": 0.6216891407966614,
"learning_rate": 3.88326793127273e-06,
"loss": 0.8595,
"step": 313
},
{
"epoch": 1.1444444444444444,
"grad_norm": 0.7108457684516907,
"learning_rate": 3.85439872551948e-06,
"loss": 0.945,
"step": 314
},
{
"epoch": 1.1481481481481481,
"grad_norm": 0.564195990562439,
"learning_rate": 3.825569771095082e-06,
"loss": 0.9172,
"step": 315
},
{
"epoch": 1.151851851851852,
"grad_norm": 0.7456059455871582,
"learning_rate": 3.796782080920659e-06,
"loss": 0.9229,
"step": 316
},
{
"epoch": 1.1555555555555554,
"grad_norm": 0.6403030157089233,
"learning_rate": 3.768036666467486e-06,
"loss": 1.0,
"step": 317
},
{
"epoch": 1.1592592592592592,
"grad_norm": 0.6477362513542175,
"learning_rate": 3.7393345377214584e-06,
"loss": 0.9649,
"step": 318
},
{
"epoch": 1.162962962962963,
"grad_norm": 0.7265921831130981,
"learning_rate": 3.7106767031476075e-06,
"loss": 0.9558,
"step": 319
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.6614460349082947,
"learning_rate": 3.682064169654663e-06,
"loss": 0.9338,
"step": 320
},
{
"epoch": 1.1703703703703703,
"grad_norm": 0.8571596145629883,
"learning_rate": 3.6534979425596747e-06,
"loss": 0.8639,
"step": 321
},
{
"epoch": 1.174074074074074,
"grad_norm": 0.7662659883499146,
"learning_rate": 3.6249790255526916e-06,
"loss": 0.9099,
"step": 322
},
{
"epoch": 1.1777777777777778,
"grad_norm": 0.6332697868347168,
"learning_rate": 3.5965084206615012e-06,
"loss": 0.966,
"step": 323
},
{
"epoch": 1.1814814814814816,
"grad_norm": 0.5719053149223328,
"learning_rate": 3.568087128216414e-06,
"loss": 0.9005,
"step": 324
},
{
"epoch": 1.1851851851851851,
"grad_norm": 0.7472560405731201,
"learning_rate": 3.539716146815122e-06,
"loss": 0.8842,
"step": 325
},
{
"epoch": 1.1888888888888889,
"grad_norm": 0.661870002746582,
"learning_rate": 3.511396473287611e-06,
"loss": 0.9547,
"step": 326
},
{
"epoch": 1.1925925925925926,
"grad_norm": 0.8332524299621582,
"learning_rate": 3.483129102661137e-06,
"loss": 1.0097,
"step": 327
},
{
"epoch": 1.1962962962962962,
"grad_norm": 0.7124307155609131,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.9479,
"step": 328
},
{
"epoch": 1.2,
"grad_norm": 0.6653727889060974,
"learning_rate": 3.4267552409969694e-06,
"loss": 0.9566,
"step": 329
},
{
"epoch": 1.2037037037037037,
"grad_norm": 0.7246274948120117,
"learning_rate": 3.398650730685813e-06,
"loss": 0.8731,
"step": 330
},
{
"epoch": 1.2074074074074075,
"grad_norm": 0.7679101824760437,
"learning_rate": 3.3706024846591717e-06,
"loss": 0.8851,
"step": 331
},
{
"epoch": 1.211111111111111,
"grad_norm": 0.6830713152885437,
"learning_rate": 3.3426114884075488e-06,
"loss": 0.9429,
"step": 332
},
{
"epoch": 1.2148148148148148,
"grad_norm": 0.686126172542572,
"learning_rate": 3.3146787254099424e-06,
"loss": 0.9363,
"step": 333
},
{
"epoch": 1.2185185185185186,
"grad_norm": 1.0472172498703003,
"learning_rate": 3.2868051770992935e-06,
"loss": 0.8628,
"step": 334
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.6732811331748962,
"learning_rate": 3.258991822828007e-06,
"loss": 0.9343,
"step": 335
},
{
"epoch": 1.2259259259259259,
"grad_norm": 0.6006411910057068,
"learning_rate": 3.2312396398335312e-06,
"loss": 0.8932,
"step": 336
},
{
"epoch": 1.2296296296296296,
"grad_norm": 0.6865448355674744,
"learning_rate": 3.2035496032040303e-06,
"loss": 0.9113,
"step": 337
},
{
"epoch": 1.2333333333333334,
"grad_norm": 0.7750067114830017,
"learning_rate": 3.175922685844125e-06,
"loss": 0.8964,
"step": 338
},
{
"epoch": 1.237037037037037,
"grad_norm": 0.6137946248054504,
"learning_rate": 3.1483598584407003e-06,
"loss": 0.9198,
"step": 339
},
{
"epoch": 1.2407407407407407,
"grad_norm": 0.5940172672271729,
"learning_rate": 3.1208620894288105e-06,
"loss": 0.8925,
"step": 340
},
{
"epoch": 1.2407407407407407,
"eval_loss": 0.9176353812217712,
"eval_runtime": 80.9941,
"eval_samples_per_second": 3.593,
"eval_steps_per_second": 0.457,
"step": 340
},
{
"epoch": 1.2444444444444445,
"grad_norm": 0.8746694326400757,
"learning_rate": 3.093430344957643e-06,
"loss": 0.9542,
"step": 341
},
{
"epoch": 1.2481481481481482,
"grad_norm": 0.7152467370033264,
"learning_rate": 3.0660655888565827e-06,
"loss": 0.9122,
"step": 342
},
{
"epoch": 1.2518518518518518,
"grad_norm": 0.665104866027832,
"learning_rate": 3.038768782601335e-06,
"loss": 0.8695,
"step": 343
},
{
"epoch": 1.2555555555555555,
"grad_norm": 0.6397359371185303,
"learning_rate": 3.0115408852801535e-06,
"loss": 0.9026,
"step": 344
},
{
"epoch": 1.2592592592592593,
"grad_norm": 0.6641426682472229,
"learning_rate": 2.98438285356014e-06,
"loss": 0.9131,
"step": 345
},
{
"epoch": 1.262962962962963,
"grad_norm": 0.7378568053245544,
"learning_rate": 2.9572956416536267e-06,
"loss": 0.9778,
"step": 346
},
{
"epoch": 1.2666666666666666,
"grad_norm": 0.7851204872131348,
"learning_rate": 2.930280201284654e-06,
"loss": 0.9295,
"step": 347
},
{
"epoch": 1.2703703703703704,
"grad_norm": 0.7360734939575195,
"learning_rate": 2.9033374816555338e-06,
"loss": 0.8333,
"step": 348
},
{
"epoch": 1.2740740740740741,
"grad_norm": 0.5486617088317871,
"learning_rate": 2.8764684294134872e-06,
"loss": 0.8636,
"step": 349
},
{
"epoch": 1.2777777777777777,
"grad_norm": 0.6200026273727417,
"learning_rate": 2.8496739886173994e-06,
"loss": 0.9163,
"step": 350
},
{
"epoch": 1.2814814814814814,
"grad_norm": 0.7656910419464111,
"learning_rate": 2.822955100704634e-06,
"loss": 0.8811,
"step": 351
},
{
"epoch": 1.2851851851851852,
"grad_norm": 0.8108608722686768,
"learning_rate": 2.7963127044579697e-06,
"loss": 0.9206,
"step": 352
},
{
"epoch": 1.2888888888888888,
"grad_norm": 0.7808861136436462,
"learning_rate": 2.769747735972605e-06,
"loss": 0.9116,
"step": 353
},
{
"epoch": 1.2925925925925925,
"grad_norm": 0.6127861142158508,
"learning_rate": 2.743261128623269e-06,
"loss": 0.8986,
"step": 354
},
{
"epoch": 1.2962962962962963,
"grad_norm": 0.8103310465812683,
"learning_rate": 2.716853813031435e-06,
"loss": 0.8832,
"step": 355
},
{
"epoch": 1.3,
"grad_norm": 0.658495306968689,
"learning_rate": 2.6905267170326143e-06,
"loss": 0.9457,
"step": 356
},
{
"epoch": 1.3037037037037038,
"grad_norm": 0.6721301078796387,
"learning_rate": 2.6642807656437565e-06,
"loss": 0.9182,
"step": 357
},
{
"epoch": 1.3074074074074074,
"grad_norm": 0.6494591236114502,
"learning_rate": 2.6381168810307536e-06,
"loss": 0.9245,
"step": 358
},
{
"epoch": 1.3111111111111111,
"grad_norm": 0.6653662919998169,
"learning_rate": 2.612035982476039e-06,
"loss": 0.9654,
"step": 359
},
{
"epoch": 1.3148148148148149,
"grad_norm": 0.6556596159934998,
"learning_rate": 2.5860389863462765e-06,
"loss": 0.9552,
"step": 360
},
{
"epoch": 1.3185185185185184,
"grad_norm": 0.7767282724380493,
"learning_rate": 2.5601268060601816e-06,
"loss": 0.901,
"step": 361
},
{
"epoch": 1.3222222222222222,
"grad_norm": 0.6174845099449158,
"learning_rate": 2.534300352056416e-06,
"loss": 0.8979,
"step": 362
},
{
"epoch": 1.325925925925926,
"grad_norm": 0.5829298496246338,
"learning_rate": 2.508560531761597e-06,
"loss": 0.9774,
"step": 363
},
{
"epoch": 1.3296296296296295,
"grad_norm": 0.6260789632797241,
"learning_rate": 2.4829082495584244e-06,
"loss": 0.9693,
"step": 364
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.6920310854911804,
"learning_rate": 2.457344406753899e-06,
"loss": 0.9345,
"step": 365
},
{
"epoch": 1.337037037037037,
"grad_norm": 0.6177324652671814,
"learning_rate": 2.4318699015476495e-06,
"loss": 0.9274,
"step": 366
},
{
"epoch": 1.3407407407407408,
"grad_norm": 0.6547250151634216,
"learning_rate": 2.4064856290003863e-06,
"loss": 0.9309,
"step": 367
},
{
"epoch": 1.3444444444444446,
"grad_norm": 0.7775738835334778,
"learning_rate": 2.3811924810024385e-06,
"loss": 0.9607,
"step": 368
},
{
"epoch": 1.348148148148148,
"grad_norm": 0.8030884861946106,
"learning_rate": 2.35599134624243e-06,
"loss": 0.9343,
"step": 369
},
{
"epoch": 1.3518518518518519,
"grad_norm": 0.5836490988731384,
"learning_rate": 2.330883110176049e-06,
"loss": 0.9088,
"step": 370
},
{
"epoch": 1.3555555555555556,
"grad_norm": 0.7231821417808533,
"learning_rate": 2.3058686549949306e-06,
"loss": 0.8505,
"step": 371
},
{
"epoch": 1.3592592592592592,
"grad_norm": 0.7363606095314026,
"learning_rate": 2.2809488595956746e-06,
"loss": 0.9975,
"step": 372
},
{
"epoch": 1.362962962962963,
"grad_norm": 0.7326072454452515,
"learning_rate": 2.256124599548957e-06,
"loss": 0.9272,
"step": 373
},
{
"epoch": 1.3666666666666667,
"grad_norm": 0.6802873015403748,
"learning_rate": 2.2313967470687593e-06,
"loss": 0.9038,
"step": 374
},
{
"epoch": 1.3703703703703702,
"grad_norm": 0.6956616640090942,
"learning_rate": 2.2067661709817384e-06,
"loss": 0.9062,
"step": 375
},
{
"epoch": 1.374074074074074,
"grad_norm": 0.5167267322540283,
"learning_rate": 2.18223373669669e-06,
"loss": 0.8963,
"step": 376
},
{
"epoch": 1.3777777777777778,
"grad_norm": 0.5965335965156555,
"learning_rate": 2.157800306174139e-06,
"loss": 0.9253,
"step": 377
},
{
"epoch": 1.3814814814814815,
"grad_norm": 0.6725478768348694,
"learning_rate": 2.1334667378960642e-06,
"loss": 0.9349,
"step": 378
},
{
"epoch": 1.3851851851851853,
"grad_norm": 0.6209405064582825,
"learning_rate": 2.1092338868357305e-06,
"loss": 0.9129,
"step": 379
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.7127699255943298,
"learning_rate": 2.0851026044276405e-06,
"loss": 0.9502,
"step": 380
},
{
"epoch": 1.3925925925925926,
"grad_norm": 0.7374861836433411,
"learning_rate": 2.061073738537635e-06,
"loss": 0.913,
"step": 381
},
{
"epoch": 1.3962962962962964,
"grad_norm": 0.6551845669746399,
"learning_rate": 2.0371481334330913e-06,
"loss": 0.9493,
"step": 382
},
{
"epoch": 1.4,
"grad_norm": 0.7371388077735901,
"learning_rate": 2.013326629753259e-06,
"loss": 0.9285,
"step": 383
},
{
"epoch": 1.4037037037037037,
"grad_norm": 0.7790321111679077,
"learning_rate": 1.9896100644797316e-06,
"loss": 0.8598,
"step": 384
},
{
"epoch": 1.4074074074074074,
"grad_norm": 0.6032355427742004,
"learning_rate": 1.9659992709070346e-06,
"loss": 0.9298,
"step": 385
},
{
"epoch": 1.411111111111111,
"grad_norm": 0.7076795101165771,
"learning_rate": 1.9424950786133414e-06,
"loss": 0.92,
"step": 386
},
{
"epoch": 1.4148148148148147,
"grad_norm": 0.8255873322486877,
"learning_rate": 1.919098313431335e-06,
"loss": 0.8778,
"step": 387
},
{
"epoch": 1.4185185185185185,
"grad_norm": 0.7901713848114014,
"learning_rate": 1.8958097974191909e-06,
"loss": 0.8844,
"step": 388
},
{
"epoch": 1.4222222222222223,
"grad_norm": 0.7087342739105225,
"learning_rate": 1.8726303488316822e-06,
"loss": 0.9575,
"step": 389
},
{
"epoch": 1.425925925925926,
"grad_norm": 0.7450980544090271,
"learning_rate": 1.8495607820914451e-06,
"loss": 0.811,
"step": 390
},
{
"epoch": 1.4296296296296296,
"grad_norm": 0.5872119665145874,
"learning_rate": 1.826601907760357e-06,
"loss": 0.9075,
"step": 391
},
{
"epoch": 1.4333333333333333,
"grad_norm": 0.7893672585487366,
"learning_rate": 1.8037545325110506e-06,
"loss": 0.9549,
"step": 392
},
{
"epoch": 1.4370370370370371,
"grad_norm": 0.7330056428909302,
"learning_rate": 1.781019459098584e-06,
"loss": 0.9366,
"step": 393
},
{
"epoch": 1.4407407407407407,
"grad_norm": 0.6859740614891052,
"learning_rate": 1.7583974863322272e-06,
"loss": 0.9284,
"step": 394
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.7870423793792725,
"learning_rate": 1.7358894090473928e-06,
"loss": 0.8698,
"step": 395
},
{
"epoch": 1.4481481481481482,
"grad_norm": 0.6686491370201111,
"learning_rate": 1.7134960180777171e-06,
"loss": 0.9149,
"step": 396
},
{
"epoch": 1.4518518518518517,
"grad_norm": 0.7235841751098633,
"learning_rate": 1.6912181002272714e-06,
"loss": 0.9068,
"step": 397
},
{
"epoch": 1.4555555555555555,
"grad_norm": 0.6145541667938232,
"learning_rate": 1.6690564382429104e-06,
"loss": 0.8985,
"step": 398
},
{
"epoch": 1.4592592592592593,
"grad_norm": 0.5945561528205872,
"learning_rate": 1.6470118107867777e-06,
"loss": 0.9318,
"step": 399
},
{
"epoch": 1.462962962962963,
"grad_norm": 0.6769958734512329,
"learning_rate": 1.6250849924089485e-06,
"loss": 0.9207,
"step": 400
},
{
"epoch": 1.4666666666666668,
"grad_norm": 0.6360178589820862,
"learning_rate": 1.6032767535202042e-06,
"loss": 0.9344,
"step": 401
},
{
"epoch": 1.4703703703703703,
"grad_norm": 0.6406002640724182,
"learning_rate": 1.581587860364977e-06,
"loss": 0.8903,
"step": 402
},
{
"epoch": 1.474074074074074,
"grad_norm": 0.7576456069946289,
"learning_rate": 1.560019074994416e-06,
"loss": 0.8949,
"step": 403
},
{
"epoch": 1.4777777777777779,
"grad_norm": 0.8080588579177856,
"learning_rate": 1.5385711552396227e-06,
"loss": 0.9252,
"step": 404
},
{
"epoch": 1.4814814814814814,
"grad_norm": 0.6083511114120483,
"learning_rate": 1.5172448546850166e-06,
"loss": 0.9096,
"step": 405
},
{
"epoch": 1.4851851851851852,
"grad_norm": 0.7563885450363159,
"learning_rate": 1.4960409226418576e-06,
"loss": 0.966,
"step": 406
},
{
"epoch": 1.488888888888889,
"grad_norm": 0.8337453603744507,
"learning_rate": 1.4749601041219246e-06,
"loss": 0.933,
"step": 407
},
{
"epoch": 1.4925925925925925,
"grad_norm": 0.5826141238212585,
"learning_rate": 1.4540031398113335e-06,
"loss": 0.896,
"step": 408
},
{
"epoch": 1.4925925925925925,
"eval_loss": 0.9128310084342957,
"eval_runtime": 80.6954,
"eval_samples_per_second": 3.606,
"eval_steps_per_second": 0.459,
"step": 408
},
{
"epoch": 1.4962962962962962,
"grad_norm": 0.64705890417099,
"learning_rate": 1.4331707660445155e-06,
"loss": 0.8723,
"step": 409
},
{
"epoch": 1.5,
"grad_norm": 0.6053957939147949,
"learning_rate": 1.4124637147783431e-06,
"loss": 0.8731,
"step": 410
},
{
"epoch": 1.5037037037037035,
"grad_norm": 0.8161659836769104,
"learning_rate": 1.3918827135664186e-06,
"loss": 0.9542,
"step": 411
},
{
"epoch": 1.5074074074074075,
"grad_norm": 0.8037695288658142,
"learning_rate": 1.371428485533498e-06,
"loss": 0.9263,
"step": 412
},
{
"epoch": 1.511111111111111,
"grad_norm": 0.6966097950935364,
"learning_rate": 1.3511017493501005e-06,
"loss": 0.9611,
"step": 413
},
{
"epoch": 1.5148148148148148,
"grad_norm": 0.7274028062820435,
"learning_rate": 1.3309032192072463e-06,
"loss": 0.8968,
"step": 414
},
{
"epoch": 1.5185185185185186,
"grad_norm": 0.657966136932373,
"learning_rate": 1.3108336047913633e-06,
"loss": 0.9025,
"step": 415
},
{
"epoch": 1.5222222222222221,
"grad_norm": 0.7330816388130188,
"learning_rate": 1.29089361125936e-06,
"loss": 0.9422,
"step": 416
},
{
"epoch": 1.525925925925926,
"grad_norm": 0.6839099526405334,
"learning_rate": 1.2710839392138386e-06,
"loss": 0.9604,
"step": 417
},
{
"epoch": 1.5296296296296297,
"grad_norm": 0.7069361805915833,
"learning_rate": 1.251405284678488e-06,
"loss": 0.9125,
"step": 418
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.6964563131332397,
"learning_rate": 1.2318583390736256e-06,
"loss": 0.899,
"step": 419
},
{
"epoch": 1.5370370370370372,
"grad_norm": 0.6390750408172607,
"learning_rate": 1.2124437891918995e-06,
"loss": 0.8699,
"step": 420
},
{
"epoch": 1.5407407407407407,
"grad_norm": 0.7302431464195251,
"learning_rate": 1.1931623171741653e-06,
"loss": 0.9124,
"step": 421
},
{
"epoch": 1.5444444444444443,
"grad_norm": 0.6087197065353394,
"learning_rate": 1.1740146004855141e-06,
"loss": 0.8754,
"step": 422
},
{
"epoch": 1.5481481481481483,
"grad_norm": 0.768294632434845,
"learning_rate": 1.1550013118914665e-06,
"loss": 0.9578,
"step": 423
},
{
"epoch": 1.5518518518518518,
"grad_norm": 0.6789171099662781,
"learning_rate": 1.1361231194343436e-06,
"loss": 0.9235,
"step": 424
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.9581536054611206,
"learning_rate": 1.1173806864097885e-06,
"loss": 0.8839,
"step": 425
},
{
"epoch": 1.5592592592592593,
"grad_norm": 0.5669332146644592,
"learning_rate": 1.0987746713434578e-06,
"loss": 0.9126,
"step": 426
},
{
"epoch": 1.5629629629629629,
"grad_norm": 0.8494063019752502,
"learning_rate": 1.080305727967893e-06,
"loss": 0.8771,
"step": 427
},
{
"epoch": 1.5666666666666667,
"grad_norm": 0.7187017202377319,
"learning_rate": 1.0619745051995473e-06,
"loss": 0.9504,
"step": 428
},
{
"epoch": 1.5703703703703704,
"grad_norm": 0.5970302820205688,
"learning_rate": 1.043781647115979e-06,
"loss": 0.8693,
"step": 429
},
{
"epoch": 1.574074074074074,
"grad_norm": 0.6068917512893677,
"learning_rate": 1.0257277929332332e-06,
"loss": 0.9556,
"step": 430
},
{
"epoch": 1.5777777777777777,
"grad_norm": 0.5248574018478394,
"learning_rate": 1.0078135769833758e-06,
"loss": 0.9034,
"step": 431
},
{
"epoch": 1.5814814814814815,
"grad_norm": 0.593900740146637,
"learning_rate": 9.900396286922025e-07,
"loss": 0.9028,
"step": 432
},
{
"epoch": 1.585185185185185,
"grad_norm": 0.6070235371589661,
"learning_rate": 9.72406572557133e-07,
"loss": 0.8641,
"step": 433
},
{
"epoch": 1.588888888888889,
"grad_norm": 0.6419976353645325,
"learning_rate": 9.549150281252633e-07,
"loss": 0.9095,
"step": 434
},
{
"epoch": 1.5925925925925926,
"grad_norm": 0.8620632290840149,
"learning_rate": 9.375656099715935e-07,
"loss": 0.974,
"step": 435
},
{
"epoch": 1.5962962962962963,
"grad_norm": 0.6101662516593933,
"learning_rate": 9.203589276774438e-07,
"loss": 0.8868,
"step": 436
},
{
"epoch": 1.6,
"grad_norm": 0.6099417209625244,
"learning_rate": 9.032955858090319e-07,
"loss": 0.8978,
"step": 437
},
{
"epoch": 1.6037037037037036,
"grad_norm": 0.8809055685997009,
"learning_rate": 8.86376183896226e-07,
"loss": 0.8905,
"step": 438
},
{
"epoch": 1.6074074074074074,
"grad_norm": 0.5985243320465088,
"learning_rate": 8.696013164114902e-07,
"loss": 0.8914,
"step": 439
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.6197245121002197,
"learning_rate": 8.529715727489912e-07,
"loss": 0.8962,
"step": 440
},
{
"epoch": 1.6148148148148147,
"grad_norm": 0.6981231570243835,
"learning_rate": 8.364875372038878e-07,
"loss": 0.9588,
"step": 441
},
{
"epoch": 1.6185185185185185,
"grad_norm": 0.9516739249229431,
"learning_rate": 8.201497889518073e-07,
"loss": 0.894,
"step": 442
},
{
"epoch": 1.6222222222222222,
"grad_norm": 0.7582102417945862,
"learning_rate": 8.039589020284926e-07,
"loss": 0.8848,
"step": 443
},
{
"epoch": 1.6259259259259258,
"grad_norm": 0.7806898355484009,
"learning_rate": 7.879154453096305e-07,
"loss": 0.9589,
"step": 444
},
{
"epoch": 1.6296296296296298,
"grad_norm": 0.7488991022109985,
"learning_rate": 7.720199824908692e-07,
"loss": 0.876,
"step": 445
},
{
"epoch": 1.6333333333333333,
"grad_norm": 0.7136239409446716,
"learning_rate": 7.562730720680111e-07,
"loss": 0.9316,
"step": 446
},
{
"epoch": 1.637037037037037,
"grad_norm": 0.7333770990371704,
"learning_rate": 7.406752673173851e-07,
"loss": 0.8628,
"step": 447
},
{
"epoch": 1.6407407407407408,
"grad_norm": 0.8281632661819458,
"learning_rate": 7.25227116276413e-07,
"loss": 0.9111,
"step": 448
},
{
"epoch": 1.6444444444444444,
"grad_norm": 0.742244303226471,
"learning_rate": 7.099291617243526e-07,
"loss": 1.0076,
"step": 449
},
{
"epoch": 1.6481481481481481,
"grad_norm": 0.9548508524894714,
"learning_rate": 6.947819411632223e-07,
"loss": 0.8449,
"step": 450
},
{
"epoch": 1.651851851851852,
"grad_norm": 0.6826335787773132,
"learning_rate": 6.797859867989226e-07,
"loss": 0.9281,
"step": 451
},
{
"epoch": 1.6555555555555554,
"grad_norm": 0.7542606592178345,
"learning_rate": 6.649418255225298e-07,
"loss": 0.9343,
"step": 452
},
{
"epoch": 1.6592592592592592,
"grad_norm": 0.6692271828651428,
"learning_rate": 6.502499788917893e-07,
"loss": 0.9217,
"step": 453
},
{
"epoch": 1.662962962962963,
"grad_norm": 0.6740605235099792,
"learning_rate": 6.357109631127889e-07,
"loss": 0.9304,
"step": 454
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.7319871187210083,
"learning_rate": 6.213252890218163e-07,
"loss": 0.937,
"step": 455
},
{
"epoch": 1.6703703703703705,
"grad_norm": 0.5654203295707703,
"learning_rate": 6.07093462067419e-07,
"loss": 0.8914,
"step": 456
},
{
"epoch": 1.674074074074074,
"grad_norm": 0.5713678002357483,
"learning_rate": 5.930159822926407e-07,
"loss": 0.9257,
"step": 457
},
{
"epoch": 1.6777777777777778,
"grad_norm": 0.838940441608429,
"learning_rate": 5.79093344317449e-07,
"loss": 0.9104,
"step": 458
},
{
"epoch": 1.6814814814814816,
"grad_norm": 0.6449588537216187,
"learning_rate": 5.653260373213632e-07,
"loss": 0.8805,
"step": 459
},
{
"epoch": 1.6851851851851851,
"grad_norm": 0.7371458411216736,
"learning_rate": 5.517145450262639e-07,
"loss": 0.8835,
"step": 460
},
{
"epoch": 1.6888888888888889,
"grad_norm": 0.679885983467102,
"learning_rate": 5.382593456793933e-07,
"loss": 0.9306,
"step": 461
},
{
"epoch": 1.6925925925925926,
"grad_norm": 0.5046345591545105,
"learning_rate": 5.249609120365579e-07,
"loss": 0.8928,
"step": 462
},
{
"epoch": 1.6962962962962962,
"grad_norm": 0.8400017023086548,
"learning_rate": 5.118197113455164e-07,
"loss": 0.9142,
"step": 463
},
{
"epoch": 1.7,
"grad_norm": 0.6406440734863281,
"learning_rate": 4.988362053295564e-07,
"loss": 0.8868,
"step": 464
},
{
"epoch": 1.7037037037037037,
"grad_norm": 0.633682906627655,
"learning_rate": 4.860108501712824e-07,
"loss": 0.913,
"step": 465
},
{
"epoch": 1.7074074074074073,
"grad_norm": 0.5694250464439392,
"learning_rate": 4.733440964965791e-07,
"loss": 0.9226,
"step": 466
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.7359141707420349,
"learning_rate": 4.6083638935878025e-07,
"loss": 0.9148,
"step": 467
},
{
"epoch": 1.7148148148148148,
"grad_norm": 0.669040322303772,
"learning_rate": 4.484881682230341e-07,
"loss": 0.8438,
"step": 468
},
{
"epoch": 1.7185185185185186,
"grad_norm": 0.6235198974609375,
"learning_rate": 4.3629986695086166e-07,
"loss": 0.9097,
"step": 469
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.617144763469696,
"learning_rate": 4.242719137849077e-07,
"loss": 0.8961,
"step": 470
},
{
"epoch": 1.7259259259259259,
"grad_norm": 0.6972277164459229,
"learning_rate": 4.124047313339025e-07,
"loss": 0.8856,
"step": 471
},
{
"epoch": 1.7296296296296296,
"grad_norm": 0.7551361322402954,
"learning_rate": 4.00698736557808e-07,
"loss": 0.9143,
"step": 472
},
{
"epoch": 1.7333333333333334,
"grad_norm": 0.7269377708435059,
"learning_rate": 3.891543407531673e-07,
"loss": 0.9145,
"step": 473
},
{
"epoch": 1.737037037037037,
"grad_norm": 0.7635181546211243,
"learning_rate": 3.777719495386567e-07,
"loss": 0.8855,
"step": 474
},
{
"epoch": 1.7407407407407407,
"grad_norm": 0.745471715927124,
"learning_rate": 3.665519628408332e-07,
"loss": 0.9049,
"step": 475
},
{
"epoch": 1.7444444444444445,
"grad_norm": 0.983215868473053,
"learning_rate": 3.5549477488007853e-07,
"loss": 0.8904,
"step": 476
},
{
"epoch": 1.7444444444444445,
"eval_loss": 0.910611093044281,
"eval_runtime": 80.7891,
"eval_samples_per_second": 3.602,
"eval_steps_per_second": 0.458,
"step": 476
},
{
"epoch": 1.748148148148148,
"grad_norm": 0.645391047000885,
"learning_rate": 3.4460077415675473e-07,
"loss": 0.9156,
"step": 477
},
{
"epoch": 1.751851851851852,
"grad_norm": 0.7084014415740967,
"learning_rate": 3.3387034343755063e-07,
"loss": 0.9417,
"step": 478
},
{
"epoch": 1.7555555555555555,
"grad_norm": 0.6383021473884583,
"learning_rate": 3.2330385974203184e-07,
"loss": 0.9339,
"step": 479
},
{
"epoch": 1.7592592592592593,
"grad_norm": 0.6533625721931458,
"learning_rate": 3.1290169432939556e-07,
"loss": 0.9548,
"step": 480
},
{
"epoch": 1.762962962962963,
"grad_norm": 0.5707213878631592,
"learning_rate": 3.0266421268542734e-07,
"loss": 0.9544,
"step": 481
},
{
"epoch": 1.7666666666666666,
"grad_norm": 0.6648502945899963,
"learning_rate": 2.925917745096568e-07,
"loss": 0.8525,
"step": 482
},
{
"epoch": 1.7703703703703704,
"grad_norm": 0.6798570156097412,
"learning_rate": 2.826847337027222e-07,
"loss": 0.9217,
"step": 483
},
{
"epoch": 1.7740740740740741,
"grad_norm": 0.709642231464386,
"learning_rate": 2.7294343835393366e-07,
"loss": 0.8996,
"step": 484
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.7643037438392639,
"learning_rate": 2.6336823072904305e-07,
"loss": 0.8625,
"step": 485
},
{
"epoch": 1.7814814814814814,
"grad_norm": 0.7004448771476746,
"learning_rate": 2.539594472582213e-07,
"loss": 0.9118,
"step": 486
},
{
"epoch": 1.7851851851851852,
"grad_norm": 0.6062957048416138,
"learning_rate": 2.447174185242324e-07,
"loss": 0.8849,
"step": 487
},
{
"epoch": 1.7888888888888888,
"grad_norm": 0.6031033396720886,
"learning_rate": 2.3564246925082358e-07,
"loss": 0.924,
"step": 488
},
{
"epoch": 1.7925925925925927,
"grad_norm": 0.6818556189537048,
"learning_rate": 2.2673491829131365e-07,
"loss": 0.9206,
"step": 489
},
{
"epoch": 1.7962962962962963,
"grad_norm": 0.7448561191558838,
"learning_rate": 2.179950786173879e-07,
"loss": 0.8549,
"step": 490
},
{
"epoch": 1.8,
"grad_norm": 0.603404700756073,
"learning_rate": 2.0942325730810565e-07,
"loss": 0.919,
"step": 491
},
{
"epoch": 1.8037037037037038,
"grad_norm": 0.7165398001670837,
"learning_rate": 2.01019755539108e-07,
"loss": 0.9021,
"step": 492
},
{
"epoch": 1.8074074074074074,
"grad_norm": 0.7593845725059509,
"learning_rate": 1.9278486857203683e-07,
"loss": 0.9153,
"step": 493
},
{
"epoch": 1.8111111111111111,
"grad_norm": 0.6313470602035522,
"learning_rate": 1.8471888574415953e-07,
"loss": 0.9106,
"step": 494
},
{
"epoch": 1.8148148148148149,
"grad_norm": 0.6173641085624695,
"learning_rate": 1.7682209045820687e-07,
"loss": 0.8888,
"step": 495
},
{
"epoch": 1.8185185185185184,
"grad_norm": 0.76194167137146,
"learning_rate": 1.690947601724091e-07,
"loss": 0.9064,
"step": 496
},
{
"epoch": 1.8222222222222222,
"grad_norm": 0.675755500793457,
"learning_rate": 1.6153716639075223e-07,
"loss": 0.9266,
"step": 497
},
{
"epoch": 1.825925925925926,
"grad_norm": 0.7498816847801208,
"learning_rate": 1.5414957465343883e-07,
"loss": 0.8579,
"step": 498
},
{
"epoch": 1.8296296296296295,
"grad_norm": 0.656910240650177,
"learning_rate": 1.4693224452755284e-07,
"loss": 0.821,
"step": 499
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.6735762357711792,
"learning_rate": 1.3988542959794627e-07,
"loss": 0.8731,
"step": 500
},
{
"epoch": 1.837037037037037,
"grad_norm": 0.6537667512893677,
"learning_rate": 1.330093774583252e-07,
"loss": 0.9249,
"step": 501
},
{
"epoch": 1.8407407407407408,
"grad_norm": 0.6112355589866638,
"learning_rate": 1.2630432970255014e-07,
"loss": 0.8936,
"step": 502
},
{
"epoch": 1.8444444444444446,
"grad_norm": 0.7084822058677673,
"learning_rate": 1.1977052191615158e-07,
"loss": 0.9221,
"step": 503
},
{
"epoch": 1.848148148148148,
"grad_norm": 0.652979850769043,
"learning_rate": 1.1340818366804728e-07,
"loss": 0.9073,
"step": 504
},
{
"epoch": 1.8518518518518519,
"grad_norm": 0.6980672478675842,
"learning_rate": 1.0721753850247984e-07,
"loss": 0.9294,
"step": 505
},
{
"epoch": 1.8555555555555556,
"grad_norm": 0.7224528789520264,
"learning_rate": 1.0119880393116177e-07,
"loss": 0.8842,
"step": 506
},
{
"epoch": 1.8592592592592592,
"grad_norm": 0.6340327262878418,
"learning_rate": 9.535219142563168e-08,
"loss": 0.9598,
"step": 507
},
{
"epoch": 1.862962962962963,
"grad_norm": 0.686582624912262,
"learning_rate": 8.967790640982466e-08,
"loss": 0.9344,
"step": 508
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.5959629416465759,
"learning_rate": 8.417614825285636e-08,
"loss": 0.9026,
"step": 509
},
{
"epoch": 1.8703703703703702,
"grad_norm": 0.6575542092323303,
"learning_rate": 7.884711026201586e-08,
"loss": 0.9288,
"step": 510
},
{
"epoch": 1.8740740740740742,
"grad_norm": 0.5704898238182068,
"learning_rate": 7.369097967597493e-08,
"loss": 0.8636,
"step": 511
},
{
"epoch": 1.8777777777777778,
"grad_norm": 0.6155747771263123,
"learning_rate": 6.870793765820783e-08,
"loss": 0.8362,
"step": 512
},
{
"epoch": 1.8814814814814815,
"grad_norm": 0.6208741664886475,
"learning_rate": 6.389815929062848e-08,
"loss": 0.9179,
"step": 513
},
{
"epoch": 1.8851851851851853,
"grad_norm": 0.7014544010162354,
"learning_rate": 5.92618135674361e-08,
"loss": 0.9333,
"step": 514
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.814078152179718,
"learning_rate": 5.479906338917984e-08,
"loss": 0.9186,
"step": 515
},
{
"epoch": 1.8925925925925926,
"grad_norm": 0.7297834753990173,
"learning_rate": 5.0510065557034526e-08,
"loss": 0.8992,
"step": 516
},
{
"epoch": 1.8962962962962964,
"grad_norm": 0.6444849371910095,
"learning_rate": 4.639497076728949e-08,
"loss": 0.94,
"step": 517
},
{
"epoch": 1.9,
"grad_norm": 0.5319604873657227,
"learning_rate": 4.245392360605727e-08,
"loss": 0.9396,
"step": 518
},
{
"epoch": 1.9037037037037037,
"grad_norm": 0.9374611973762512,
"learning_rate": 3.86870625441893e-08,
"loss": 0.9718,
"step": 519
},
{
"epoch": 1.9074074074074074,
"grad_norm": 0.5568841695785522,
"learning_rate": 3.5094519932415417e-08,
"loss": 0.89,
"step": 520
},
{
"epoch": 1.911111111111111,
"grad_norm": 0.9112274646759033,
"learning_rate": 3.167642199668863e-08,
"loss": 0.925,
"step": 521
},
{
"epoch": 1.914814814814815,
"grad_norm": 0.655830979347229,
"learning_rate": 2.843288883375539e-08,
"loss": 0.9135,
"step": 522
},
{
"epoch": 1.9185185185185185,
"grad_norm": 0.5499829649925232,
"learning_rate": 2.5364034406930026e-08,
"loss": 0.8902,
"step": 523
},
{
"epoch": 1.9222222222222223,
"grad_norm": 0.9093420505523682,
"learning_rate": 2.2469966542096323e-08,
"loss": 0.971,
"step": 524
},
{
"epoch": 1.925925925925926,
"grad_norm": 0.8075233101844788,
"learning_rate": 1.975078692391552e-08,
"loss": 0.9288,
"step": 525
},
{
"epoch": 1.9296296296296296,
"grad_norm": 0.6721240282058716,
"learning_rate": 1.7206591092253642e-08,
"loss": 0.8983,
"step": 526
},
{
"epoch": 1.9333333333333333,
"grad_norm": 0.6682837605476379,
"learning_rate": 1.4837468438826385e-08,
"loss": 0.9423,
"step": 527
},
{
"epoch": 1.9370370370370371,
"grad_norm": 0.653581440448761,
"learning_rate": 1.264350220405719e-08,
"loss": 0.9542,
"step": 528
},
{
"epoch": 1.9407407407407407,
"grad_norm": 0.5496450066566467,
"learning_rate": 1.0624769474152363e-08,
"loss": 0.9059,
"step": 529
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.7037910223007202,
"learning_rate": 8.781341178393244e-09,
"loss": 0.8928,
"step": 530
},
{
"epoch": 1.9481481481481482,
"grad_norm": 0.6391336917877197,
"learning_rate": 7.1132820866431915e-09,
"loss": 0.8936,
"step": 531
},
{
"epoch": 1.9518518518518517,
"grad_norm": 0.7979388236999512,
"learning_rate": 5.620650807073857e-09,
"loss": 0.8871,
"step": 532
},
{
"epoch": 1.9555555555555557,
"grad_norm": 0.6291653513908386,
"learning_rate": 4.303499784102383e-09,
"loss": 0.8815,
"step": 533
},
{
"epoch": 1.9592592592592593,
"grad_norm": 0.7071843147277832,
"learning_rate": 3.1618752965534295e-09,
"loss": 0.8984,
"step": 534
},
{
"epoch": 1.9629629629629628,
"grad_norm": 0.5879070162773132,
"learning_rate": 2.19581745602826e-09,
"loss": 0.849,
"step": 535
},
{
"epoch": 1.9666666666666668,
"grad_norm": 0.743624746799469,
"learning_rate": 1.4053602054991954e-09,
"loss": 0.879,
"step": 536
},
{
"epoch": 1.9703703703703703,
"grad_norm": 0.5870293974876404,
"learning_rate": 7.905313181150176e-10,
"loss": 0.9257,
"step": 537
},
{
"epoch": 1.974074074074074,
"grad_norm": 0.7187138199806213,
"learning_rate": 3.513523962256349e-10,
"loss": 0.9768,
"step": 538
},
{
"epoch": 1.9777777777777779,
"grad_norm": 0.6711537837982178,
"learning_rate": 8.783887062324692e-11,
"loss": 0.9182,
"step": 539
},
{
"epoch": 1.9814814814814814,
"grad_norm": 0.66741943359375,
"learning_rate": 0.0,
"loss": 0.8763,
"step": 540
}
],
"logging_steps": 1,
"max_steps": 540,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 135,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.9871043243514266e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}