tjj_chat / trainer_state.json
zstanjj's picture
Upload 10 files
25794dc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.997215777262181,
"eval_steps": 500,
"global_step": 538,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014849187935034803,
"grad_norm": 895.9867553710938,
"learning_rate": 2e-05,
"loss": 41.8575,
"step": 4
},
{
"epoch": 0.029698375870069606,
"grad_norm": 164.2000732421875,
"learning_rate": 2e-05,
"loss": 19.2704,
"step": 8
},
{
"epoch": 0.044547563805104405,
"grad_norm": 130.14195251464844,
"learning_rate": 2e-05,
"loss": 16.7431,
"step": 12
},
{
"epoch": 0.05939675174013921,
"grad_norm": 119.65748596191406,
"learning_rate": 2e-05,
"loss": 17.2431,
"step": 16
},
{
"epoch": 0.07424593967517401,
"grad_norm": 123.41026306152344,
"learning_rate": 2e-05,
"loss": 17.5812,
"step": 20
},
{
"epoch": 0.08909512761020881,
"grad_norm": 143.79872131347656,
"learning_rate": 2e-05,
"loss": 16.1039,
"step": 24
},
{
"epoch": 0.10394431554524362,
"grad_norm": 191.55752563476562,
"learning_rate": 2e-05,
"loss": 15.5393,
"step": 28
},
{
"epoch": 0.11879350348027842,
"grad_norm": 125.146728515625,
"learning_rate": 2e-05,
"loss": 15.1988,
"step": 32
},
{
"epoch": 0.13364269141531324,
"grad_norm": 122.55828857421875,
"learning_rate": 2e-05,
"loss": 15.456,
"step": 36
},
{
"epoch": 0.14849187935034802,
"grad_norm": 126.60418701171875,
"learning_rate": 2e-05,
"loss": 16.9079,
"step": 40
},
{
"epoch": 0.16334106728538283,
"grad_norm": 116.0846176147461,
"learning_rate": 2e-05,
"loss": 15.9405,
"step": 44
},
{
"epoch": 0.17819025522041762,
"grad_norm": 135.65383911132812,
"learning_rate": 2e-05,
"loss": 13.6821,
"step": 48
},
{
"epoch": 0.19303944315545243,
"grad_norm": 115.77993774414062,
"learning_rate": 2e-05,
"loss": 15.6503,
"step": 52
},
{
"epoch": 0.20788863109048725,
"grad_norm": 131.34146118164062,
"learning_rate": 2e-05,
"loss": 15.7174,
"step": 56
},
{
"epoch": 0.22273781902552203,
"grad_norm": 150.83935546875,
"learning_rate": 2e-05,
"loss": 16.6436,
"step": 60
},
{
"epoch": 0.23758700696055685,
"grad_norm": 152.6024169921875,
"learning_rate": 2e-05,
"loss": 16.1857,
"step": 64
},
{
"epoch": 0.25243619489559166,
"grad_norm": 165.27406311035156,
"learning_rate": 2e-05,
"loss": 15.5328,
"step": 68
},
{
"epoch": 0.2672853828306265,
"grad_norm": 119.0411376953125,
"learning_rate": 2e-05,
"loss": 14.212,
"step": 72
},
{
"epoch": 0.28213457076566123,
"grad_norm": 130.3306884765625,
"learning_rate": 2e-05,
"loss": 16.7866,
"step": 76
},
{
"epoch": 0.29698375870069604,
"grad_norm": 115.24845123291016,
"learning_rate": 2e-05,
"loss": 15.0373,
"step": 80
},
{
"epoch": 0.31183294663573086,
"grad_norm": 174.6798858642578,
"learning_rate": 2e-05,
"loss": 15.3437,
"step": 84
},
{
"epoch": 0.32668213457076567,
"grad_norm": 145.3719482421875,
"learning_rate": 2e-05,
"loss": 14.4015,
"step": 88
},
{
"epoch": 0.3415313225058005,
"grad_norm": 117.09785461425781,
"learning_rate": 2e-05,
"loss": 13.7134,
"step": 92
},
{
"epoch": 0.35638051044083524,
"grad_norm": 120.23141479492188,
"learning_rate": 2e-05,
"loss": 14.641,
"step": 96
},
{
"epoch": 0.37122969837587005,
"grad_norm": 107.27012634277344,
"learning_rate": 2e-05,
"loss": 14.7094,
"step": 100
},
{
"epoch": 0.38607888631090487,
"grad_norm": 136.1507568359375,
"learning_rate": 2e-05,
"loss": 14.8711,
"step": 104
},
{
"epoch": 0.4009280742459397,
"grad_norm": 136.19911193847656,
"learning_rate": 2e-05,
"loss": 14.7636,
"step": 108
},
{
"epoch": 0.4157772621809745,
"grad_norm": 120.15601348876953,
"learning_rate": 2e-05,
"loss": 16.0424,
"step": 112
},
{
"epoch": 0.4306264501160093,
"grad_norm": 104.66596221923828,
"learning_rate": 2e-05,
"loss": 14.2951,
"step": 116
},
{
"epoch": 0.44547563805104406,
"grad_norm": 102.8609619140625,
"learning_rate": 2e-05,
"loss": 13.2711,
"step": 120
},
{
"epoch": 0.4603248259860789,
"grad_norm": 108.99791717529297,
"learning_rate": 2e-05,
"loss": 14.4603,
"step": 124
},
{
"epoch": 0.4751740139211137,
"grad_norm": 100.2767333984375,
"learning_rate": 2e-05,
"loss": 14.5153,
"step": 128
},
{
"epoch": 0.4900232018561485,
"grad_norm": 108.51724243164062,
"learning_rate": 2e-05,
"loss": 14.3767,
"step": 132
},
{
"epoch": 0.5048723897911833,
"grad_norm": 139.0511932373047,
"learning_rate": 2e-05,
"loss": 15.0579,
"step": 136
},
{
"epoch": 0.5197215777262181,
"grad_norm": 131.45651245117188,
"learning_rate": 2e-05,
"loss": 16.0837,
"step": 140
},
{
"epoch": 0.534570765661253,
"grad_norm": 128.41012573242188,
"learning_rate": 2e-05,
"loss": 13.679,
"step": 144
},
{
"epoch": 0.5494199535962877,
"grad_norm": 138.88658142089844,
"learning_rate": 2e-05,
"loss": 13.4384,
"step": 148
},
{
"epoch": 0.5642691415313225,
"grad_norm": 119.11845397949219,
"learning_rate": 2e-05,
"loss": 13.9317,
"step": 152
},
{
"epoch": 0.5791183294663573,
"grad_norm": 119.57584381103516,
"learning_rate": 2e-05,
"loss": 14.371,
"step": 156
},
{
"epoch": 0.5939675174013921,
"grad_norm": 96.74629211425781,
"learning_rate": 2e-05,
"loss": 15.2401,
"step": 160
},
{
"epoch": 0.608816705336427,
"grad_norm": 111.12255096435547,
"learning_rate": 2e-05,
"loss": 15.1936,
"step": 164
},
{
"epoch": 0.6236658932714617,
"grad_norm": 148.77015686035156,
"learning_rate": 2e-05,
"loss": 14.4655,
"step": 168
},
{
"epoch": 0.6385150812064965,
"grad_norm": 107.04643249511719,
"learning_rate": 2e-05,
"loss": 12.6344,
"step": 172
},
{
"epoch": 0.6533642691415313,
"grad_norm": 104.93022918701172,
"learning_rate": 2e-05,
"loss": 13.9102,
"step": 176
},
{
"epoch": 0.6682134570765661,
"grad_norm": 104.616943359375,
"learning_rate": 2e-05,
"loss": 14.9522,
"step": 180
},
{
"epoch": 0.683062645011601,
"grad_norm": 139.63406372070312,
"learning_rate": 2e-05,
"loss": 15.4642,
"step": 184
},
{
"epoch": 0.6979118329466357,
"grad_norm": 106.42848205566406,
"learning_rate": 2e-05,
"loss": 14.1578,
"step": 188
},
{
"epoch": 0.7127610208816705,
"grad_norm": 95.40778350830078,
"learning_rate": 2e-05,
"loss": 15.5809,
"step": 192
},
{
"epoch": 0.7276102088167054,
"grad_norm": 106.99407958984375,
"learning_rate": 2e-05,
"loss": 12.3565,
"step": 196
},
{
"epoch": 0.7424593967517401,
"grad_norm": 116.07793426513672,
"learning_rate": 2e-05,
"loss": 13.6122,
"step": 200
},
{
"epoch": 0.757308584686775,
"grad_norm": 117.84542846679688,
"learning_rate": 2e-05,
"loss": 14.2531,
"step": 204
},
{
"epoch": 0.7721577726218097,
"grad_norm": 90.03235626220703,
"learning_rate": 2e-05,
"loss": 14.2915,
"step": 208
},
{
"epoch": 0.7870069605568445,
"grad_norm": 99.91178894042969,
"learning_rate": 2e-05,
"loss": 13.7193,
"step": 212
},
{
"epoch": 0.8018561484918794,
"grad_norm": 127.37728881835938,
"learning_rate": 2e-05,
"loss": 14.4029,
"step": 216
},
{
"epoch": 0.8167053364269141,
"grad_norm": 106.17198181152344,
"learning_rate": 2e-05,
"loss": 14.152,
"step": 220
},
{
"epoch": 0.831554524361949,
"grad_norm": 109.1567611694336,
"learning_rate": 2e-05,
"loss": 14.6705,
"step": 224
},
{
"epoch": 0.8464037122969837,
"grad_norm": 101.11131286621094,
"learning_rate": 2e-05,
"loss": 13.956,
"step": 228
},
{
"epoch": 0.8612529002320186,
"grad_norm": 113.48827362060547,
"learning_rate": 2e-05,
"loss": 14.138,
"step": 232
},
{
"epoch": 0.8761020881670534,
"grad_norm": 112.26351165771484,
"learning_rate": 2e-05,
"loss": 12.2284,
"step": 236
},
{
"epoch": 0.8909512761020881,
"grad_norm": 100.76663970947266,
"learning_rate": 2e-05,
"loss": 13.7275,
"step": 240
},
{
"epoch": 0.905800464037123,
"grad_norm": 104.24567413330078,
"learning_rate": 2e-05,
"loss": 12.7694,
"step": 244
},
{
"epoch": 0.9206496519721578,
"grad_norm": 106.16858673095703,
"learning_rate": 2e-05,
"loss": 14.139,
"step": 248
},
{
"epoch": 0.9354988399071926,
"grad_norm": 112.65348815917969,
"learning_rate": 2e-05,
"loss": 13.8694,
"step": 252
},
{
"epoch": 0.9503480278422274,
"grad_norm": 91.72236633300781,
"learning_rate": 2e-05,
"loss": 15.5933,
"step": 256
},
{
"epoch": 0.9651972157772621,
"grad_norm": 90.93212127685547,
"learning_rate": 2e-05,
"loss": 14.2187,
"step": 260
},
{
"epoch": 0.980046403712297,
"grad_norm": 100.89374542236328,
"learning_rate": 2e-05,
"loss": 13.7716,
"step": 264
},
{
"epoch": 0.9948955916473318,
"grad_norm": 92.8128662109375,
"learning_rate": 2e-05,
"loss": 12.5682,
"step": 268
},
{
"epoch": 1.0097447795823666,
"grad_norm": 95.66116333007812,
"learning_rate": 2e-05,
"loss": 14.4997,
"step": 272
},
{
"epoch": 1.0245939675174014,
"grad_norm": 104.52428436279297,
"learning_rate": 2e-05,
"loss": 11.8475,
"step": 276
},
{
"epoch": 1.0394431554524362,
"grad_norm": 104.34024810791016,
"learning_rate": 2e-05,
"loss": 10.1835,
"step": 280
},
{
"epoch": 1.054292343387471,
"grad_norm": 98.30239868164062,
"learning_rate": 2e-05,
"loss": 10.2298,
"step": 284
},
{
"epoch": 1.069141531322506,
"grad_norm": 109.97785949707031,
"learning_rate": 2e-05,
"loss": 10.6023,
"step": 288
},
{
"epoch": 1.0839907192575406,
"grad_norm": 122.24370574951172,
"learning_rate": 2e-05,
"loss": 10.0427,
"step": 292
},
{
"epoch": 1.0988399071925754,
"grad_norm": 109.37757873535156,
"learning_rate": 2e-05,
"loss": 10.0441,
"step": 296
},
{
"epoch": 1.1136890951276102,
"grad_norm": 127.94110107421875,
"learning_rate": 2e-05,
"loss": 9.7277,
"step": 300
},
{
"epoch": 1.128538283062645,
"grad_norm": 124.07524108886719,
"learning_rate": 2e-05,
"loss": 9.7969,
"step": 304
},
{
"epoch": 1.14338747099768,
"grad_norm": 126.29171752929688,
"learning_rate": 2e-05,
"loss": 9.5134,
"step": 308
},
{
"epoch": 1.1582366589327147,
"grad_norm": 104.21505737304688,
"learning_rate": 2e-05,
"loss": 10.8362,
"step": 312
},
{
"epoch": 1.1730858468677494,
"grad_norm": 121.6202392578125,
"learning_rate": 2e-05,
"loss": 8.8389,
"step": 316
},
{
"epoch": 1.1879350348027842,
"grad_norm": 110.58162689208984,
"learning_rate": 2e-05,
"loss": 9.0145,
"step": 320
},
{
"epoch": 1.202784222737819,
"grad_norm": 127.4255599975586,
"learning_rate": 2e-05,
"loss": 9.6973,
"step": 324
},
{
"epoch": 1.217633410672854,
"grad_norm": 108.92906951904297,
"learning_rate": 2e-05,
"loss": 9.6894,
"step": 328
},
{
"epoch": 1.2324825986078887,
"grad_norm": 131.8388214111328,
"learning_rate": 2e-05,
"loss": 11.288,
"step": 332
},
{
"epoch": 1.2473317865429234,
"grad_norm": 106.78469848632812,
"learning_rate": 2e-05,
"loss": 9.656,
"step": 336
},
{
"epoch": 1.2621809744779582,
"grad_norm": 120.8875503540039,
"learning_rate": 2e-05,
"loss": 9.6884,
"step": 340
},
{
"epoch": 1.2770301624129932,
"grad_norm": 112.69973754882812,
"learning_rate": 2e-05,
"loss": 8.8555,
"step": 344
},
{
"epoch": 1.291879350348028,
"grad_norm": 122.43771362304688,
"learning_rate": 2e-05,
"loss": 9.6718,
"step": 348
},
{
"epoch": 1.3067285382830627,
"grad_norm": 116.25230407714844,
"learning_rate": 2e-05,
"loss": 8.7905,
"step": 352
},
{
"epoch": 1.3215777262180974,
"grad_norm": 114.96141815185547,
"learning_rate": 2e-05,
"loss": 9.7848,
"step": 356
},
{
"epoch": 1.3364269141531322,
"grad_norm": 119.10284423828125,
"learning_rate": 2e-05,
"loss": 7.9737,
"step": 360
},
{
"epoch": 1.3512761020881672,
"grad_norm": 109.69094848632812,
"learning_rate": 2e-05,
"loss": 8.7001,
"step": 364
},
{
"epoch": 1.366125290023202,
"grad_norm": 109.21603393554688,
"learning_rate": 2e-05,
"loss": 8.0757,
"step": 368
},
{
"epoch": 1.3809744779582367,
"grad_norm": 128.07073974609375,
"learning_rate": 2e-05,
"loss": 10.1842,
"step": 372
},
{
"epoch": 1.3958236658932714,
"grad_norm": 105.088623046875,
"learning_rate": 2e-05,
"loss": 8.1361,
"step": 376
},
{
"epoch": 1.4106728538283062,
"grad_norm": 117.58355712890625,
"learning_rate": 2e-05,
"loss": 10.6169,
"step": 380
},
{
"epoch": 1.4255220417633412,
"grad_norm": 102.73584747314453,
"learning_rate": 2e-05,
"loss": 8.8225,
"step": 384
},
{
"epoch": 1.440371229698376,
"grad_norm": 104.41094207763672,
"learning_rate": 2e-05,
"loss": 8.593,
"step": 388
},
{
"epoch": 1.4552204176334107,
"grad_norm": 104.82015228271484,
"learning_rate": 2e-05,
"loss": 8.4753,
"step": 392
},
{
"epoch": 1.4700696055684455,
"grad_norm": 113.64494323730469,
"learning_rate": 2e-05,
"loss": 7.9889,
"step": 396
},
{
"epoch": 1.4849187935034802,
"grad_norm": 109.5793685913086,
"learning_rate": 2e-05,
"loss": 8.2657,
"step": 400
},
{
"epoch": 1.4997679814385152,
"grad_norm": 107.78541564941406,
"learning_rate": 2e-05,
"loss": 8.5209,
"step": 404
},
{
"epoch": 1.5146171693735497,
"grad_norm": 125.47006225585938,
"learning_rate": 2e-05,
"loss": 9.4815,
"step": 408
},
{
"epoch": 1.5294663573085847,
"grad_norm": 108.86872863769531,
"learning_rate": 2e-05,
"loss": 7.989,
"step": 412
},
{
"epoch": 1.5443155452436195,
"grad_norm": 102.67842864990234,
"learning_rate": 2e-05,
"loss": 7.6957,
"step": 416
},
{
"epoch": 1.5591647331786542,
"grad_norm": 109.05705261230469,
"learning_rate": 2e-05,
"loss": 8.5562,
"step": 420
},
{
"epoch": 1.5740139211136892,
"grad_norm": 104.20409393310547,
"learning_rate": 2e-05,
"loss": 8.4751,
"step": 424
},
{
"epoch": 1.5888631090487237,
"grad_norm": 126.31594848632812,
"learning_rate": 2e-05,
"loss": 8.3718,
"step": 428
},
{
"epoch": 1.6037122969837587,
"grad_norm": 120.48487091064453,
"learning_rate": 2e-05,
"loss": 8.8551,
"step": 432
},
{
"epoch": 1.6185614849187935,
"grad_norm": 105.4981689453125,
"learning_rate": 2e-05,
"loss": 8.2207,
"step": 436
},
{
"epoch": 1.6334106728538282,
"grad_norm": 112.6336441040039,
"learning_rate": 2e-05,
"loss": 8.4217,
"step": 440
},
{
"epoch": 1.6482598607888632,
"grad_norm": 132.0428009033203,
"learning_rate": 2e-05,
"loss": 7.7686,
"step": 444
},
{
"epoch": 1.6631090487238978,
"grad_norm": 125.45011901855469,
"learning_rate": 2e-05,
"loss": 9.2927,
"step": 448
},
{
"epoch": 1.6779582366589327,
"grad_norm": 136.8842315673828,
"learning_rate": 2e-05,
"loss": 8.7879,
"step": 452
},
{
"epoch": 1.6928074245939675,
"grad_norm": 128.8678741455078,
"learning_rate": 2e-05,
"loss": 9.6716,
"step": 456
},
{
"epoch": 1.7076566125290022,
"grad_norm": 111.33040618896484,
"learning_rate": 2e-05,
"loss": 8.5814,
"step": 460
},
{
"epoch": 1.7225058004640372,
"grad_norm": 123.63487243652344,
"learning_rate": 2e-05,
"loss": 8.4478,
"step": 464
},
{
"epoch": 1.7373549883990718,
"grad_norm": 113.80644989013672,
"learning_rate": 2e-05,
"loss": 7.6935,
"step": 468
},
{
"epoch": 1.7522041763341067,
"grad_norm": 107.1911392211914,
"learning_rate": 2e-05,
"loss": 8.6608,
"step": 472
},
{
"epoch": 1.7670533642691415,
"grad_norm": 102.86659240722656,
"learning_rate": 2e-05,
"loss": 8.3795,
"step": 476
},
{
"epoch": 1.7819025522041763,
"grad_norm": 110.92539978027344,
"learning_rate": 2e-05,
"loss": 7.6919,
"step": 480
},
{
"epoch": 1.7967517401392112,
"grad_norm": 104.6399917602539,
"learning_rate": 2e-05,
"loss": 8.2716,
"step": 484
},
{
"epoch": 1.8116009280742458,
"grad_norm": 115.54898071289062,
"learning_rate": 2e-05,
"loss": 8.5838,
"step": 488
},
{
"epoch": 1.8264501160092808,
"grad_norm": 105.62113952636719,
"learning_rate": 2e-05,
"loss": 8.113,
"step": 492
},
{
"epoch": 1.8412993039443155,
"grad_norm": 100.64768981933594,
"learning_rate": 2e-05,
"loss": 7.9681,
"step": 496
},
{
"epoch": 1.8561484918793503,
"grad_norm": 113.74981689453125,
"learning_rate": 2e-05,
"loss": 8.3435,
"step": 500
},
{
"epoch": 1.8709976798143853,
"grad_norm": 111.69252014160156,
"learning_rate": 2e-05,
"loss": 7.9597,
"step": 504
},
{
"epoch": 1.88584686774942,
"grad_norm": 127.168212890625,
"learning_rate": 2e-05,
"loss": 7.4817,
"step": 508
},
{
"epoch": 1.9006960556844548,
"grad_norm": 112.72080993652344,
"learning_rate": 2e-05,
"loss": 8.3011,
"step": 512
},
{
"epoch": 1.9155452436194895,
"grad_norm": 96.97032928466797,
"learning_rate": 2e-05,
"loss": 7.5826,
"step": 516
},
{
"epoch": 1.9303944315545243,
"grad_norm": 90.76924896240234,
"learning_rate": 2e-05,
"loss": 7.6501,
"step": 520
},
{
"epoch": 1.9452436194895593,
"grad_norm": 110.57941436767578,
"learning_rate": 2e-05,
"loss": 8.3152,
"step": 524
},
{
"epoch": 1.960092807424594,
"grad_norm": 97.4187240600586,
"learning_rate": 2e-05,
"loss": 7.9809,
"step": 528
},
{
"epoch": 1.9749419953596288,
"grad_norm": 107.45658111572266,
"learning_rate": 2e-05,
"loss": 8.1966,
"step": 532
},
{
"epoch": 1.9897911832946635,
"grad_norm": 125.85009002685547,
"learning_rate": 2e-05,
"loss": 7.8091,
"step": 536
}
],
"logging_steps": 4,
"max_steps": 538,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 446690230272000.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}