oh-dcft-v3.1-SN-405B-hacky / trainer_state.json
sedrickkeh's picture
End of training
e3215fe verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.998438719750195,
"eval_steps": 500,
"global_step": 960,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0312256049960968,
"grad_norm": 2.3905091254417177,
"learning_rate": 5e-06,
"loss": 0.8071,
"step": 10
},
{
"epoch": 0.0624512099921936,
"grad_norm": 1.3958881588356253,
"learning_rate": 5e-06,
"loss": 0.7197,
"step": 20
},
{
"epoch": 0.0936768149882904,
"grad_norm": 1.5094847169199592,
"learning_rate": 5e-06,
"loss": 0.6985,
"step": 30
},
{
"epoch": 0.1249024199843872,
"grad_norm": 1.4080817727177422,
"learning_rate": 5e-06,
"loss": 0.6855,
"step": 40
},
{
"epoch": 0.156128024980484,
"grad_norm": 0.8437914162126238,
"learning_rate": 5e-06,
"loss": 0.6727,
"step": 50
},
{
"epoch": 0.1873536299765808,
"grad_norm": 0.8642540743285999,
"learning_rate": 5e-06,
"loss": 0.6576,
"step": 60
},
{
"epoch": 0.2185792349726776,
"grad_norm": 0.6633530231329323,
"learning_rate": 5e-06,
"loss": 0.6514,
"step": 70
},
{
"epoch": 0.2498048399687744,
"grad_norm": 0.8262205915204606,
"learning_rate": 5e-06,
"loss": 0.6457,
"step": 80
},
{
"epoch": 0.2810304449648712,
"grad_norm": 0.5613248571253571,
"learning_rate": 5e-06,
"loss": 0.6412,
"step": 90
},
{
"epoch": 0.312256049960968,
"grad_norm": 0.9150986102724331,
"learning_rate": 5e-06,
"loss": 0.6384,
"step": 100
},
{
"epoch": 0.3434816549570648,
"grad_norm": 0.7465781739155327,
"learning_rate": 5e-06,
"loss": 0.6335,
"step": 110
},
{
"epoch": 0.3747072599531616,
"grad_norm": 0.8250370829383081,
"learning_rate": 5e-06,
"loss": 0.6302,
"step": 120
},
{
"epoch": 0.4059328649492584,
"grad_norm": 0.6341139387710243,
"learning_rate": 5e-06,
"loss": 0.634,
"step": 130
},
{
"epoch": 0.4371584699453552,
"grad_norm": 0.4964005156113376,
"learning_rate": 5e-06,
"loss": 0.6211,
"step": 140
},
{
"epoch": 0.468384074941452,
"grad_norm": 0.9475290669111363,
"learning_rate": 5e-06,
"loss": 0.6271,
"step": 150
},
{
"epoch": 0.4996096799375488,
"grad_norm": 0.7811648794629471,
"learning_rate": 5e-06,
"loss": 0.6238,
"step": 160
},
{
"epoch": 0.5308352849336456,
"grad_norm": 0.516293302775752,
"learning_rate": 5e-06,
"loss": 0.621,
"step": 170
},
{
"epoch": 0.5620608899297423,
"grad_norm": 0.471912073011228,
"learning_rate": 5e-06,
"loss": 0.6215,
"step": 180
},
{
"epoch": 0.5932864949258392,
"grad_norm": 0.5719925024660438,
"learning_rate": 5e-06,
"loss": 0.6193,
"step": 190
},
{
"epoch": 0.624512099921936,
"grad_norm": 0.5059415320269443,
"learning_rate": 5e-06,
"loss": 0.6175,
"step": 200
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.5195408058121892,
"learning_rate": 5e-06,
"loss": 0.622,
"step": 210
},
{
"epoch": 0.6869633099141296,
"grad_norm": 0.6014889400609209,
"learning_rate": 5e-06,
"loss": 0.6158,
"step": 220
},
{
"epoch": 0.7181889149102264,
"grad_norm": 0.5096070261428851,
"learning_rate": 5e-06,
"loss": 0.6177,
"step": 230
},
{
"epoch": 0.7494145199063232,
"grad_norm": 0.6028623695390841,
"learning_rate": 5e-06,
"loss": 0.6148,
"step": 240
},
{
"epoch": 0.78064012490242,
"grad_norm": 0.5451038071079088,
"learning_rate": 5e-06,
"loss": 0.6164,
"step": 250
},
{
"epoch": 0.8118657298985168,
"grad_norm": 0.4708236433706893,
"learning_rate": 5e-06,
"loss": 0.6094,
"step": 260
},
{
"epoch": 0.8430913348946136,
"grad_norm": 0.46109612782168113,
"learning_rate": 5e-06,
"loss": 0.6103,
"step": 270
},
{
"epoch": 0.8743169398907104,
"grad_norm": 0.502648205452055,
"learning_rate": 5e-06,
"loss": 0.6095,
"step": 280
},
{
"epoch": 0.9055425448868072,
"grad_norm": 0.4489395079927774,
"learning_rate": 5e-06,
"loss": 0.6065,
"step": 290
},
{
"epoch": 0.936768149882904,
"grad_norm": 0.6477411963812875,
"learning_rate": 5e-06,
"loss": 0.6132,
"step": 300
},
{
"epoch": 0.9679937548790007,
"grad_norm": 0.5302907770786253,
"learning_rate": 5e-06,
"loss": 0.6111,
"step": 310
},
{
"epoch": 0.9992193598750976,
"grad_norm": 0.45517099030938496,
"learning_rate": 5e-06,
"loss": 0.5986,
"step": 320
},
{
"epoch": 0.9992193598750976,
"eval_loss": 0.6128131151199341,
"eval_runtime": 341.0462,
"eval_samples_per_second": 25.304,
"eval_steps_per_second": 0.396,
"step": 320
},
{
"epoch": 1.0308352849336455,
"grad_norm": 0.8438882865201324,
"learning_rate": 5e-06,
"loss": 0.6164,
"step": 330
},
{
"epoch": 1.0620608899297423,
"grad_norm": 0.4821646175445636,
"learning_rate": 5e-06,
"loss": 0.5662,
"step": 340
},
{
"epoch": 1.0932864949258392,
"grad_norm": 0.5058980149763423,
"learning_rate": 5e-06,
"loss": 0.5606,
"step": 350
},
{
"epoch": 1.124512099921936,
"grad_norm": 0.4781831185352073,
"learning_rate": 5e-06,
"loss": 0.5648,
"step": 360
},
{
"epoch": 1.1557377049180328,
"grad_norm": 0.5048559047058323,
"learning_rate": 5e-06,
"loss": 0.5582,
"step": 370
},
{
"epoch": 1.1869633099141297,
"grad_norm": 0.44414824193518654,
"learning_rate": 5e-06,
"loss": 0.5584,
"step": 380
},
{
"epoch": 1.2181889149102263,
"grad_norm": 0.4928423351798681,
"learning_rate": 5e-06,
"loss": 0.5515,
"step": 390
},
{
"epoch": 1.2494145199063231,
"grad_norm": 0.5064189451582637,
"learning_rate": 5e-06,
"loss": 0.5637,
"step": 400
},
{
"epoch": 1.28064012490242,
"grad_norm": 0.44193713470343654,
"learning_rate": 5e-06,
"loss": 0.5618,
"step": 410
},
{
"epoch": 1.3118657298985168,
"grad_norm": 0.4650381211562015,
"learning_rate": 5e-06,
"loss": 0.5554,
"step": 420
},
{
"epoch": 1.3430913348946136,
"grad_norm": 0.5544428065241478,
"learning_rate": 5e-06,
"loss": 0.5547,
"step": 430
},
{
"epoch": 1.3743169398907105,
"grad_norm": 0.48005595474790913,
"learning_rate": 5e-06,
"loss": 0.5523,
"step": 440
},
{
"epoch": 1.4055425448868073,
"grad_norm": 0.4974548951913249,
"learning_rate": 5e-06,
"loss": 0.5666,
"step": 450
},
{
"epoch": 1.436768149882904,
"grad_norm": 0.4923658625750441,
"learning_rate": 5e-06,
"loss": 0.5558,
"step": 460
},
{
"epoch": 1.4679937548790007,
"grad_norm": 0.5272663506589431,
"learning_rate": 5e-06,
"loss": 0.5584,
"step": 470
},
{
"epoch": 1.4992193598750976,
"grad_norm": 0.5304464959914178,
"learning_rate": 5e-06,
"loss": 0.5643,
"step": 480
},
{
"epoch": 1.5304449648711944,
"grad_norm": 0.5773543616559265,
"learning_rate": 5e-06,
"loss": 0.5598,
"step": 490
},
{
"epoch": 1.561670569867291,
"grad_norm": 0.4558348320273449,
"learning_rate": 5e-06,
"loss": 0.5591,
"step": 500
},
{
"epoch": 1.5928961748633879,
"grad_norm": 0.5072303901122793,
"learning_rate": 5e-06,
"loss": 0.5626,
"step": 510
},
{
"epoch": 1.6241217798594847,
"grad_norm": 0.5369887998410667,
"learning_rate": 5e-06,
"loss": 0.5556,
"step": 520
},
{
"epoch": 1.6553473848555815,
"grad_norm": 0.5556757682627291,
"learning_rate": 5e-06,
"loss": 0.5572,
"step": 530
},
{
"epoch": 1.6865729898516784,
"grad_norm": 0.5337242705677901,
"learning_rate": 5e-06,
"loss": 0.557,
"step": 540
},
{
"epoch": 1.7177985948477752,
"grad_norm": 0.46280527938706506,
"learning_rate": 5e-06,
"loss": 0.5617,
"step": 550
},
{
"epoch": 1.749024199843872,
"grad_norm": 0.45608832514525505,
"learning_rate": 5e-06,
"loss": 0.5581,
"step": 560
},
{
"epoch": 1.7802498048399689,
"grad_norm": 0.48374355780746187,
"learning_rate": 5e-06,
"loss": 0.5564,
"step": 570
},
{
"epoch": 1.8114754098360657,
"grad_norm": 0.5029705354009028,
"learning_rate": 5e-06,
"loss": 0.559,
"step": 580
},
{
"epoch": 1.8427010148321625,
"grad_norm": 0.46966476792976214,
"learning_rate": 5e-06,
"loss": 0.5616,
"step": 590
},
{
"epoch": 1.8739266198282591,
"grad_norm": 0.446283124549817,
"learning_rate": 5e-06,
"loss": 0.553,
"step": 600
},
{
"epoch": 1.905152224824356,
"grad_norm": 0.4745527474098281,
"learning_rate": 5e-06,
"loss": 0.5589,
"step": 610
},
{
"epoch": 1.9363778298204528,
"grad_norm": 0.501609279464785,
"learning_rate": 5e-06,
"loss": 0.5628,
"step": 620
},
{
"epoch": 1.9676034348165494,
"grad_norm": 0.49320626859834116,
"learning_rate": 5e-06,
"loss": 0.5522,
"step": 630
},
{
"epoch": 1.9988290398126463,
"grad_norm": 0.4324557011242181,
"learning_rate": 5e-06,
"loss": 0.5596,
"step": 640
},
{
"epoch": 1.9988290398126463,
"eval_loss": 0.6045193076133728,
"eval_runtime": 340.5129,
"eval_samples_per_second": 25.344,
"eval_steps_per_second": 0.396,
"step": 640
},
{
"epoch": 2.030444964871194,
"grad_norm": 0.6006527642113036,
"learning_rate": 5e-06,
"loss": 0.5662,
"step": 650
},
{
"epoch": 2.061670569867291,
"grad_norm": 0.5376147888211947,
"learning_rate": 5e-06,
"loss": 0.5018,
"step": 660
},
{
"epoch": 2.092896174863388,
"grad_norm": 0.5448017881956769,
"learning_rate": 5e-06,
"loss": 0.5057,
"step": 670
},
{
"epoch": 2.1241217798594847,
"grad_norm": 0.6095347029172922,
"learning_rate": 5e-06,
"loss": 0.5036,
"step": 680
},
{
"epoch": 2.1553473848555815,
"grad_norm": 0.5281790301882382,
"learning_rate": 5e-06,
"loss": 0.5066,
"step": 690
},
{
"epoch": 2.1865729898516784,
"grad_norm": 0.543025537124188,
"learning_rate": 5e-06,
"loss": 0.5086,
"step": 700
},
{
"epoch": 2.217798594847775,
"grad_norm": 0.6024294613229594,
"learning_rate": 5e-06,
"loss": 0.508,
"step": 710
},
{
"epoch": 2.249024199843872,
"grad_norm": 0.5261160691218546,
"learning_rate": 5e-06,
"loss": 0.5073,
"step": 720
},
{
"epoch": 2.280249804839969,
"grad_norm": 0.4878879224650377,
"learning_rate": 5e-06,
"loss": 0.5126,
"step": 730
},
{
"epoch": 2.3114754098360657,
"grad_norm": 0.5298908191049263,
"learning_rate": 5e-06,
"loss": 0.5098,
"step": 740
},
{
"epoch": 2.3427010148321625,
"grad_norm": 0.4963375261761113,
"learning_rate": 5e-06,
"loss": 0.5084,
"step": 750
},
{
"epoch": 2.3739266198282594,
"grad_norm": 0.476625155447844,
"learning_rate": 5e-06,
"loss": 0.5097,
"step": 760
},
{
"epoch": 2.4051522248243558,
"grad_norm": 0.5208071390082176,
"learning_rate": 5e-06,
"loss": 0.5028,
"step": 770
},
{
"epoch": 2.4363778298204526,
"grad_norm": 0.4800697229604007,
"learning_rate": 5e-06,
"loss": 0.5102,
"step": 780
},
{
"epoch": 2.4676034348165494,
"grad_norm": 0.5837948115948769,
"learning_rate": 5e-06,
"loss": 0.5068,
"step": 790
},
{
"epoch": 2.4988290398126463,
"grad_norm": 0.5280421758640109,
"learning_rate": 5e-06,
"loss": 0.5141,
"step": 800
},
{
"epoch": 2.530054644808743,
"grad_norm": 0.4594714496886714,
"learning_rate": 5e-06,
"loss": 0.5081,
"step": 810
},
{
"epoch": 2.56128024980484,
"grad_norm": 0.51076427145537,
"learning_rate": 5e-06,
"loss": 0.5123,
"step": 820
},
{
"epoch": 2.5925058548009368,
"grad_norm": 0.5309550406289082,
"learning_rate": 5e-06,
"loss": 0.5067,
"step": 830
},
{
"epoch": 2.6237314597970336,
"grad_norm": 0.6033885635557941,
"learning_rate": 5e-06,
"loss": 0.519,
"step": 840
},
{
"epoch": 2.6549570647931304,
"grad_norm": 0.5271243446375676,
"learning_rate": 5e-06,
"loss": 0.5048,
"step": 850
},
{
"epoch": 2.6861826697892273,
"grad_norm": 0.5270298610894952,
"learning_rate": 5e-06,
"loss": 0.5165,
"step": 860
},
{
"epoch": 2.717408274785324,
"grad_norm": 0.5222507557357616,
"learning_rate": 5e-06,
"loss": 0.5154,
"step": 870
},
{
"epoch": 2.748633879781421,
"grad_norm": 0.49639435328740067,
"learning_rate": 5e-06,
"loss": 0.5091,
"step": 880
},
{
"epoch": 2.7798594847775178,
"grad_norm": 0.4828475074772525,
"learning_rate": 5e-06,
"loss": 0.5112,
"step": 890
},
{
"epoch": 2.8110850897736146,
"grad_norm": 0.5145958529566682,
"learning_rate": 5e-06,
"loss": 0.5109,
"step": 900
},
{
"epoch": 2.8423106947697114,
"grad_norm": 0.528104324477883,
"learning_rate": 5e-06,
"loss": 0.5166,
"step": 910
},
{
"epoch": 2.873536299765808,
"grad_norm": 0.47920251362694366,
"learning_rate": 5e-06,
"loss": 0.519,
"step": 920
},
{
"epoch": 2.9047619047619047,
"grad_norm": 0.538906478147928,
"learning_rate": 5e-06,
"loss": 0.5181,
"step": 930
},
{
"epoch": 2.9359875097580015,
"grad_norm": 0.5243949264804789,
"learning_rate": 5e-06,
"loss": 0.5134,
"step": 940
},
{
"epoch": 2.9672131147540983,
"grad_norm": 0.47727618067883554,
"learning_rate": 5e-06,
"loss": 0.512,
"step": 950
},
{
"epoch": 2.998438719750195,
"grad_norm": 0.5110204465597075,
"learning_rate": 5e-06,
"loss": 0.5083,
"step": 960
},
{
"epoch": 2.998438719750195,
"eval_loss": 0.608026921749115,
"eval_runtime": 339.6647,
"eval_samples_per_second": 25.407,
"eval_steps_per_second": 0.397,
"step": 960
},
{
"epoch": 2.998438719750195,
"step": 960,
"total_flos": 1607826375966720.0,
"train_loss": 0.5702028140425682,
"train_runtime": 56737.9664,
"train_samples_per_second": 8.669,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 960,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1607826375966720.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}