oldiday's picture
Training in progress, step 162, checkpoint
1693ade verified
{
"best_metric": 0.8367001414299011,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 1.0,
"eval_steps": 50,
"global_step": 162,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006172839506172839,
"eval_loss": 1.9562022686004639,
"eval_runtime": 19.9289,
"eval_samples_per_second": 13.699,
"eval_steps_per_second": 3.462,
"step": 1
},
{
"epoch": 0.018518518518518517,
"grad_norm": 0.5035348534584045,
"learning_rate": 3e-05,
"loss": 1.3592,
"step": 3
},
{
"epoch": 0.037037037037037035,
"grad_norm": 0.5112131834030151,
"learning_rate": 6e-05,
"loss": 1.2165,
"step": 6
},
{
"epoch": 0.05555555555555555,
"grad_norm": 0.600572407245636,
"learning_rate": 9e-05,
"loss": 1.2194,
"step": 9
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.5512685179710388,
"learning_rate": 9.995728791936504e-05,
"loss": 1.1536,
"step": 12
},
{
"epoch": 0.09259259259259259,
"grad_norm": 4.089280605316162,
"learning_rate": 9.973324900566213e-05,
"loss": 1.4903,
"step": 15
},
{
"epoch": 0.1111111111111111,
"grad_norm": 2.117110252380371,
"learning_rate": 9.931806517013612e-05,
"loss": 1.1609,
"step": 18
},
{
"epoch": 0.12962962962962962,
"grad_norm": 2.3771092891693115,
"learning_rate": 9.871333213161438e-05,
"loss": 1.062,
"step": 21
},
{
"epoch": 0.14814814814814814,
"grad_norm": 1.8321698904037476,
"learning_rate": 9.792137412291265e-05,
"loss": 1.112,
"step": 24
},
{
"epoch": 0.16666666666666666,
"grad_norm": 1.5089828968048096,
"learning_rate": 9.694523495787149e-05,
"loss": 0.9636,
"step": 27
},
{
"epoch": 0.18518518518518517,
"grad_norm": 1.330153465270996,
"learning_rate": 9.578866633275288e-05,
"loss": 0.8645,
"step": 30
},
{
"epoch": 0.2037037037037037,
"grad_norm": 1.6392351388931274,
"learning_rate": 9.445611340695926e-05,
"loss": 0.994,
"step": 33
},
{
"epoch": 0.2222222222222222,
"grad_norm": 2.0593364238739014,
"learning_rate": 9.295269771849427e-05,
"loss": 0.9373,
"step": 36
},
{
"epoch": 0.24074074074074073,
"grad_norm": 1.6057007312774658,
"learning_rate": 9.12841974998278e-05,
"loss": 0.8773,
"step": 39
},
{
"epoch": 0.25925925925925924,
"grad_norm": 0.8941075801849365,
"learning_rate": 8.945702546981969e-05,
"loss": 1.0678,
"step": 42
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.4377215504646301,
"learning_rate": 8.74782041870563e-05,
"loss": 1.0822,
"step": 45
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.44285309314727783,
"learning_rate": 8.535533905932738e-05,
"loss": 1.0699,
"step": 48
},
{
"epoch": 0.30864197530864196,
"eval_loss": 1.018227458000183,
"eval_runtime": 20.0515,
"eval_samples_per_second": 13.615,
"eval_steps_per_second": 3.441,
"step": 50
},
{
"epoch": 0.3148148148148148,
"grad_norm": 0.3906916379928589,
"learning_rate": 8.309658911297834e-05,
"loss": 1.0028,
"step": 51
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.8289111852645874,
"learning_rate": 8.07106356344834e-05,
"loss": 1.151,
"step": 54
},
{
"epoch": 0.35185185185185186,
"grad_norm": 1.4784194231033325,
"learning_rate": 7.820664880476256e-05,
"loss": 0.8767,
"step": 57
},
{
"epoch": 0.37037037037037035,
"grad_norm": 1.4373376369476318,
"learning_rate": 7.559425245448006e-05,
"loss": 0.9452,
"step": 60
},
{
"epoch": 0.3888888888888889,
"grad_norm": 1.172136664390564,
"learning_rate": 7.288348707578408e-05,
"loss": 0.8619,
"step": 63
},
{
"epoch": 0.4074074074074074,
"grad_norm": 1.534406304359436,
"learning_rate": 7.008477123264848e-05,
"loss": 0.7924,
"step": 66
},
{
"epoch": 0.42592592592592593,
"grad_norm": 1.261856198310852,
"learning_rate": 6.720886151813194e-05,
"loss": 0.8172,
"step": 69
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.2206156253814697,
"learning_rate": 6.426681121245527e-05,
"loss": 0.8421,
"step": 72
},
{
"epoch": 0.46296296296296297,
"grad_norm": 1.6020294427871704,
"learning_rate": 6.126992780079031e-05,
"loss": 0.9739,
"step": 75
},
{
"epoch": 0.48148148148148145,
"grad_norm": 1.7759839296340942,
"learning_rate": 5.8229729514036705e-05,
"loss": 0.8757,
"step": 78
},
{
"epoch": 0.5,
"grad_norm": 0.49982303380966187,
"learning_rate": 5.515790105961786e-05,
"loss": 0.8141,
"step": 81
},
{
"epoch": 0.5185185185185185,
"grad_norm": 0.46078914403915405,
"learning_rate": 5.2066248712440656e-05,
"loss": 1.0369,
"step": 84
},
{
"epoch": 0.5370370370370371,
"grad_norm": 0.5604984164237976,
"learning_rate": 4.8966654938622295e-05,
"loss": 1.0657,
"step": 87
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.4341430962085724,
"learning_rate": 4.5871032726383386e-05,
"loss": 1.0451,
"step": 90
},
{
"epoch": 0.5740740740740741,
"grad_norm": 0.4503065049648285,
"learning_rate": 4.2791279799632666e-05,
"loss": 1.0712,
"step": 93
},
{
"epoch": 0.5925925925925926,
"grad_norm": 1.6895031929016113,
"learning_rate": 3.973923289021829e-05,
"loss": 1.0168,
"step": 96
},
{
"epoch": 0.6111111111111112,
"grad_norm": 1.4327552318572998,
"learning_rate": 3.67266222445964e-05,
"loss": 0.8845,
"step": 99
},
{
"epoch": 0.6172839506172839,
"eval_loss": 0.884192168712616,
"eval_runtime": 20.2826,
"eval_samples_per_second": 13.46,
"eval_steps_per_second": 3.402,
"step": 100
},
{
"epoch": 0.6296296296296297,
"grad_norm": 1.1935515403747559,
"learning_rate": 3.3765026539765834e-05,
"loss": 0.8139,
"step": 102
},
{
"epoch": 0.6481481481481481,
"grad_norm": 1.2995893955230713,
"learning_rate": 3.086582838174551e-05,
"loss": 0.893,
"step": 105
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.3577182292938232,
"learning_rate": 2.804017055763149e-05,
"loss": 0.9254,
"step": 108
},
{
"epoch": 0.6851851851851852,
"grad_norm": 1.3016126155853271,
"learning_rate": 2.529891320937481e-05,
"loss": 0.8247,
"step": 111
},
{
"epoch": 0.7037037037037037,
"grad_norm": 1.1968082189559937,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.8928,
"step": 114
},
{
"epoch": 0.7222222222222222,
"grad_norm": 1.577446699142456,
"learning_rate": 2.0111378089837956e-05,
"loss": 0.8642,
"step": 117
},
{
"epoch": 0.7407407407407407,
"grad_norm": 1.3656805753707886,
"learning_rate": 1.768503810695295e-05,
"loss": 0.678,
"step": 120
},
{
"epoch": 0.7592592592592593,
"grad_norm": 0.3362742066383362,
"learning_rate": 1.5382897547758514e-05,
"loss": 1.044,
"step": 123
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.31672272086143494,
"learning_rate": 1.3213804466343421e-05,
"loss": 0.9984,
"step": 126
},
{
"epoch": 0.7962962962962963,
"grad_norm": 0.3221389055252075,
"learning_rate": 1.118609556171213e-05,
"loss": 1.0372,
"step": 129
},
{
"epoch": 0.8148148148148148,
"grad_norm": 0.37123388051986694,
"learning_rate": 9.307564136490254e-06,
"loss": 1.0451,
"step": 132
},
{
"epoch": 0.8333333333333334,
"grad_norm": 1.3557378053665161,
"learning_rate": 7.585430144121319e-06,
"loss": 0.9801,
"step": 135
},
{
"epoch": 0.8518518518518519,
"grad_norm": 1.2644445896148682,
"learning_rate": 6.026312439675552e-06,
"loss": 0.8632,
"step": 138
},
{
"epoch": 0.8703703703703703,
"grad_norm": 1.4030702114105225,
"learning_rate": 4.636203340922008e-06,
"loss": 0.9115,
"step": 141
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.1068618297576904,
"learning_rate": 3.420445597436056e-06,
"loss": 0.7584,
"step": 144
},
{
"epoch": 0.9074074074074074,
"grad_norm": 1.4436562061309814,
"learning_rate": 2.3837118562592797e-06,
"loss": 0.9516,
"step": 147
},
{
"epoch": 0.9259259259259259,
"grad_norm": 1.3743484020233154,
"learning_rate": 1.5299867030334814e-06,
"loss": 0.8574,
"step": 150
},
{
"epoch": 0.9259259259259259,
"eval_loss": 0.8367001414299011,
"eval_runtime": 20.0366,
"eval_samples_per_second": 13.625,
"eval_steps_per_second": 3.444,
"step": 150
},
{
"epoch": 0.9444444444444444,
"grad_norm": 1.1355260610580444,
"learning_rate": 8.62551347632029e-07,
"loss": 0.6921,
"step": 153
},
{
"epoch": 0.9629629629629629,
"grad_norm": 1.2376205921173096,
"learning_rate": 3.839710131477492e-07,
"loss": 0.7755,
"step": 156
},
{
"epoch": 0.9814814814814815,
"grad_norm": 1.5938589572906494,
"learning_rate": 9.60850767065924e-08,
"loss": 0.7325,
"step": 159
},
{
"epoch": 1.0,
"grad_norm": 1.6113556623458862,
"learning_rate": 0.0,
"loss": 0.7744,
"step": 162
}
],
"logging_steps": 3,
"max_steps": 162,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1994758268256256e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}