{ "best_metric": 7.851955413818359, "best_model_checkpoint": "bert_tiny_lda_100_v1/checkpoint-30000", "epoch": 25.0, "eval_steps": 10000, "global_step": 35725, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.34989503149055284, "grad_norm": 2.1483612060546875, "learning_rate": 5e-06, "loss": 14.2732, "step": 500 }, { "epoch": 0.6997900629811057, "grad_norm": 0.9536421298980713, "learning_rate": 1e-05, "loss": 12.5971, "step": 1000 }, { "epoch": 1.0496850944716585, "grad_norm": 0.5777405500411987, "learning_rate": 1.5e-05, "loss": 11.6575, "step": 1500 }, { "epoch": 1.3995801259622114, "grad_norm": 0.6296975016593933, "learning_rate": 2e-05, "loss": 11.4706, "step": 2000 }, { "epoch": 1.749475157452764, "grad_norm": 0.8440209627151489, "learning_rate": 2.5e-05, "loss": 11.3595, "step": 2500 }, { "epoch": 2.099370188943317, "grad_norm": 0.8582096695899963, "learning_rate": 3e-05, "loss": 11.2625, "step": 3000 }, { "epoch": 2.44926522043387, "grad_norm": 0.7912972569465637, "learning_rate": 3.5e-05, "loss": 11.1716, "step": 3500 }, { "epoch": 2.7991602519244227, "grad_norm": 0.8343620300292969, "learning_rate": 4e-05, "loss": 11.0956, "step": 4000 }, { "epoch": 3.1490552834149756, "grad_norm": 0.9754597544670105, "learning_rate": 4.5e-05, "loss": 11.0411, "step": 4500 }, { "epoch": 3.498950314905528, "grad_norm": 0.8357794880867004, "learning_rate": 5e-05, "loss": 10.9979, "step": 5000 }, { "epoch": 3.8488453463960814, "grad_norm": 0.5983511209487915, "learning_rate": 5.500000000000001e-05, "loss": 10.955, "step": 5500 }, { "epoch": 4.198740377886634, "grad_norm": 0.6659268736839294, "learning_rate": 6e-05, "loss": 10.9206, "step": 6000 }, { "epoch": 4.548635409377187, "grad_norm": 0.595109224319458, "learning_rate": 6.500000000000001e-05, "loss": 10.8873, "step": 6500 }, { "epoch": 4.89853044086774, "grad_norm": 0.6366376280784607, "learning_rate": 7e-05, "loss": 10.8587, "step": 7000 }, { "epoch": 5.248425472358292, "grad_norm": 0.5867002606391907, "learning_rate": 7.500000000000001e-05, "loss": 10.8289, "step": 7500 }, { "epoch": 5.5983205038488455, "grad_norm": 0.58534175157547, "learning_rate": 8e-05, "loss": 10.8007, "step": 8000 }, { "epoch": 5.948215535339398, "grad_norm": 0.5407905578613281, "learning_rate": 8.5e-05, "loss": 10.7803, "step": 8500 }, { "epoch": 6.298110566829951, "grad_norm": 0.6819906830787659, "learning_rate": 9e-05, "loss": 10.7548, "step": 9000 }, { "epoch": 6.648005598320504, "grad_norm": 0.5426816940307617, "learning_rate": 9.5e-05, "loss": 10.7349, "step": 9500 }, { "epoch": 6.997900629811056, "grad_norm": 0.5664660334587097, "learning_rate": 0.0001, "loss": 10.7151, "step": 10000 }, { "epoch": 6.997900629811056, "eval_accuracy": 0.15461471909798777, "eval_loss": 10.65131664276123, "eval_runtime": 1.0023, "eval_samples_per_second": 477.92, "eval_steps_per_second": 2.993, "step": 10000 }, { "epoch": 7.3477956613016095, "grad_norm": 0.5446542501449585, "learning_rate": 9.805636540330418e-05, "loss": 10.6913, "step": 10500 }, { "epoch": 7.697690692792162, "grad_norm": 0.5372028350830078, "learning_rate": 9.611273080660836e-05, "loss": 10.6794, "step": 11000 }, { "epoch": 8.047585724282715, "grad_norm": 0.5421639680862427, "learning_rate": 9.416909620991254e-05, "loss": 10.659, "step": 11500 }, { "epoch": 8.397480755773268, "grad_norm": 0.6829137206077576, "learning_rate": 9.222546161321672e-05, "loss": 10.643, "step": 12000 }, { "epoch": 8.74737578726382, "grad_norm": 0.5330397486686707, "learning_rate": 9.02818270165209e-05, "loss": 10.6303, "step": 12500 }, { "epoch": 9.097270818754374, "grad_norm": 0.5683152675628662, "learning_rate": 8.833819241982508e-05, "loss": 10.618, "step": 13000 }, { "epoch": 9.447165850244927, "grad_norm": 0.5669359564781189, "learning_rate": 8.639455782312925e-05, "loss": 10.6011, "step": 13500 }, { "epoch": 9.79706088173548, "grad_norm": 0.5618341565132141, "learning_rate": 8.445092322643343e-05, "loss": 10.5924, "step": 14000 }, { "epoch": 10.146955913226032, "grad_norm": 0.5494099259376526, "learning_rate": 8.250728862973761e-05, "loss": 10.5817, "step": 14500 }, { "epoch": 10.496850944716584, "grad_norm": 0.5508003830909729, "learning_rate": 8.056365403304179e-05, "loss": 10.5671, "step": 15000 }, { "epoch": 10.846745976207139, "grad_norm": 0.5670793652534485, "learning_rate": 7.862001943634597e-05, "loss": 10.5603, "step": 15500 }, { "epoch": 11.196641007697691, "grad_norm": 0.6135077476501465, "learning_rate": 7.667638483965015e-05, "loss": 10.5414, "step": 16000 }, { "epoch": 11.546536039188243, "grad_norm": 0.6429851055145264, "learning_rate": 7.473275024295433e-05, "loss": 10.5216, "step": 16500 }, { "epoch": 11.896431070678796, "grad_norm": 0.6677358746528625, "learning_rate": 7.27891156462585e-05, "loss": 10.497, "step": 17000 }, { "epoch": 12.246326102169348, "grad_norm": 0.727293848991394, "learning_rate": 7.08454810495627e-05, "loss": 10.3961, "step": 17500 }, { "epoch": 12.596221133659903, "grad_norm": 0.9938657283782959, "learning_rate": 6.890184645286687e-05, "loss": 10.1991, "step": 18000 }, { "epoch": 12.946116165150455, "grad_norm": 0.9696453809738159, "learning_rate": 6.695821185617104e-05, "loss": 10.065, "step": 18500 }, { "epoch": 13.296011196641008, "grad_norm": 1.010794758796692, "learning_rate": 6.501457725947522e-05, "loss": 9.9419, "step": 19000 }, { "epoch": 13.64590622813156, "grad_norm": 1.2189589738845825, "learning_rate": 6.30709426627794e-05, "loss": 9.848, "step": 19500 }, { "epoch": 13.995801259622114, "grad_norm": 1.4994540214538574, "learning_rate": 6.112730806608357e-05, "loss": 9.7707, "step": 20000 }, { "epoch": 13.995801259622114, "eval_accuracy": 0.194632577040298, "eval_loss": 9.61900520324707, "eval_runtime": 0.9914, "eval_samples_per_second": 483.146, "eval_steps_per_second": 3.026, "step": 20000 }, { "epoch": 14.345696291112667, "grad_norm": 1.950490951538086, "learning_rate": 5.918367346938776e-05, "loss": 9.6452, "step": 20500 }, { "epoch": 14.695591322603219, "grad_norm": 1.7163243293762207, "learning_rate": 5.724003887269194e-05, "loss": 9.4926, "step": 21000 }, { "epoch": 15.045486354093772, "grad_norm": 2.8089776039123535, "learning_rate": 5.529640427599612e-05, "loss": 9.3468, "step": 21500 }, { "epoch": 15.395381385584324, "grad_norm": 2.505420207977295, "learning_rate": 5.3352769679300295e-05, "loss": 9.2153, "step": 22000 }, { "epoch": 15.745276417074878, "grad_norm": 1.929539680480957, "learning_rate": 5.1409135082604474e-05, "loss": 9.1104, "step": 22500 }, { "epoch": 16.09517144856543, "grad_norm": 1.953999638557434, "learning_rate": 4.946550048590865e-05, "loss": 9.0218, "step": 23000 }, { "epoch": 16.445066480055985, "grad_norm": 2.3553173542022705, "learning_rate": 4.752186588921283e-05, "loss": 8.9414, "step": 23500 }, { "epoch": 16.794961511546536, "grad_norm": 2.3242921829223633, "learning_rate": 4.557823129251701e-05, "loss": 8.8719, "step": 24000 }, { "epoch": 17.14485654303709, "grad_norm": 2.3202247619628906, "learning_rate": 4.363459669582119e-05, "loss": 8.8015, "step": 24500 }, { "epoch": 17.49475157452764, "grad_norm": 2.1002941131591797, "learning_rate": 4.1690962099125366e-05, "loss": 8.7419, "step": 25000 }, { "epoch": 17.844646606018195, "grad_norm": 2.1744942665100098, "learning_rate": 3.9747327502429545e-05, "loss": 8.6877, "step": 25500 }, { "epoch": 18.19454163750875, "grad_norm": 2.131441116333008, "learning_rate": 3.780369290573372e-05, "loss": 8.609, "step": 26000 }, { "epoch": 18.5444366689993, "grad_norm": 2.32468318939209, "learning_rate": 3.58600583090379e-05, "loss": 8.4935, "step": 26500 }, { "epoch": 18.894331700489854, "grad_norm": 2.470233917236328, "learning_rate": 3.391642371234208e-05, "loss": 8.4095, "step": 27000 }, { "epoch": 19.244226731980405, "grad_norm": 2.6915345191955566, "learning_rate": 3.1972789115646265e-05, "loss": 8.3349, "step": 27500 }, { "epoch": 19.59412176347096, "grad_norm": 2.169857978820801, "learning_rate": 3.0029154518950437e-05, "loss": 8.2701, "step": 28000 }, { "epoch": 19.944016794961513, "grad_norm": 2.2447140216827393, "learning_rate": 2.8085519922254615e-05, "loss": 8.2201, "step": 28500 }, { "epoch": 20.293911826452064, "grad_norm": 1.986703872680664, "learning_rate": 2.6141885325558797e-05, "loss": 8.1657, "step": 29000 }, { "epoch": 20.643806857942618, "grad_norm": 1.892199158668518, "learning_rate": 2.4198250728862976e-05, "loss": 8.1212, "step": 29500 }, { "epoch": 20.99370188943317, "grad_norm": 1.9202613830566406, "learning_rate": 2.225461613216715e-05, "loss": 8.0878, "step": 30000 }, { "epoch": 20.99370188943317, "eval_accuracy": 0.35927231702157664, "eval_loss": 7.851955413818359, "eval_runtime": 1.0013, "eval_samples_per_second": 478.39, "eval_steps_per_second": 2.996, "step": 30000 }, { "epoch": 21.343596920923723, "grad_norm": 1.9794193506240845, "learning_rate": 2.0310981535471333e-05, "loss": 8.0507, "step": 30500 }, { "epoch": 21.693491952414277, "grad_norm": 1.9899263381958008, "learning_rate": 1.836734693877551e-05, "loss": 8.0214, "step": 31000 }, { "epoch": 22.043386983904828, "grad_norm": 1.9592680931091309, "learning_rate": 1.642371234207969e-05, "loss": 7.9971, "step": 31500 }, { "epoch": 22.393282015395382, "grad_norm": 1.8842312097549438, "learning_rate": 1.4480077745383868e-05, "loss": 7.97, "step": 32000 }, { "epoch": 22.743177046885933, "grad_norm": 1.9430269002914429, "learning_rate": 1.2536443148688048e-05, "loss": 7.9548, "step": 32500 }, { "epoch": 23.093072078376487, "grad_norm": 1.8178030252456665, "learning_rate": 1.0592808551992225e-05, "loss": 7.939, "step": 33000 }, { "epoch": 23.44296710986704, "grad_norm": 1.8209426403045654, "learning_rate": 8.649173955296405e-06, "loss": 7.9209, "step": 33500 }, { "epoch": 23.792862141357592, "grad_norm": 1.7719342708587646, "learning_rate": 6.705539358600584e-06, "loss": 7.9128, "step": 34000 }, { "epoch": 24.142757172848146, "grad_norm": 1.7601419687271118, "learning_rate": 4.7619047619047615e-06, "loss": 7.8951, "step": 34500 }, { "epoch": 24.492652204338697, "grad_norm": 1.85917067527771, "learning_rate": 2.818270165208941e-06, "loss": 7.8899, "step": 35000 }, { "epoch": 24.84254723582925, "grad_norm": 1.7753186225891113, "learning_rate": 8.746355685131196e-07, "loss": 7.8851, "step": 35500 }, { "epoch": 25.0, "step": 35725, "total_flos": 3.058354515764736e+17, "train_loss": 9.78783958952939, "train_runtime": 21050.1008, "train_samples_per_second": 271.541, "train_steps_per_second": 1.697 } ], "logging_steps": 500, "max_steps": 35725, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.058354515764736e+17, "train_batch_size": 160, "trial_name": null, "trial_params": null }