{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 200, "global_step": 633, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04739336492890995, "grad_norm": 2.9766806187481474, "learning_rate": 1.5625e-06, "loss": 0.771, "step": 10 }, { "epoch": 0.0947867298578199, "grad_norm": 1.5418621232873402, "learning_rate": 3.125e-06, "loss": 0.648, "step": 20 }, { "epoch": 0.14218009478672985, "grad_norm": 1.1814688641528344, "learning_rate": 4.6875000000000004e-06, "loss": 0.6178, "step": 30 }, { "epoch": 0.1895734597156398, "grad_norm": 1.2364175375012094, "learning_rate": 6.25e-06, "loss": 0.5823, "step": 40 }, { "epoch": 0.23696682464454977, "grad_norm": 1.1860326420716258, "learning_rate": 7.8125e-06, "loss": 0.5708, "step": 50 }, { "epoch": 0.2843601895734597, "grad_norm": 1.2987360775063614, "learning_rate": 9.375000000000001e-06, "loss": 0.5932, "step": 60 }, { "epoch": 0.33175355450236965, "grad_norm": 1.0430206308500805, "learning_rate": 9.997256670306478e-06, "loss": 0.5842, "step": 70 }, { "epoch": 0.3791469194312796, "grad_norm": 1.1828482935350693, "learning_rate": 9.980502778148438e-06, "loss": 0.594, "step": 80 }, { "epoch": 0.4265402843601896, "grad_norm": 1.1308268179030505, "learning_rate": 9.948570063910216e-06, "loss": 0.5801, "step": 90 }, { "epoch": 0.47393364928909953, "grad_norm": 1.1117789496343384, "learning_rate": 9.901555847282123e-06, "loss": 0.5881, "step": 100 }, { "epoch": 0.5213270142180095, "grad_norm": 1.0838661804968213, "learning_rate": 9.839603411073388e-06, "loss": 0.5972, "step": 110 }, { "epoch": 0.5687203791469194, "grad_norm": 1.0764508726699293, "learning_rate": 9.762901564536523e-06, "loss": 0.5694, "step": 120 }, { "epoch": 0.6161137440758294, "grad_norm": 1.0699310149411376, "learning_rate": 9.671684067943056e-06, "loss": 0.5699, "step": 130 }, { "epoch": 0.6635071090047393, "grad_norm": 0.9389457300289639, "learning_rate": 9.566228920164405e-06, "loss": 0.5485, "step": 140 }, { "epoch": 0.7109004739336493, "grad_norm": 0.996512385216608, "learning_rate": 9.446857511429e-06, "loss": 0.5738, "step": 150 }, { "epoch": 0.7582938388625592, "grad_norm": 1.0595858551516084, "learning_rate": 9.313933643837825e-06, "loss": 0.5784, "step": 160 }, { "epoch": 0.8056872037914692, "grad_norm": 1.071792736625516, "learning_rate": 9.167862422623474e-06, "loss": 0.5623, "step": 170 }, { "epoch": 0.8530805687203792, "grad_norm": 1.1011190811844587, "learning_rate": 9.009089021531777e-06, "loss": 0.564, "step": 180 }, { "epoch": 0.9004739336492891, "grad_norm": 1.112546020287585, "learning_rate": 8.838097326088667e-06, "loss": 0.5651, "step": 190 }, { "epoch": 0.9478672985781991, "grad_norm": 1.0393198922082802, "learning_rate": 8.65540845888717e-06, "loss": 0.591, "step": 200 }, { "epoch": 0.9478672985781991, "eval_loss": 0.5885207056999207, "eval_runtime": 45.2622, "eval_samples_per_second": 16.57, "eval_steps_per_second": 4.154, "step": 200 }, { "epoch": 0.995260663507109, "grad_norm": 0.9432833478680037, "learning_rate": 8.46157919138889e-06, "loss": 0.5708, "step": 210 }, { "epoch": 1.042654028436019, "grad_norm": 1.2074054053167964, "learning_rate": 8.257200247080249e-06, "loss": 0.4553, "step": 220 }, { "epoch": 1.0900473933649288, "grad_norm": 1.0112015146309683, "learning_rate": 8.042894501154937e-06, "loss": 0.4385, "step": 230 }, { "epoch": 1.1374407582938388, "grad_norm": 1.1628537371522318, "learning_rate": 7.819315082209217e-06, "loss": 0.4338, "step": 240 }, { "epoch": 1.1848341232227488, "grad_norm": 1.170928047599835, "learning_rate": 7.587143381735498e-06, "loss": 0.4436, "step": 250 }, { "epoch": 1.2322274881516588, "grad_norm": 1.0940229070187266, "learning_rate": 7.347086977480552e-06, "loss": 0.4437, "step": 260 }, { "epoch": 1.2796208530805688, "grad_norm": 1.078497079284251, "learning_rate": 7.09987747699721e-06, "loss": 0.4248, "step": 270 }, { "epoch": 1.3270142180094786, "grad_norm": 1.1732721334517928, "learning_rate": 6.846268287961667e-06, "loss": 0.4406, "step": 280 }, { "epoch": 1.3744075829383886, "grad_norm": 1.0531298774229871, "learning_rate": 6.587032322051667e-06, "loss": 0.4136, "step": 290 }, { "epoch": 1.4218009478672986, "grad_norm": 1.096485062882827, "learning_rate": 6.32295963938335e-06, "loss": 0.4362, "step": 300 }, { "epoch": 1.4691943127962086, "grad_norm": 1.0392884098831818, "learning_rate": 6.05485504068568e-06, "loss": 0.4522, "step": 310 }, { "epoch": 1.5165876777251186, "grad_norm": 1.083422217671995, "learning_rate": 5.783535614550666e-06, "loss": 0.4388, "step": 320 }, { "epoch": 1.5639810426540284, "grad_norm": 1.0753748282048554, "learning_rate": 5.509828247234505e-06, "loss": 0.4235, "step": 330 }, { "epoch": 1.6113744075829384, "grad_norm": 1.0872592425407301, "learning_rate": 5.234567102598881e-06, "loss": 0.4403, "step": 340 }, { "epoch": 1.6587677725118484, "grad_norm": 1.1025275218625015, "learning_rate": 4.958591079872667e-06, "loss": 0.4418, "step": 350 }, { "epoch": 1.7061611374407581, "grad_norm": 1.1353816002182384, "learning_rate": 4.682741256981922e-06, "loss": 0.4337, "step": 360 }, { "epoch": 1.7535545023696684, "grad_norm": 1.0931888494924935, "learning_rate": 4.407858327239952e-06, "loss": 0.4336, "step": 370 }, { "epoch": 1.8009478672985781, "grad_norm": 1.0508589383610711, "learning_rate": 4.134780037209563e-06, "loss": 0.4098, "step": 380 }, { "epoch": 1.8483412322274881, "grad_norm": 1.1111362166640932, "learning_rate": 3.864338633545956e-06, "loss": 0.4406, "step": 390 }, { "epoch": 1.8957345971563981, "grad_norm": 1.1166936800784941, "learning_rate": 3.597358326601413e-06, "loss": 0.4333, "step": 400 }, { "epoch": 1.8957345971563981, "eval_loss": 0.591492772102356, "eval_runtime": 47.8117, "eval_samples_per_second": 15.687, "eval_steps_per_second": 3.932, "step": 400 }, { "epoch": 1.943127962085308, "grad_norm": 1.0991265238055437, "learning_rate": 3.334652778521813e-06, "loss": 0.4373, "step": 410 }, { "epoch": 1.9905213270142181, "grad_norm": 1.1198063285390958, "learning_rate": 3.077022623490371e-06, "loss": 0.4386, "step": 420 }, { "epoch": 2.037914691943128, "grad_norm": 1.931503214588018, "learning_rate": 2.825253027676026e-06, "loss": 0.323, "step": 430 }, { "epoch": 2.085308056872038, "grad_norm": 1.3832270008763752, "learning_rate": 2.580111296322904e-06, "loss": 0.2859, "step": 440 }, { "epoch": 2.132701421800948, "grad_norm": 1.3099804566732292, "learning_rate": 2.342344535273608e-06, "loss": 0.3013, "step": 450 }, { "epoch": 2.1800947867298577, "grad_norm": 1.2557111742602929, "learning_rate": 2.112677374053164e-06, "loss": 0.2807, "step": 460 }, { "epoch": 2.227488151658768, "grad_norm": 1.207378114432126, "learning_rate": 1.8918097574529193e-06, "loss": 0.2761, "step": 470 }, { "epoch": 2.2748815165876777, "grad_norm": 1.2390653750205476, "learning_rate": 1.68041481234479e-06, "loss": 0.2968, "step": 480 }, { "epoch": 2.322274881516588, "grad_norm": 1.333523368962914, "learning_rate": 1.4791367962271425e-06, "loss": 0.2831, "step": 490 }, { "epoch": 2.3696682464454977, "grad_norm": 1.3778251642296175, "learning_rate": 1.2885891337543539e-06, "loss": 0.2818, "step": 500 }, { "epoch": 2.4170616113744074, "grad_norm": 1.2887264302961001, "learning_rate": 1.1093525472340471e-06, "loss": 0.2818, "step": 510 }, { "epoch": 2.4644549763033177, "grad_norm": 1.2852430735542881, "learning_rate": 9.419732867896048e-07, "loss": 0.3011, "step": 520 }, { "epoch": 2.5118483412322274, "grad_norm": 1.2266552796699188, "learning_rate": 7.869614655817576e-07, "loss": 0.2898, "step": 530 }, { "epoch": 2.5592417061611377, "grad_norm": 1.2756903884606243, "learning_rate": 6.44789505162955e-07, "loss": 0.2755, "step": 540 }, { "epoch": 2.6066350710900474, "grad_norm": 1.2233314152937933, "learning_rate": 5.158906957025079e-07, "loss": 0.2862, "step": 550 }, { "epoch": 2.654028436018957, "grad_norm": 1.250585071594385, "learning_rate": 4.0065787547042543e-07, "loss": 0.2818, "step": 560 }, { "epoch": 2.7014218009478674, "grad_norm": 1.2631821069852571, "learning_rate": 2.994422336044345e-07, "loss": 0.2826, "step": 570 }, { "epoch": 2.748815165876777, "grad_norm": 1.2436223396448967, "learning_rate": 2.1255223980891027e-07, "loss": 0.2863, "step": 580 }, { "epoch": 2.7962085308056874, "grad_norm": 1.3410061615236049, "learning_rate": 1.402527042476276e-07, "loss": 0.2775, "step": 590 }, { "epoch": 2.843601895734597, "grad_norm": 1.2695922290249133, "learning_rate": 8.276397049545359e-08, "loss": 0.2899, "step": 600 }, { "epoch": 2.843601895734597, "eval_loss": 0.6522307991981506, "eval_runtime": 48.4889, "eval_samples_per_second": 15.467, "eval_steps_per_second": 3.877, "step": 600 }, { "epoch": 2.890995260663507, "grad_norm": 1.240766088679871, "learning_rate": 4.026124400856479e-08, "loss": 0.2883, "step": 610 }, { "epoch": 2.938388625592417, "grad_norm": 1.1818503565161846, "learning_rate": 1.2874058159796366e-08, "loss": 0.286, "step": 620 }, { "epoch": 2.985781990521327, "grad_norm": 1.2123342176587113, "learning_rate": 6.858794664449386e-10, "loss": 0.2764, "step": 630 }, { "epoch": 3.0, "step": 633, "total_flos": 52725334671360.0, "train_loss": 0.01484178630473301, "train_runtime": 365.2303, "train_samples_per_second": 55.444, "train_steps_per_second": 1.733 } ], "logging_steps": 10, "max_steps": 633, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 52725334671360.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }