{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 439, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04555808656036447, "grad_norm": 0.5296587944030762, "learning_rate": 9.948875483893885e-05, "loss": 0.3433, "step": 20 }, { "epoch": 0.09111617312072894, "grad_norm": 0.042843956500291824, "learning_rate": 9.796547422034374e-05, "loss": 0.0076, "step": 40 }, { "epoch": 0.11389521640091116, "eval_loss": 0.005896100774407387, "eval_runtime": 122.4009, "eval_samples_per_second": 7.957, "eval_steps_per_second": 0.4, "step": 50 }, { "epoch": 0.1366742596810934, "grad_norm": 0.04506906867027283, "learning_rate": 9.546130893802246e-05, "loss": 0.006, "step": 60 }, { "epoch": 0.18223234624145787, "grad_norm": 0.04249206930398941, "learning_rate": 9.20274686872984e-05, "loss": 0.006, "step": 80 }, { "epoch": 0.22779043280182232, "grad_norm": 0.034582458436489105, "learning_rate": 8.773417483665309e-05, "loss": 0.0054, "step": 100 }, { "epoch": 0.22779043280182232, "eval_loss": 0.004958716221153736, "eval_runtime": 122.3931, "eval_samples_per_second": 7.958, "eval_steps_per_second": 0.4, "step": 100 }, { "epoch": 0.2733485193621868, "grad_norm": 0.09121271967887878, "learning_rate": 8.266922441433284e-05, "loss": 0.0043, "step": 120 }, { "epoch": 0.31890660592255127, "grad_norm": 0.025009050965309143, "learning_rate": 7.693619467611464e-05, "loss": 0.0041, "step": 140 }, { "epoch": 0.3416856492027335, "eval_loss": 0.0036018535029143095, "eval_runtime": 122.4383, "eval_samples_per_second": 7.955, "eval_steps_per_second": 0.4, "step": 150 }, { "epoch": 0.36446469248291574, "grad_norm": 0.06171397864818573, "learning_rate": 7.065232497047384e-05, "loss": 0.0036, "step": 160 }, { "epoch": 0.41002277904328016, "grad_norm": 0.0585937425494194, "learning_rate": 6.394611921660036e-05, "loss": 0.003, "step": 180 }, { "epoch": 0.45558086560364464, "grad_norm": 0.03573513403534889, "learning_rate": 5.695471802412413e-05, "loss": 0.0026, "step": 200 }, { "epoch": 0.45558086560364464, "eval_loss": 0.0025479402393102646, "eval_runtime": 122.4406, "eval_samples_per_second": 7.955, "eval_steps_per_second": 0.4, "step": 200 }, { "epoch": 0.5011389521640092, "grad_norm": 0.08403529226779938, "learning_rate": 4.982109419419277e-05, "loss": 0.0023, "step": 220 }, { "epoch": 0.5466970387243736, "grad_norm": 0.08128103613853455, "learning_rate": 4.269112895336161e-05, "loss": 0.0025, "step": 240 }, { "epoch": 0.5694760820045558, "eval_loss": 0.002369565423578024, "eval_runtime": 122.4121, "eval_samples_per_second": 7.957, "eval_steps_per_second": 0.4, "step": 250 }, { "epoch": 0.592255125284738, "grad_norm": 0.04797542467713356, "learning_rate": 3.5710628710747e-05, "loss": 0.0025, "step": 260 }, { "epoch": 0.6378132118451025, "grad_norm": 0.041951198130846024, "learning_rate": 2.9022343345181846e-05, "loss": 0.0022, "step": 280 }, { "epoch": 0.683371298405467, "grad_norm": 0.03911906108260155, "learning_rate": 2.276304699782381e-05, "loss": 0.0022, "step": 300 }, { "epoch": 0.683371298405467, "eval_loss": 0.0020905383862555027, "eval_runtime": 122.4348, "eval_samples_per_second": 7.955, "eval_steps_per_second": 0.4, "step": 300 }, { "epoch": 0.7289293849658315, "grad_norm": 0.07351183891296387, "learning_rate": 1.7060741067442288e-05, "loss": 0.0021, "step": 320 }, { "epoch": 0.7744874715261959, "grad_norm": 0.06717297434806824, "learning_rate": 1.2032036606589175e-05, "loss": 0.0022, "step": 340 }, { "epoch": 0.7972665148063781, "eval_loss": 0.0018556159920990467, "eval_runtime": 122.4481, "eval_samples_per_second": 7.954, "eval_steps_per_second": 0.4, "step": 350 }, { "epoch": 0.8200455580865603, "grad_norm": 0.03857114166021347, "learning_rate": 7.779769648145201e-06, "loss": 0.0015, "step": 360 }, { "epoch": 0.8656036446469249, "grad_norm": 0.036848343908786774, "learning_rate": 4.390898228352131e-06, "loss": 0.0018, "step": 380 }, { "epoch": 0.9111617312072893, "grad_norm": 0.09445371478796005, "learning_rate": 1.9347241118030823e-06, "loss": 0.0021, "step": 400 }, { "epoch": 0.9111617312072893, "eval_loss": 0.0018473172094672918, "eval_runtime": 122.4479, "eval_samples_per_second": 7.954, "eval_steps_per_second": 0.4, "step": 400 }, { "epoch": 0.9567198177676538, "grad_norm": 0.04989920184016228, "learning_rate": 4.614755837704321e-07, "loss": 0.0018, "step": 420 }, { "epoch": 1.0, "step": 439, "total_flos": 9.38862713986089e+17, "train_loss": 0.018706909701194197, "train_runtime": 4591.0233, "train_samples_per_second": 1.909, "train_steps_per_second": 0.096 } ], "logging_steps": 20, "max_steps": 439, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.38862713986089e+17, "train_batch_size": 10, "trial_name": null, "trial_params": null }