{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18248175182481752, "grad_norm": 23.73174476623535, "learning_rate": 0.00046, "loss": 4.0301, "step": 25 }, { "epoch": 0.36496350364963503, "grad_norm": 2.086200714111328, "learning_rate": 0.00096, "loss": 1.0569, "step": 50 }, { "epoch": 0.5474452554744526, "grad_norm": 0.4636005461215973, "learning_rate": 0.0009825757575757576, "loss": 0.4231, "step": 75 }, { "epoch": 0.7299270072992701, "grad_norm": 0.905696451663971, "learning_rate": 0.0009636363636363637, "loss": 0.1587, "step": 100 }, { "epoch": 0.9124087591240876, "grad_norm": 0.7420445084571838, "learning_rate": 0.0009446969696969697, "loss": 0.1302, "step": 125 }, { "epoch": 1.0, "eval_loss": 0.09637967497110367, "eval_runtime": 192.8711, "eval_samples_per_second": 4.241, "eval_steps_per_second": 0.71, "step": 137 }, { "epoch": 1.094890510948905, "grad_norm": 0.5315635204315186, "learning_rate": 0.0009257575757575758, "loss": 0.1168, "step": 150 }, { "epoch": 1.2773722627737225, "grad_norm": 2.168321132659912, "learning_rate": 0.0009068181818181819, "loss": 0.0957, "step": 175 }, { "epoch": 1.4598540145985401, "grad_norm": 1.4148447513580322, "learning_rate": 0.000887878787878788, "loss": 0.0819, "step": 200 }, { "epoch": 1.6423357664233578, "grad_norm": 1.08322274684906, "learning_rate": 0.0008689393939393939, "loss": 0.0967, "step": 225 }, { "epoch": 1.8248175182481752, "grad_norm": 0.34484872221946716, "learning_rate": 0.00085, "loss": 0.0897, "step": 250 }, { "epoch": 2.0, "eval_loss": 0.05452003329992294, "eval_runtime": 193.1249, "eval_samples_per_second": 4.236, "eval_steps_per_second": 0.709, "step": 274 }, { "epoch": 2.0072992700729926, "grad_norm": 0.4588640034198761, "learning_rate": 0.0008310606060606061, "loss": 0.078, "step": 275 }, { "epoch": 2.18978102189781, "grad_norm": 0.9430391788482666, "learning_rate": 0.0008121212121212122, "loss": 0.047, "step": 300 }, { "epoch": 2.372262773722628, "grad_norm": 1.2605994939804077, "learning_rate": 0.0007931818181818182, "loss": 0.0733, "step": 325 }, { "epoch": 2.554744525547445, "grad_norm": 0.4879356026649475, "learning_rate": 0.0007742424242424244, "loss": 0.0595, "step": 350 }, { "epoch": 2.7372262773722627, "grad_norm": 0.7468044757843018, "learning_rate": 0.0007553030303030303, "loss": 0.0581, "step": 375 }, { "epoch": 2.9197080291970803, "grad_norm": 1.2231853008270264, "learning_rate": 0.0007363636363636363, "loss": 0.0691, "step": 400 }, { "epoch": 3.0, "eval_loss": 0.039563629776239395, "eval_runtime": 191.6012, "eval_samples_per_second": 4.269, "eval_steps_per_second": 0.715, "step": 411 }, { "epoch": 3.102189781021898, "grad_norm": 0.3177865445613861, "learning_rate": 0.0007174242424242424, "loss": 0.0421, "step": 425 }, { "epoch": 3.2846715328467155, "grad_norm": 1.0183290243148804, "learning_rate": 0.0006984848484848485, "loss": 0.0485, "step": 450 }, { "epoch": 3.4671532846715327, "grad_norm": 0.361370712518692, "learning_rate": 0.0006795454545454546, "loss": 0.0369, "step": 475 }, { "epoch": 3.6496350364963503, "grad_norm": 1.0689235925674438, "learning_rate": 0.0006606060606060606, "loss": 0.0456, "step": 500 }, { "epoch": 3.832116788321168, "grad_norm": 0.29606688022613525, "learning_rate": 0.0006416666666666667, "loss": 0.0482, "step": 525 }, { "epoch": 4.0, "eval_loss": 0.027906004339456558, "eval_runtime": 190.9612, "eval_samples_per_second": 4.284, "eval_steps_per_second": 0.717, "step": 548 }, { "epoch": 4.014598540145985, "grad_norm": 0.1826704740524292, "learning_rate": 0.0006227272727272727, "loss": 0.0451, "step": 550 }, { "epoch": 4.197080291970803, "grad_norm": 0.1476697325706482, "learning_rate": 0.0006037878787878788, "loss": 0.0324, "step": 575 }, { "epoch": 4.37956204379562, "grad_norm": 0.37707987427711487, "learning_rate": 0.0005848484848484848, "loss": 0.0277, "step": 600 }, { "epoch": 4.562043795620438, "grad_norm": 0.18928465247154236, "learning_rate": 0.0005659090909090909, "loss": 0.0331, "step": 625 }, { "epoch": 4.744525547445256, "grad_norm": 0.10301584750413895, "learning_rate": 0.000546969696969697, "loss": 0.0189, "step": 650 }, { "epoch": 4.927007299270073, "grad_norm": 0.5264261960983276, "learning_rate": 0.0005280303030303031, "loss": 0.0374, "step": 675 }, { "epoch": 5.0, "eval_loss": 0.017436422407627106, "eval_runtime": 189.4577, "eval_samples_per_second": 4.318, "eval_steps_per_second": 0.723, "step": 685 }, { "epoch": 5.109489051094891, "grad_norm": 0.3798004984855652, "learning_rate": 0.000509090909090909, "loss": 0.0166, "step": 700 }, { "epoch": 5.291970802919708, "grad_norm": 0.22640006244182587, "learning_rate": 0.0004901515151515152, "loss": 0.0224, "step": 725 }, { "epoch": 5.474452554744525, "grad_norm": 0.2977535128593445, "learning_rate": 0.0004712121212121212, "loss": 0.0178, "step": 750 }, { "epoch": 5.656934306569343, "grad_norm": 0.046165354549884796, "learning_rate": 0.00045227272727272727, "loss": 0.0188, "step": 775 }, { "epoch": 5.839416058394161, "grad_norm": 0.20036080479621887, "learning_rate": 0.00043333333333333337, "loss": 0.0189, "step": 800 }, { "epoch": 6.0, "eval_loss": 0.009602065198123455, "eval_runtime": 190.5013, "eval_samples_per_second": 4.294, "eval_steps_per_second": 0.719, "step": 822 }, { "epoch": 6.021897810218978, "grad_norm": 0.009559686295688152, "learning_rate": 0.00041439393939393936, "loss": 0.0112, "step": 825 }, { "epoch": 6.204379562043796, "grad_norm": 0.17128558456897736, "learning_rate": 0.00039545454545454546, "loss": 0.0115, "step": 850 }, { "epoch": 6.386861313868613, "grad_norm": 0.12201745808124542, "learning_rate": 0.0003765151515151515, "loss": 0.0099, "step": 875 }, { "epoch": 6.569343065693431, "grad_norm": 0.221001997590065, "learning_rate": 0.0003575757575757576, "loss": 0.0062, "step": 900 }, { "epoch": 6.751824817518248, "grad_norm": 0.1375964730978012, "learning_rate": 0.00033863636363636366, "loss": 0.0104, "step": 925 }, { "epoch": 6.934306569343065, "grad_norm": 0.18805921077728271, "learning_rate": 0.0003196969696969697, "loss": 0.0108, "step": 950 }, { "epoch": 7.0, "eval_loss": 0.006272537633776665, "eval_runtime": 189.9453, "eval_samples_per_second": 4.307, "eval_steps_per_second": 0.721, "step": 959 }, { "epoch": 7.116788321167883, "grad_norm": 0.04299464076757431, "learning_rate": 0.0003007575757575758, "loss": 0.0065, "step": 975 }, { "epoch": 7.299270072992701, "grad_norm": 0.19402629137039185, "learning_rate": 0.0002818181818181818, "loss": 0.0089, "step": 1000 }, { "epoch": 7.481751824817518, "grad_norm": 0.1514054536819458, "learning_rate": 0.0002628787878787879, "loss": 0.007, "step": 1025 }, { "epoch": 7.664233576642336, "grad_norm": 0.011143018491566181, "learning_rate": 0.00024393939393939392, "loss": 0.0074, "step": 1050 }, { "epoch": 7.846715328467154, "grad_norm": 0.037931449711322784, "learning_rate": 0.00022500000000000002, "loss": 0.004, "step": 1075 }, { "epoch": 8.0, "eval_loss": 0.003884958801791072, "eval_runtime": 190.9247, "eval_samples_per_second": 4.284, "eval_steps_per_second": 0.718, "step": 1096 }, { "epoch": 8.02919708029197, "grad_norm": 0.09535694122314453, "learning_rate": 0.00020606060606060607, "loss": 0.0047, "step": 1100 }, { "epoch": 8.211678832116789, "grad_norm": 0.1803082972764969, "learning_rate": 0.00018712121212121214, "loss": 0.0047, "step": 1125 }, { "epoch": 8.394160583941606, "grad_norm": 0.07427278906106949, "learning_rate": 0.0001681818181818182, "loss": 0.0035, "step": 1150 }, { "epoch": 8.576642335766424, "grad_norm": 0.07938549667596817, "learning_rate": 0.00014924242424242424, "loss": 0.0038, "step": 1175 }, { "epoch": 8.75912408759124, "grad_norm": 0.047964416444301605, "learning_rate": 0.0001303030303030303, "loss": 0.0035, "step": 1200 }, { "epoch": 8.941605839416058, "grad_norm": 0.03935278207063675, "learning_rate": 0.00011136363636363636, "loss": 0.0039, "step": 1225 }, { "epoch": 9.0, "eval_loss": 0.002366352593526244, "eval_runtime": 191.3767, "eval_samples_per_second": 4.274, "eval_steps_per_second": 0.716, "step": 1233 }, { "epoch": 9.124087591240876, "grad_norm": 0.06493715196847916, "learning_rate": 9.242424242424242e-05, "loss": 0.0025, "step": 1250 }, { "epoch": 9.306569343065693, "grad_norm": 0.02973032370209694, "learning_rate": 7.348484848484849e-05, "loss": 0.0031, "step": 1275 }, { "epoch": 9.489051094890511, "grad_norm": 0.03156217932701111, "learning_rate": 5.4545454545454546e-05, "loss": 0.0013, "step": 1300 }, { "epoch": 9.671532846715328, "grad_norm": 0.01587042771279812, "learning_rate": 3.560606060606061e-05, "loss": 0.0036, "step": 1325 }, { "epoch": 9.854014598540147, "grad_norm": 0.02338283136487007, "learning_rate": 1.6666666666666667e-05, "loss": 0.0026, "step": 1350 }, { "epoch": 10.0, "eval_loss": 0.001961252186447382, "eval_runtime": 191.871, "eval_samples_per_second": 4.263, "eval_steps_per_second": 0.714, "step": 1370 }, { "epoch": 10.0, "step": 1370, "total_flos": 2.4023145037824e+18, "train_loss": 0.1333546004319278, "train_runtime": 4360.7512, "train_samples_per_second": 1.876, "train_steps_per_second": 0.314 } ], "logging_steps": 25, "max_steps": 1370, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4023145037824e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }