{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.841269841269842, "eval_steps": 500, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.31746031746031744, "grad_norm": 2.053196907043457, "learning_rate": 0.00019948693233918952, "loss": 2.3733, "step": 10 }, { "epoch": 0.6349206349206349, "grad_norm": 4.835758209228516, "learning_rate": 0.00019795299412524945, "loss": 2.0557, "step": 20 }, { "epoch": 0.9523809523809523, "grad_norm": 0.8223971724510193, "learning_rate": 0.00019541392564000488, "loss": 1.9785, "step": 30 }, { "epoch": 1.2698412698412698, "grad_norm": 1.6451746225357056, "learning_rate": 0.00019189578116202307, "loss": 1.8827, "step": 40 }, { "epoch": 1.5873015873015874, "grad_norm": 3.187594413757324, "learning_rate": 0.00018743466161445823, "loss": 1.8161, "step": 50 }, { "epoch": 1.9047619047619047, "grad_norm": 1.1520949602127075, "learning_rate": 0.00018207634412072764, "loss": 1.7753, "step": 60 }, { "epoch": 2.2222222222222223, "grad_norm": 560.3873291015625, "learning_rate": 0.0001758758122692791, "loss": 1.7127, "step": 70 }, { "epoch": 2.5396825396825395, "grad_norm": 3.1525535583496094, "learning_rate": 0.00016889669190756868, "loss": 1.7557, "step": 80 }, { "epoch": 2.857142857142857, "grad_norm": 0.8066505193710327, "learning_rate": 0.0001612105982547663, "loss": 1.6924, "step": 90 }, { "epoch": 3.1746031746031744, "grad_norm": 6.147305011749268, "learning_rate": 0.00015289640103269625, "loss": 1.6697, "step": 100 }, { "epoch": 3.492063492063492, "grad_norm": 1.1416895389556885, "learning_rate": 0.00014403941515576344, "loss": 1.6402, "step": 110 }, { "epoch": 3.8095238095238093, "grad_norm": 0.7278308868408203, "learning_rate": 0.00013473052528448201, "loss": 1.633, "step": 120 }, { "epoch": 4.1269841269841265, "grad_norm": 0.981347382068634, "learning_rate": 0.00012506525322587207, "loss": 1.6499, "step": 130 }, { "epoch": 4.444444444444445, "grad_norm": 27.774219512939453, "learning_rate": 0.00011514277775045768, "loss": 1.6055, "step": 140 }, { "epoch": 4.761904761904762, "grad_norm": 2.1296420097351074, "learning_rate": 0.00010506491688387127, "loss": 1.5705, "step": 150 }, { "epoch": 5.079365079365079, "grad_norm": 0.7905821800231934, "learning_rate": 9.493508311612874e-05, "loss": 1.6286, "step": 160 }, { "epoch": 5.396825396825397, "grad_norm": 0.805335283279419, "learning_rate": 8.485722224954237e-05, "loss": 1.6359, "step": 170 }, { "epoch": 5.714285714285714, "grad_norm": 0.7546270489692688, "learning_rate": 7.493474677412794e-05, "loss": 1.5344, "step": 180 }, { "epoch": 6.031746031746032, "grad_norm": 0.7980037927627563, "learning_rate": 6.526947471551798e-05, "loss": 1.5468, "step": 190 }, { "epoch": 6.349206349206349, "grad_norm": 0.7658030986785889, "learning_rate": 5.596058484423656e-05, "loss": 1.5378, "step": 200 }, { "epoch": 6.666666666666667, "grad_norm": 1.7591676712036133, "learning_rate": 4.710359896730379e-05, "loss": 1.5446, "step": 210 }, { "epoch": 6.984126984126984, "grad_norm": 0.8993221521377563, "learning_rate": 3.878940174523371e-05, "loss": 1.5279, "step": 220 }, { "epoch": 7.301587301587301, "grad_norm": 0.7995801568031311, "learning_rate": 3.110330809243134e-05, "loss": 1.5253, "step": 230 }, { "epoch": 7.619047619047619, "grad_norm": 0.8297848105430603, "learning_rate": 2.4124187730720917e-05, "loss": 1.5507, "step": 240 }, { "epoch": 7.936507936507937, "grad_norm": 0.7720061540603638, "learning_rate": 1.7923655879272393e-05, "loss": 1.5218, "step": 250 }, { "epoch": 8.253968253968253, "grad_norm": 0.818590521812439, "learning_rate": 1.2565338385541792e-05, "loss": 1.5276, "step": 260 }, { "epoch": 8.571428571428571, "grad_norm": 0.857406735420227, "learning_rate": 8.10421883797694e-06, "loss": 1.5014, "step": 270 }, { "epoch": 8.88888888888889, "grad_norm": 0.8421258926391602, "learning_rate": 4.586074359995119e-06, "loss": 1.5059, "step": 280 }, { "epoch": 9.206349206349206, "grad_norm": 0.7964168190956116, "learning_rate": 2.0470058747505516e-06, "loss": 1.5039, "step": 290 }, { "epoch": 9.523809523809524, "grad_norm": 0.8193419575691223, "learning_rate": 5.130676608104845e-07, "loss": 1.4986, "step": 300 }, { "epoch": 9.841269841269842, "grad_norm": 0.884679913520813, "learning_rate": 0.0, "loss": 1.5265, "step": 310 }, { "epoch": 9.841269841269842, "step": 310, "total_flos": 1.511820317687808e+16, "train_loss": 1.6589944193440098, "train_runtime": 971.9769, "train_samples_per_second": 1.296, "train_steps_per_second": 0.319 } ], "logging_steps": 10, "max_steps": 310, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.511820317687808e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }