|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.841269841269842, |
|
"eval_steps": 500, |
|
"global_step": 310, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 2.053196907043457, |
|
"learning_rate": 0.00019948693233918952, |
|
"loss": 2.3733, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 4.835758209228516, |
|
"learning_rate": 0.00019795299412524945, |
|
"loss": 2.0557, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.8223971724510193, |
|
"learning_rate": 0.00019541392564000488, |
|
"loss": 1.9785, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 1.6451746225357056, |
|
"learning_rate": 0.00019189578116202307, |
|
"loss": 1.8827, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 3.187594413757324, |
|
"learning_rate": 0.00018743466161445823, |
|
"loss": 1.8161, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 1.1520949602127075, |
|
"learning_rate": 0.00018207634412072764, |
|
"loss": 1.7753, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 560.3873291015625, |
|
"learning_rate": 0.0001758758122692791, |
|
"loss": 1.7127, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 3.1525535583496094, |
|
"learning_rate": 0.00016889669190756868, |
|
"loss": 1.7557, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.8066505193710327, |
|
"learning_rate": 0.0001612105982547663, |
|
"loss": 1.6924, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.1746031746031744, |
|
"grad_norm": 6.147305011749268, |
|
"learning_rate": 0.00015289640103269625, |
|
"loss": 1.6697, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.492063492063492, |
|
"grad_norm": 1.1416895389556885, |
|
"learning_rate": 0.00014403941515576344, |
|
"loss": 1.6402, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.7278308868408203, |
|
"learning_rate": 0.00013473052528448201, |
|
"loss": 1.633, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.1269841269841265, |
|
"grad_norm": 0.981347382068634, |
|
"learning_rate": 0.00012506525322587207, |
|
"loss": 1.6499, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 27.774219512939453, |
|
"learning_rate": 0.00011514277775045768, |
|
"loss": 1.6055, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 2.1296420097351074, |
|
"learning_rate": 0.00010506491688387127, |
|
"loss": 1.5705, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 5.079365079365079, |
|
"grad_norm": 0.7905821800231934, |
|
"learning_rate": 9.493508311612874e-05, |
|
"loss": 1.6286, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.396825396825397, |
|
"grad_norm": 0.805335283279419, |
|
"learning_rate": 8.485722224954237e-05, |
|
"loss": 1.6359, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.7546270489692688, |
|
"learning_rate": 7.493474677412794e-05, |
|
"loss": 1.5344, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 6.031746031746032, |
|
"grad_norm": 0.7980037927627563, |
|
"learning_rate": 6.526947471551798e-05, |
|
"loss": 1.5468, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 6.349206349206349, |
|
"grad_norm": 0.7658030986785889, |
|
"learning_rate": 5.596058484423656e-05, |
|
"loss": 1.5378, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 1.7591676712036133, |
|
"learning_rate": 4.710359896730379e-05, |
|
"loss": 1.5446, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.984126984126984, |
|
"grad_norm": 0.8993221521377563, |
|
"learning_rate": 3.878940174523371e-05, |
|
"loss": 1.5279, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 7.301587301587301, |
|
"grad_norm": 0.7995801568031311, |
|
"learning_rate": 3.110330809243134e-05, |
|
"loss": 1.5253, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.619047619047619, |
|
"grad_norm": 0.8297848105430603, |
|
"learning_rate": 2.4124187730720917e-05, |
|
"loss": 1.5507, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.936507936507937, |
|
"grad_norm": 0.7720061540603638, |
|
"learning_rate": 1.7923655879272393e-05, |
|
"loss": 1.5218, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 8.253968253968253, |
|
"grad_norm": 0.818590521812439, |
|
"learning_rate": 1.2565338385541792e-05, |
|
"loss": 1.5276, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 0.857406735420227, |
|
"learning_rate": 8.10421883797694e-06, |
|
"loss": 1.5014, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 0.8421258926391602, |
|
"learning_rate": 4.586074359995119e-06, |
|
"loss": 1.5059, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 9.206349206349206, |
|
"grad_norm": 0.7964168190956116, |
|
"learning_rate": 2.0470058747505516e-06, |
|
"loss": 1.5039, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.523809523809524, |
|
"grad_norm": 0.8193419575691223, |
|
"learning_rate": 5.130676608104845e-07, |
|
"loss": 1.4986, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.841269841269842, |
|
"grad_norm": 0.884679913520813, |
|
"learning_rate": 0.0, |
|
"loss": 1.5265, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.841269841269842, |
|
"step": 310, |
|
"total_flos": 1.511820317687808e+16, |
|
"train_loss": 1.6589944193440098, |
|
"train_runtime": 971.9769, |
|
"train_samples_per_second": 1.296, |
|
"train_steps_per_second": 0.319 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.511820317687808e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|