emilykang's picture
Training in progress, epoch 8
e301478 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.841269841269842,
"eval_steps": 500,
"global_step": 310,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.31746031746031744,
"grad_norm": 2.053196907043457,
"learning_rate": 0.00019948693233918952,
"loss": 2.3733,
"step": 10
},
{
"epoch": 0.6349206349206349,
"grad_norm": 4.835758209228516,
"learning_rate": 0.00019795299412524945,
"loss": 2.0557,
"step": 20
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.8223971724510193,
"learning_rate": 0.00019541392564000488,
"loss": 1.9785,
"step": 30
},
{
"epoch": 1.2698412698412698,
"grad_norm": 1.6451746225357056,
"learning_rate": 0.00019189578116202307,
"loss": 1.8827,
"step": 40
},
{
"epoch": 1.5873015873015874,
"grad_norm": 3.187594413757324,
"learning_rate": 0.00018743466161445823,
"loss": 1.8161,
"step": 50
},
{
"epoch": 1.9047619047619047,
"grad_norm": 1.1520949602127075,
"learning_rate": 0.00018207634412072764,
"loss": 1.7753,
"step": 60
},
{
"epoch": 2.2222222222222223,
"grad_norm": 560.3873291015625,
"learning_rate": 0.0001758758122692791,
"loss": 1.7127,
"step": 70
},
{
"epoch": 2.5396825396825395,
"grad_norm": 3.1525535583496094,
"learning_rate": 0.00016889669190756868,
"loss": 1.7557,
"step": 80
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.8066505193710327,
"learning_rate": 0.0001612105982547663,
"loss": 1.6924,
"step": 90
},
{
"epoch": 3.1746031746031744,
"grad_norm": 6.147305011749268,
"learning_rate": 0.00015289640103269625,
"loss": 1.6697,
"step": 100
},
{
"epoch": 3.492063492063492,
"grad_norm": 1.1416895389556885,
"learning_rate": 0.00014403941515576344,
"loss": 1.6402,
"step": 110
},
{
"epoch": 3.8095238095238093,
"grad_norm": 0.7278308868408203,
"learning_rate": 0.00013473052528448201,
"loss": 1.633,
"step": 120
},
{
"epoch": 4.1269841269841265,
"grad_norm": 0.981347382068634,
"learning_rate": 0.00012506525322587207,
"loss": 1.6499,
"step": 130
},
{
"epoch": 4.444444444444445,
"grad_norm": 27.774219512939453,
"learning_rate": 0.00011514277775045768,
"loss": 1.6055,
"step": 140
},
{
"epoch": 4.761904761904762,
"grad_norm": 2.1296420097351074,
"learning_rate": 0.00010506491688387127,
"loss": 1.5705,
"step": 150
},
{
"epoch": 5.079365079365079,
"grad_norm": 0.7905821800231934,
"learning_rate": 9.493508311612874e-05,
"loss": 1.6286,
"step": 160
},
{
"epoch": 5.396825396825397,
"grad_norm": 0.805335283279419,
"learning_rate": 8.485722224954237e-05,
"loss": 1.6359,
"step": 170
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.7546270489692688,
"learning_rate": 7.493474677412794e-05,
"loss": 1.5344,
"step": 180
},
{
"epoch": 6.031746031746032,
"grad_norm": 0.7980037927627563,
"learning_rate": 6.526947471551798e-05,
"loss": 1.5468,
"step": 190
},
{
"epoch": 6.349206349206349,
"grad_norm": 0.7658030986785889,
"learning_rate": 5.596058484423656e-05,
"loss": 1.5378,
"step": 200
},
{
"epoch": 6.666666666666667,
"grad_norm": 1.7591676712036133,
"learning_rate": 4.710359896730379e-05,
"loss": 1.5446,
"step": 210
},
{
"epoch": 6.984126984126984,
"grad_norm": 0.8993221521377563,
"learning_rate": 3.878940174523371e-05,
"loss": 1.5279,
"step": 220
},
{
"epoch": 7.301587301587301,
"grad_norm": 0.7995801568031311,
"learning_rate": 3.110330809243134e-05,
"loss": 1.5253,
"step": 230
},
{
"epoch": 7.619047619047619,
"grad_norm": 0.8297848105430603,
"learning_rate": 2.4124187730720917e-05,
"loss": 1.5507,
"step": 240
},
{
"epoch": 7.936507936507937,
"grad_norm": 0.7720061540603638,
"learning_rate": 1.7923655879272393e-05,
"loss": 1.5218,
"step": 250
},
{
"epoch": 8.253968253968253,
"grad_norm": 0.818590521812439,
"learning_rate": 1.2565338385541792e-05,
"loss": 1.5276,
"step": 260
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.857406735420227,
"learning_rate": 8.10421883797694e-06,
"loss": 1.5014,
"step": 270
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.8421258926391602,
"learning_rate": 4.586074359995119e-06,
"loss": 1.5059,
"step": 280
},
{
"epoch": 9.206349206349206,
"grad_norm": 0.7964168190956116,
"learning_rate": 2.0470058747505516e-06,
"loss": 1.5039,
"step": 290
},
{
"epoch": 9.523809523809524,
"grad_norm": 0.8193419575691223,
"learning_rate": 5.130676608104845e-07,
"loss": 1.4986,
"step": 300
},
{
"epoch": 9.841269841269842,
"grad_norm": 0.884679913520813,
"learning_rate": 0.0,
"loss": 1.5265,
"step": 310
},
{
"epoch": 9.841269841269842,
"step": 310,
"total_flos": 1.511820317687808e+16,
"train_loss": 1.6589944193440098,
"train_runtime": 971.9769,
"train_samples_per_second": 1.296,
"train_steps_per_second": 0.319
}
],
"logging_steps": 10,
"max_steps": 310,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 1.511820317687808e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}