|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 5000, |
|
"global_step": 504, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01984126984126984, |
|
"grad_norm": 2912.028564453125, |
|
"learning_rate": 2e-08, |
|
"loss": 14.5926, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03968253968253968, |
|
"grad_norm": 6037.033203125, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 14.6525, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05952380952380952, |
|
"grad_norm": 7095.4931640625, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 13.8958, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07936507936507936, |
|
"grad_norm": 3047.11572265625, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 13.0648, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0992063492063492, |
|
"grad_norm": 20132.697265625, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 11.7415, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 65280.9921875, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 10.186, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 18013.744140625, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 8.9537, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 12039.76171875, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 8.8499, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 4152.63037109375, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 8.1076, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1984126984126984, |
|
"grad_norm": 112633.828125, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 7.3218, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21825396825396826, |
|
"grad_norm": 12219.0498046875, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 6.8102, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 5337.48681640625, |
|
"learning_rate": 2.2e-06, |
|
"loss": 6.4298, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25793650793650796, |
|
"grad_norm": 66286.9609375, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 6.1412, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 22970.162109375, |
|
"learning_rate": 2.6e-06, |
|
"loss": 6.1932, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 1229.7572021484375, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 5.807, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 4040.58544921875, |
|
"learning_rate": 3e-06, |
|
"loss": 5.3959, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3373015873015873, |
|
"grad_norm": 79031.90625, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 5.1443, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 2377.986328125, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 5.1, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.376984126984127, |
|
"grad_norm": 6816.78173828125, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 5.0767, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3968253968253968, |
|
"grad_norm": 329.9869384765625, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 4.7978, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 551.7224731445312, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 4.3107, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4365079365079365, |
|
"grad_norm": 306.0798645019531, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 4.126, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.45634920634920634, |
|
"grad_norm": 259.38702392578125, |
|
"learning_rate": 4.4e-06, |
|
"loss": 3.3982, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 152.5589141845703, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 2.9569, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.49603174603174605, |
|
"grad_norm": 188.98681640625, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 2.3802, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5158730158730159, |
|
"grad_norm": 214.31179809570312, |
|
"learning_rate": 5e-06, |
|
"loss": 2.0057, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 184.34437561035156, |
|
"learning_rate": 5.2e-06, |
|
"loss": 1.6501, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 119.70015716552734, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 1.4393, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5753968253968254, |
|
"grad_norm": 172.4916534423828, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 1.2536, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 152.96202087402344, |
|
"learning_rate": 5.8e-06, |
|
"loss": 1.1011, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6150793650793651, |
|
"grad_norm": 229.5851593017578, |
|
"learning_rate": 6e-06, |
|
"loss": 0.9963, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 192.8756103515625, |
|
"learning_rate": 6.200000000000001e-06, |
|
"loss": 0.893, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6547619047619048, |
|
"grad_norm": 285.22601318359375, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.7744, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6746031746031746, |
|
"grad_norm": 121.977783203125, |
|
"learning_rate": 6.600000000000001e-06, |
|
"loss": 0.6989, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 135.0439453125, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.559, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 157.01791381835938, |
|
"learning_rate": 7e-06, |
|
"loss": 0.5721, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7341269841269841, |
|
"grad_norm": 139.25894165039062, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.5385, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.753968253968254, |
|
"grad_norm": 94.0282974243164, |
|
"learning_rate": 7.4e-06, |
|
"loss": 0.5169, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7738095238095238, |
|
"grad_norm": 64.70858764648438, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.4738, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 68.39224243164062, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 0.4533, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8134920634920635, |
|
"grad_norm": 55.44282150268555, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.4296, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 66.44039154052734, |
|
"learning_rate": 8.2e-06, |
|
"loss": 0.3933, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8531746031746031, |
|
"grad_norm": 64.27642822265625, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.4348, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.873015873015873, |
|
"grad_norm": 1226.5478515625, |
|
"learning_rate": 8.6e-06, |
|
"loss": 0.6043, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 94.26041412353516, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.3775, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9126984126984127, |
|
"grad_norm": 63.64945983886719, |
|
"learning_rate": 9e-06, |
|
"loss": 0.3306, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9325396825396826, |
|
"grad_norm": 41.9366340637207, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.3727, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 36.641475677490234, |
|
"learning_rate": 9.4e-06, |
|
"loss": 0.3561, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 30.62155532836914, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.3152, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9920634920634921, |
|
"grad_norm": 36.66413497924805, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 0.3266, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 504, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2449113063424e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|