|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 0.14456839859485626, |
|
"learning_rate": 0.0002, |
|
"loss": 2.6994, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.12785384058952332, |
|
"learning_rate": 0.0002, |
|
"loss": 2.5263, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 0.13556812703609467, |
|
"learning_rate": 0.0002, |
|
"loss": 2.4, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.16702307760715485, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2516, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3053046464920044, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1036, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 0.22266729176044464, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3971, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.028, |
|
"grad_norm": 0.14757753908634186, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3272, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.1445876806974411, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3242, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 0.1639222800731659, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0874, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.30435118079185486, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9193, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044, |
|
"grad_norm": 0.19899794459342957, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3188, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.171605184674263, |
|
"learning_rate": 0.0002, |
|
"loss": 2.285, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.052, |
|
"grad_norm": 0.18728512525558472, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2458, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 0.21194487810134888, |
|
"learning_rate": 0.0002, |
|
"loss": 2.038, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3907397985458374, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8128, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.2305271327495575, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2758, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.068, |
|
"grad_norm": 0.1841125786304474, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2169, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 0.18488088250160217, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1981, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.076, |
|
"grad_norm": 0.22125467658042908, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0181, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.36263903975486755, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7492, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 0.23995819687843323, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2287, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 0.18997274339199066, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2265, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.092, |
|
"grad_norm": 0.19529950618743896, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2107, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.23907797038555145, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9434, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3128001093864441, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7463, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 0.22716236114501953, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2284, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 0.19475902616977692, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1706, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.19292621314525604, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1211, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.116, |
|
"grad_norm": 0.27429261803627014, |
|
"learning_rate": 0.0002, |
|
"loss": 1.916, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.35040974617004395, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6903, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.124, |
|
"grad_norm": 0.25687935948371887, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1812, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.20279887318611145, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2005, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 0.22153416275978088, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1195, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.2411063015460968, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8694, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3698059320449829, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6522, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.263630211353302, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1644, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.148, |
|
"grad_norm": 0.22328948974609375, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1753, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 0.21132981777191162, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1191, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 0.2776873707771301, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8539, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.34315550327301025, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6447, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.164, |
|
"grad_norm": 0.2831268906593323, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1451, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 0.2178158015012741, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1368, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.172, |
|
"grad_norm": 0.22629423439502716, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1039, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.2540661096572876, |
|
"learning_rate": 0.0002, |
|
"loss": 1.856, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3525744676589966, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6286, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.24233509600162506, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1296, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.188, |
|
"grad_norm": 0.20195922255516052, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1321, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.25681406259536743, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0255, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.196, |
|
"grad_norm": 0.2791021764278412, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8141, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3304845690727234, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6042, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7342773566308352e+16, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|