|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 17.0, |
|
"eval_steps": 500, |
|
"global_step": 5406, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.9968553459119497, |
|
"grad_norm": 0.37894660234451294, |
|
"learning_rate": 8.9089921371132e-05, |
|
"loss": 0.2936, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8738709677419355, |
|
"eval_loss": 0.06461235135793686, |
|
"eval_runtime": 2.6511, |
|
"eval_samples_per_second": 1169.335, |
|
"eval_steps_per_second": 24.518, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.9937106918238994, |
|
"grad_norm": 0.2703763544559479, |
|
"learning_rate": 8.354040180448849e-05, |
|
"loss": 0.0584, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9264516129032258, |
|
"eval_loss": 0.03086796961724758, |
|
"eval_runtime": 2.6512, |
|
"eval_samples_per_second": 1169.284, |
|
"eval_steps_per_second": 24.517, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.990566037735849, |
|
"grad_norm": 0.20945589244365692, |
|
"learning_rate": 7.799088223784498e-05, |
|
"loss": 0.035, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9361290322580645, |
|
"eval_loss": 0.023069918155670166, |
|
"eval_runtime": 2.6554, |
|
"eval_samples_per_second": 1167.42, |
|
"eval_steps_per_second": 24.478, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 3.9874213836477987, |
|
"grad_norm": 0.1196790486574173, |
|
"learning_rate": 7.244136267120147e-05, |
|
"loss": 0.0276, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9390322580645162, |
|
"eval_loss": 0.02050953172147274, |
|
"eval_runtime": 2.6346, |
|
"eval_samples_per_second": 1176.652, |
|
"eval_steps_per_second": 24.672, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 4.984276729559748, |
|
"grad_norm": 0.11527035385370255, |
|
"learning_rate": 6.689184310455794e-05, |
|
"loss": 0.0239, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9361290322580645, |
|
"eval_loss": 0.019045764580368996, |
|
"eval_runtime": 2.6584, |
|
"eval_samples_per_second": 1166.129, |
|
"eval_steps_per_second": 24.451, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.981132075471698, |
|
"grad_norm": 0.10317819565534592, |
|
"learning_rate": 6.134232353791443e-05, |
|
"loss": 0.0217, |
|
"step": 1902 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9412903225806452, |
|
"eval_loss": 0.018115758895874023, |
|
"eval_runtime": 2.6563, |
|
"eval_samples_per_second": 1167.048, |
|
"eval_steps_per_second": 24.47, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 6.977987421383648, |
|
"grad_norm": 0.11120817810297012, |
|
"learning_rate": 5.5792803971270916e-05, |
|
"loss": 0.0202, |
|
"step": 2219 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9435483870967742, |
|
"eval_loss": 0.017409062013030052, |
|
"eval_runtime": 2.664, |
|
"eval_samples_per_second": 1163.673, |
|
"eval_steps_per_second": 24.4, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 7.9748427672955975, |
|
"grad_norm": 0.08610443770885468, |
|
"learning_rate": 5.0243284404627405e-05, |
|
"loss": 0.0189, |
|
"step": 2536 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9441935483870968, |
|
"eval_loss": 0.01702069491147995, |
|
"eval_runtime": 2.6577, |
|
"eval_samples_per_second": 1166.417, |
|
"eval_steps_per_second": 24.457, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 8.971698113207546, |
|
"grad_norm": 0.09259846806526184, |
|
"learning_rate": 4.469376483798389e-05, |
|
"loss": 0.018, |
|
"step": 2853 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9458064516129032, |
|
"eval_loss": 0.016187822446227074, |
|
"eval_runtime": 2.6545, |
|
"eval_samples_per_second": 1167.839, |
|
"eval_steps_per_second": 24.487, |
|
"step": 2862 |
|
}, |
|
{ |
|
"epoch": 9.968553459119496, |
|
"grad_norm": 0.07796873152256012, |
|
"learning_rate": 3.914424527134037e-05, |
|
"loss": 0.0171, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9438709677419355, |
|
"eval_loss": 0.015915485098958015, |
|
"eval_runtime": 2.6415, |
|
"eval_samples_per_second": 1173.555, |
|
"eval_steps_per_second": 24.607, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 10.965408805031446, |
|
"grad_norm": 0.06984979659318924, |
|
"learning_rate": 3.359472570469686e-05, |
|
"loss": 0.0164, |
|
"step": 3487 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.9416129032258065, |
|
"eval_loss": 0.01606527529656887, |
|
"eval_runtime": 2.6279, |
|
"eval_samples_per_second": 1179.632, |
|
"eval_steps_per_second": 24.734, |
|
"step": 3498 |
|
}, |
|
{ |
|
"epoch": 11.962264150943396, |
|
"grad_norm": 0.06347791105508804, |
|
"learning_rate": 2.8045206138053343e-05, |
|
"loss": 0.0159, |
|
"step": 3804 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9461290322580646, |
|
"eval_loss": 0.015618913806974888, |
|
"eval_runtime": 2.6558, |
|
"eval_samples_per_second": 1167.25, |
|
"eval_steps_per_second": 24.475, |
|
"step": 3816 |
|
}, |
|
{ |
|
"epoch": 12.959119496855346, |
|
"grad_norm": 0.08197327703237534, |
|
"learning_rate": 2.249568657140983e-05, |
|
"loss": 0.0153, |
|
"step": 4121 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.9438709677419355, |
|
"eval_loss": 0.015260354615747929, |
|
"eval_runtime": 2.6542, |
|
"eval_samples_per_second": 1167.962, |
|
"eval_steps_per_second": 24.49, |
|
"step": 4134 |
|
}, |
|
{ |
|
"epoch": 13.955974842767295, |
|
"grad_norm": 0.06349155306816101, |
|
"learning_rate": 1.6946167004766316e-05, |
|
"loss": 0.0149, |
|
"step": 4438 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.9467741935483871, |
|
"eval_loss": 0.014791840687394142, |
|
"eval_runtime": 2.639, |
|
"eval_samples_per_second": 1174.705, |
|
"eval_steps_per_second": 24.631, |
|
"step": 4452 |
|
}, |
|
{ |
|
"epoch": 14.952830188679245, |
|
"grad_norm": 0.0693543404340744, |
|
"learning_rate": 1.1396647438122802e-05, |
|
"loss": 0.0144, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.9464516129032258, |
|
"eval_loss": 0.014759526588022709, |
|
"eval_runtime": 2.629, |
|
"eval_samples_per_second": 1179.148, |
|
"eval_steps_per_second": 24.724, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 15.949685534591195, |
|
"grad_norm": 0.06401927769184113, |
|
"learning_rate": 5.847127871479286e-06, |
|
"loss": 0.0141, |
|
"step": 5072 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.947741935483871, |
|
"eval_loss": 0.014390287920832634, |
|
"eval_runtime": 2.6548, |
|
"eval_samples_per_second": 1167.693, |
|
"eval_steps_per_second": 24.484, |
|
"step": 5088 |
|
}, |
|
{ |
|
"epoch": 16.946540880503143, |
|
"grad_norm": 0.06713937968015671, |
|
"learning_rate": 2.976083048357721e-07, |
|
"loss": 0.0139, |
|
"step": 5389 |
|
} |
|
], |
|
"logging_steps": 317, |
|
"max_steps": 5406, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 17, |
|
"save_steps": 1000000000.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1401394002013824.0, |
|
"train_batch_size": 48, |
|
"trial_name": null, |
|
"trial_params": { |
|
"alpha": 0.28743091275198973, |
|
"fp16": true, |
|
"learning_rate": 9.215353627385319e-05, |
|
"lr_scheduler": "cosine", |
|
"num_train_epochs": 17, |
|
"temperature": 11, |
|
"warmup_steps": 142, |
|
"weight_decay": 0.1789135641130362 |
|
} |
|
} |
|
|