|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.3398852348327637, |
|
"learning_rate": 4.999950652140343e-05, |
|
"loss": 0.288, |
|
"num_input_tokens_seen": 1192, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.534205436706543, |
|
"learning_rate": 4.9998026105095405e-05, |
|
"loss": 0.1627, |
|
"num_input_tokens_seen": 2208, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9795141220092773, |
|
"learning_rate": 4.999555880952023e-05, |
|
"loss": 0.2259, |
|
"num_input_tokens_seen": 3320, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.712265729904175, |
|
"learning_rate": 4.99921047320825e-05, |
|
"loss": 0.3248, |
|
"num_input_tokens_seen": 4536, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.0970170497894287, |
|
"learning_rate": 4.998766400914329e-05, |
|
"loss": 0.1867, |
|
"num_input_tokens_seen": 5960, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.16365909576416, |
|
"learning_rate": 4.998223681601473e-05, |
|
"loss": 0.1558, |
|
"num_input_tokens_seen": 7304, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.9363147020339966, |
|
"learning_rate": 4.9975823366953124e-05, |
|
"loss": 0.1863, |
|
"num_input_tokens_seen": 8744, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.874936819076538, |
|
"learning_rate": 4.996842391515044e-05, |
|
"loss": 0.1679, |
|
"num_input_tokens_seen": 10120, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 7.298994541168213, |
|
"learning_rate": 4.996003875272438e-05, |
|
"loss": 0.3627, |
|
"num_input_tokens_seen": 10816, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.7786474227905273, |
|
"learning_rate": 4.995066821070679e-05, |
|
"loss": 0.1466, |
|
"num_input_tokens_seen": 11920, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.9229803085327148, |
|
"learning_rate": 4.994031265903063e-05, |
|
"loss": 0.2029, |
|
"num_input_tokens_seen": 13216, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.230123519897461, |
|
"learning_rate": 4.992897250651535e-05, |
|
"loss": 0.1142, |
|
"num_input_tokens_seen": 14552, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.0229177474975586, |
|
"learning_rate": 4.991664820085074e-05, |
|
"loss": 0.2142, |
|
"num_input_tokens_seen": 15600, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.4955450296401978, |
|
"learning_rate": 4.990334022857932e-05, |
|
"loss": 0.149, |
|
"num_input_tokens_seen": 16744, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.070120334625244, |
|
"learning_rate": 4.9889049115077005e-05, |
|
"loss": 0.2267, |
|
"num_input_tokens_seen": 17424, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.16508150100708, |
|
"learning_rate": 4.987377542453251e-05, |
|
"loss": 0.1499, |
|
"num_input_tokens_seen": 18408, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.1738064289093018, |
|
"learning_rate": 4.9857519759924974e-05, |
|
"loss": 0.4279, |
|
"num_input_tokens_seen": 19288, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.7662558555603027, |
|
"learning_rate": 4.984028276300021e-05, |
|
"loss": 0.2957, |
|
"num_input_tokens_seen": 20336, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4056564569473267, |
|
"learning_rate": 4.982206511424534e-05, |
|
"loss": 0.1899, |
|
"num_input_tokens_seen": 21528, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4555004835128784, |
|
"learning_rate": 4.980286753286195e-05, |
|
"loss": 0.1429, |
|
"num_input_tokens_seen": 22680, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.6009600162506104, |
|
"learning_rate": 4.978269077673767e-05, |
|
"loss": 0.2084, |
|
"num_input_tokens_seen": 23800, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.1936904191970825, |
|
"learning_rate": 4.976153564241628e-05, |
|
"loss": 0.0474, |
|
"num_input_tokens_seen": 24784, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.241938829421997, |
|
"learning_rate": 4.9739402965066276e-05, |
|
"loss": 0.1757, |
|
"num_input_tokens_seen": 25688, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.735016942024231, |
|
"learning_rate": 4.971629361844785e-05, |
|
"loss": 0.2083, |
|
"num_input_tokens_seen": 27016, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.648120403289795, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 0.1653, |
|
"num_input_tokens_seen": 28200, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.088919162750244, |
|
"learning_rate": 4.96671486051967e-05, |
|
"loss": 0.2277, |
|
"num_input_tokens_seen": 29240, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.9720033407211304, |
|
"learning_rate": 4.9641114878724956e-05, |
|
"loss": 0.2262, |
|
"num_input_tokens_seen": 30264, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.137261390686035, |
|
"learning_rate": 4.9614108363230135e-05, |
|
"loss": 0.2119, |
|
"num_input_tokens_seen": 31376, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.7072184085845947, |
|
"learning_rate": 4.958613012488324e-05, |
|
"loss": 0.186, |
|
"num_input_tokens_seen": 32288, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.6640106439590454, |
|
"learning_rate": 4.9557181268217227e-05, |
|
"loss": 0.1381, |
|
"num_input_tokens_seen": 33336, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.743667721748352, |
|
"learning_rate": 4.952726293608335e-05, |
|
"loss": 0.1521, |
|
"num_input_tokens_seen": 35264, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.3096036911010742, |
|
"learning_rate": 4.949637630960617e-05, |
|
"loss": 0.2232, |
|
"num_input_tokens_seen": 36496, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.943765640258789, |
|
"learning_rate": 4.9464522608136805e-05, |
|
"loss": 0.0252, |
|
"num_input_tokens_seen": 37312, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.8293421268463135, |
|
"learning_rate": 4.943170308920484e-05, |
|
"loss": 0.264, |
|
"num_input_tokens_seen": 38376, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.6471211910247803, |
|
"learning_rate": 4.939791904846869e-05, |
|
"loss": 0.1645, |
|
"num_input_tokens_seen": 39168, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.6533693075180054, |
|
"learning_rate": 4.9363171819664434e-05, |
|
"loss": 0.1802, |
|
"num_input_tokens_seen": 40440, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.3770309686660767, |
|
"learning_rate": 4.9327462774553166e-05, |
|
"loss": 0.2206, |
|
"num_input_tokens_seen": 41760, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.6882014274597168, |
|
"learning_rate": 4.929079332286685e-05, |
|
"loss": 0.2886, |
|
"num_input_tokens_seen": 42944, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.9799898862838745, |
|
"learning_rate": 4.925316491225265e-05, |
|
"loss": 0.1897, |
|
"num_input_tokens_seen": 44008, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.551802158355713, |
|
"learning_rate": 4.9214579028215776e-05, |
|
"loss": 0.1935, |
|
"num_input_tokens_seen": 45168, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0445095300674438, |
|
"learning_rate": 4.917503719406088e-05, |
|
"loss": 0.1071, |
|
"num_input_tokens_seen": 46432, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.2616090774536133, |
|
"learning_rate": 4.913454097083185e-05, |
|
"loss": 0.5737, |
|
"num_input_tokens_seen": 47360, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9618259072303772, |
|
"learning_rate": 4.909309195725025e-05, |
|
"loss": 0.114, |
|
"num_input_tokens_seen": 49008, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.5821013450622559, |
|
"learning_rate": 4.905069178965215e-05, |
|
"loss": 0.2676, |
|
"num_input_tokens_seen": 50264, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.1913903951644897, |
|
"learning_rate": 4.900734214192358e-05, |
|
"loss": 0.1061, |
|
"num_input_tokens_seen": 51368, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.8395439386367798, |
|
"learning_rate": 4.89630447254344e-05, |
|
"loss": 0.2161, |
|
"num_input_tokens_seen": 52304, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.736319363117218, |
|
"learning_rate": 4.891780128897077e-05, |
|
"loss": 0.1163, |
|
"num_input_tokens_seen": 53712, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.5373475551605225, |
|
"learning_rate": 4.887161361866608e-05, |
|
"loss": 0.1609, |
|
"num_input_tokens_seen": 54704, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.3580759763717651, |
|
"learning_rate": 4.882448353793048e-05, |
|
"loss": 0.1673, |
|
"num_input_tokens_seen": 55880, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.0536258220672607, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 0.1955, |
|
"num_input_tokens_seen": 56784, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 56784, |
|
"num_train_epochs": 10, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2564102936199168.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|