|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 17.142857142857142, |
|
"eval_steps": 500, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 4.297126293182373, |
|
"learning_rate": 5e-06, |
|
"loss": 2.0584, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 5.442339897155762, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1704, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 3.8550186157226562, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.9811, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 4.534988880157471, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9627, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 4.389001846313477, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.8195, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 2.9333465099334717, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7995, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.8814234733581543, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.6025, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 3.1065313816070557, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4849, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 3.8952481746673584, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.4171, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 2.2721009254455566, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3011, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 1.9634448289871216, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.2701, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 1.7538613080978394, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.1454, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 1.5953209400177002, |
|
"learning_rate": 4.7e-05, |
|
"loss": 1.1024, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.5740618705749512, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.995, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 1.4993911981582642, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.042, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 2.0913941860198975, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.8267, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 4.857142857142857, |
|
"grad_norm": 1.1753151416778564, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.7648, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 5.142857142857143, |
|
"grad_norm": 1.2862088680267334, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.866, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 5.428571428571429, |
|
"grad_norm": 0.9621393084526062, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.7878, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 1.2728854417800903, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6709, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.9635931253433228, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.7501, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 6.285714285714286, |
|
"grad_norm": 0.8941543698310852, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.7428, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 6.571428571428571, |
|
"grad_norm": 1.032415747642517, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.5109, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 6.857142857142857, |
|
"grad_norm": 1.1366804838180542, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.6937, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 1.0126097202301025, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.6131, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 7.428571428571429, |
|
"grad_norm": 1.0203325748443604, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.6236, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 7.714285714285714, |
|
"grad_norm": 1.1223379373550415, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.4396, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.156301498413086, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.6135, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 8.285714285714286, |
|
"grad_norm": 1.3308717012405396, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.4818, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 1.0889034271240234, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5571, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 8.857142857142858, |
|
"grad_norm": 1.2153044939041138, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.4891, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 9.142857142857142, |
|
"grad_norm": 1.2152674198150635, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.5333, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 9.428571428571429, |
|
"grad_norm": 1.1658018827438354, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.4143, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 9.714285714285714, |
|
"grad_norm": 1.2498981952667236, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.3726, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.313835620880127, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.3324, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 10.285714285714286, |
|
"grad_norm": 1.2041679620742798, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.3397, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 10.571428571428571, |
|
"grad_norm": 1.312419056892395, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.4332, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 10.857142857142858, |
|
"grad_norm": 1.392309546470642, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.2769, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 11.142857142857142, |
|
"grad_norm": 1.255035638809204, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.4282, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 11.428571428571429, |
|
"grad_norm": 1.450054407119751, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2118, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 11.714285714285714, |
|
"grad_norm": 1.3045405149459839, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.2816, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 1.3475886583328247, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.2846, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 12.285714285714286, |
|
"grad_norm": 1.299771785736084, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.335, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 12.571428571428571, |
|
"grad_norm": 1.272226333618164, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.2031, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 12.857142857142858, |
|
"grad_norm": 1.7329617738723755, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.2738, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 13.142857142857142, |
|
"grad_norm": 1.6047208309173584, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.2792, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 13.428571428571429, |
|
"grad_norm": 1.2631847858428955, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.2651, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 13.714285714285714, |
|
"grad_norm": 1.3166191577911377, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.1932, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 1.5138972997665405, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.2165, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 14.285714285714286, |
|
"grad_norm": 1.1001005172729492, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1508, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 14.571428571428571, |
|
"grad_norm": 1.3192509412765503, |
|
"learning_rate": 9e-06, |
|
"loss": 0.2484, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 14.857142857142858, |
|
"grad_norm": 1.7928791046142578, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.1838, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 15.142857142857142, |
|
"grad_norm": 1.579509973526001, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.2775, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 15.428571428571429, |
|
"grad_norm": 1.0521697998046875, |
|
"learning_rate": 6e-06, |
|
"loss": 0.147, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 15.714285714285714, |
|
"grad_norm": 1.6448253393173218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2041, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 1.2469841241836548, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.1623, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 16.285714285714285, |
|
"grad_norm": 1.1743882894515991, |
|
"learning_rate": 3e-06, |
|
"loss": 0.1822, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 16.571428571428573, |
|
"grad_norm": 1.246579885482788, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.2199, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 16.857142857142858, |
|
"grad_norm": 1.1595028638839722, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.1361, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 17.142857142857142, |
|
"grad_norm": 1.380496859550476, |
|
"learning_rate": 0.0, |
|
"loss": 0.1335, |
|
"step": 60 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 60, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2902028263292928.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|