|
{ |
|
"best_metric": 0.6686851978302002, |
|
"best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b/checkpoint-285", |
|
"epoch": 9.192, |
|
"eval_steps": 1.0, |
|
"global_step": 285, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.5758810052676581, |
|
"learning_rate": 0.0, |
|
"loss": 1.5784, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"eval_loss": 1.614479660987854, |
|
"eval_runtime": 90.1495, |
|
"eval_samples_per_second": 2.219, |
|
"eval_steps_per_second": 0.277, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.5732524292532967, |
|
"learning_rate": 1.2618595071429148e-05, |
|
"loss": 1.496, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"eval_loss": 1.614479660987854, |
|
"eval_runtime": 85.9035, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.291, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.5752113482534317, |
|
"learning_rate": 2e-05, |
|
"loss": 1.5565, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"eval_loss": 1.5945543050765991, |
|
"eval_runtime": 85.9029, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.291, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.46566701161868557, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4933, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"eval_loss": 1.5579420328140259, |
|
"eval_runtime": 85.9412, |
|
"eval_samples_per_second": 2.327, |
|
"eval_steps_per_second": 0.291, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6298906192739324, |
|
"learning_rate": 2e-05, |
|
"loss": 1.5177, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.519984483718872, |
|
"eval_runtime": 85.9435, |
|
"eval_samples_per_second": 2.327, |
|
"eval_steps_per_second": 0.291, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.5629546129758171, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4806, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"eval_loss": 1.4810457229614258, |
|
"eval_runtime": 86.8543, |
|
"eval_samples_per_second": 2.303, |
|
"eval_steps_per_second": 0.288, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.5629546129758171, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4426, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"eval_loss": 1.4810457229614258, |
|
"eval_runtime": 85.5769, |
|
"eval_samples_per_second": 2.337, |
|
"eval_steps_per_second": 0.292, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.5629546129758171, |
|
"learning_rate": 2e-05, |
|
"loss": 1.487, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"eval_loss": 1.4810457229614258, |
|
"eval_runtime": 85.5166, |
|
"eval_samples_per_second": 2.339, |
|
"eval_steps_per_second": 0.292, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.5225734696456765, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4824, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"eval_loss": 1.4472432136535645, |
|
"eval_runtime": 85.6273, |
|
"eval_samples_per_second": 2.336, |
|
"eval_steps_per_second": 0.292, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.5120766386106574, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4055, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 1.431533694267273, |
|
"eval_runtime": 85.1358, |
|
"eval_samples_per_second": 2.349, |
|
"eval_steps_per_second": 0.294, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.5120766386106574, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4374, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"eval_loss": 1.431533694267273, |
|
"eval_runtime": 85.8174, |
|
"eval_samples_per_second": 2.331, |
|
"eval_steps_per_second": 0.291, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.9218280445348435, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4128, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"eval_loss": 1.4020923376083374, |
|
"eval_runtime": 85.7769, |
|
"eval_samples_per_second": 2.332, |
|
"eval_steps_per_second": 0.291, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 1.5494392795931824, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4671, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"eval_loss": 1.3614002466201782, |
|
"eval_runtime": 85.7302, |
|
"eval_samples_per_second": 2.333, |
|
"eval_steps_per_second": 0.292, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 2.3567445757054766, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2809, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"eval_loss": 1.3194799423217773, |
|
"eval_runtime": 85.9134, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.291, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.5106182972290174, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3338, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 1.2909258604049683, |
|
"eval_runtime": 86.1624, |
|
"eval_samples_per_second": 2.321, |
|
"eval_steps_per_second": 0.29, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.6480008459041514, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2888, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"eval_loss": 1.267655611038208, |
|
"eval_runtime": 85.1961, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.293, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.5742819355565492, |
|
"learning_rate": 2e-05, |
|
"loss": 1.265, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"eval_loss": 1.243842601776123, |
|
"eval_runtime": 85.3262, |
|
"eval_samples_per_second": 2.344, |
|
"eval_steps_per_second": 0.293, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.6388701851382904, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2662, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"eval_loss": 1.2195556163787842, |
|
"eval_runtime": 85.1951, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.293, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.4940836425011853, |
|
"learning_rate": 2e-05, |
|
"loss": 1.136, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"eval_loss": 1.1959036588668823, |
|
"eval_runtime": 85.3064, |
|
"eval_samples_per_second": 2.344, |
|
"eval_steps_per_second": 0.293, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.48425296045894156, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2361, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 1.1732313632965088, |
|
"eval_runtime": 85.2018, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.4174760070919301, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1559, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"eval_loss": 1.151344656944275, |
|
"eval_runtime": 85.1267, |
|
"eval_samples_per_second": 2.349, |
|
"eval_steps_per_second": 0.294, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.4514925259027495, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2058, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"eval_loss": 1.1299418210983276, |
|
"eval_runtime": 85.187, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.293, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.4104167292767524, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1874, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"eval_loss": 1.1091759204864502, |
|
"eval_runtime": 85.2104, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.3690081767478843, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1555, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"eval_loss": 1.0891705751419067, |
|
"eval_runtime": 85.0331, |
|
"eval_samples_per_second": 2.352, |
|
"eval_steps_per_second": 0.294, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.39605731232207236, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1113, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.0692001581192017, |
|
"eval_runtime": 85.2164, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.36927768645318826, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1124, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"eval_loss": 1.0496515035629272, |
|
"eval_runtime": 88.1096, |
|
"eval_samples_per_second": 2.27, |
|
"eval_steps_per_second": 0.284, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.38712274276178793, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0485, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"eval_loss": 1.0307434797286987, |
|
"eval_runtime": 88.6941, |
|
"eval_samples_per_second": 2.255, |
|
"eval_steps_per_second": 0.282, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.37683532534478703, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0494, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"eval_loss": 1.0122556686401367, |
|
"eval_runtime": 88.6898, |
|
"eval_samples_per_second": 2.255, |
|
"eval_steps_per_second": 0.282, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.31167540236884894, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0384, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"eval_loss": 0.994915783405304, |
|
"eval_runtime": 88.4751, |
|
"eval_samples_per_second": 2.261, |
|
"eval_steps_per_second": 0.283, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3035168410857397, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0571, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.9787777662277222, |
|
"eval_runtime": 88.4016, |
|
"eval_samples_per_second": 2.262, |
|
"eval_steps_per_second": 0.283, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.3501105312815732, |
|
"learning_rate": 2e-05, |
|
"loss": 0.915, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"eval_loss": 0.9635753035545349, |
|
"eval_runtime": 93.659, |
|
"eval_samples_per_second": 2.135, |
|
"eval_steps_per_second": 0.267, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.31289892959527454, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0061, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"eval_loss": 0.9496576189994812, |
|
"eval_runtime": 92.1616, |
|
"eval_samples_per_second": 2.17, |
|
"eval_steps_per_second": 0.271, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.29757404844376606, |
|
"learning_rate": 2e-05, |
|
"loss": 1.018, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"eval_loss": 0.9369340538978577, |
|
"eval_runtime": 92.6023, |
|
"eval_samples_per_second": 2.16, |
|
"eval_steps_per_second": 0.27, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.2618148684145232, |
|
"learning_rate": 2e-05, |
|
"loss": 0.927, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"eval_loss": 0.9251891374588013, |
|
"eval_runtime": 92.1541, |
|
"eval_samples_per_second": 2.17, |
|
"eval_steps_per_second": 0.271, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.28251385173765375, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9539, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.913889467716217, |
|
"eval_runtime": 92.0402, |
|
"eval_samples_per_second": 2.173, |
|
"eval_steps_per_second": 0.272, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.260093009410511, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9356, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 0.9035020470619202, |
|
"eval_runtime": 89.6964, |
|
"eval_samples_per_second": 2.23, |
|
"eval_steps_per_second": 0.279, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 0.27662724636836117, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9597, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"eval_loss": 0.8957402110099792, |
|
"eval_runtime": 85.1681, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.294, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 0.5728746629980745, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9398, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"eval_loss": 0.887246310710907, |
|
"eval_runtime": 85.6149, |
|
"eval_samples_per_second": 2.336, |
|
"eval_steps_per_second": 0.292, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 0.2684824759760228, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9616, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"eval_loss": 0.8806753754615784, |
|
"eval_runtime": 86.3901, |
|
"eval_samples_per_second": 2.315, |
|
"eval_steps_per_second": 0.289, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 0.24685769110976413, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9854, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"eval_loss": 0.8744142055511475, |
|
"eval_runtime": 85.3845, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.2357626526047496, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9284, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_loss": 0.868619441986084, |
|
"eval_runtime": 86.2809, |
|
"eval_samples_per_second": 2.318, |
|
"eval_steps_per_second": 0.29, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 0.22791772858432163, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0035, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"eval_loss": 0.8631160259246826, |
|
"eval_runtime": 85.2643, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 0.2301475582048382, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9441, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"eval_loss": 0.8579904437065125, |
|
"eval_runtime": 85.464, |
|
"eval_samples_per_second": 2.34, |
|
"eval_steps_per_second": 0.293, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 0.2435877146655292, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9537, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"eval_loss": 0.8532869219779968, |
|
"eval_runtime": 85.4531, |
|
"eval_samples_per_second": 2.34, |
|
"eval_steps_per_second": 0.293, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 0.22680224690529022, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8432, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"eval_loss": 0.8488282561302185, |
|
"eval_runtime": 85.2256, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.24467493716810432, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9582, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 0.8452281951904297, |
|
"eval_runtime": 86.4412, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 0.3102498103163491, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8935, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"eval_loss": 0.8409687876701355, |
|
"eval_runtime": 85.1809, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.293, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 0.26376164638875965, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9153, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"eval_loss": 0.836646556854248, |
|
"eval_runtime": 85.3347, |
|
"eval_samples_per_second": 2.344, |
|
"eval_steps_per_second": 0.293, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 0.26268816516328214, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8937, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"eval_loss": 0.8322432637214661, |
|
"eval_runtime": 85.1246, |
|
"eval_samples_per_second": 2.349, |
|
"eval_steps_per_second": 0.294, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 0.20800644242816013, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8346, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"eval_loss": 0.8282632231712341, |
|
"eval_runtime": 85.9203, |
|
"eval_samples_per_second": 2.328, |
|
"eval_steps_per_second": 0.291, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.234023912047604, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9457, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"eval_loss": 0.8249067068099976, |
|
"eval_runtime": 85.5644, |
|
"eval_samples_per_second": 2.337, |
|
"eval_steps_per_second": 0.292, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 0.22274778391959116, |
|
"learning_rate": 2e-05, |
|
"loss": 0.894, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"eval_loss": 0.8220057487487793, |
|
"eval_runtime": 85.2356, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 0.247116310753153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9422, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"eval_loss": 0.8193264603614807, |
|
"eval_runtime": 85.2302, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 0.2156755816522451, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9483, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"eval_loss": 0.8170039653778076, |
|
"eval_runtime": 85.5313, |
|
"eval_samples_per_second": 2.338, |
|
"eval_steps_per_second": 0.292, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 0.20641699121207405, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9433, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"eval_loss": 0.8153803944587708, |
|
"eval_runtime": 85.4465, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.22411091268182518, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8839, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.8131626844406128, |
|
"eval_runtime": 87.3514, |
|
"eval_samples_per_second": 2.29, |
|
"eval_steps_per_second": 0.286, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 0.22136515515298041, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9219, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"eval_loss": 0.8108111023902893, |
|
"eval_runtime": 86.5607, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 0.22277176520749853, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8317, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"eval_loss": 0.8082687854766846, |
|
"eval_runtime": 86.4781, |
|
"eval_samples_per_second": 2.313, |
|
"eval_steps_per_second": 0.289, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 0.22242544994690336, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8317, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"eval_loss": 0.8052798509597778, |
|
"eval_runtime": 86.1491, |
|
"eval_samples_per_second": 2.322, |
|
"eval_steps_per_second": 0.29, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 0.20539599715237697, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8777, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"eval_loss": 0.8023205399513245, |
|
"eval_runtime": 86.2508, |
|
"eval_samples_per_second": 2.319, |
|
"eval_steps_per_second": 0.29, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.2259203508735786, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8987, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 0.7997938394546509, |
|
"eval_runtime": 87.6556, |
|
"eval_samples_per_second": 2.282, |
|
"eval_steps_per_second": 0.285, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 0.2423173341059814, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7621, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"eval_loss": 0.7969934344291687, |
|
"eval_runtime": 86.8775, |
|
"eval_samples_per_second": 2.302, |
|
"eval_steps_per_second": 0.288, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.024, |
|
"grad_norm": 0.24036001781096705, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8819, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.024, |
|
"eval_loss": 0.7944203019142151, |
|
"eval_runtime": 86.5654, |
|
"eval_samples_per_second": 2.31, |
|
"eval_steps_per_second": 0.289, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.056, |
|
"grad_norm": 0.20841482575321812, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7713, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.056, |
|
"eval_loss": 0.791679859161377, |
|
"eval_runtime": 86.6028, |
|
"eval_samples_per_second": 2.309, |
|
"eval_steps_per_second": 0.289, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.088, |
|
"grad_norm": 0.2184988692033955, |
|
"learning_rate": 2e-05, |
|
"loss": 0.829, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.088, |
|
"eval_loss": 0.7891654372215271, |
|
"eval_runtime": 86.7507, |
|
"eval_samples_per_second": 2.305, |
|
"eval_steps_per_second": 0.288, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.23020842769384967, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8473, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"eval_loss": 0.7867069840431213, |
|
"eval_runtime": 87.5061, |
|
"eval_samples_per_second": 2.286, |
|
"eval_steps_per_second": 0.286, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.152, |
|
"grad_norm": 0.25430631663993714, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8681, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.152, |
|
"eval_loss": 0.7836448550224304, |
|
"eval_runtime": 88.0078, |
|
"eval_samples_per_second": 2.273, |
|
"eval_steps_per_second": 0.284, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.184, |
|
"grad_norm": 0.23653466680757473, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8876, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.184, |
|
"eval_loss": 0.7806727886199951, |
|
"eval_runtime": 87.345, |
|
"eval_samples_per_second": 2.29, |
|
"eval_steps_per_second": 0.286, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.216, |
|
"grad_norm": 0.2565004166075463, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8596, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.216, |
|
"eval_loss": 0.7781125903129578, |
|
"eval_runtime": 87.2054, |
|
"eval_samples_per_second": 2.293, |
|
"eval_steps_per_second": 0.287, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.248, |
|
"grad_norm": 0.22097009361742267, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8956, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.248, |
|
"eval_loss": 0.7758963704109192, |
|
"eval_runtime": 87.6181, |
|
"eval_samples_per_second": 2.283, |
|
"eval_steps_per_second": 0.285, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 0.23458324558709256, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8812, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"eval_loss": 0.773766279220581, |
|
"eval_runtime": 86.8377, |
|
"eval_samples_per_second": 2.303, |
|
"eval_steps_per_second": 0.288, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.312, |
|
"grad_norm": 0.2690788840468198, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8779, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.312, |
|
"eval_loss": 0.7716243267059326, |
|
"eval_runtime": 86.5931, |
|
"eval_samples_per_second": 2.31, |
|
"eval_steps_per_second": 0.289, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.344, |
|
"grad_norm": 0.22263909993294226, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7766, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.344, |
|
"eval_loss": 0.7695778012275696, |
|
"eval_runtime": 86.9844, |
|
"eval_samples_per_second": 2.299, |
|
"eval_steps_per_second": 0.287, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.376, |
|
"grad_norm": 0.26058003387602907, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8995, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.376, |
|
"eval_loss": 0.7680388689041138, |
|
"eval_runtime": 86.551, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.408, |
|
"grad_norm": 0.2262224984051455, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8323, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.408, |
|
"eval_loss": 0.766679584980011, |
|
"eval_runtime": 86.5962, |
|
"eval_samples_per_second": 2.31, |
|
"eval_steps_per_second": 0.289, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.25720804331740627, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8036, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"eval_loss": 0.7652787566184998, |
|
"eval_runtime": 86.2366, |
|
"eval_samples_per_second": 2.319, |
|
"eval_steps_per_second": 0.29, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.472, |
|
"grad_norm": 0.22971293606988397, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8806, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.472, |
|
"eval_loss": 0.7643636465072632, |
|
"eval_runtime": 86.3577, |
|
"eval_samples_per_second": 2.316, |
|
"eval_steps_per_second": 0.289, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.504, |
|
"grad_norm": 0.2522080484690418, |
|
"learning_rate": 2e-05, |
|
"loss": 0.817, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.504, |
|
"eval_loss": 0.7629995942115784, |
|
"eval_runtime": 86.2805, |
|
"eval_samples_per_second": 2.318, |
|
"eval_steps_per_second": 0.29, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.536, |
|
"grad_norm": 0.25383057566992234, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7931, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.536, |
|
"eval_loss": 0.7622135877609253, |
|
"eval_runtime": 86.5133, |
|
"eval_samples_per_second": 2.312, |
|
"eval_steps_per_second": 0.289, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.568, |
|
"grad_norm": 0.27933475264216745, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8135, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.568, |
|
"eval_loss": 0.7606070041656494, |
|
"eval_runtime": 86.377, |
|
"eval_samples_per_second": 2.315, |
|
"eval_steps_per_second": 0.289, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.24704516135802373, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7688, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 0.7587440609931946, |
|
"eval_runtime": 88.9717, |
|
"eval_samples_per_second": 2.248, |
|
"eval_steps_per_second": 0.281, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.632, |
|
"grad_norm": 0.2595849376774823, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7207, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.632, |
|
"eval_loss": 0.7568916082382202, |
|
"eval_runtime": 88.9097, |
|
"eval_samples_per_second": 2.249, |
|
"eval_steps_per_second": 0.281, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.664, |
|
"grad_norm": 0.2586023772952801, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8642, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.664, |
|
"eval_loss": 0.7559364438056946, |
|
"eval_runtime": 89.045, |
|
"eval_samples_per_second": 2.246, |
|
"eval_steps_per_second": 0.281, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.6959999999999997, |
|
"grad_norm": 0.2273264534259725, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8864, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.6959999999999997, |
|
"eval_loss": 0.7552520632743835, |
|
"eval_runtime": 88.9448, |
|
"eval_samples_per_second": 2.249, |
|
"eval_steps_per_second": 0.281, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.7279999999999998, |
|
"grad_norm": 0.26638251168101784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7977, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.7279999999999998, |
|
"eval_loss": 0.753672182559967, |
|
"eval_runtime": 89.2211, |
|
"eval_samples_per_second": 2.242, |
|
"eval_steps_per_second": 0.28, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.27672934644885144, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8003, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"eval_loss": 0.7510656714439392, |
|
"eval_runtime": 95.8714, |
|
"eval_samples_per_second": 2.086, |
|
"eval_steps_per_second": 0.261, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.792, |
|
"grad_norm": 0.28159046758182865, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8216, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.792, |
|
"eval_loss": 0.7484390735626221, |
|
"eval_runtime": 93.9836, |
|
"eval_samples_per_second": 2.128, |
|
"eval_steps_per_second": 0.266, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.824, |
|
"grad_norm": 0.25495896352825237, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8514, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.824, |
|
"eval_loss": 0.7466137409210205, |
|
"eval_runtime": 92.9783, |
|
"eval_samples_per_second": 2.151, |
|
"eval_steps_per_second": 0.269, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.856, |
|
"grad_norm": 0.24959081452423665, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8291, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.856, |
|
"eval_loss": 0.745083749294281, |
|
"eval_runtime": 92.9713, |
|
"eval_samples_per_second": 2.151, |
|
"eval_steps_per_second": 0.269, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.888, |
|
"grad_norm": 0.258467204198503, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8669, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.888, |
|
"eval_loss": 0.7432680726051331, |
|
"eval_runtime": 93.3306, |
|
"eval_samples_per_second": 2.143, |
|
"eval_steps_per_second": 0.268, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 0.24500563921569218, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8085, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"eval_loss": 0.7415681481361389, |
|
"eval_runtime": 91.2727, |
|
"eval_samples_per_second": 2.191, |
|
"eval_steps_per_second": 0.274, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.27455934907237084, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8307, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"eval_loss": 0.7402371168136597, |
|
"eval_runtime": 85.3987, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"grad_norm": 0.2775170496694688, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6946, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"eval_loss": 0.7383440136909485, |
|
"eval_runtime": 85.4187, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 3.024, |
|
"grad_norm": 0.2792378343187822, |
|
"learning_rate": 2e-05, |
|
"loss": 0.788, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 3.024, |
|
"eval_loss": 0.7362905144691467, |
|
"eval_runtime": 85.2866, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 3.056, |
|
"grad_norm": 0.24527382879929208, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8077, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.056, |
|
"eval_loss": 0.7345578074455261, |
|
"eval_runtime": 85.2836, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.088, |
|
"grad_norm": 0.2639510901590645, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8159, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.088, |
|
"eval_loss": 0.7332432270050049, |
|
"eval_runtime": 86.4403, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.32275944645869054, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7283, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"eval_loss": 0.7311471700668335, |
|
"eval_runtime": 86.3257, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.152, |
|
"grad_norm": 0.22657765140514205, |
|
"learning_rate": 2e-05, |
|
"loss": 0.796, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.152, |
|
"eval_loss": 0.7294245958328247, |
|
"eval_runtime": 86.4307, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.184, |
|
"grad_norm": 0.2696947762711156, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8463, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.184, |
|
"eval_loss": 0.7282422780990601, |
|
"eval_runtime": 86.5308, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.216, |
|
"grad_norm": 0.2600510971816684, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8089, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.216, |
|
"eval_loss": 0.7277690768241882, |
|
"eval_runtime": 86.3354, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.248, |
|
"grad_norm": 0.2786398542362818, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7746, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.248, |
|
"eval_loss": 0.7275124192237854, |
|
"eval_runtime": 88.6929, |
|
"eval_samples_per_second": 2.255, |
|
"eval_steps_per_second": 0.282, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 0.2737884177070957, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8182, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"eval_loss": 0.727260947227478, |
|
"eval_runtime": 85.8129, |
|
"eval_samples_per_second": 2.331, |
|
"eval_steps_per_second": 0.291, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.312, |
|
"grad_norm": 0.29485392261335913, |
|
"learning_rate": 2e-05, |
|
"loss": 0.771, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.312, |
|
"eval_loss": 0.726463794708252, |
|
"eval_runtime": 85.5624, |
|
"eval_samples_per_second": 2.337, |
|
"eval_steps_per_second": 0.292, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.344, |
|
"grad_norm": 0.2950854321605982, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7412, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.344, |
|
"eval_loss": 0.7254646420478821, |
|
"eval_runtime": 86.1462, |
|
"eval_samples_per_second": 2.322, |
|
"eval_steps_per_second": 0.29, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.376, |
|
"grad_norm": 0.2868496115468271, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7902, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.376, |
|
"eval_loss": 0.724499523639679, |
|
"eval_runtime": 85.8109, |
|
"eval_samples_per_second": 2.331, |
|
"eval_steps_per_second": 0.291, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.408, |
|
"grad_norm": 0.27526808102180006, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7962, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.408, |
|
"eval_loss": 0.723967432975769, |
|
"eval_runtime": 85.1137, |
|
"eval_samples_per_second": 2.35, |
|
"eval_steps_per_second": 0.294, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.28826054599507117, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7659, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"eval_loss": 0.7228976488113403, |
|
"eval_runtime": 85.338, |
|
"eval_samples_per_second": 2.344, |
|
"eval_steps_per_second": 0.293, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.472, |
|
"grad_norm": 0.2739253052624054, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8122, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.472, |
|
"eval_loss": 0.7213765978813171, |
|
"eval_runtime": 86.0819, |
|
"eval_samples_per_second": 2.323, |
|
"eval_steps_per_second": 0.29, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.504, |
|
"grad_norm": 0.3244236677701114, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7926, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.504, |
|
"eval_loss": 0.7201890349388123, |
|
"eval_runtime": 85.264, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.536, |
|
"grad_norm": 0.272846304884481, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7815, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.536, |
|
"eval_loss": 0.7191389203071594, |
|
"eval_runtime": 85.3814, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.568, |
|
"grad_norm": 0.32540225984762255, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7669, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.568, |
|
"eval_loss": 0.7177696824073792, |
|
"eval_runtime": 86.5851, |
|
"eval_samples_per_second": 2.31, |
|
"eval_steps_per_second": 0.289, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.3049195701830638, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7817, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 0.7163457274436951, |
|
"eval_runtime": 86.3221, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.632, |
|
"grad_norm": 0.2908157712070727, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7803, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.632, |
|
"eval_loss": 0.7153773307800293, |
|
"eval_runtime": 86.5278, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.664, |
|
"grad_norm": 0.3068313248625758, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8223, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.664, |
|
"eval_loss": 0.7154207825660706, |
|
"eval_runtime": 87.3327, |
|
"eval_samples_per_second": 2.29, |
|
"eval_steps_per_second": 0.286, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.6959999999999997, |
|
"grad_norm": 0.3055979867515295, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7682, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.6959999999999997, |
|
"eval_loss": 0.7148604393005371, |
|
"eval_runtime": 86.2716, |
|
"eval_samples_per_second": 2.318, |
|
"eval_steps_per_second": 0.29, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.7279999999999998, |
|
"grad_norm": 0.30145967440162974, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7794, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.7279999999999998, |
|
"eval_loss": 0.7141232490539551, |
|
"eval_runtime": 90.4025, |
|
"eval_samples_per_second": 2.212, |
|
"eval_steps_per_second": 0.277, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.30263126216965686, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7924, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"eval_loss": 0.7125248908996582, |
|
"eval_runtime": 89.9787, |
|
"eval_samples_per_second": 2.223, |
|
"eval_steps_per_second": 0.278, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.792, |
|
"grad_norm": 0.37055787998484313, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7527, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.792, |
|
"eval_loss": 0.7103064060211182, |
|
"eval_runtime": 89.9435, |
|
"eval_samples_per_second": 2.224, |
|
"eval_steps_per_second": 0.278, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.824, |
|
"grad_norm": 0.32370435744629744, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7225, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.824, |
|
"eval_loss": 0.708495020866394, |
|
"eval_runtime": 89.7211, |
|
"eval_samples_per_second": 2.229, |
|
"eval_steps_per_second": 0.279, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.856, |
|
"grad_norm": 0.28450870148051394, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7594, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.856, |
|
"eval_loss": 0.7078144550323486, |
|
"eval_runtime": 89.755, |
|
"eval_samples_per_second": 2.228, |
|
"eval_steps_per_second": 0.279, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.888, |
|
"grad_norm": 0.3521496955227081, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8098, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.888, |
|
"eval_loss": 0.706774115562439, |
|
"eval_runtime": 93.4447, |
|
"eval_samples_per_second": 2.14, |
|
"eval_steps_per_second": 0.268, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.29964697600639706, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7945, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"eval_loss": 0.7057322859764099, |
|
"eval_runtime": 93.0089, |
|
"eval_samples_per_second": 2.15, |
|
"eval_steps_per_second": 0.269, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.952, |
|
"grad_norm": 0.2998200701516689, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7986, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.952, |
|
"eval_loss": 0.7051501274108887, |
|
"eval_runtime": 93.7613, |
|
"eval_samples_per_second": 2.133, |
|
"eval_steps_per_second": 0.267, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.984, |
|
"grad_norm": 0.34265154113873836, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7626, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.984, |
|
"eval_loss": 0.7055770754814148, |
|
"eval_runtime": 94.0074, |
|
"eval_samples_per_second": 2.127, |
|
"eval_steps_per_second": 0.266, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 4.016, |
|
"grad_norm": 0.3227557876231983, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8266, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.016, |
|
"eval_loss": 0.7067859172821045, |
|
"eval_runtime": 92.4085, |
|
"eval_samples_per_second": 2.164, |
|
"eval_steps_per_second": 0.271, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.064, |
|
"grad_norm": 0.31358966391371784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7162, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 4.064, |
|
"eval_loss": 0.7073588371276855, |
|
"eval_runtime": 89.235, |
|
"eval_samples_per_second": 2.241, |
|
"eval_steps_per_second": 0.28, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 4.096, |
|
"grad_norm": 0.29594296413078097, |
|
"learning_rate": 2e-05, |
|
"loss": 0.737, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.096, |
|
"eval_loss": 0.7072306871414185, |
|
"eval_runtime": 85.5672, |
|
"eval_samples_per_second": 2.337, |
|
"eval_steps_per_second": 0.292, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.128, |
|
"grad_norm": 0.31562345712114676, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7735, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.128, |
|
"eval_loss": 0.7067290544509888, |
|
"eval_runtime": 85.4464, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.36960151197946806, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7275, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"eval_loss": 0.7046365141868591, |
|
"eval_runtime": 85.4173, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.192, |
|
"grad_norm": 0.28777555135336585, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7568, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.192, |
|
"eval_loss": 0.7030876278877258, |
|
"eval_runtime": 85.2072, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.224, |
|
"grad_norm": 0.3335688387393771, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7473, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.224, |
|
"eval_loss": 0.7016716003417969, |
|
"eval_runtime": 85.3898, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.256, |
|
"grad_norm": 0.36992044176671973, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7915, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.256, |
|
"eval_loss": 0.7006884813308716, |
|
"eval_runtime": 85.1939, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.293, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.288, |
|
"grad_norm": 0.3213431246183001, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7716, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.288, |
|
"eval_loss": 0.7004576325416565, |
|
"eval_runtime": 85.1892, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.293, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.31818378029100663, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7504, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"eval_loss": 0.7006973028182983, |
|
"eval_runtime": 85.2711, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.352, |
|
"grad_norm": 0.34120408014701054, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8125, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.352, |
|
"eval_loss": 0.7006770372390747, |
|
"eval_runtime": 85.0797, |
|
"eval_samples_per_second": 2.351, |
|
"eval_steps_per_second": 0.294, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.384, |
|
"grad_norm": 0.3354650435400624, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7623, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.384, |
|
"eval_loss": 0.7007671594619751, |
|
"eval_runtime": 86.3137, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.416, |
|
"grad_norm": 0.34273454929170855, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7539, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.416, |
|
"eval_loss": 0.7007145881652832, |
|
"eval_runtime": 86.1203, |
|
"eval_samples_per_second": 2.322, |
|
"eval_steps_per_second": 0.29, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.448, |
|
"grad_norm": 0.34329366738767764, |
|
"learning_rate": 2e-05, |
|
"loss": 0.673, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.448, |
|
"eval_loss": 0.7001290321350098, |
|
"eval_runtime": 86.4938, |
|
"eval_samples_per_second": 2.312, |
|
"eval_steps_per_second": 0.289, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.32986462476877876, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7874, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_loss": 0.6998225450515747, |
|
"eval_runtime": 85.0554, |
|
"eval_samples_per_second": 2.351, |
|
"eval_steps_per_second": 0.294, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.5120000000000005, |
|
"grad_norm": 0.42029356364309967, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7391, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.5120000000000005, |
|
"eval_loss": 0.6981640458106995, |
|
"eval_runtime": 86.43, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.5440000000000005, |
|
"grad_norm": 0.3410153964676588, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7375, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.5440000000000005, |
|
"eval_loss": 0.6970750689506531, |
|
"eval_runtime": 85.1484, |
|
"eval_samples_per_second": 2.349, |
|
"eval_steps_per_second": 0.294, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.576, |
|
"grad_norm": 0.39568033105661293, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7175, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.576, |
|
"eval_loss": 0.6954947113990784, |
|
"eval_runtime": 85.256, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"grad_norm": 0.35114222943495293, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6854, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"eval_loss": 0.6938956379890442, |
|
"eval_runtime": 85.0562, |
|
"eval_samples_per_second": 2.351, |
|
"eval_steps_per_second": 0.294, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.36129302808062, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7821, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"eval_loss": 0.6925562024116516, |
|
"eval_runtime": 85.1341, |
|
"eval_samples_per_second": 2.349, |
|
"eval_steps_per_second": 0.294, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.672, |
|
"grad_norm": 0.39551012956858894, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7521, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.672, |
|
"eval_loss": 0.6914381384849548, |
|
"eval_runtime": 85.152, |
|
"eval_samples_per_second": 2.349, |
|
"eval_steps_per_second": 0.294, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.704, |
|
"grad_norm": 0.42650783733532543, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7883, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.704, |
|
"eval_loss": 0.6911692023277283, |
|
"eval_runtime": 86.8539, |
|
"eval_samples_per_second": 2.303, |
|
"eval_steps_per_second": 0.288, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.736, |
|
"grad_norm": 0.3786582921989863, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7987, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.736, |
|
"eval_loss": 0.6914923191070557, |
|
"eval_runtime": 87.0003, |
|
"eval_samples_per_second": 2.299, |
|
"eval_steps_per_second": 0.287, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.768, |
|
"grad_norm": 0.3528223035850843, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8181, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.768, |
|
"eval_loss": 0.6930768489837646, |
|
"eval_runtime": 86.8453, |
|
"eval_samples_per_second": 2.303, |
|
"eval_steps_per_second": 0.288, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.4052106693792139, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7317, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_loss": 0.6946350336074829, |
|
"eval_runtime": 86.808, |
|
"eval_samples_per_second": 2.304, |
|
"eval_steps_per_second": 0.288, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.832, |
|
"grad_norm": 0.3739014269672761, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7851, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.832, |
|
"eval_loss": 0.6952430605888367, |
|
"eval_runtime": 86.8255, |
|
"eval_samples_per_second": 2.303, |
|
"eval_steps_per_second": 0.288, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"grad_norm": 0.42120491782720065, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6829, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"eval_loss": 0.6938563585281372, |
|
"eval_runtime": 89.8279, |
|
"eval_samples_per_second": 2.226, |
|
"eval_steps_per_second": 0.278, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.896, |
|
"grad_norm": 0.40313446713945206, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6972, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.896, |
|
"eval_loss": 0.6912936568260193, |
|
"eval_runtime": 89.3777, |
|
"eval_samples_per_second": 2.238, |
|
"eval_steps_per_second": 0.28, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.928, |
|
"grad_norm": 0.36052668588306425, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7294, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.928, |
|
"eval_loss": 0.6893093585968018, |
|
"eval_runtime": 89.369, |
|
"eval_samples_per_second": 2.238, |
|
"eval_steps_per_second": 0.28, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.35889751392140123, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7471, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"eval_loss": 0.6887902021408081, |
|
"eval_runtime": 89.3518, |
|
"eval_samples_per_second": 2.238, |
|
"eval_steps_per_second": 0.28, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.992, |
|
"grad_norm": 0.40694329818018776, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6145, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.992, |
|
"eval_loss": 0.6877387762069702, |
|
"eval_runtime": 89.5999, |
|
"eval_samples_per_second": 2.232, |
|
"eval_steps_per_second": 0.279, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 5.024, |
|
"grad_norm": 0.4071045470916848, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7106, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 5.024, |
|
"eval_loss": 0.6863316297531128, |
|
"eval_runtime": 91.1516, |
|
"eval_samples_per_second": 2.194, |
|
"eval_steps_per_second": 0.274, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 5.056, |
|
"grad_norm": 0.3825562066811806, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6845, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 5.056, |
|
"eval_loss": 0.6852035522460938, |
|
"eval_runtime": 90.7966, |
|
"eval_samples_per_second": 2.203, |
|
"eval_steps_per_second": 0.275, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 5.088, |
|
"grad_norm": 0.37147714986904934, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6739, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 5.088, |
|
"eval_loss": 0.6840065121650696, |
|
"eval_runtime": 91.6382, |
|
"eval_samples_per_second": 2.182, |
|
"eval_steps_per_second": 0.273, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 0.38259903152485825, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6777, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"eval_loss": 0.6829774379730225, |
|
"eval_runtime": 90.8166, |
|
"eval_samples_per_second": 2.202, |
|
"eval_steps_per_second": 0.275, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.152, |
|
"grad_norm": 0.39777547401791735, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7145, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.152, |
|
"eval_loss": 0.682302713394165, |
|
"eval_runtime": 91.0923, |
|
"eval_samples_per_second": 2.196, |
|
"eval_steps_per_second": 0.274, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.192, |
|
"grad_norm": 0.3709714989318106, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7251, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.192, |
|
"eval_loss": 0.6822090148925781, |
|
"eval_runtime": 90.1282, |
|
"eval_samples_per_second": 2.219, |
|
"eval_steps_per_second": 0.277, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.224, |
|
"grad_norm": 0.4046346018620919, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7108, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.224, |
|
"eval_loss": 0.6821247935295105, |
|
"eval_runtime": 85.5268, |
|
"eval_samples_per_second": 2.338, |
|
"eval_steps_per_second": 0.292, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.256, |
|
"grad_norm": 0.42060496638232386, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7541, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.256, |
|
"eval_loss": 0.6818928718566895, |
|
"eval_runtime": 86.5491, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.288, |
|
"grad_norm": 0.3949151390399246, |
|
"learning_rate": 2e-05, |
|
"loss": 0.731, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.288, |
|
"eval_loss": 0.6819549798965454, |
|
"eval_runtime": 85.4036, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.3610134094474086, |
|
"learning_rate": 2e-05, |
|
"loss": 0.711, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"eval_loss": 0.6825198531150818, |
|
"eval_runtime": 85.3022, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.352, |
|
"grad_norm": 0.38905537916660615, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7693, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.352, |
|
"eval_loss": 0.682900071144104, |
|
"eval_runtime": 87.5028, |
|
"eval_samples_per_second": 2.286, |
|
"eval_steps_per_second": 0.286, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.384, |
|
"grad_norm": 0.4020289142954435, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7257, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.384, |
|
"eval_loss": 0.6832457184791565, |
|
"eval_runtime": 86.3175, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.416, |
|
"grad_norm": 0.4136061042465234, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7082, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.416, |
|
"eval_loss": 0.6837514638900757, |
|
"eval_runtime": 87.4244, |
|
"eval_samples_per_second": 2.288, |
|
"eval_steps_per_second": 0.286, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.448, |
|
"grad_norm": 0.40006410263925274, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6352, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.448, |
|
"eval_loss": 0.6845301985740662, |
|
"eval_runtime": 86.3451, |
|
"eval_samples_per_second": 2.316, |
|
"eval_steps_per_second": 0.29, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 0.40988438997044196, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7485, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"eval_loss": 0.6851826310157776, |
|
"eval_runtime": 86.4269, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.5120000000000005, |
|
"grad_norm": 0.47923323092924647, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6926, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.5120000000000005, |
|
"eval_loss": 0.6841108798980713, |
|
"eval_runtime": 86.3221, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.5440000000000005, |
|
"grad_norm": 0.4031545746474779, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6961, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.5440000000000005, |
|
"eval_loss": 0.6829754710197449, |
|
"eval_runtime": 85.2735, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.576, |
|
"grad_norm": 0.46444040317934493, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6757, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.576, |
|
"eval_loss": 0.6810196042060852, |
|
"eval_runtime": 85.23, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.608, |
|
"grad_norm": 0.40032547211306824, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6465, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.608, |
|
"eval_loss": 0.6795651316642761, |
|
"eval_runtime": 85.2935, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 0.3975749684060634, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7434, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"eval_loss": 0.6787837147712708, |
|
"eval_runtime": 86.4234, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.672, |
|
"grad_norm": 0.4413863489846678, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7148, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.672, |
|
"eval_loss": 0.678077220916748, |
|
"eval_runtime": 86.46, |
|
"eval_samples_per_second": 2.313, |
|
"eval_steps_per_second": 0.289, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.704, |
|
"grad_norm": 0.4552334205325458, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7467, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.704, |
|
"eval_loss": 0.6782705783843994, |
|
"eval_runtime": 85.7334, |
|
"eval_samples_per_second": 2.333, |
|
"eval_steps_per_second": 0.292, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.736, |
|
"grad_norm": 0.4222034129737574, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7573, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.736, |
|
"eval_loss": 0.6788575053215027, |
|
"eval_runtime": 85.407, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.768, |
|
"grad_norm": 0.4212365440913614, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7853, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.768, |
|
"eval_loss": 0.680314302444458, |
|
"eval_runtime": 85.1528, |
|
"eval_samples_per_second": 2.349, |
|
"eval_steps_per_second": 0.294, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 0.47040418573969534, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6941, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"eval_loss": 0.6814693212509155, |
|
"eval_runtime": 85.2668, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.832, |
|
"grad_norm": 0.43506164207204023, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7466, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.832, |
|
"eval_loss": 0.6818942427635193, |
|
"eval_runtime": 86.1082, |
|
"eval_samples_per_second": 2.323, |
|
"eval_steps_per_second": 0.29, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.864, |
|
"grad_norm": 0.4851524205448296, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6414, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.864, |
|
"eval_loss": 0.6807515621185303, |
|
"eval_runtime": 85.2905, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.896, |
|
"grad_norm": 0.46212982880574544, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6594, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.896, |
|
"eval_loss": 0.6793842911720276, |
|
"eval_runtime": 85.2531, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.928, |
|
"grad_norm": 0.43483234178092045, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6927, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.928, |
|
"eval_loss": 0.6785325407981873, |
|
"eval_runtime": 86.4294, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 0.45461536176049777, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7127, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"eval_loss": 0.6785117983818054, |
|
"eval_runtime": 86.5612, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.992, |
|
"grad_norm": 0.5124892629103449, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5778, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.992, |
|
"eval_loss": 0.6772163510322571, |
|
"eval_runtime": 85.3177, |
|
"eval_samples_per_second": 2.344, |
|
"eval_steps_per_second": 0.293, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 6.024, |
|
"grad_norm": 0.4872469973004331, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7045, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 6.024, |
|
"eval_loss": 0.6760932207107544, |
|
"eval_runtime": 85.1785, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.294, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 6.056, |
|
"grad_norm": 0.43317759363804015, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6121, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 6.056, |
|
"eval_loss": 0.6763756275177002, |
|
"eval_runtime": 85.4466, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 6.088, |
|
"grad_norm": 0.47411518505747885, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7409, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 6.088, |
|
"eval_loss": 0.6757389903068542, |
|
"eval_runtime": 85.3382, |
|
"eval_samples_per_second": 2.344, |
|
"eval_steps_per_second": 0.293, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 0.4971851748274855, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7193, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"eval_loss": 0.6749419569969177, |
|
"eval_runtime": 85.4198, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 6.152, |
|
"grad_norm": 0.46832302038313683, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7413, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.152, |
|
"eval_loss": 0.674567699432373, |
|
"eval_runtime": 85.6429, |
|
"eval_samples_per_second": 2.335, |
|
"eval_steps_per_second": 0.292, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.184, |
|
"grad_norm": 0.47651234347196103, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7113, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.184, |
|
"eval_loss": 0.6739189028739929, |
|
"eval_runtime": 85.3814, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.216, |
|
"grad_norm": 0.4808945978374079, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6603, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.216, |
|
"eval_loss": 0.6737512350082397, |
|
"eval_runtime": 85.1781, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.294, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.248, |
|
"grad_norm": 0.45747741700806654, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6905, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.248, |
|
"eval_loss": 0.6738162040710449, |
|
"eval_runtime": 86.5609, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 0.49033746737240025, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7373, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"eval_loss": 0.6742382645606995, |
|
"eval_runtime": 86.4284, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.312, |
|
"grad_norm": 0.5438084044824532, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6819, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.312, |
|
"eval_loss": 0.674878716468811, |
|
"eval_runtime": 85.7504, |
|
"eval_samples_per_second": 2.332, |
|
"eval_steps_per_second": 0.292, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.344, |
|
"grad_norm": 0.4631516087052852, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6775, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.344, |
|
"eval_loss": 0.6761616468429565, |
|
"eval_runtime": 85.417, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.376, |
|
"grad_norm": 0.49177247669398155, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6605, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.376, |
|
"eval_loss": 0.6770765781402588, |
|
"eval_runtime": 85.4274, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.408, |
|
"grad_norm": 0.5177407926775024, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7136, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.408, |
|
"eval_loss": 0.6772163510322571, |
|
"eval_runtime": 85.2682, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"grad_norm": 0.5385213429977403, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6809, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"eval_loss": 0.6758923530578613, |
|
"eval_runtime": 85.4858, |
|
"eval_samples_per_second": 2.34, |
|
"eval_steps_per_second": 0.292, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.4719999999999995, |
|
"grad_norm": 0.4982626204598202, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7159, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.4719999999999995, |
|
"eval_loss": 0.675208568572998, |
|
"eval_runtime": 86.2523, |
|
"eval_samples_per_second": 2.319, |
|
"eval_steps_per_second": 0.29, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.504, |
|
"grad_norm": 0.4710756307884673, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6309, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.504, |
|
"eval_loss": 0.6743338108062744, |
|
"eval_runtime": 86.6636, |
|
"eval_samples_per_second": 2.308, |
|
"eval_steps_per_second": 0.288, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.536, |
|
"grad_norm": 0.5127505608717865, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6257, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.536, |
|
"eval_loss": 0.6735503673553467, |
|
"eval_runtime": 86.3216, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.568, |
|
"grad_norm": 0.48812419654399086, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6164, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.568, |
|
"eval_loss": 0.6740123629570007, |
|
"eval_runtime": 86.3491, |
|
"eval_samples_per_second": 2.316, |
|
"eval_steps_per_second": 0.29, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 0.5031408070515696, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6765, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"eval_loss": 0.6746988892555237, |
|
"eval_runtime": 86.2475, |
|
"eval_samples_per_second": 2.319, |
|
"eval_steps_per_second": 0.29, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.632, |
|
"grad_norm": 0.5221751920115928, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6836, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.632, |
|
"eval_loss": 0.6757599711418152, |
|
"eval_runtime": 90.9953, |
|
"eval_samples_per_second": 2.198, |
|
"eval_steps_per_second": 0.275, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.664, |
|
"grad_norm": 0.49611473039131815, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6809, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.664, |
|
"eval_loss": 0.676249086856842, |
|
"eval_runtime": 88.6484, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.282, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.696, |
|
"grad_norm": 0.5646771313169766, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6905, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.696, |
|
"eval_loss": 0.675098717212677, |
|
"eval_runtime": 88.7358, |
|
"eval_samples_per_second": 2.254, |
|
"eval_steps_per_second": 0.282, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.728, |
|
"grad_norm": 0.5075133396143146, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7001, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.728, |
|
"eval_loss": 0.6734683513641357, |
|
"eval_runtime": 88.7108, |
|
"eval_samples_per_second": 2.255, |
|
"eval_steps_per_second": 0.282, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 0.5292004993716772, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6366, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"eval_loss": 0.6727490425109863, |
|
"eval_runtime": 88.7397, |
|
"eval_samples_per_second": 2.254, |
|
"eval_steps_per_second": 0.282, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.792, |
|
"grad_norm": 0.5508154729937994, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6627, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.792, |
|
"eval_loss": 0.6719673275947571, |
|
"eval_runtime": 94.3959, |
|
"eval_samples_per_second": 2.119, |
|
"eval_steps_per_second": 0.265, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.824, |
|
"grad_norm": 0.5436944297369074, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6939, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.824, |
|
"eval_loss": 0.6717627048492432, |
|
"eval_runtime": 92.6409, |
|
"eval_samples_per_second": 2.159, |
|
"eval_steps_per_second": 0.27, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.856, |
|
"grad_norm": 0.563836681781508, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6715, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.856, |
|
"eval_loss": 0.6704577803611755, |
|
"eval_runtime": 92.7628, |
|
"eval_samples_per_second": 2.156, |
|
"eval_steps_per_second": 0.27, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.888, |
|
"grad_norm": 0.5903391746928088, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6706, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.888, |
|
"eval_loss": 0.6705368161201477, |
|
"eval_runtime": 93.0342, |
|
"eval_samples_per_second": 2.15, |
|
"eval_steps_per_second": 0.269, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 0.5044604071023134, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6308, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"eval_loss": 0.6709109544754028, |
|
"eval_runtime": 92.9865, |
|
"eval_samples_per_second": 2.151, |
|
"eval_steps_per_second": 0.269, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 0.5029981251789745, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6565, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"eval_loss": 0.6729848384857178, |
|
"eval_runtime": 91.2007, |
|
"eval_samples_per_second": 2.193, |
|
"eval_steps_per_second": 0.274, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.992, |
|
"grad_norm": 0.662623585564011, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5311, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.992, |
|
"eval_loss": 0.6713245511054993, |
|
"eval_runtime": 85.2866, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 7.024, |
|
"grad_norm": 0.6256446136768937, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6022, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 7.024, |
|
"eval_loss": 0.6695873737335205, |
|
"eval_runtime": 86.4021, |
|
"eval_samples_per_second": 2.315, |
|
"eval_steps_per_second": 0.289, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 7.056, |
|
"grad_norm": 0.4857229274218417, |
|
"learning_rate": 2e-05, |
|
"loss": 0.647, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 7.056, |
|
"eval_loss": 0.6719114780426025, |
|
"eval_runtime": 86.526, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 7.088, |
|
"grad_norm": 0.5361101044951209, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6477, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 7.088, |
|
"eval_loss": 0.6757076978683472, |
|
"eval_runtime": 85.4011, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 0.8396027685018896, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6112, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"eval_loss": 0.6758625507354736, |
|
"eval_runtime": 85.2083, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 7.152, |
|
"grad_norm": 0.5858149591099446, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6826, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 7.152, |
|
"eval_loss": 0.6765357255935669, |
|
"eval_runtime": 85.5627, |
|
"eval_samples_per_second": 2.337, |
|
"eval_steps_per_second": 0.292, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 7.184, |
|
"grad_norm": 0.5694999654835196, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5851, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.184, |
|
"eval_loss": 0.6776654124259949, |
|
"eval_runtime": 85.417, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.216, |
|
"grad_norm": 0.6001772847123094, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6633, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.216, |
|
"eval_loss": 0.6783779859542847, |
|
"eval_runtime": 85.1623, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.294, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.248, |
|
"grad_norm": 0.6068993188167514, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6275, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.248, |
|
"eval_loss": 0.6755834817886353, |
|
"eval_runtime": 85.3711, |
|
"eval_samples_per_second": 2.343, |
|
"eval_steps_per_second": 0.293, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 0.6038060153225616, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6319, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"eval_loss": 0.6720392107963562, |
|
"eval_runtime": 86.1134, |
|
"eval_samples_per_second": 2.323, |
|
"eval_steps_per_second": 0.29, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.312, |
|
"grad_norm": 0.5900082642978601, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6417, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.312, |
|
"eval_loss": 0.6699540615081787, |
|
"eval_runtime": 87.4261, |
|
"eval_samples_per_second": 2.288, |
|
"eval_steps_per_second": 0.286, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.344, |
|
"grad_norm": 0.6303979703934064, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5954, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.344, |
|
"eval_loss": 0.6697332859039307, |
|
"eval_runtime": 86.3137, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.376, |
|
"grad_norm": 0.5896270303292949, |
|
"learning_rate": 2e-05, |
|
"loss": 0.638, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.376, |
|
"eval_loss": 0.6699292063713074, |
|
"eval_runtime": 85.4946, |
|
"eval_samples_per_second": 2.339, |
|
"eval_steps_per_second": 0.292, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.408, |
|
"grad_norm": 0.5499007579825991, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6312, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.408, |
|
"eval_loss": 0.6695923805236816, |
|
"eval_runtime": 85.1955, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.293, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 0.5806994466204508, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6487, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"eval_loss": 0.670379638671875, |
|
"eval_runtime": 85.2487, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.4719999999999995, |
|
"grad_norm": 0.6171318222234403, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6497, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.4719999999999995, |
|
"eval_loss": 0.671440601348877, |
|
"eval_runtime": 85.4044, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.504, |
|
"grad_norm": 0.6077497033087486, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6388, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.504, |
|
"eval_loss": 0.6715556383132935, |
|
"eval_runtime": 85.7739, |
|
"eval_samples_per_second": 2.332, |
|
"eval_steps_per_second": 0.291, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.536, |
|
"grad_norm": 0.6333159810332618, |
|
"learning_rate": 2e-05, |
|
"loss": 0.636, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.536, |
|
"eval_loss": 0.6708941459655762, |
|
"eval_runtime": 85.2691, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.568, |
|
"grad_norm": 0.6022274503734126, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6455, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.568, |
|
"eval_loss": 0.6690527200698853, |
|
"eval_runtime": 85.4144, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.608, |
|
"grad_norm": 0.5296025255848918, |
|
"learning_rate": 2e-05, |
|
"loss": 0.572, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.608, |
|
"eval_loss": 0.6683849096298218, |
|
"eval_runtime": 89.92, |
|
"eval_samples_per_second": 2.224, |
|
"eval_steps_per_second": 0.278, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.5436886467794938, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6681, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"eval_loss": 0.67000412940979, |
|
"eval_runtime": 85.2548, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.672, |
|
"grad_norm": 0.5833714563537171, |
|
"learning_rate": 2e-05, |
|
"loss": 0.646, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.672, |
|
"eval_loss": 0.6720954179763794, |
|
"eval_runtime": 85.4363, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.704, |
|
"grad_norm": 0.6833890117857615, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6641, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.704, |
|
"eval_loss": 0.6737973093986511, |
|
"eval_runtime": 85.2541, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.736, |
|
"grad_norm": 0.5832421680011252, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6742, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.736, |
|
"eval_loss": 0.6757528185844421, |
|
"eval_runtime": 85.2234, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.768, |
|
"grad_norm": 0.5843876495624203, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7069, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.768, |
|
"eval_loss": 0.6778927445411682, |
|
"eval_runtime": 85.2049, |
|
"eval_samples_per_second": 2.347, |
|
"eval_steps_per_second": 0.293, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.6527712899983633, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6182, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"eval_loss": 0.6785970330238342, |
|
"eval_runtime": 84.9435, |
|
"eval_samples_per_second": 2.355, |
|
"eval_steps_per_second": 0.294, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.832, |
|
"grad_norm": 0.6228341483848424, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6633, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.832, |
|
"eval_loss": 0.678627610206604, |
|
"eval_runtime": 85.2349, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.864, |
|
"grad_norm": 0.6762374705072328, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5581, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.864, |
|
"eval_loss": 0.6781509518623352, |
|
"eval_runtime": 85.0862, |
|
"eval_samples_per_second": 2.351, |
|
"eval_steps_per_second": 0.294, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.896, |
|
"grad_norm": 0.6530004154367571, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5896, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.896, |
|
"eval_loss": 0.6776159405708313, |
|
"eval_runtime": 85.4386, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.928, |
|
"grad_norm": 0.6496264347077455, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6262, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.928, |
|
"eval_loss": 0.6762803792953491, |
|
"eval_runtime": 86.0069, |
|
"eval_samples_per_second": 2.325, |
|
"eval_steps_per_second": 0.291, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.6530394584817848, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6044, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"eval_loss": 0.6763593554496765, |
|
"eval_runtime": 86.5382, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.992, |
|
"grad_norm": 0.7165450399528321, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4777, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.992, |
|
"eval_loss": 0.6767419576644897, |
|
"eval_runtime": 85.3445, |
|
"eval_samples_per_second": 2.343, |
|
"eval_steps_per_second": 0.293, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 8.024, |
|
"grad_norm": 0.6210079733679161, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6113, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 8.024, |
|
"eval_loss": 0.6772445440292358, |
|
"eval_runtime": 85.4211, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 8.056, |
|
"grad_norm": 0.6078116340925231, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6133, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 8.056, |
|
"eval_loss": 0.6801083087921143, |
|
"eval_runtime": 85.1688, |
|
"eval_samples_per_second": 2.348, |
|
"eval_steps_per_second": 0.294, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 8.088, |
|
"grad_norm": 0.6584954900058523, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6234, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 8.088, |
|
"eval_loss": 0.680172324180603, |
|
"eval_runtime": 85.2854, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.6410123072973634, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6768, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"eval_loss": 0.6790580749511719, |
|
"eval_runtime": 85.343, |
|
"eval_samples_per_second": 2.343, |
|
"eval_steps_per_second": 0.293, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 8.152, |
|
"grad_norm": 0.7068905769473427, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6308, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 8.152, |
|
"eval_loss": 0.6779585480690002, |
|
"eval_runtime": 85.2579, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 8.184, |
|
"grad_norm": 0.6941639704688177, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6651, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 8.184, |
|
"eval_loss": 0.6783471703529358, |
|
"eval_runtime": 85.2368, |
|
"eval_samples_per_second": 2.346, |
|
"eval_steps_per_second": 0.293, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 8.216, |
|
"grad_norm": 0.7347551538754563, |
|
"learning_rate": 2e-05, |
|
"loss": 0.643, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.216, |
|
"eval_loss": 0.6772164702415466, |
|
"eval_runtime": 86.2786, |
|
"eval_samples_per_second": 2.318, |
|
"eval_steps_per_second": 0.29, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.248, |
|
"grad_norm": 0.7690902453226406, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6178, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.248, |
|
"eval_loss": 0.67960524559021, |
|
"eval_runtime": 86.3356, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.6534589041693806, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6231, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"eval_loss": 0.683942437171936, |
|
"eval_runtime": 86.114, |
|
"eval_samples_per_second": 2.323, |
|
"eval_steps_per_second": 0.29, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.312, |
|
"grad_norm": 0.7620000857656035, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5937, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.312, |
|
"eval_loss": 0.6850832104682922, |
|
"eval_runtime": 86.3659, |
|
"eval_samples_per_second": 2.316, |
|
"eval_steps_per_second": 0.289, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.344, |
|
"grad_norm": 0.8769311371151648, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6288, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.344, |
|
"eval_loss": 0.6806495189666748, |
|
"eval_runtime": 86.2011, |
|
"eval_samples_per_second": 2.32, |
|
"eval_steps_per_second": 0.29, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.376, |
|
"grad_norm": 0.7549996230143433, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5614, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.376, |
|
"eval_loss": 0.6746546030044556, |
|
"eval_runtime": 86.0828, |
|
"eval_samples_per_second": 2.323, |
|
"eval_steps_per_second": 0.29, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.408, |
|
"grad_norm": 0.6678277921019138, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5818, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.408, |
|
"eval_loss": 0.6705954074859619, |
|
"eval_runtime": 86.9228, |
|
"eval_samples_per_second": 2.301, |
|
"eval_steps_per_second": 0.288, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.6629861523432089, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6231, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"eval_loss": 0.6688622832298279, |
|
"eval_runtime": 86.644, |
|
"eval_samples_per_second": 2.308, |
|
"eval_steps_per_second": 0.289, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.472, |
|
"grad_norm": 0.7468331552698385, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6221, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.472, |
|
"eval_loss": 0.6675601005554199, |
|
"eval_runtime": 86.5385, |
|
"eval_samples_per_second": 2.311, |
|
"eval_steps_per_second": 0.289, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.504, |
|
"grad_norm": 0.6718735805762622, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5989, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.504, |
|
"eval_loss": 0.6682644486427307, |
|
"eval_runtime": 86.5137, |
|
"eval_samples_per_second": 2.312, |
|
"eval_steps_per_second": 0.289, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.536, |
|
"grad_norm": 0.7360990456326049, |
|
"learning_rate": 2e-05, |
|
"loss": 0.586, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.536, |
|
"eval_loss": 0.670520544052124, |
|
"eval_runtime": 86.3921, |
|
"eval_samples_per_second": 2.315, |
|
"eval_steps_per_second": 0.289, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.568, |
|
"grad_norm": 0.7372365755868506, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6154, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.568, |
|
"eval_loss": 0.6722339391708374, |
|
"eval_runtime": 86.4486, |
|
"eval_samples_per_second": 2.314, |
|
"eval_steps_per_second": 0.289, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.7691674703908615, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5759, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"eval_loss": 0.6752627491950989, |
|
"eval_runtime": 86.3478, |
|
"eval_samples_per_second": 2.316, |
|
"eval_steps_per_second": 0.29, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.632, |
|
"grad_norm": 0.7037334988016319, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5808, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.632, |
|
"eval_loss": 0.6786094903945923, |
|
"eval_runtime": 86.3221, |
|
"eval_samples_per_second": 2.317, |
|
"eval_steps_per_second": 0.29, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.664, |
|
"grad_norm": 0.7364875762471698, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6381, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.664, |
|
"eval_loss": 0.6802875399589539, |
|
"eval_runtime": 87.3858, |
|
"eval_samples_per_second": 2.289, |
|
"eval_steps_per_second": 0.286, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.696, |
|
"grad_norm": 0.772443884505786, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5779, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.696, |
|
"eval_loss": 0.6788821220397949, |
|
"eval_runtime": 86.2832, |
|
"eval_samples_per_second": 2.318, |
|
"eval_steps_per_second": 0.29, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.728, |
|
"grad_norm": 0.8243245423024692, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5899, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.728, |
|
"eval_loss": 0.6770071983337402, |
|
"eval_runtime": 89.4282, |
|
"eval_samples_per_second": 2.236, |
|
"eval_steps_per_second": 0.28, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"grad_norm": 0.7241934433417714, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6323, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"eval_loss": 0.6750556826591492, |
|
"eval_runtime": 88.2376, |
|
"eval_samples_per_second": 2.267, |
|
"eval_steps_per_second": 0.283, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.792, |
|
"grad_norm": 0.7031539699096522, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5878, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.792, |
|
"eval_loss": 0.6727312207221985, |
|
"eval_runtime": 88.2462, |
|
"eval_samples_per_second": 2.266, |
|
"eval_steps_per_second": 0.283, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.824, |
|
"grad_norm": 0.7218931138049051, |
|
"learning_rate": 2e-05, |
|
"loss": 0.613, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.824, |
|
"eval_loss": 0.6727555990219116, |
|
"eval_runtime": 88.3872, |
|
"eval_samples_per_second": 2.263, |
|
"eval_steps_per_second": 0.283, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.856, |
|
"grad_norm": 0.7231490645694756, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6315, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.856, |
|
"eval_loss": 0.6705790758132935, |
|
"eval_runtime": 87.9218, |
|
"eval_samples_per_second": 2.275, |
|
"eval_steps_per_second": 0.284, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.888, |
|
"grad_norm": 0.7051718905755886, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6076, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.888, |
|
"eval_loss": 0.6689162254333496, |
|
"eval_runtime": 91.8597, |
|
"eval_samples_per_second": 2.177, |
|
"eval_steps_per_second": 0.272, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 0.7328110944982523, |
|
"learning_rate": 2e-05, |
|
"loss": 0.624, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"eval_loss": 0.6683139204978943, |
|
"eval_runtime": 92.6652, |
|
"eval_samples_per_second": 2.158, |
|
"eval_steps_per_second": 0.27, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.952, |
|
"grad_norm": 0.7116677024113118, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6078, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.952, |
|
"eval_loss": 0.6700756549835205, |
|
"eval_runtime": 91.0374, |
|
"eval_samples_per_second": 2.197, |
|
"eval_steps_per_second": 0.275, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.984, |
|
"grad_norm": 0.7461165978892803, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6104, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.984, |
|
"eval_loss": 0.670260488986969, |
|
"eval_runtime": 90.6087, |
|
"eval_samples_per_second": 2.207, |
|
"eval_steps_per_second": 0.276, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 9.016, |
|
"grad_norm": 0.73533421631475, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5993, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 9.016, |
|
"eval_loss": 0.6718578934669495, |
|
"eval_runtime": 90.678, |
|
"eval_samples_per_second": 2.206, |
|
"eval_steps_per_second": 0.276, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 9.064, |
|
"grad_norm": 0.7141681250783954, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5763, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 9.064, |
|
"eval_loss": 0.6721769571304321, |
|
"eval_runtime": 89.905, |
|
"eval_samples_per_second": 2.225, |
|
"eval_steps_per_second": 0.278, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 9.096, |
|
"grad_norm": 0.7179191597134931, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6039, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 9.096, |
|
"eval_loss": 0.670803964138031, |
|
"eval_runtime": 85.2808, |
|
"eval_samples_per_second": 2.345, |
|
"eval_steps_per_second": 0.293, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 9.128, |
|
"grad_norm": 0.6677686603528123, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6018, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 9.128, |
|
"eval_loss": 0.67020583152771, |
|
"eval_runtime": 85.394, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.293, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 9.16, |
|
"grad_norm": 0.7730086322440611, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5776, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 9.16, |
|
"eval_loss": 0.6687878370285034, |
|
"eval_runtime": 85.3242, |
|
"eval_samples_per_second": 2.344, |
|
"eval_steps_per_second": 0.293, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 9.192, |
|
"grad_norm": 0.6910229929978119, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5947, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 9.192, |
|
"eval_loss": 0.6686851978302002, |
|
"eval_runtime": 85.4375, |
|
"eval_samples_per_second": 2.341, |
|
"eval_steps_per_second": 0.293, |
|
"step": 285 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 612422111199232.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|