diff --git "a/checkpoint-285/trainer_state.json" "b/checkpoint-285/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-285/trainer_state.json" @@ -0,0 +1,4308 @@ +{ + "best_metric": 0.6686851978302002, + "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b/checkpoint-285", + "epoch": 9.192, + "eval_steps": 1.0, + "global_step": 285, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.032, + "grad_norm": 0.5758810052676581, + "learning_rate": 0.0, + "loss": 1.5784, + "step": 1 + }, + { + "epoch": 0.032, + "eval_loss": 1.614479660987854, + "eval_runtime": 90.1495, + "eval_samples_per_second": 2.219, + "eval_steps_per_second": 0.277, + "step": 1 + }, + { + "epoch": 0.064, + "grad_norm": 0.5732524292532967, + "learning_rate": 1.2618595071429148e-05, + "loss": 1.496, + "step": 2 + }, + { + "epoch": 0.064, + "eval_loss": 1.614479660987854, + "eval_runtime": 85.9035, + "eval_samples_per_second": 2.328, + "eval_steps_per_second": 0.291, + "step": 2 + }, + { + "epoch": 0.096, + "grad_norm": 0.5752113482534317, + "learning_rate": 2e-05, + "loss": 1.5565, + "step": 3 + }, + { + "epoch": 0.096, + "eval_loss": 1.5945543050765991, + "eval_runtime": 85.9029, + "eval_samples_per_second": 2.328, + "eval_steps_per_second": 0.291, + "step": 3 + }, + { + "epoch": 0.128, + "grad_norm": 0.46566701161868557, + "learning_rate": 2e-05, + "loss": 1.4933, + "step": 4 + }, + { + "epoch": 0.128, + "eval_loss": 1.5579420328140259, + "eval_runtime": 85.9412, + "eval_samples_per_second": 2.327, + "eval_steps_per_second": 0.291, + "step": 4 + }, + { + "epoch": 0.16, + "grad_norm": 0.6298906192739324, + "learning_rate": 2e-05, + "loss": 1.5177, + "step": 5 + }, + { + "epoch": 0.16, + "eval_loss": 1.519984483718872, + "eval_runtime": 85.9435, + "eval_samples_per_second": 2.327, + "eval_steps_per_second": 0.291, + "step": 5 + }, + { + "epoch": 0.192, + "grad_norm": 0.5629546129758171, + "learning_rate": 2e-05, + "loss": 1.4806, + "step": 6 + }, + { + "epoch": 0.192, + "eval_loss": 1.4810457229614258, + "eval_runtime": 86.8543, + "eval_samples_per_second": 2.303, + "eval_steps_per_second": 0.288, + "step": 6 + }, + { + "epoch": 0.224, + "grad_norm": 0.5629546129758171, + "learning_rate": 2e-05, + "loss": 1.4426, + "step": 7 + }, + { + "epoch": 0.224, + "eval_loss": 1.4810457229614258, + "eval_runtime": 85.5769, + "eval_samples_per_second": 2.337, + "eval_steps_per_second": 0.292, + "step": 7 + }, + { + "epoch": 0.256, + "grad_norm": 0.5629546129758171, + "learning_rate": 2e-05, + "loss": 1.487, + "step": 8 + }, + { + "epoch": 0.256, + "eval_loss": 1.4810457229614258, + "eval_runtime": 85.5166, + "eval_samples_per_second": 2.339, + "eval_steps_per_second": 0.292, + "step": 8 + }, + { + "epoch": 0.288, + "grad_norm": 0.5225734696456765, + "learning_rate": 2e-05, + "loss": 1.4824, + "step": 9 + }, + { + "epoch": 0.288, + "eval_loss": 1.4472432136535645, + "eval_runtime": 85.6273, + "eval_samples_per_second": 2.336, + "eval_steps_per_second": 0.292, + "step": 9 + }, + { + "epoch": 0.32, + "grad_norm": 1.5120766386106574, + "learning_rate": 2e-05, + "loss": 1.4055, + "step": 10 + }, + { + "epoch": 0.32, + "eval_loss": 1.431533694267273, + "eval_runtime": 85.1358, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 0.294, + "step": 10 + }, + { + "epoch": 0.352, + "grad_norm": 1.5120766386106574, + "learning_rate": 2e-05, + "loss": 1.4374, + "step": 11 + }, + { + "epoch": 0.352, + "eval_loss": 1.431533694267273, + "eval_runtime": 85.8174, + "eval_samples_per_second": 2.331, + "eval_steps_per_second": 0.291, + "step": 11 + }, + { + "epoch": 0.384, + "grad_norm": 1.9218280445348435, + "learning_rate": 2e-05, + "loss": 1.4128, + "step": 12 + }, + { + "epoch": 0.384, + "eval_loss": 1.4020923376083374, + "eval_runtime": 85.7769, + "eval_samples_per_second": 2.332, + "eval_steps_per_second": 0.291, + "step": 12 + }, + { + "epoch": 0.416, + "grad_norm": 1.5494392795931824, + "learning_rate": 2e-05, + "loss": 1.4671, + "step": 13 + }, + { + "epoch": 0.416, + "eval_loss": 1.3614002466201782, + "eval_runtime": 85.7302, + "eval_samples_per_second": 2.333, + "eval_steps_per_second": 0.292, + "step": 13 + }, + { + "epoch": 0.448, + "grad_norm": 2.3567445757054766, + "learning_rate": 2e-05, + "loss": 1.2809, + "step": 14 + }, + { + "epoch": 0.448, + "eval_loss": 1.3194799423217773, + "eval_runtime": 85.9134, + "eval_samples_per_second": 2.328, + "eval_steps_per_second": 0.291, + "step": 14 + }, + { + "epoch": 0.48, + "grad_norm": 1.5106182972290174, + "learning_rate": 2e-05, + "loss": 1.3338, + "step": 15 + }, + { + "epoch": 0.48, + "eval_loss": 1.2909258604049683, + "eval_runtime": 86.1624, + "eval_samples_per_second": 2.321, + "eval_steps_per_second": 0.29, + "step": 15 + }, + { + "epoch": 0.512, + "grad_norm": 0.6480008459041514, + "learning_rate": 2e-05, + "loss": 1.2888, + "step": 16 + }, + { + "epoch": 0.512, + "eval_loss": 1.267655611038208, + "eval_runtime": 85.1961, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.293, + "step": 16 + }, + { + "epoch": 0.544, + "grad_norm": 0.5742819355565492, + "learning_rate": 2e-05, + "loss": 1.265, + "step": 17 + }, + { + "epoch": 0.544, + "eval_loss": 1.243842601776123, + "eval_runtime": 85.3262, + "eval_samples_per_second": 2.344, + "eval_steps_per_second": 0.293, + "step": 17 + }, + { + "epoch": 0.576, + "grad_norm": 0.6388701851382904, + "learning_rate": 2e-05, + "loss": 1.2662, + "step": 18 + }, + { + "epoch": 0.576, + "eval_loss": 1.2195556163787842, + "eval_runtime": 85.1951, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.293, + "step": 18 + }, + { + "epoch": 0.608, + "grad_norm": 0.4940836425011853, + "learning_rate": 2e-05, + "loss": 1.136, + "step": 19 + }, + { + "epoch": 0.608, + "eval_loss": 1.1959036588668823, + "eval_runtime": 85.3064, + "eval_samples_per_second": 2.344, + "eval_steps_per_second": 0.293, + "step": 19 + }, + { + "epoch": 0.64, + "grad_norm": 0.48425296045894156, + "learning_rate": 2e-05, + "loss": 1.2361, + "step": 20 + }, + { + "epoch": 0.64, + "eval_loss": 1.1732313632965088, + "eval_runtime": 85.2018, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 20 + }, + { + "epoch": 0.672, + "grad_norm": 0.4174760070919301, + "learning_rate": 2e-05, + "loss": 1.1559, + "step": 21 + }, + { + "epoch": 0.672, + "eval_loss": 1.151344656944275, + "eval_runtime": 85.1267, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 0.294, + "step": 21 + }, + { + "epoch": 0.704, + "grad_norm": 0.4514925259027495, + "learning_rate": 2e-05, + "loss": 1.2058, + "step": 22 + }, + { + "epoch": 0.704, + "eval_loss": 1.1299418210983276, + "eval_runtime": 85.187, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.293, + "step": 22 + }, + { + "epoch": 0.736, + "grad_norm": 0.4104167292767524, + "learning_rate": 2e-05, + "loss": 1.1874, + "step": 23 + }, + { + "epoch": 0.736, + "eval_loss": 1.1091759204864502, + "eval_runtime": 85.2104, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 23 + }, + { + "epoch": 0.768, + "grad_norm": 0.3690081767478843, + "learning_rate": 2e-05, + "loss": 1.1555, + "step": 24 + }, + { + "epoch": 0.768, + "eval_loss": 1.0891705751419067, + "eval_runtime": 85.0331, + "eval_samples_per_second": 2.352, + "eval_steps_per_second": 0.294, + "step": 24 + }, + { + "epoch": 0.8, + "grad_norm": 0.39605731232207236, + "learning_rate": 2e-05, + "loss": 1.1113, + "step": 25 + }, + { + "epoch": 0.8, + "eval_loss": 1.0692001581192017, + "eval_runtime": 85.2164, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 25 + }, + { + "epoch": 0.832, + "grad_norm": 0.36927768645318826, + "learning_rate": 2e-05, + "loss": 1.1124, + "step": 26 + }, + { + "epoch": 0.832, + "eval_loss": 1.0496515035629272, + "eval_runtime": 88.1096, + "eval_samples_per_second": 2.27, + "eval_steps_per_second": 0.284, + "step": 26 + }, + { + "epoch": 0.864, + "grad_norm": 0.38712274276178793, + "learning_rate": 2e-05, + "loss": 1.0485, + "step": 27 + }, + { + "epoch": 0.864, + "eval_loss": 1.0307434797286987, + "eval_runtime": 88.6941, + "eval_samples_per_second": 2.255, + "eval_steps_per_second": 0.282, + "step": 27 + }, + { + "epoch": 0.896, + "grad_norm": 0.37683532534478703, + "learning_rate": 2e-05, + "loss": 1.0494, + "step": 28 + }, + { + "epoch": 0.896, + "eval_loss": 1.0122556686401367, + "eval_runtime": 88.6898, + "eval_samples_per_second": 2.255, + "eval_steps_per_second": 0.282, + "step": 28 + }, + { + "epoch": 0.928, + "grad_norm": 0.31167540236884894, + "learning_rate": 2e-05, + "loss": 1.0384, + "step": 29 + }, + { + "epoch": 0.928, + "eval_loss": 0.994915783405304, + "eval_runtime": 88.4751, + "eval_samples_per_second": 2.261, + "eval_steps_per_second": 0.283, + "step": 29 + }, + { + "epoch": 0.96, + "grad_norm": 0.3035168410857397, + "learning_rate": 2e-05, + "loss": 1.0571, + "step": 30 + }, + { + "epoch": 0.96, + "eval_loss": 0.9787777662277222, + "eval_runtime": 88.4016, + "eval_samples_per_second": 2.262, + "eval_steps_per_second": 0.283, + "step": 30 + }, + { + "epoch": 0.992, + "grad_norm": 0.3501105312815732, + "learning_rate": 2e-05, + "loss": 0.915, + "step": 31 + }, + { + "epoch": 0.992, + "eval_loss": 0.9635753035545349, + "eval_runtime": 93.659, + "eval_samples_per_second": 2.135, + "eval_steps_per_second": 0.267, + "step": 31 + }, + { + "epoch": 1.024, + "grad_norm": 0.31289892959527454, + "learning_rate": 2e-05, + "loss": 1.0061, + "step": 32 + }, + { + "epoch": 1.024, + "eval_loss": 0.9496576189994812, + "eval_runtime": 92.1616, + "eval_samples_per_second": 2.17, + "eval_steps_per_second": 0.271, + "step": 32 + }, + { + "epoch": 1.056, + "grad_norm": 0.29757404844376606, + "learning_rate": 2e-05, + "loss": 1.018, + "step": 33 + }, + { + "epoch": 1.056, + "eval_loss": 0.9369340538978577, + "eval_runtime": 92.6023, + "eval_samples_per_second": 2.16, + "eval_steps_per_second": 0.27, + "step": 33 + }, + { + "epoch": 1.088, + "grad_norm": 0.2618148684145232, + "learning_rate": 2e-05, + "loss": 0.927, + "step": 34 + }, + { + "epoch": 1.088, + "eval_loss": 0.9251891374588013, + "eval_runtime": 92.1541, + "eval_samples_per_second": 2.17, + "eval_steps_per_second": 0.271, + "step": 34 + }, + { + "epoch": 1.12, + "grad_norm": 0.28251385173765375, + "learning_rate": 2e-05, + "loss": 0.9539, + "step": 35 + }, + { + "epoch": 1.12, + "eval_loss": 0.913889467716217, + "eval_runtime": 92.0402, + "eval_samples_per_second": 2.173, + "eval_steps_per_second": 0.272, + "step": 35 + }, + { + "epoch": 1.16, + "grad_norm": 0.260093009410511, + "learning_rate": 2e-05, + "loss": 0.9356, + "step": 36 + }, + { + "epoch": 1.16, + "eval_loss": 0.9035020470619202, + "eval_runtime": 89.6964, + "eval_samples_per_second": 2.23, + "eval_steps_per_second": 0.279, + "step": 36 + }, + { + "epoch": 1.192, + "grad_norm": 0.27662724636836117, + "learning_rate": 2e-05, + "loss": 0.9597, + "step": 37 + }, + { + "epoch": 1.192, + "eval_loss": 0.8957402110099792, + "eval_runtime": 85.1681, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.294, + "step": 37 + }, + { + "epoch": 1.224, + "grad_norm": 0.5728746629980745, + "learning_rate": 2e-05, + "loss": 0.9398, + "step": 38 + }, + { + "epoch": 1.224, + "eval_loss": 0.887246310710907, + "eval_runtime": 85.6149, + "eval_samples_per_second": 2.336, + "eval_steps_per_second": 0.292, + "step": 38 + }, + { + "epoch": 1.256, + "grad_norm": 0.2684824759760228, + "learning_rate": 2e-05, + "loss": 0.9616, + "step": 39 + }, + { + "epoch": 1.256, + "eval_loss": 0.8806753754615784, + "eval_runtime": 86.3901, + "eval_samples_per_second": 2.315, + "eval_steps_per_second": 0.289, + "step": 39 + }, + { + "epoch": 1.288, + "grad_norm": 0.24685769110976413, + "learning_rate": 2e-05, + "loss": 0.9854, + "step": 40 + }, + { + "epoch": 1.288, + "eval_loss": 0.8744142055511475, + "eval_runtime": 85.3845, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 40 + }, + { + "epoch": 1.32, + "grad_norm": 0.2357626526047496, + "learning_rate": 2e-05, + "loss": 0.9284, + "step": 41 + }, + { + "epoch": 1.32, + "eval_loss": 0.868619441986084, + "eval_runtime": 86.2809, + "eval_samples_per_second": 2.318, + "eval_steps_per_second": 0.29, + "step": 41 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.22791772858432163, + "learning_rate": 2e-05, + "loss": 1.0035, + "step": 42 + }, + { + "epoch": 1.3519999999999999, + "eval_loss": 0.8631160259246826, + "eval_runtime": 85.2643, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 42 + }, + { + "epoch": 1.384, + "grad_norm": 0.2301475582048382, + "learning_rate": 2e-05, + "loss": 0.9441, + "step": 43 + }, + { + "epoch": 1.384, + "eval_loss": 0.8579904437065125, + "eval_runtime": 85.464, + "eval_samples_per_second": 2.34, + "eval_steps_per_second": 0.293, + "step": 43 + }, + { + "epoch": 1.416, + "grad_norm": 0.2435877146655292, + "learning_rate": 2e-05, + "loss": 0.9537, + "step": 44 + }, + { + "epoch": 1.416, + "eval_loss": 0.8532869219779968, + "eval_runtime": 85.4531, + "eval_samples_per_second": 2.34, + "eval_steps_per_second": 0.293, + "step": 44 + }, + { + "epoch": 1.448, + "grad_norm": 0.22680224690529022, + "learning_rate": 2e-05, + "loss": 0.8432, + "step": 45 + }, + { + "epoch": 1.448, + "eval_loss": 0.8488282561302185, + "eval_runtime": 85.2256, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 45 + }, + { + "epoch": 1.48, + "grad_norm": 0.24467493716810432, + "learning_rate": 2e-05, + "loss": 0.9582, + "step": 46 + }, + { + "epoch": 1.48, + "eval_loss": 0.8452281951904297, + "eval_runtime": 86.4412, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 46 + }, + { + "epoch": 1.512, + "grad_norm": 0.3102498103163491, + "learning_rate": 2e-05, + "loss": 0.8935, + "step": 47 + }, + { + "epoch": 1.512, + "eval_loss": 0.8409687876701355, + "eval_runtime": 85.1809, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.293, + "step": 47 + }, + { + "epoch": 1.544, + "grad_norm": 0.26376164638875965, + "learning_rate": 2e-05, + "loss": 0.9153, + "step": 48 + }, + { + "epoch": 1.544, + "eval_loss": 0.836646556854248, + "eval_runtime": 85.3347, + "eval_samples_per_second": 2.344, + "eval_steps_per_second": 0.293, + "step": 48 + }, + { + "epoch": 1.576, + "grad_norm": 0.26268816516328214, + "learning_rate": 2e-05, + "loss": 0.8937, + "step": 49 + }, + { + "epoch": 1.576, + "eval_loss": 0.8322432637214661, + "eval_runtime": 85.1246, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 0.294, + "step": 49 + }, + { + "epoch": 1.608, + "grad_norm": 0.20800644242816013, + "learning_rate": 2e-05, + "loss": 0.8346, + "step": 50 + }, + { + "epoch": 1.608, + "eval_loss": 0.8282632231712341, + "eval_runtime": 85.9203, + "eval_samples_per_second": 2.328, + "eval_steps_per_second": 0.291, + "step": 50 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.234023912047604, + "learning_rate": 2e-05, + "loss": 0.9457, + "step": 51 + }, + { + "epoch": 1.6400000000000001, + "eval_loss": 0.8249067068099976, + "eval_runtime": 85.5644, + "eval_samples_per_second": 2.337, + "eval_steps_per_second": 0.292, + "step": 51 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.22274778391959116, + "learning_rate": 2e-05, + "loss": 0.894, + "step": 52 + }, + { + "epoch": 1.6720000000000002, + "eval_loss": 0.8220057487487793, + "eval_runtime": 85.2356, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 52 + }, + { + "epoch": 1.704, + "grad_norm": 0.247116310753153, + "learning_rate": 2e-05, + "loss": 0.9422, + "step": 53 + }, + { + "epoch": 1.704, + "eval_loss": 0.8193264603614807, + "eval_runtime": 85.2302, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 53 + }, + { + "epoch": 1.736, + "grad_norm": 0.2156755816522451, + "learning_rate": 2e-05, + "loss": 0.9483, + "step": 54 + }, + { + "epoch": 1.736, + "eval_loss": 0.8170039653778076, + "eval_runtime": 85.5313, + "eval_samples_per_second": 2.338, + "eval_steps_per_second": 0.292, + "step": 54 + }, + { + "epoch": 1.768, + "grad_norm": 0.20641699121207405, + "learning_rate": 2e-05, + "loss": 0.9433, + "step": 55 + }, + { + "epoch": 1.768, + "eval_loss": 0.8153803944587708, + "eval_runtime": 85.4465, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 55 + }, + { + "epoch": 1.8, + "grad_norm": 0.22411091268182518, + "learning_rate": 2e-05, + "loss": 0.8839, + "step": 56 + }, + { + "epoch": 1.8, + "eval_loss": 0.8131626844406128, + "eval_runtime": 87.3514, + "eval_samples_per_second": 2.29, + "eval_steps_per_second": 0.286, + "step": 56 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.22136515515298041, + "learning_rate": 2e-05, + "loss": 0.9219, + "step": 57 + }, + { + "epoch": 1.8319999999999999, + "eval_loss": 0.8108111023902893, + "eval_runtime": 86.5607, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 57 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.22277176520749853, + "learning_rate": 2e-05, + "loss": 0.8317, + "step": 58 + }, + { + "epoch": 1.8639999999999999, + "eval_loss": 0.8082687854766846, + "eval_runtime": 86.4781, + "eval_samples_per_second": 2.313, + "eval_steps_per_second": 0.289, + "step": 58 + }, + { + "epoch": 1.896, + "grad_norm": 0.22242544994690336, + "learning_rate": 2e-05, + "loss": 0.8317, + "step": 59 + }, + { + "epoch": 1.896, + "eval_loss": 0.8052798509597778, + "eval_runtime": 86.1491, + "eval_samples_per_second": 2.322, + "eval_steps_per_second": 0.29, + "step": 59 + }, + { + "epoch": 1.928, + "grad_norm": 0.20539599715237697, + "learning_rate": 2e-05, + "loss": 0.8777, + "step": 60 + }, + { + "epoch": 1.928, + "eval_loss": 0.8023205399513245, + "eval_runtime": 86.2508, + "eval_samples_per_second": 2.319, + "eval_steps_per_second": 0.29, + "step": 60 + }, + { + "epoch": 1.96, + "grad_norm": 0.2259203508735786, + "learning_rate": 2e-05, + "loss": 0.8987, + "step": 61 + }, + { + "epoch": 1.96, + "eval_loss": 0.7997938394546509, + "eval_runtime": 87.6556, + "eval_samples_per_second": 2.282, + "eval_steps_per_second": 0.285, + "step": 61 + }, + { + "epoch": 1.992, + "grad_norm": 0.2423173341059814, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 62 + }, + { + "epoch": 1.992, + "eval_loss": 0.7969934344291687, + "eval_runtime": 86.8775, + "eval_samples_per_second": 2.302, + "eval_steps_per_second": 0.288, + "step": 62 + }, + { + "epoch": 2.024, + "grad_norm": 0.24036001781096705, + "learning_rate": 2e-05, + "loss": 0.8819, + "step": 63 + }, + { + "epoch": 2.024, + "eval_loss": 0.7944203019142151, + "eval_runtime": 86.5654, + "eval_samples_per_second": 2.31, + "eval_steps_per_second": 0.289, + "step": 63 + }, + { + "epoch": 2.056, + "grad_norm": 0.20841482575321812, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 64 + }, + { + "epoch": 2.056, + "eval_loss": 0.791679859161377, + "eval_runtime": 86.6028, + "eval_samples_per_second": 2.309, + "eval_steps_per_second": 0.289, + "step": 64 + }, + { + "epoch": 2.088, + "grad_norm": 0.2184988692033955, + "learning_rate": 2e-05, + "loss": 0.829, + "step": 65 + }, + { + "epoch": 2.088, + "eval_loss": 0.7891654372215271, + "eval_runtime": 86.7507, + "eval_samples_per_second": 2.305, + "eval_steps_per_second": 0.288, + "step": 65 + }, + { + "epoch": 2.12, + "grad_norm": 0.23020842769384967, + "learning_rate": 2e-05, + "loss": 0.8473, + "step": 66 + }, + { + "epoch": 2.12, + "eval_loss": 0.7867069840431213, + "eval_runtime": 87.5061, + "eval_samples_per_second": 2.286, + "eval_steps_per_second": 0.286, + "step": 66 + }, + { + "epoch": 2.152, + "grad_norm": 0.25430631663993714, + "learning_rate": 2e-05, + "loss": 0.8681, + "step": 67 + }, + { + "epoch": 2.152, + "eval_loss": 0.7836448550224304, + "eval_runtime": 88.0078, + "eval_samples_per_second": 2.273, + "eval_steps_per_second": 0.284, + "step": 67 + }, + { + "epoch": 2.184, + "grad_norm": 0.23653466680757473, + "learning_rate": 2e-05, + "loss": 0.8876, + "step": 68 + }, + { + "epoch": 2.184, + "eval_loss": 0.7806727886199951, + "eval_runtime": 87.345, + "eval_samples_per_second": 2.29, + "eval_steps_per_second": 0.286, + "step": 68 + }, + { + "epoch": 2.216, + "grad_norm": 0.2565004166075463, + "learning_rate": 2e-05, + "loss": 0.8596, + "step": 69 + }, + { + "epoch": 2.216, + "eval_loss": 0.7781125903129578, + "eval_runtime": 87.2054, + "eval_samples_per_second": 2.293, + "eval_steps_per_second": 0.287, + "step": 69 + }, + { + "epoch": 2.248, + "grad_norm": 0.22097009361742267, + "learning_rate": 2e-05, + "loss": 0.8956, + "step": 70 + }, + { + "epoch": 2.248, + "eval_loss": 0.7758963704109192, + "eval_runtime": 87.6181, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.285, + "step": 70 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 0.23458324558709256, + "learning_rate": 2e-05, + "loss": 0.8812, + "step": 71 + }, + { + "epoch": 2.2800000000000002, + "eval_loss": 0.773766279220581, + "eval_runtime": 86.8377, + "eval_samples_per_second": 2.303, + "eval_steps_per_second": 0.288, + "step": 71 + }, + { + "epoch": 2.312, + "grad_norm": 0.2690788840468198, + "learning_rate": 2e-05, + "loss": 0.8779, + "step": 72 + }, + { + "epoch": 2.312, + "eval_loss": 0.7716243267059326, + "eval_runtime": 86.5931, + "eval_samples_per_second": 2.31, + "eval_steps_per_second": 0.289, + "step": 72 + }, + { + "epoch": 2.344, + "grad_norm": 0.22263909993294226, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 73 + }, + { + "epoch": 2.344, + "eval_loss": 0.7695778012275696, + "eval_runtime": 86.9844, + "eval_samples_per_second": 2.299, + "eval_steps_per_second": 0.287, + "step": 73 + }, + { + "epoch": 2.376, + "grad_norm": 0.26058003387602907, + "learning_rate": 2e-05, + "loss": 0.8995, + "step": 74 + }, + { + "epoch": 2.376, + "eval_loss": 0.7680388689041138, + "eval_runtime": 86.551, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 74 + }, + { + "epoch": 2.408, + "grad_norm": 0.2262224984051455, + "learning_rate": 2e-05, + "loss": 0.8323, + "step": 75 + }, + { + "epoch": 2.408, + "eval_loss": 0.766679584980011, + "eval_runtime": 86.5962, + "eval_samples_per_second": 2.31, + "eval_steps_per_second": 0.289, + "step": 75 + }, + { + "epoch": 2.44, + "grad_norm": 0.25720804331740627, + "learning_rate": 2e-05, + "loss": 0.8036, + "step": 76 + }, + { + "epoch": 2.44, + "eval_loss": 0.7652787566184998, + "eval_runtime": 86.2366, + "eval_samples_per_second": 2.319, + "eval_steps_per_second": 0.29, + "step": 76 + }, + { + "epoch": 2.472, + "grad_norm": 0.22971293606988397, + "learning_rate": 2e-05, + "loss": 0.8806, + "step": 77 + }, + { + "epoch": 2.472, + "eval_loss": 0.7643636465072632, + "eval_runtime": 86.3577, + "eval_samples_per_second": 2.316, + "eval_steps_per_second": 0.289, + "step": 77 + }, + { + "epoch": 2.504, + "grad_norm": 0.2522080484690418, + "learning_rate": 2e-05, + "loss": 0.817, + "step": 78 + }, + { + "epoch": 2.504, + "eval_loss": 0.7629995942115784, + "eval_runtime": 86.2805, + "eval_samples_per_second": 2.318, + "eval_steps_per_second": 0.29, + "step": 78 + }, + { + "epoch": 2.536, + "grad_norm": 0.25383057566992234, + "learning_rate": 2e-05, + "loss": 0.7931, + "step": 79 + }, + { + "epoch": 2.536, + "eval_loss": 0.7622135877609253, + "eval_runtime": 86.5133, + "eval_samples_per_second": 2.312, + "eval_steps_per_second": 0.289, + "step": 79 + }, + { + "epoch": 2.568, + "grad_norm": 0.27933475264216745, + "learning_rate": 2e-05, + "loss": 0.8135, + "step": 80 + }, + { + "epoch": 2.568, + "eval_loss": 0.7606070041656494, + "eval_runtime": 86.377, + "eval_samples_per_second": 2.315, + "eval_steps_per_second": 0.289, + "step": 80 + }, + { + "epoch": 2.6, + "grad_norm": 0.24704516135802373, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 81 + }, + { + "epoch": 2.6, + "eval_loss": 0.7587440609931946, + "eval_runtime": 88.9717, + "eval_samples_per_second": 2.248, + "eval_steps_per_second": 0.281, + "step": 81 + }, + { + "epoch": 2.632, + "grad_norm": 0.2595849376774823, + "learning_rate": 2e-05, + "loss": 0.7207, + "step": 82 + }, + { + "epoch": 2.632, + "eval_loss": 0.7568916082382202, + "eval_runtime": 88.9097, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 0.281, + "step": 82 + }, + { + "epoch": 2.664, + "grad_norm": 0.2586023772952801, + "learning_rate": 2e-05, + "loss": 0.8642, + "step": 83 + }, + { + "epoch": 2.664, + "eval_loss": 0.7559364438056946, + "eval_runtime": 89.045, + "eval_samples_per_second": 2.246, + "eval_steps_per_second": 0.281, + "step": 83 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 0.2273264534259725, + "learning_rate": 2e-05, + "loss": 0.8864, + "step": 84 + }, + { + "epoch": 2.6959999999999997, + "eval_loss": 0.7552520632743835, + "eval_runtime": 88.9448, + "eval_samples_per_second": 2.249, + "eval_steps_per_second": 0.281, + "step": 84 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 0.26638251168101784, + "learning_rate": 2e-05, + "loss": 0.7977, + "step": 85 + }, + { + "epoch": 2.7279999999999998, + "eval_loss": 0.753672182559967, + "eval_runtime": 89.2211, + "eval_samples_per_second": 2.242, + "eval_steps_per_second": 0.28, + "step": 85 + }, + { + "epoch": 2.76, + "grad_norm": 0.27672934644885144, + "learning_rate": 2e-05, + "loss": 0.8003, + "step": 86 + }, + { + "epoch": 2.76, + "eval_loss": 0.7510656714439392, + "eval_runtime": 95.8714, + "eval_samples_per_second": 2.086, + "eval_steps_per_second": 0.261, + "step": 86 + }, + { + "epoch": 2.792, + "grad_norm": 0.28159046758182865, + "learning_rate": 2e-05, + "loss": 0.8216, + "step": 87 + }, + { + "epoch": 2.792, + "eval_loss": 0.7484390735626221, + "eval_runtime": 93.9836, + "eval_samples_per_second": 2.128, + "eval_steps_per_second": 0.266, + "step": 87 + }, + { + "epoch": 2.824, + "grad_norm": 0.25495896352825237, + "learning_rate": 2e-05, + "loss": 0.8514, + "step": 88 + }, + { + "epoch": 2.824, + "eval_loss": 0.7466137409210205, + "eval_runtime": 92.9783, + "eval_samples_per_second": 2.151, + "eval_steps_per_second": 0.269, + "step": 88 + }, + { + "epoch": 2.856, + "grad_norm": 0.24959081452423665, + "learning_rate": 2e-05, + "loss": 0.8291, + "step": 89 + }, + { + "epoch": 2.856, + "eval_loss": 0.745083749294281, + "eval_runtime": 92.9713, + "eval_samples_per_second": 2.151, + "eval_steps_per_second": 0.269, + "step": 89 + }, + { + "epoch": 2.888, + "grad_norm": 0.258467204198503, + "learning_rate": 2e-05, + "loss": 0.8669, + "step": 90 + }, + { + "epoch": 2.888, + "eval_loss": 0.7432680726051331, + "eval_runtime": 93.3306, + "eval_samples_per_second": 2.143, + "eval_steps_per_second": 0.268, + "step": 90 + }, + { + "epoch": 2.928, + "grad_norm": 0.24500563921569218, + "learning_rate": 2e-05, + "loss": 0.8085, + "step": 91 + }, + { + "epoch": 2.928, + "eval_loss": 0.7415681481361389, + "eval_runtime": 91.2727, + "eval_samples_per_second": 2.191, + "eval_steps_per_second": 0.274, + "step": 91 + }, + { + "epoch": 2.96, + "grad_norm": 0.27455934907237084, + "learning_rate": 2e-05, + "loss": 0.8307, + "step": 92 + }, + { + "epoch": 2.96, + "eval_loss": 0.7402371168136597, + "eval_runtime": 85.3987, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 92 + }, + { + "epoch": 2.992, + "grad_norm": 0.2775170496694688, + "learning_rate": 2e-05, + "loss": 0.6946, + "step": 93 + }, + { + "epoch": 2.992, + "eval_loss": 0.7383440136909485, + "eval_runtime": 85.4187, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 93 + }, + { + "epoch": 3.024, + "grad_norm": 0.2792378343187822, + "learning_rate": 2e-05, + "loss": 0.788, + "step": 94 + }, + { + "epoch": 3.024, + "eval_loss": 0.7362905144691467, + "eval_runtime": 85.2866, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 94 + }, + { + "epoch": 3.056, + "grad_norm": 0.24527382879929208, + "learning_rate": 2e-05, + "loss": 0.8077, + "step": 95 + }, + { + "epoch": 3.056, + "eval_loss": 0.7345578074455261, + "eval_runtime": 85.2836, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 95 + }, + { + "epoch": 3.088, + "grad_norm": 0.2639510901590645, + "learning_rate": 2e-05, + "loss": 0.8159, + "step": 96 + }, + { + "epoch": 3.088, + "eval_loss": 0.7332432270050049, + "eval_runtime": 86.4403, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 96 + }, + { + "epoch": 3.12, + "grad_norm": 0.32275944645869054, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 97 + }, + { + "epoch": 3.12, + "eval_loss": 0.7311471700668335, + "eval_runtime": 86.3257, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 97 + }, + { + "epoch": 3.152, + "grad_norm": 0.22657765140514205, + "learning_rate": 2e-05, + "loss": 0.796, + "step": 98 + }, + { + "epoch": 3.152, + "eval_loss": 0.7294245958328247, + "eval_runtime": 86.4307, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 98 + }, + { + "epoch": 3.184, + "grad_norm": 0.2696947762711156, + "learning_rate": 2e-05, + "loss": 0.8463, + "step": 99 + }, + { + "epoch": 3.184, + "eval_loss": 0.7282422780990601, + "eval_runtime": 86.5308, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 99 + }, + { + "epoch": 3.216, + "grad_norm": 0.2600510971816684, + "learning_rate": 2e-05, + "loss": 0.8089, + "step": 100 + }, + { + "epoch": 3.216, + "eval_loss": 0.7277690768241882, + "eval_runtime": 86.3354, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 100 + }, + { + "epoch": 3.248, + "grad_norm": 0.2786398542362818, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 101 + }, + { + "epoch": 3.248, + "eval_loss": 0.7275124192237854, + "eval_runtime": 88.6929, + "eval_samples_per_second": 2.255, + "eval_steps_per_second": 0.282, + "step": 101 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 0.2737884177070957, + "learning_rate": 2e-05, + "loss": 0.8182, + "step": 102 + }, + { + "epoch": 3.2800000000000002, + "eval_loss": 0.727260947227478, + "eval_runtime": 85.8129, + "eval_samples_per_second": 2.331, + "eval_steps_per_second": 0.291, + "step": 102 + }, + { + "epoch": 3.312, + "grad_norm": 0.29485392261335913, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 103 + }, + { + "epoch": 3.312, + "eval_loss": 0.726463794708252, + "eval_runtime": 85.5624, + "eval_samples_per_second": 2.337, + "eval_steps_per_second": 0.292, + "step": 103 + }, + { + "epoch": 3.344, + "grad_norm": 0.2950854321605982, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 104 + }, + { + "epoch": 3.344, + "eval_loss": 0.7254646420478821, + "eval_runtime": 86.1462, + "eval_samples_per_second": 2.322, + "eval_steps_per_second": 0.29, + "step": 104 + }, + { + "epoch": 3.376, + "grad_norm": 0.2868496115468271, + "learning_rate": 2e-05, + "loss": 0.7902, + "step": 105 + }, + { + "epoch": 3.376, + "eval_loss": 0.724499523639679, + "eval_runtime": 85.8109, + "eval_samples_per_second": 2.331, + "eval_steps_per_second": 0.291, + "step": 105 + }, + { + "epoch": 3.408, + "grad_norm": 0.27526808102180006, + "learning_rate": 2e-05, + "loss": 0.7962, + "step": 106 + }, + { + "epoch": 3.408, + "eval_loss": 0.723967432975769, + "eval_runtime": 85.1137, + "eval_samples_per_second": 2.35, + "eval_steps_per_second": 0.294, + "step": 106 + }, + { + "epoch": 3.44, + "grad_norm": 0.28826054599507117, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 107 + }, + { + "epoch": 3.44, + "eval_loss": 0.7228976488113403, + "eval_runtime": 85.338, + "eval_samples_per_second": 2.344, + "eval_steps_per_second": 0.293, + "step": 107 + }, + { + "epoch": 3.472, + "grad_norm": 0.2739253052624054, + "learning_rate": 2e-05, + "loss": 0.8122, + "step": 108 + }, + { + "epoch": 3.472, + "eval_loss": 0.7213765978813171, + "eval_runtime": 86.0819, + "eval_samples_per_second": 2.323, + "eval_steps_per_second": 0.29, + "step": 108 + }, + { + "epoch": 3.504, + "grad_norm": 0.3244236677701114, + "learning_rate": 2e-05, + "loss": 0.7926, + "step": 109 + }, + { + "epoch": 3.504, + "eval_loss": 0.7201890349388123, + "eval_runtime": 85.264, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 109 + }, + { + "epoch": 3.536, + "grad_norm": 0.272846304884481, + "learning_rate": 2e-05, + "loss": 0.7815, + "step": 110 + }, + { + "epoch": 3.536, + "eval_loss": 0.7191389203071594, + "eval_runtime": 85.3814, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 110 + }, + { + "epoch": 3.568, + "grad_norm": 0.32540225984762255, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 111 + }, + { + "epoch": 3.568, + "eval_loss": 0.7177696824073792, + "eval_runtime": 86.5851, + "eval_samples_per_second": 2.31, + "eval_steps_per_second": 0.289, + "step": 111 + }, + { + "epoch": 3.6, + "grad_norm": 0.3049195701830638, + "learning_rate": 2e-05, + "loss": 0.7817, + "step": 112 + }, + { + "epoch": 3.6, + "eval_loss": 0.7163457274436951, + "eval_runtime": 86.3221, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 112 + }, + { + "epoch": 3.632, + "grad_norm": 0.2908157712070727, + "learning_rate": 2e-05, + "loss": 0.7803, + "step": 113 + }, + { + "epoch": 3.632, + "eval_loss": 0.7153773307800293, + "eval_runtime": 86.5278, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 113 + }, + { + "epoch": 3.664, + "grad_norm": 0.3068313248625758, + "learning_rate": 2e-05, + "loss": 0.8223, + "step": 114 + }, + { + "epoch": 3.664, + "eval_loss": 0.7154207825660706, + "eval_runtime": 87.3327, + "eval_samples_per_second": 2.29, + "eval_steps_per_second": 0.286, + "step": 114 + }, + { + "epoch": 3.6959999999999997, + "grad_norm": 0.3055979867515295, + "learning_rate": 2e-05, + "loss": 0.7682, + "step": 115 + }, + { + "epoch": 3.6959999999999997, + "eval_loss": 0.7148604393005371, + "eval_runtime": 86.2716, + "eval_samples_per_second": 2.318, + "eval_steps_per_second": 0.29, + "step": 115 + }, + { + "epoch": 3.7279999999999998, + "grad_norm": 0.30145967440162974, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 116 + }, + { + "epoch": 3.7279999999999998, + "eval_loss": 0.7141232490539551, + "eval_runtime": 90.4025, + "eval_samples_per_second": 2.212, + "eval_steps_per_second": 0.277, + "step": 116 + }, + { + "epoch": 3.76, + "grad_norm": 0.30263126216965686, + "learning_rate": 2e-05, + "loss": 0.7924, + "step": 117 + }, + { + "epoch": 3.76, + "eval_loss": 0.7125248908996582, + "eval_runtime": 89.9787, + "eval_samples_per_second": 2.223, + "eval_steps_per_second": 0.278, + "step": 117 + }, + { + "epoch": 3.792, + "grad_norm": 0.37055787998484313, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 118 + }, + { + "epoch": 3.792, + "eval_loss": 0.7103064060211182, + "eval_runtime": 89.9435, + "eval_samples_per_second": 2.224, + "eval_steps_per_second": 0.278, + "step": 118 + }, + { + "epoch": 3.824, + "grad_norm": 0.32370435744629744, + "learning_rate": 2e-05, + "loss": 0.7225, + "step": 119 + }, + { + "epoch": 3.824, + "eval_loss": 0.708495020866394, + "eval_runtime": 89.7211, + "eval_samples_per_second": 2.229, + "eval_steps_per_second": 0.279, + "step": 119 + }, + { + "epoch": 3.856, + "grad_norm": 0.28450870148051394, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 120 + }, + { + "epoch": 3.856, + "eval_loss": 0.7078144550323486, + "eval_runtime": 89.755, + "eval_samples_per_second": 2.228, + "eval_steps_per_second": 0.279, + "step": 120 + }, + { + "epoch": 3.888, + "grad_norm": 0.3521496955227081, + "learning_rate": 2e-05, + "loss": 0.8098, + "step": 121 + }, + { + "epoch": 3.888, + "eval_loss": 0.706774115562439, + "eval_runtime": 93.4447, + "eval_samples_per_second": 2.14, + "eval_steps_per_second": 0.268, + "step": 121 + }, + { + "epoch": 3.92, + "grad_norm": 0.29964697600639706, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 122 + }, + { + "epoch": 3.92, + "eval_loss": 0.7057322859764099, + "eval_runtime": 93.0089, + "eval_samples_per_second": 2.15, + "eval_steps_per_second": 0.269, + "step": 122 + }, + { + "epoch": 3.952, + "grad_norm": 0.2998200701516689, + "learning_rate": 2e-05, + "loss": 0.7986, + "step": 123 + }, + { + "epoch": 3.952, + "eval_loss": 0.7051501274108887, + "eval_runtime": 93.7613, + "eval_samples_per_second": 2.133, + "eval_steps_per_second": 0.267, + "step": 123 + }, + { + "epoch": 3.984, + "grad_norm": 0.34265154113873836, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 124 + }, + { + "epoch": 3.984, + "eval_loss": 0.7055770754814148, + "eval_runtime": 94.0074, + "eval_samples_per_second": 2.127, + "eval_steps_per_second": 0.266, + "step": 124 + }, + { + "epoch": 4.016, + "grad_norm": 0.3227557876231983, + "learning_rate": 2e-05, + "loss": 0.8266, + "step": 125 + }, + { + "epoch": 4.016, + "eval_loss": 0.7067859172821045, + "eval_runtime": 92.4085, + "eval_samples_per_second": 2.164, + "eval_steps_per_second": 0.271, + "step": 125 + }, + { + "epoch": 4.064, + "grad_norm": 0.31358966391371784, + "learning_rate": 2e-05, + "loss": 0.7162, + "step": 126 + }, + { + "epoch": 4.064, + "eval_loss": 0.7073588371276855, + "eval_runtime": 89.235, + "eval_samples_per_second": 2.241, + "eval_steps_per_second": 0.28, + "step": 126 + }, + { + "epoch": 4.096, + "grad_norm": 0.29594296413078097, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 127 + }, + { + "epoch": 4.096, + "eval_loss": 0.7072306871414185, + "eval_runtime": 85.5672, + "eval_samples_per_second": 2.337, + "eval_steps_per_second": 0.292, + "step": 127 + }, + { + "epoch": 4.128, + "grad_norm": 0.31562345712114676, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 128 + }, + { + "epoch": 4.128, + "eval_loss": 0.7067290544509888, + "eval_runtime": 85.4464, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 128 + }, + { + "epoch": 4.16, + "grad_norm": 0.36960151197946806, + "learning_rate": 2e-05, + "loss": 0.7275, + "step": 129 + }, + { + "epoch": 4.16, + "eval_loss": 0.7046365141868591, + "eval_runtime": 85.4173, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 129 + }, + { + "epoch": 4.192, + "grad_norm": 0.28777555135336585, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 130 + }, + { + "epoch": 4.192, + "eval_loss": 0.7030876278877258, + "eval_runtime": 85.2072, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 130 + }, + { + "epoch": 4.224, + "grad_norm": 0.3335688387393771, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 131 + }, + { + "epoch": 4.224, + "eval_loss": 0.7016716003417969, + "eval_runtime": 85.3898, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 131 + }, + { + "epoch": 4.256, + "grad_norm": 0.36992044176671973, + "learning_rate": 2e-05, + "loss": 0.7915, + "step": 132 + }, + { + "epoch": 4.256, + "eval_loss": 0.7006884813308716, + "eval_runtime": 85.1939, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.293, + "step": 132 + }, + { + "epoch": 4.288, + "grad_norm": 0.3213431246183001, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 133 + }, + { + "epoch": 4.288, + "eval_loss": 0.7004576325416565, + "eval_runtime": 85.1892, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.293, + "step": 133 + }, + { + "epoch": 4.32, + "grad_norm": 0.31818378029100663, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 134 + }, + { + "epoch": 4.32, + "eval_loss": 0.7006973028182983, + "eval_runtime": 85.2711, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 134 + }, + { + "epoch": 4.352, + "grad_norm": 0.34120408014701054, + "learning_rate": 2e-05, + "loss": 0.8125, + "step": 135 + }, + { + "epoch": 4.352, + "eval_loss": 0.7006770372390747, + "eval_runtime": 85.0797, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 0.294, + "step": 135 + }, + { + "epoch": 4.384, + "grad_norm": 0.3354650435400624, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 136 + }, + { + "epoch": 4.384, + "eval_loss": 0.7007671594619751, + "eval_runtime": 86.3137, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 136 + }, + { + "epoch": 4.416, + "grad_norm": 0.34273454929170855, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 137 + }, + { + "epoch": 4.416, + "eval_loss": 0.7007145881652832, + "eval_runtime": 86.1203, + "eval_samples_per_second": 2.322, + "eval_steps_per_second": 0.29, + "step": 137 + }, + { + "epoch": 4.448, + "grad_norm": 0.34329366738767764, + "learning_rate": 2e-05, + "loss": 0.673, + "step": 138 + }, + { + "epoch": 4.448, + "eval_loss": 0.7001290321350098, + "eval_runtime": 86.4938, + "eval_samples_per_second": 2.312, + "eval_steps_per_second": 0.289, + "step": 138 + }, + { + "epoch": 4.48, + "grad_norm": 0.32986462476877876, + "learning_rate": 2e-05, + "loss": 0.7874, + "step": 139 + }, + { + "epoch": 4.48, + "eval_loss": 0.6998225450515747, + "eval_runtime": 85.0554, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 0.294, + "step": 139 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 0.42029356364309967, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 140 + }, + { + "epoch": 4.5120000000000005, + "eval_loss": 0.6981640458106995, + "eval_runtime": 86.43, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 140 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 0.3410153964676588, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 141 + }, + { + "epoch": 4.5440000000000005, + "eval_loss": 0.6970750689506531, + "eval_runtime": 85.1484, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 0.294, + "step": 141 + }, + { + "epoch": 4.576, + "grad_norm": 0.39568033105661293, + "learning_rate": 2e-05, + "loss": 0.7175, + "step": 142 + }, + { + "epoch": 4.576, + "eval_loss": 0.6954947113990784, + "eval_runtime": 85.256, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 142 + }, + { + "epoch": 4.608, + "grad_norm": 0.35114222943495293, + "learning_rate": 2e-05, + "loss": 0.6854, + "step": 143 + }, + { + "epoch": 4.608, + "eval_loss": 0.6938956379890442, + "eval_runtime": 85.0562, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 0.294, + "step": 143 + }, + { + "epoch": 4.64, + "grad_norm": 0.36129302808062, + "learning_rate": 2e-05, + "loss": 0.7821, + "step": 144 + }, + { + "epoch": 4.64, + "eval_loss": 0.6925562024116516, + "eval_runtime": 85.1341, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 0.294, + "step": 144 + }, + { + "epoch": 4.672, + "grad_norm": 0.39551012956858894, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 145 + }, + { + "epoch": 4.672, + "eval_loss": 0.6914381384849548, + "eval_runtime": 85.152, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 0.294, + "step": 145 + }, + { + "epoch": 4.704, + "grad_norm": 0.42650783733532543, + "learning_rate": 2e-05, + "loss": 0.7883, + "step": 146 + }, + { + "epoch": 4.704, + "eval_loss": 0.6911692023277283, + "eval_runtime": 86.8539, + "eval_samples_per_second": 2.303, + "eval_steps_per_second": 0.288, + "step": 146 + }, + { + "epoch": 4.736, + "grad_norm": 0.3786582921989863, + "learning_rate": 2e-05, + "loss": 0.7987, + "step": 147 + }, + { + "epoch": 4.736, + "eval_loss": 0.6914923191070557, + "eval_runtime": 87.0003, + "eval_samples_per_second": 2.299, + "eval_steps_per_second": 0.287, + "step": 147 + }, + { + "epoch": 4.768, + "grad_norm": 0.3528223035850843, + "learning_rate": 2e-05, + "loss": 0.8181, + "step": 148 + }, + { + "epoch": 4.768, + "eval_loss": 0.6930768489837646, + "eval_runtime": 86.8453, + "eval_samples_per_second": 2.303, + "eval_steps_per_second": 0.288, + "step": 148 + }, + { + "epoch": 4.8, + "grad_norm": 0.4052106693792139, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 149 + }, + { + "epoch": 4.8, + "eval_loss": 0.6946350336074829, + "eval_runtime": 86.808, + "eval_samples_per_second": 2.304, + "eval_steps_per_second": 0.288, + "step": 149 + }, + { + "epoch": 4.832, + "grad_norm": 0.3739014269672761, + "learning_rate": 2e-05, + "loss": 0.7851, + "step": 150 + }, + { + "epoch": 4.832, + "eval_loss": 0.6952430605888367, + "eval_runtime": 86.8255, + "eval_samples_per_second": 2.303, + "eval_steps_per_second": 0.288, + "step": 150 + }, + { + "epoch": 4.864, + "grad_norm": 0.42120491782720065, + "learning_rate": 2e-05, + "loss": 0.6829, + "step": 151 + }, + { + "epoch": 4.864, + "eval_loss": 0.6938563585281372, + "eval_runtime": 89.8279, + "eval_samples_per_second": 2.226, + "eval_steps_per_second": 0.278, + "step": 151 + }, + { + "epoch": 4.896, + "grad_norm": 0.40313446713945206, + "learning_rate": 2e-05, + "loss": 0.6972, + "step": 152 + }, + { + "epoch": 4.896, + "eval_loss": 0.6912936568260193, + "eval_runtime": 89.3777, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 0.28, + "step": 152 + }, + { + "epoch": 4.928, + "grad_norm": 0.36052668588306425, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 153 + }, + { + "epoch": 4.928, + "eval_loss": 0.6893093585968018, + "eval_runtime": 89.369, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 0.28, + "step": 153 + }, + { + "epoch": 4.96, + "grad_norm": 0.35889751392140123, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 154 + }, + { + "epoch": 4.96, + "eval_loss": 0.6887902021408081, + "eval_runtime": 89.3518, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 0.28, + "step": 154 + }, + { + "epoch": 4.992, + "grad_norm": 0.40694329818018776, + "learning_rate": 2e-05, + "loss": 0.6145, + "step": 155 + }, + { + "epoch": 4.992, + "eval_loss": 0.6877387762069702, + "eval_runtime": 89.5999, + "eval_samples_per_second": 2.232, + "eval_steps_per_second": 0.279, + "step": 155 + }, + { + "epoch": 5.024, + "grad_norm": 0.4071045470916848, + "learning_rate": 2e-05, + "loss": 0.7106, + "step": 156 + }, + { + "epoch": 5.024, + "eval_loss": 0.6863316297531128, + "eval_runtime": 91.1516, + "eval_samples_per_second": 2.194, + "eval_steps_per_second": 0.274, + "step": 156 + }, + { + "epoch": 5.056, + "grad_norm": 0.3825562066811806, + "learning_rate": 2e-05, + "loss": 0.6845, + "step": 157 + }, + { + "epoch": 5.056, + "eval_loss": 0.6852035522460938, + "eval_runtime": 90.7966, + "eval_samples_per_second": 2.203, + "eval_steps_per_second": 0.275, + "step": 157 + }, + { + "epoch": 5.088, + "grad_norm": 0.37147714986904934, + "learning_rate": 2e-05, + "loss": 0.6739, + "step": 158 + }, + { + "epoch": 5.088, + "eval_loss": 0.6840065121650696, + "eval_runtime": 91.6382, + "eval_samples_per_second": 2.182, + "eval_steps_per_second": 0.273, + "step": 158 + }, + { + "epoch": 5.12, + "grad_norm": 0.38259903152485825, + "learning_rate": 2e-05, + "loss": 0.6777, + "step": 159 + }, + { + "epoch": 5.12, + "eval_loss": 0.6829774379730225, + "eval_runtime": 90.8166, + "eval_samples_per_second": 2.202, + "eval_steps_per_second": 0.275, + "step": 159 + }, + { + "epoch": 5.152, + "grad_norm": 0.39777547401791735, + "learning_rate": 2e-05, + "loss": 0.7145, + "step": 160 + }, + { + "epoch": 5.152, + "eval_loss": 0.682302713394165, + "eval_runtime": 91.0923, + "eval_samples_per_second": 2.196, + "eval_steps_per_second": 0.274, + "step": 160 + }, + { + "epoch": 5.192, + "grad_norm": 0.3709714989318106, + "learning_rate": 2e-05, + "loss": 0.7251, + "step": 161 + }, + { + "epoch": 5.192, + "eval_loss": 0.6822090148925781, + "eval_runtime": 90.1282, + "eval_samples_per_second": 2.219, + "eval_steps_per_second": 0.277, + "step": 161 + }, + { + "epoch": 5.224, + "grad_norm": 0.4046346018620919, + "learning_rate": 2e-05, + "loss": 0.7108, + "step": 162 + }, + { + "epoch": 5.224, + "eval_loss": 0.6821247935295105, + "eval_runtime": 85.5268, + "eval_samples_per_second": 2.338, + "eval_steps_per_second": 0.292, + "step": 162 + }, + { + "epoch": 5.256, + "grad_norm": 0.42060496638232386, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 163 + }, + { + "epoch": 5.256, + "eval_loss": 0.6818928718566895, + "eval_runtime": 86.5491, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 163 + }, + { + "epoch": 5.288, + "grad_norm": 0.3949151390399246, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 164 + }, + { + "epoch": 5.288, + "eval_loss": 0.6819549798965454, + "eval_runtime": 85.4036, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 164 + }, + { + "epoch": 5.32, + "grad_norm": 0.3610134094474086, + "learning_rate": 2e-05, + "loss": 0.711, + "step": 165 + }, + { + "epoch": 5.32, + "eval_loss": 0.6825198531150818, + "eval_runtime": 85.3022, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 165 + }, + { + "epoch": 5.352, + "grad_norm": 0.38905537916660615, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 166 + }, + { + "epoch": 5.352, + "eval_loss": 0.682900071144104, + "eval_runtime": 87.5028, + "eval_samples_per_second": 2.286, + "eval_steps_per_second": 0.286, + "step": 166 + }, + { + "epoch": 5.384, + "grad_norm": 0.4020289142954435, + "learning_rate": 2e-05, + "loss": 0.7257, + "step": 167 + }, + { + "epoch": 5.384, + "eval_loss": 0.6832457184791565, + "eval_runtime": 86.3175, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 167 + }, + { + "epoch": 5.416, + "grad_norm": 0.4136061042465234, + "learning_rate": 2e-05, + "loss": 0.7082, + "step": 168 + }, + { + "epoch": 5.416, + "eval_loss": 0.6837514638900757, + "eval_runtime": 87.4244, + "eval_samples_per_second": 2.288, + "eval_steps_per_second": 0.286, + "step": 168 + }, + { + "epoch": 5.448, + "grad_norm": 0.40006410263925274, + "learning_rate": 2e-05, + "loss": 0.6352, + "step": 169 + }, + { + "epoch": 5.448, + "eval_loss": 0.6845301985740662, + "eval_runtime": 86.3451, + "eval_samples_per_second": 2.316, + "eval_steps_per_second": 0.29, + "step": 169 + }, + { + "epoch": 5.48, + "grad_norm": 0.40988438997044196, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 170 + }, + { + "epoch": 5.48, + "eval_loss": 0.6851826310157776, + "eval_runtime": 86.4269, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 170 + }, + { + "epoch": 5.5120000000000005, + "grad_norm": 0.47923323092924647, + "learning_rate": 2e-05, + "loss": 0.6926, + "step": 171 + }, + { + "epoch": 5.5120000000000005, + "eval_loss": 0.6841108798980713, + "eval_runtime": 86.3221, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 171 + }, + { + "epoch": 5.5440000000000005, + "grad_norm": 0.4031545746474779, + "learning_rate": 2e-05, + "loss": 0.6961, + "step": 172 + }, + { + "epoch": 5.5440000000000005, + "eval_loss": 0.6829754710197449, + "eval_runtime": 85.2735, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 172 + }, + { + "epoch": 5.576, + "grad_norm": 0.46444040317934493, + "learning_rate": 2e-05, + "loss": 0.6757, + "step": 173 + }, + { + "epoch": 5.576, + "eval_loss": 0.6810196042060852, + "eval_runtime": 85.23, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 173 + }, + { + "epoch": 5.608, + "grad_norm": 0.40032547211306824, + "learning_rate": 2e-05, + "loss": 0.6465, + "step": 174 + }, + { + "epoch": 5.608, + "eval_loss": 0.6795651316642761, + "eval_runtime": 85.2935, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 174 + }, + { + "epoch": 5.64, + "grad_norm": 0.3975749684060634, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 175 + }, + { + "epoch": 5.64, + "eval_loss": 0.6787837147712708, + "eval_runtime": 86.4234, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 175 + }, + { + "epoch": 5.672, + "grad_norm": 0.4413863489846678, + "learning_rate": 2e-05, + "loss": 0.7148, + "step": 176 + }, + { + "epoch": 5.672, + "eval_loss": 0.678077220916748, + "eval_runtime": 86.46, + "eval_samples_per_second": 2.313, + "eval_steps_per_second": 0.289, + "step": 176 + }, + { + "epoch": 5.704, + "grad_norm": 0.4552334205325458, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 177 + }, + { + "epoch": 5.704, + "eval_loss": 0.6782705783843994, + "eval_runtime": 85.7334, + "eval_samples_per_second": 2.333, + "eval_steps_per_second": 0.292, + "step": 177 + }, + { + "epoch": 5.736, + "grad_norm": 0.4222034129737574, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 178 + }, + { + "epoch": 5.736, + "eval_loss": 0.6788575053215027, + "eval_runtime": 85.407, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 178 + }, + { + "epoch": 5.768, + "grad_norm": 0.4212365440913614, + "learning_rate": 2e-05, + "loss": 0.7853, + "step": 179 + }, + { + "epoch": 5.768, + "eval_loss": 0.680314302444458, + "eval_runtime": 85.1528, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 0.294, + "step": 179 + }, + { + "epoch": 5.8, + "grad_norm": 0.47040418573969534, + "learning_rate": 2e-05, + "loss": 0.6941, + "step": 180 + }, + { + "epoch": 5.8, + "eval_loss": 0.6814693212509155, + "eval_runtime": 85.2668, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 180 + }, + { + "epoch": 5.832, + "grad_norm": 0.43506164207204023, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 181 + }, + { + "epoch": 5.832, + "eval_loss": 0.6818942427635193, + "eval_runtime": 86.1082, + "eval_samples_per_second": 2.323, + "eval_steps_per_second": 0.29, + "step": 181 + }, + { + "epoch": 5.864, + "grad_norm": 0.4851524205448296, + "learning_rate": 2e-05, + "loss": 0.6414, + "step": 182 + }, + { + "epoch": 5.864, + "eval_loss": 0.6807515621185303, + "eval_runtime": 85.2905, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 182 + }, + { + "epoch": 5.896, + "grad_norm": 0.46212982880574544, + "learning_rate": 2e-05, + "loss": 0.6594, + "step": 183 + }, + { + "epoch": 5.896, + "eval_loss": 0.6793842911720276, + "eval_runtime": 85.2531, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 183 + }, + { + "epoch": 5.928, + "grad_norm": 0.43483234178092045, + "learning_rate": 2e-05, + "loss": 0.6927, + "step": 184 + }, + { + "epoch": 5.928, + "eval_loss": 0.6785325407981873, + "eval_runtime": 86.4294, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 184 + }, + { + "epoch": 5.96, + "grad_norm": 0.45461536176049777, + "learning_rate": 2e-05, + "loss": 0.7127, + "step": 185 + }, + { + "epoch": 5.96, + "eval_loss": 0.6785117983818054, + "eval_runtime": 86.5612, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 185 + }, + { + "epoch": 5.992, + "grad_norm": 0.5124892629103449, + "learning_rate": 2e-05, + "loss": 0.5778, + "step": 186 + }, + { + "epoch": 5.992, + "eval_loss": 0.6772163510322571, + "eval_runtime": 85.3177, + "eval_samples_per_second": 2.344, + "eval_steps_per_second": 0.293, + "step": 186 + }, + { + "epoch": 6.024, + "grad_norm": 0.4872469973004331, + "learning_rate": 2e-05, + "loss": 0.7045, + "step": 187 + }, + { + "epoch": 6.024, + "eval_loss": 0.6760932207107544, + "eval_runtime": 85.1785, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.294, + "step": 187 + }, + { + "epoch": 6.056, + "grad_norm": 0.43317759363804015, + "learning_rate": 2e-05, + "loss": 0.6121, + "step": 188 + }, + { + "epoch": 6.056, + "eval_loss": 0.6763756275177002, + "eval_runtime": 85.4466, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 188 + }, + { + "epoch": 6.088, + "grad_norm": 0.47411518505747885, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 189 + }, + { + "epoch": 6.088, + "eval_loss": 0.6757389903068542, + "eval_runtime": 85.3382, + "eval_samples_per_second": 2.344, + "eval_steps_per_second": 0.293, + "step": 189 + }, + { + "epoch": 6.12, + "grad_norm": 0.4971851748274855, + "learning_rate": 2e-05, + "loss": 0.7193, + "step": 190 + }, + { + "epoch": 6.12, + "eval_loss": 0.6749419569969177, + "eval_runtime": 85.4198, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 190 + }, + { + "epoch": 6.152, + "grad_norm": 0.46832302038313683, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 191 + }, + { + "epoch": 6.152, + "eval_loss": 0.674567699432373, + "eval_runtime": 85.6429, + "eval_samples_per_second": 2.335, + "eval_steps_per_second": 0.292, + "step": 191 + }, + { + "epoch": 6.184, + "grad_norm": 0.47651234347196103, + "learning_rate": 2e-05, + "loss": 0.7113, + "step": 192 + }, + { + "epoch": 6.184, + "eval_loss": 0.6739189028739929, + "eval_runtime": 85.3814, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 192 + }, + { + "epoch": 6.216, + "grad_norm": 0.4808945978374079, + "learning_rate": 2e-05, + "loss": 0.6603, + "step": 193 + }, + { + "epoch": 6.216, + "eval_loss": 0.6737512350082397, + "eval_runtime": 85.1781, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.294, + "step": 193 + }, + { + "epoch": 6.248, + "grad_norm": 0.45747741700806654, + "learning_rate": 2e-05, + "loss": 0.6905, + "step": 194 + }, + { + "epoch": 6.248, + "eval_loss": 0.6738162040710449, + "eval_runtime": 86.5609, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 194 + }, + { + "epoch": 6.28, + "grad_norm": 0.49033746737240025, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 195 + }, + { + "epoch": 6.28, + "eval_loss": 0.6742382645606995, + "eval_runtime": 86.4284, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 195 + }, + { + "epoch": 6.312, + "grad_norm": 0.5438084044824532, + "learning_rate": 2e-05, + "loss": 0.6819, + "step": 196 + }, + { + "epoch": 6.312, + "eval_loss": 0.674878716468811, + "eval_runtime": 85.7504, + "eval_samples_per_second": 2.332, + "eval_steps_per_second": 0.292, + "step": 196 + }, + { + "epoch": 6.344, + "grad_norm": 0.4631516087052852, + "learning_rate": 2e-05, + "loss": 0.6775, + "step": 197 + }, + { + "epoch": 6.344, + "eval_loss": 0.6761616468429565, + "eval_runtime": 85.417, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 197 + }, + { + "epoch": 6.376, + "grad_norm": 0.49177247669398155, + "learning_rate": 2e-05, + "loss": 0.6605, + "step": 198 + }, + { + "epoch": 6.376, + "eval_loss": 0.6770765781402588, + "eval_runtime": 85.4274, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 198 + }, + { + "epoch": 6.408, + "grad_norm": 0.5177407926775024, + "learning_rate": 2e-05, + "loss": 0.7136, + "step": 199 + }, + { + "epoch": 6.408, + "eval_loss": 0.6772163510322571, + "eval_runtime": 85.2682, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 199 + }, + { + "epoch": 6.44, + "grad_norm": 0.5385213429977403, + "learning_rate": 2e-05, + "loss": 0.6809, + "step": 200 + }, + { + "epoch": 6.44, + "eval_loss": 0.6758923530578613, + "eval_runtime": 85.4858, + "eval_samples_per_second": 2.34, + "eval_steps_per_second": 0.292, + "step": 200 + }, + { + "epoch": 6.4719999999999995, + "grad_norm": 0.4982626204598202, + "learning_rate": 2e-05, + "loss": 0.7159, + "step": 201 + }, + { + "epoch": 6.4719999999999995, + "eval_loss": 0.675208568572998, + "eval_runtime": 86.2523, + "eval_samples_per_second": 2.319, + "eval_steps_per_second": 0.29, + "step": 201 + }, + { + "epoch": 6.504, + "grad_norm": 0.4710756307884673, + "learning_rate": 2e-05, + "loss": 0.6309, + "step": 202 + }, + { + "epoch": 6.504, + "eval_loss": 0.6743338108062744, + "eval_runtime": 86.6636, + "eval_samples_per_second": 2.308, + "eval_steps_per_second": 0.288, + "step": 202 + }, + { + "epoch": 6.536, + "grad_norm": 0.5127505608717865, + "learning_rate": 2e-05, + "loss": 0.6257, + "step": 203 + }, + { + "epoch": 6.536, + "eval_loss": 0.6735503673553467, + "eval_runtime": 86.3216, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 203 + }, + { + "epoch": 6.568, + "grad_norm": 0.48812419654399086, + "learning_rate": 2e-05, + "loss": 0.6164, + "step": 204 + }, + { + "epoch": 6.568, + "eval_loss": 0.6740123629570007, + "eval_runtime": 86.3491, + "eval_samples_per_second": 2.316, + "eval_steps_per_second": 0.29, + "step": 204 + }, + { + "epoch": 6.6, + "grad_norm": 0.5031408070515696, + "learning_rate": 2e-05, + "loss": 0.6765, + "step": 205 + }, + { + "epoch": 6.6, + "eval_loss": 0.6746988892555237, + "eval_runtime": 86.2475, + "eval_samples_per_second": 2.319, + "eval_steps_per_second": 0.29, + "step": 205 + }, + { + "epoch": 6.632, + "grad_norm": 0.5221751920115928, + "learning_rate": 2e-05, + "loss": 0.6836, + "step": 206 + }, + { + "epoch": 6.632, + "eval_loss": 0.6757599711418152, + "eval_runtime": 90.9953, + "eval_samples_per_second": 2.198, + "eval_steps_per_second": 0.275, + "step": 206 + }, + { + "epoch": 6.664, + "grad_norm": 0.49611473039131815, + "learning_rate": 2e-05, + "loss": 0.6809, + "step": 207 + }, + { + "epoch": 6.664, + "eval_loss": 0.676249086856842, + "eval_runtime": 88.6484, + "eval_samples_per_second": 2.256, + "eval_steps_per_second": 0.282, + "step": 207 + }, + { + "epoch": 6.696, + "grad_norm": 0.5646771313169766, + "learning_rate": 2e-05, + "loss": 0.6905, + "step": 208 + }, + { + "epoch": 6.696, + "eval_loss": 0.675098717212677, + "eval_runtime": 88.7358, + "eval_samples_per_second": 2.254, + "eval_steps_per_second": 0.282, + "step": 208 + }, + { + "epoch": 6.728, + "grad_norm": 0.5075133396143146, + "learning_rate": 2e-05, + "loss": 0.7001, + "step": 209 + }, + { + "epoch": 6.728, + "eval_loss": 0.6734683513641357, + "eval_runtime": 88.7108, + "eval_samples_per_second": 2.255, + "eval_steps_per_second": 0.282, + "step": 209 + }, + { + "epoch": 6.76, + "grad_norm": 0.5292004993716772, + "learning_rate": 2e-05, + "loss": 0.6366, + "step": 210 + }, + { + "epoch": 6.76, + "eval_loss": 0.6727490425109863, + "eval_runtime": 88.7397, + "eval_samples_per_second": 2.254, + "eval_steps_per_second": 0.282, + "step": 210 + }, + { + "epoch": 6.792, + "grad_norm": 0.5508154729937994, + "learning_rate": 2e-05, + "loss": 0.6627, + "step": 211 + }, + { + "epoch": 6.792, + "eval_loss": 0.6719673275947571, + "eval_runtime": 94.3959, + "eval_samples_per_second": 2.119, + "eval_steps_per_second": 0.265, + "step": 211 + }, + { + "epoch": 6.824, + "grad_norm": 0.5436944297369074, + "learning_rate": 2e-05, + "loss": 0.6939, + "step": 212 + }, + { + "epoch": 6.824, + "eval_loss": 0.6717627048492432, + "eval_runtime": 92.6409, + "eval_samples_per_second": 2.159, + "eval_steps_per_second": 0.27, + "step": 212 + }, + { + "epoch": 6.856, + "grad_norm": 0.563836681781508, + "learning_rate": 2e-05, + "loss": 0.6715, + "step": 213 + }, + { + "epoch": 6.856, + "eval_loss": 0.6704577803611755, + "eval_runtime": 92.7628, + "eval_samples_per_second": 2.156, + "eval_steps_per_second": 0.27, + "step": 213 + }, + { + "epoch": 6.888, + "grad_norm": 0.5903391746928088, + "learning_rate": 2e-05, + "loss": 0.6706, + "step": 214 + }, + { + "epoch": 6.888, + "eval_loss": 0.6705368161201477, + "eval_runtime": 93.0342, + "eval_samples_per_second": 2.15, + "eval_steps_per_second": 0.269, + "step": 214 + }, + { + "epoch": 6.92, + "grad_norm": 0.5044604071023134, + "learning_rate": 2e-05, + "loss": 0.6308, + "step": 215 + }, + { + "epoch": 6.92, + "eval_loss": 0.6709109544754028, + "eval_runtime": 92.9865, + "eval_samples_per_second": 2.151, + "eval_steps_per_second": 0.269, + "step": 215 + }, + { + "epoch": 6.96, + "grad_norm": 0.5029981251789745, + "learning_rate": 2e-05, + "loss": 0.6565, + "step": 216 + }, + { + "epoch": 6.96, + "eval_loss": 0.6729848384857178, + "eval_runtime": 91.2007, + "eval_samples_per_second": 2.193, + "eval_steps_per_second": 0.274, + "step": 216 + }, + { + "epoch": 6.992, + "grad_norm": 0.662623585564011, + "learning_rate": 2e-05, + "loss": 0.5311, + "step": 217 + }, + { + "epoch": 6.992, + "eval_loss": 0.6713245511054993, + "eval_runtime": 85.2866, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 217 + }, + { + "epoch": 7.024, + "grad_norm": 0.6256446136768937, + "learning_rate": 2e-05, + "loss": 0.6022, + "step": 218 + }, + { + "epoch": 7.024, + "eval_loss": 0.6695873737335205, + "eval_runtime": 86.4021, + "eval_samples_per_second": 2.315, + "eval_steps_per_second": 0.289, + "step": 218 + }, + { + "epoch": 7.056, + "grad_norm": 0.4857229274218417, + "learning_rate": 2e-05, + "loss": 0.647, + "step": 219 + }, + { + "epoch": 7.056, + "eval_loss": 0.6719114780426025, + "eval_runtime": 86.526, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 219 + }, + { + "epoch": 7.088, + "grad_norm": 0.5361101044951209, + "learning_rate": 2e-05, + "loss": 0.6477, + "step": 220 + }, + { + "epoch": 7.088, + "eval_loss": 0.6757076978683472, + "eval_runtime": 85.4011, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 220 + }, + { + "epoch": 7.12, + "grad_norm": 0.8396027685018896, + "learning_rate": 2e-05, + "loss": 0.6112, + "step": 221 + }, + { + "epoch": 7.12, + "eval_loss": 0.6758625507354736, + "eval_runtime": 85.2083, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 221 + }, + { + "epoch": 7.152, + "grad_norm": 0.5858149591099446, + "learning_rate": 2e-05, + "loss": 0.6826, + "step": 222 + }, + { + "epoch": 7.152, + "eval_loss": 0.6765357255935669, + "eval_runtime": 85.5627, + "eval_samples_per_second": 2.337, + "eval_steps_per_second": 0.292, + "step": 222 + }, + { + "epoch": 7.184, + "grad_norm": 0.5694999654835196, + "learning_rate": 2e-05, + "loss": 0.5851, + "step": 223 + }, + { + "epoch": 7.184, + "eval_loss": 0.6776654124259949, + "eval_runtime": 85.417, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 223 + }, + { + "epoch": 7.216, + "grad_norm": 0.6001772847123094, + "learning_rate": 2e-05, + "loss": 0.6633, + "step": 224 + }, + { + "epoch": 7.216, + "eval_loss": 0.6783779859542847, + "eval_runtime": 85.1623, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.294, + "step": 224 + }, + { + "epoch": 7.248, + "grad_norm": 0.6068993188167514, + "learning_rate": 2e-05, + "loss": 0.6275, + "step": 225 + }, + { + "epoch": 7.248, + "eval_loss": 0.6755834817886353, + "eval_runtime": 85.3711, + "eval_samples_per_second": 2.343, + "eval_steps_per_second": 0.293, + "step": 225 + }, + { + "epoch": 7.28, + "grad_norm": 0.6038060153225616, + "learning_rate": 2e-05, + "loss": 0.6319, + "step": 226 + }, + { + "epoch": 7.28, + "eval_loss": 0.6720392107963562, + "eval_runtime": 86.1134, + "eval_samples_per_second": 2.323, + "eval_steps_per_second": 0.29, + "step": 226 + }, + { + "epoch": 7.312, + "grad_norm": 0.5900082642978601, + "learning_rate": 2e-05, + "loss": 0.6417, + "step": 227 + }, + { + "epoch": 7.312, + "eval_loss": 0.6699540615081787, + "eval_runtime": 87.4261, + "eval_samples_per_second": 2.288, + "eval_steps_per_second": 0.286, + "step": 227 + }, + { + "epoch": 7.344, + "grad_norm": 0.6303979703934064, + "learning_rate": 2e-05, + "loss": 0.5954, + "step": 228 + }, + { + "epoch": 7.344, + "eval_loss": 0.6697332859039307, + "eval_runtime": 86.3137, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 228 + }, + { + "epoch": 7.376, + "grad_norm": 0.5896270303292949, + "learning_rate": 2e-05, + "loss": 0.638, + "step": 229 + }, + { + "epoch": 7.376, + "eval_loss": 0.6699292063713074, + "eval_runtime": 85.4946, + "eval_samples_per_second": 2.339, + "eval_steps_per_second": 0.292, + "step": 229 + }, + { + "epoch": 7.408, + "grad_norm": 0.5499007579825991, + "learning_rate": 2e-05, + "loss": 0.6312, + "step": 230 + }, + { + "epoch": 7.408, + "eval_loss": 0.6695923805236816, + "eval_runtime": 85.1955, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.293, + "step": 230 + }, + { + "epoch": 7.44, + "grad_norm": 0.5806994466204508, + "learning_rate": 2e-05, + "loss": 0.6487, + "step": 231 + }, + { + "epoch": 7.44, + "eval_loss": 0.670379638671875, + "eval_runtime": 85.2487, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 231 + }, + { + "epoch": 7.4719999999999995, + "grad_norm": 0.6171318222234403, + "learning_rate": 2e-05, + "loss": 0.6497, + "step": 232 + }, + { + "epoch": 7.4719999999999995, + "eval_loss": 0.671440601348877, + "eval_runtime": 85.4044, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 232 + }, + { + "epoch": 7.504, + "grad_norm": 0.6077497033087486, + "learning_rate": 2e-05, + "loss": 0.6388, + "step": 233 + }, + { + "epoch": 7.504, + "eval_loss": 0.6715556383132935, + "eval_runtime": 85.7739, + "eval_samples_per_second": 2.332, + "eval_steps_per_second": 0.291, + "step": 233 + }, + { + "epoch": 7.536, + "grad_norm": 0.6333159810332618, + "learning_rate": 2e-05, + "loss": 0.636, + "step": 234 + }, + { + "epoch": 7.536, + "eval_loss": 0.6708941459655762, + "eval_runtime": 85.2691, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 234 + }, + { + "epoch": 7.568, + "grad_norm": 0.6022274503734126, + "learning_rate": 2e-05, + "loss": 0.6455, + "step": 235 + }, + { + "epoch": 7.568, + "eval_loss": 0.6690527200698853, + "eval_runtime": 85.4144, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 235 + }, + { + "epoch": 7.608, + "grad_norm": 0.5296025255848918, + "learning_rate": 2e-05, + "loss": 0.572, + "step": 236 + }, + { + "epoch": 7.608, + "eval_loss": 0.6683849096298218, + "eval_runtime": 89.92, + "eval_samples_per_second": 2.224, + "eval_steps_per_second": 0.278, + "step": 236 + }, + { + "epoch": 7.64, + "grad_norm": 0.5436886467794938, + "learning_rate": 2e-05, + "loss": 0.6681, + "step": 237 + }, + { + "epoch": 7.64, + "eval_loss": 0.67000412940979, + "eval_runtime": 85.2548, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 237 + }, + { + "epoch": 7.672, + "grad_norm": 0.5833714563537171, + "learning_rate": 2e-05, + "loss": 0.646, + "step": 238 + }, + { + "epoch": 7.672, + "eval_loss": 0.6720954179763794, + "eval_runtime": 85.4363, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 238 + }, + { + "epoch": 7.704, + "grad_norm": 0.6833890117857615, + "learning_rate": 2e-05, + "loss": 0.6641, + "step": 239 + }, + { + "epoch": 7.704, + "eval_loss": 0.6737973093986511, + "eval_runtime": 85.2541, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 239 + }, + { + "epoch": 7.736, + "grad_norm": 0.5832421680011252, + "learning_rate": 2e-05, + "loss": 0.6742, + "step": 240 + }, + { + "epoch": 7.736, + "eval_loss": 0.6757528185844421, + "eval_runtime": 85.2234, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 240 + }, + { + "epoch": 7.768, + "grad_norm": 0.5843876495624203, + "learning_rate": 2e-05, + "loss": 0.7069, + "step": 241 + }, + { + "epoch": 7.768, + "eval_loss": 0.6778927445411682, + "eval_runtime": 85.2049, + "eval_samples_per_second": 2.347, + "eval_steps_per_second": 0.293, + "step": 241 + }, + { + "epoch": 7.8, + "grad_norm": 0.6527712899983633, + "learning_rate": 2e-05, + "loss": 0.6182, + "step": 242 + }, + { + "epoch": 7.8, + "eval_loss": 0.6785970330238342, + "eval_runtime": 84.9435, + "eval_samples_per_second": 2.355, + "eval_steps_per_second": 0.294, + "step": 242 + }, + { + "epoch": 7.832, + "grad_norm": 0.6228341483848424, + "learning_rate": 2e-05, + "loss": 0.6633, + "step": 243 + }, + { + "epoch": 7.832, + "eval_loss": 0.678627610206604, + "eval_runtime": 85.2349, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 243 + }, + { + "epoch": 7.864, + "grad_norm": 0.6762374705072328, + "learning_rate": 2e-05, + "loss": 0.5581, + "step": 244 + }, + { + "epoch": 7.864, + "eval_loss": 0.6781509518623352, + "eval_runtime": 85.0862, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 0.294, + "step": 244 + }, + { + "epoch": 7.896, + "grad_norm": 0.6530004154367571, + "learning_rate": 2e-05, + "loss": 0.5896, + "step": 245 + }, + { + "epoch": 7.896, + "eval_loss": 0.6776159405708313, + "eval_runtime": 85.4386, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 245 + }, + { + "epoch": 7.928, + "grad_norm": 0.6496264347077455, + "learning_rate": 2e-05, + "loss": 0.6262, + "step": 246 + }, + { + "epoch": 7.928, + "eval_loss": 0.6762803792953491, + "eval_runtime": 86.0069, + "eval_samples_per_second": 2.325, + "eval_steps_per_second": 0.291, + "step": 246 + }, + { + "epoch": 7.96, + "grad_norm": 0.6530394584817848, + "learning_rate": 2e-05, + "loss": 0.6044, + "step": 247 + }, + { + "epoch": 7.96, + "eval_loss": 0.6763593554496765, + "eval_runtime": 86.5382, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 247 + }, + { + "epoch": 7.992, + "grad_norm": 0.7165450399528321, + "learning_rate": 2e-05, + "loss": 0.4777, + "step": 248 + }, + { + "epoch": 7.992, + "eval_loss": 0.6767419576644897, + "eval_runtime": 85.3445, + "eval_samples_per_second": 2.343, + "eval_steps_per_second": 0.293, + "step": 248 + }, + { + "epoch": 8.024, + "grad_norm": 0.6210079733679161, + "learning_rate": 2e-05, + "loss": 0.6113, + "step": 249 + }, + { + "epoch": 8.024, + "eval_loss": 0.6772445440292358, + "eval_runtime": 85.4211, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 249 + }, + { + "epoch": 8.056, + "grad_norm": 0.6078116340925231, + "learning_rate": 2e-05, + "loss": 0.6133, + "step": 250 + }, + { + "epoch": 8.056, + "eval_loss": 0.6801083087921143, + "eval_runtime": 85.1688, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 0.294, + "step": 250 + }, + { + "epoch": 8.088, + "grad_norm": 0.6584954900058523, + "learning_rate": 2e-05, + "loss": 0.6234, + "step": 251 + }, + { + "epoch": 8.088, + "eval_loss": 0.680172324180603, + "eval_runtime": 85.2854, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 251 + }, + { + "epoch": 8.12, + "grad_norm": 0.6410123072973634, + "learning_rate": 2e-05, + "loss": 0.6768, + "step": 252 + }, + { + "epoch": 8.12, + "eval_loss": 0.6790580749511719, + "eval_runtime": 85.343, + "eval_samples_per_second": 2.343, + "eval_steps_per_second": 0.293, + "step": 252 + }, + { + "epoch": 8.152, + "grad_norm": 0.7068905769473427, + "learning_rate": 2e-05, + "loss": 0.6308, + "step": 253 + }, + { + "epoch": 8.152, + "eval_loss": 0.6779585480690002, + "eval_runtime": 85.2579, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 253 + }, + { + "epoch": 8.184, + "grad_norm": 0.6941639704688177, + "learning_rate": 2e-05, + "loss": 0.6651, + "step": 254 + }, + { + "epoch": 8.184, + "eval_loss": 0.6783471703529358, + "eval_runtime": 85.2368, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 0.293, + "step": 254 + }, + { + "epoch": 8.216, + "grad_norm": 0.7347551538754563, + "learning_rate": 2e-05, + "loss": 0.643, + "step": 255 + }, + { + "epoch": 8.216, + "eval_loss": 0.6772164702415466, + "eval_runtime": 86.2786, + "eval_samples_per_second": 2.318, + "eval_steps_per_second": 0.29, + "step": 255 + }, + { + "epoch": 8.248, + "grad_norm": 0.7690902453226406, + "learning_rate": 2e-05, + "loss": 0.6178, + "step": 256 + }, + { + "epoch": 8.248, + "eval_loss": 0.67960524559021, + "eval_runtime": 86.3356, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 256 + }, + { + "epoch": 8.28, + "grad_norm": 0.6534589041693806, + "learning_rate": 2e-05, + "loss": 0.6231, + "step": 257 + }, + { + "epoch": 8.28, + "eval_loss": 0.683942437171936, + "eval_runtime": 86.114, + "eval_samples_per_second": 2.323, + "eval_steps_per_second": 0.29, + "step": 257 + }, + { + "epoch": 8.312, + "grad_norm": 0.7620000857656035, + "learning_rate": 2e-05, + "loss": 0.5937, + "step": 258 + }, + { + "epoch": 8.312, + "eval_loss": 0.6850832104682922, + "eval_runtime": 86.3659, + "eval_samples_per_second": 2.316, + "eval_steps_per_second": 0.289, + "step": 258 + }, + { + "epoch": 8.344, + "grad_norm": 0.8769311371151648, + "learning_rate": 2e-05, + "loss": 0.6288, + "step": 259 + }, + { + "epoch": 8.344, + "eval_loss": 0.6806495189666748, + "eval_runtime": 86.2011, + "eval_samples_per_second": 2.32, + "eval_steps_per_second": 0.29, + "step": 259 + }, + { + "epoch": 8.376, + "grad_norm": 0.7549996230143433, + "learning_rate": 2e-05, + "loss": 0.5614, + "step": 260 + }, + { + "epoch": 8.376, + "eval_loss": 0.6746546030044556, + "eval_runtime": 86.0828, + "eval_samples_per_second": 2.323, + "eval_steps_per_second": 0.29, + "step": 260 + }, + { + "epoch": 8.408, + "grad_norm": 0.6678277921019138, + "learning_rate": 2e-05, + "loss": 0.5818, + "step": 261 + }, + { + "epoch": 8.408, + "eval_loss": 0.6705954074859619, + "eval_runtime": 86.9228, + "eval_samples_per_second": 2.301, + "eval_steps_per_second": 0.288, + "step": 261 + }, + { + "epoch": 8.44, + "grad_norm": 0.6629861523432089, + "learning_rate": 2e-05, + "loss": 0.6231, + "step": 262 + }, + { + "epoch": 8.44, + "eval_loss": 0.6688622832298279, + "eval_runtime": 86.644, + "eval_samples_per_second": 2.308, + "eval_steps_per_second": 0.289, + "step": 262 + }, + { + "epoch": 8.472, + "grad_norm": 0.7468331552698385, + "learning_rate": 2e-05, + "loss": 0.6221, + "step": 263 + }, + { + "epoch": 8.472, + "eval_loss": 0.6675601005554199, + "eval_runtime": 86.5385, + "eval_samples_per_second": 2.311, + "eval_steps_per_second": 0.289, + "step": 263 + }, + { + "epoch": 8.504, + "grad_norm": 0.6718735805762622, + "learning_rate": 2e-05, + "loss": 0.5989, + "step": 264 + }, + { + "epoch": 8.504, + "eval_loss": 0.6682644486427307, + "eval_runtime": 86.5137, + "eval_samples_per_second": 2.312, + "eval_steps_per_second": 0.289, + "step": 264 + }, + { + "epoch": 8.536, + "grad_norm": 0.7360990456326049, + "learning_rate": 2e-05, + "loss": 0.586, + "step": 265 + }, + { + "epoch": 8.536, + "eval_loss": 0.670520544052124, + "eval_runtime": 86.3921, + "eval_samples_per_second": 2.315, + "eval_steps_per_second": 0.289, + "step": 265 + }, + { + "epoch": 8.568, + "grad_norm": 0.7372365755868506, + "learning_rate": 2e-05, + "loss": 0.6154, + "step": 266 + }, + { + "epoch": 8.568, + "eval_loss": 0.6722339391708374, + "eval_runtime": 86.4486, + "eval_samples_per_second": 2.314, + "eval_steps_per_second": 0.289, + "step": 266 + }, + { + "epoch": 8.6, + "grad_norm": 0.7691674703908615, + "learning_rate": 2e-05, + "loss": 0.5759, + "step": 267 + }, + { + "epoch": 8.6, + "eval_loss": 0.6752627491950989, + "eval_runtime": 86.3478, + "eval_samples_per_second": 2.316, + "eval_steps_per_second": 0.29, + "step": 267 + }, + { + "epoch": 8.632, + "grad_norm": 0.7037334988016319, + "learning_rate": 2e-05, + "loss": 0.5808, + "step": 268 + }, + { + "epoch": 8.632, + "eval_loss": 0.6786094903945923, + "eval_runtime": 86.3221, + "eval_samples_per_second": 2.317, + "eval_steps_per_second": 0.29, + "step": 268 + }, + { + "epoch": 8.664, + "grad_norm": 0.7364875762471698, + "learning_rate": 2e-05, + "loss": 0.6381, + "step": 269 + }, + { + "epoch": 8.664, + "eval_loss": 0.6802875399589539, + "eval_runtime": 87.3858, + "eval_samples_per_second": 2.289, + "eval_steps_per_second": 0.286, + "step": 269 + }, + { + "epoch": 8.696, + "grad_norm": 0.772443884505786, + "learning_rate": 2e-05, + "loss": 0.5779, + "step": 270 + }, + { + "epoch": 8.696, + "eval_loss": 0.6788821220397949, + "eval_runtime": 86.2832, + "eval_samples_per_second": 2.318, + "eval_steps_per_second": 0.29, + "step": 270 + }, + { + "epoch": 8.728, + "grad_norm": 0.8243245423024692, + "learning_rate": 2e-05, + "loss": 0.5899, + "step": 271 + }, + { + "epoch": 8.728, + "eval_loss": 0.6770071983337402, + "eval_runtime": 89.4282, + "eval_samples_per_second": 2.236, + "eval_steps_per_second": 0.28, + "step": 271 + }, + { + "epoch": 8.76, + "grad_norm": 0.7241934433417714, + "learning_rate": 2e-05, + "loss": 0.6323, + "step": 272 + }, + { + "epoch": 8.76, + "eval_loss": 0.6750556826591492, + "eval_runtime": 88.2376, + "eval_samples_per_second": 2.267, + "eval_steps_per_second": 0.283, + "step": 272 + }, + { + "epoch": 8.792, + "grad_norm": 0.7031539699096522, + "learning_rate": 2e-05, + "loss": 0.5878, + "step": 273 + }, + { + "epoch": 8.792, + "eval_loss": 0.6727312207221985, + "eval_runtime": 88.2462, + "eval_samples_per_second": 2.266, + "eval_steps_per_second": 0.283, + "step": 273 + }, + { + "epoch": 8.824, + "grad_norm": 0.7218931138049051, + "learning_rate": 2e-05, + "loss": 0.613, + "step": 274 + }, + { + "epoch": 8.824, + "eval_loss": 0.6727555990219116, + "eval_runtime": 88.3872, + "eval_samples_per_second": 2.263, + "eval_steps_per_second": 0.283, + "step": 274 + }, + { + "epoch": 8.856, + "grad_norm": 0.7231490645694756, + "learning_rate": 2e-05, + "loss": 0.6315, + "step": 275 + }, + { + "epoch": 8.856, + "eval_loss": 0.6705790758132935, + "eval_runtime": 87.9218, + "eval_samples_per_second": 2.275, + "eval_steps_per_second": 0.284, + "step": 275 + }, + { + "epoch": 8.888, + "grad_norm": 0.7051718905755886, + "learning_rate": 2e-05, + "loss": 0.6076, + "step": 276 + }, + { + "epoch": 8.888, + "eval_loss": 0.6689162254333496, + "eval_runtime": 91.8597, + "eval_samples_per_second": 2.177, + "eval_steps_per_second": 0.272, + "step": 276 + }, + { + "epoch": 8.92, + "grad_norm": 0.7328110944982523, + "learning_rate": 2e-05, + "loss": 0.624, + "step": 277 + }, + { + "epoch": 8.92, + "eval_loss": 0.6683139204978943, + "eval_runtime": 92.6652, + "eval_samples_per_second": 2.158, + "eval_steps_per_second": 0.27, + "step": 277 + }, + { + "epoch": 8.952, + "grad_norm": 0.7116677024113118, + "learning_rate": 2e-05, + "loss": 0.6078, + "step": 278 + }, + { + "epoch": 8.952, + "eval_loss": 0.6700756549835205, + "eval_runtime": 91.0374, + "eval_samples_per_second": 2.197, + "eval_steps_per_second": 0.275, + "step": 278 + }, + { + "epoch": 8.984, + "grad_norm": 0.7461165978892803, + "learning_rate": 2e-05, + "loss": 0.6104, + "step": 279 + }, + { + "epoch": 8.984, + "eval_loss": 0.670260488986969, + "eval_runtime": 90.6087, + "eval_samples_per_second": 2.207, + "eval_steps_per_second": 0.276, + "step": 279 + }, + { + "epoch": 9.016, + "grad_norm": 0.73533421631475, + "learning_rate": 2e-05, + "loss": 0.5993, + "step": 280 + }, + { + "epoch": 9.016, + "eval_loss": 0.6718578934669495, + "eval_runtime": 90.678, + "eval_samples_per_second": 2.206, + "eval_steps_per_second": 0.276, + "step": 280 + }, + { + "epoch": 9.064, + "grad_norm": 0.7141681250783954, + "learning_rate": 2e-05, + "loss": 0.5763, + "step": 281 + }, + { + "epoch": 9.064, + "eval_loss": 0.6721769571304321, + "eval_runtime": 89.905, + "eval_samples_per_second": 2.225, + "eval_steps_per_second": 0.278, + "step": 281 + }, + { + "epoch": 9.096, + "grad_norm": 0.7179191597134931, + "learning_rate": 2e-05, + "loss": 0.6039, + "step": 282 + }, + { + "epoch": 9.096, + "eval_loss": 0.670803964138031, + "eval_runtime": 85.2808, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 0.293, + "step": 282 + }, + { + "epoch": 9.128, + "grad_norm": 0.6677686603528123, + "learning_rate": 2e-05, + "loss": 0.6018, + "step": 283 + }, + { + "epoch": 9.128, + "eval_loss": 0.67020583152771, + "eval_runtime": 85.394, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 0.293, + "step": 283 + }, + { + "epoch": 9.16, + "grad_norm": 0.7730086322440611, + "learning_rate": 2e-05, + "loss": 0.5776, + "step": 284 + }, + { + "epoch": 9.16, + "eval_loss": 0.6687878370285034, + "eval_runtime": 85.3242, + "eval_samples_per_second": 2.344, + "eval_steps_per_second": 0.293, + "step": 284 + }, + { + "epoch": 9.192, + "grad_norm": 0.6910229929978119, + "learning_rate": 2e-05, + "loss": 0.5947, + "step": 285 + }, + { + "epoch": 9.192, + "eval_loss": 0.6686851978302002, + "eval_runtime": 85.4375, + "eval_samples_per_second": 2.341, + "eval_steps_per_second": 0.293, + "step": 285 + } + ], + "logging_steps": 1.0, + "max_steps": 310, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 612422111199232.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}