{ "best_metric": 0.6686851978302002, "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-13b/checkpoint-285", "epoch": 9.992, "eval_steps": 1.0, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 0.5758810052676581, "learning_rate": 0.0, "loss": 1.5784, "step": 1 }, { "epoch": 0.032, "eval_loss": 1.614479660987854, "eval_runtime": 90.1495, "eval_samples_per_second": 2.219, "eval_steps_per_second": 0.277, "step": 1 }, { "epoch": 0.064, "grad_norm": 0.5732524292532967, "learning_rate": 1.2618595071429148e-05, "loss": 1.496, "step": 2 }, { "epoch": 0.064, "eval_loss": 1.614479660987854, "eval_runtime": 85.9035, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.291, "step": 2 }, { "epoch": 0.096, "grad_norm": 0.5752113482534317, "learning_rate": 2e-05, "loss": 1.5565, "step": 3 }, { "epoch": 0.096, "eval_loss": 1.5945543050765991, "eval_runtime": 85.9029, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.291, "step": 3 }, { "epoch": 0.128, "grad_norm": 0.46566701161868557, "learning_rate": 2e-05, "loss": 1.4933, "step": 4 }, { "epoch": 0.128, "eval_loss": 1.5579420328140259, "eval_runtime": 85.9412, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.291, "step": 4 }, { "epoch": 0.16, "grad_norm": 0.6298906192739324, "learning_rate": 2e-05, "loss": 1.5177, "step": 5 }, { "epoch": 0.16, "eval_loss": 1.519984483718872, "eval_runtime": 85.9435, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.291, "step": 5 }, { "epoch": 0.192, "grad_norm": 0.5629546129758171, "learning_rate": 2e-05, "loss": 1.4806, "step": 6 }, { "epoch": 0.192, "eval_loss": 1.4810457229614258, "eval_runtime": 86.8543, "eval_samples_per_second": 2.303, "eval_steps_per_second": 0.288, "step": 6 }, { "epoch": 0.224, "grad_norm": 0.5629546129758171, "learning_rate": 2e-05, "loss": 1.4426, "step": 7 }, { "epoch": 0.224, "eval_loss": 1.4810457229614258, "eval_runtime": 85.5769, "eval_samples_per_second": 2.337, "eval_steps_per_second": 0.292, "step": 7 }, { "epoch": 0.256, "grad_norm": 0.5629546129758171, "learning_rate": 2e-05, "loss": 1.487, "step": 8 }, { "epoch": 0.256, "eval_loss": 1.4810457229614258, "eval_runtime": 85.5166, "eval_samples_per_second": 2.339, "eval_steps_per_second": 0.292, "step": 8 }, { "epoch": 0.288, "grad_norm": 0.5225734696456765, "learning_rate": 2e-05, "loss": 1.4824, "step": 9 }, { "epoch": 0.288, "eval_loss": 1.4472432136535645, "eval_runtime": 85.6273, "eval_samples_per_second": 2.336, "eval_steps_per_second": 0.292, "step": 9 }, { "epoch": 0.32, "grad_norm": 1.5120766386106574, "learning_rate": 2e-05, "loss": 1.4055, "step": 10 }, { "epoch": 0.32, "eval_loss": 1.431533694267273, "eval_runtime": 85.1358, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 10 }, { "epoch": 0.352, "grad_norm": 1.5120766386106574, "learning_rate": 2e-05, "loss": 1.4374, "step": 11 }, { "epoch": 0.352, "eval_loss": 1.431533694267273, "eval_runtime": 85.8174, "eval_samples_per_second": 2.331, "eval_steps_per_second": 0.291, "step": 11 }, { "epoch": 0.384, "grad_norm": 1.9218280445348435, "learning_rate": 2e-05, "loss": 1.4128, "step": 12 }, { "epoch": 0.384, "eval_loss": 1.4020923376083374, "eval_runtime": 85.7769, "eval_samples_per_second": 2.332, "eval_steps_per_second": 0.291, "step": 12 }, { "epoch": 0.416, "grad_norm": 1.5494392795931824, "learning_rate": 2e-05, "loss": 1.4671, "step": 13 }, { "epoch": 0.416, "eval_loss": 1.3614002466201782, "eval_runtime": 85.7302, "eval_samples_per_second": 2.333, "eval_steps_per_second": 0.292, "step": 13 }, { "epoch": 0.448, "grad_norm": 2.3567445757054766, "learning_rate": 2e-05, "loss": 1.2809, "step": 14 }, { "epoch": 0.448, "eval_loss": 1.3194799423217773, "eval_runtime": 85.9134, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.291, "step": 14 }, { "epoch": 0.48, "grad_norm": 1.5106182972290174, "learning_rate": 2e-05, "loss": 1.3338, "step": 15 }, { "epoch": 0.48, "eval_loss": 1.2909258604049683, "eval_runtime": 86.1624, "eval_samples_per_second": 2.321, "eval_steps_per_second": 0.29, "step": 15 }, { "epoch": 0.512, "grad_norm": 0.6480008459041514, "learning_rate": 2e-05, "loss": 1.2888, "step": 16 }, { "epoch": 0.512, "eval_loss": 1.267655611038208, "eval_runtime": 85.1961, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.293, "step": 16 }, { "epoch": 0.544, "grad_norm": 0.5742819355565492, "learning_rate": 2e-05, "loss": 1.265, "step": 17 }, { "epoch": 0.544, "eval_loss": 1.243842601776123, "eval_runtime": 85.3262, "eval_samples_per_second": 2.344, "eval_steps_per_second": 0.293, "step": 17 }, { "epoch": 0.576, "grad_norm": 0.6388701851382904, "learning_rate": 2e-05, "loss": 1.2662, "step": 18 }, { "epoch": 0.576, "eval_loss": 1.2195556163787842, "eval_runtime": 85.1951, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.293, "step": 18 }, { "epoch": 0.608, "grad_norm": 0.4940836425011853, "learning_rate": 2e-05, "loss": 1.136, "step": 19 }, { "epoch": 0.608, "eval_loss": 1.1959036588668823, "eval_runtime": 85.3064, "eval_samples_per_second": 2.344, "eval_steps_per_second": 0.293, "step": 19 }, { "epoch": 0.64, "grad_norm": 0.48425296045894156, "learning_rate": 2e-05, "loss": 1.2361, "step": 20 }, { "epoch": 0.64, "eval_loss": 1.1732313632965088, "eval_runtime": 85.2018, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 20 }, { "epoch": 0.672, "grad_norm": 0.4174760070919301, "learning_rate": 2e-05, "loss": 1.1559, "step": 21 }, { "epoch": 0.672, "eval_loss": 1.151344656944275, "eval_runtime": 85.1267, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 21 }, { "epoch": 0.704, "grad_norm": 0.4514925259027495, "learning_rate": 2e-05, "loss": 1.2058, "step": 22 }, { "epoch": 0.704, "eval_loss": 1.1299418210983276, "eval_runtime": 85.187, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.293, "step": 22 }, { "epoch": 0.736, "grad_norm": 0.4104167292767524, "learning_rate": 2e-05, "loss": 1.1874, "step": 23 }, { "epoch": 0.736, "eval_loss": 1.1091759204864502, "eval_runtime": 85.2104, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 23 }, { "epoch": 0.768, "grad_norm": 0.3690081767478843, "learning_rate": 2e-05, "loss": 1.1555, "step": 24 }, { "epoch": 0.768, "eval_loss": 1.0891705751419067, "eval_runtime": 85.0331, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.294, "step": 24 }, { "epoch": 0.8, "grad_norm": 0.39605731232207236, "learning_rate": 2e-05, "loss": 1.1113, "step": 25 }, { "epoch": 0.8, "eval_loss": 1.0692001581192017, "eval_runtime": 85.2164, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 25 }, { "epoch": 0.832, "grad_norm": 0.36927768645318826, "learning_rate": 2e-05, "loss": 1.1124, "step": 26 }, { "epoch": 0.832, "eval_loss": 1.0496515035629272, "eval_runtime": 88.1096, "eval_samples_per_second": 2.27, "eval_steps_per_second": 0.284, "step": 26 }, { "epoch": 0.864, "grad_norm": 0.38712274276178793, "learning_rate": 2e-05, "loss": 1.0485, "step": 27 }, { "epoch": 0.864, "eval_loss": 1.0307434797286987, "eval_runtime": 88.6941, "eval_samples_per_second": 2.255, "eval_steps_per_second": 0.282, "step": 27 }, { "epoch": 0.896, "grad_norm": 0.37683532534478703, "learning_rate": 2e-05, "loss": 1.0494, "step": 28 }, { "epoch": 0.896, "eval_loss": 1.0122556686401367, "eval_runtime": 88.6898, "eval_samples_per_second": 2.255, "eval_steps_per_second": 0.282, "step": 28 }, { "epoch": 0.928, "grad_norm": 0.31167540236884894, "learning_rate": 2e-05, "loss": 1.0384, "step": 29 }, { "epoch": 0.928, "eval_loss": 0.994915783405304, "eval_runtime": 88.4751, "eval_samples_per_second": 2.261, "eval_steps_per_second": 0.283, "step": 29 }, { "epoch": 0.96, "grad_norm": 0.3035168410857397, "learning_rate": 2e-05, "loss": 1.0571, "step": 30 }, { "epoch": 0.96, "eval_loss": 0.9787777662277222, "eval_runtime": 88.4016, "eval_samples_per_second": 2.262, "eval_steps_per_second": 0.283, "step": 30 }, { "epoch": 0.992, "grad_norm": 0.3501105312815732, "learning_rate": 2e-05, "loss": 0.915, "step": 31 }, { "epoch": 0.992, "eval_loss": 0.9635753035545349, "eval_runtime": 93.659, "eval_samples_per_second": 2.135, "eval_steps_per_second": 0.267, "step": 31 }, { "epoch": 1.024, "grad_norm": 0.31289892959527454, "learning_rate": 2e-05, "loss": 1.0061, "step": 32 }, { "epoch": 1.024, "eval_loss": 0.9496576189994812, "eval_runtime": 92.1616, "eval_samples_per_second": 2.17, "eval_steps_per_second": 0.271, "step": 32 }, { "epoch": 1.056, "grad_norm": 0.29757404844376606, "learning_rate": 2e-05, "loss": 1.018, "step": 33 }, { "epoch": 1.056, "eval_loss": 0.9369340538978577, "eval_runtime": 92.6023, "eval_samples_per_second": 2.16, "eval_steps_per_second": 0.27, "step": 33 }, { "epoch": 1.088, "grad_norm": 0.2618148684145232, "learning_rate": 2e-05, "loss": 0.927, "step": 34 }, { "epoch": 1.088, "eval_loss": 0.9251891374588013, "eval_runtime": 92.1541, "eval_samples_per_second": 2.17, "eval_steps_per_second": 0.271, "step": 34 }, { "epoch": 1.12, "grad_norm": 0.28251385173765375, "learning_rate": 2e-05, "loss": 0.9539, "step": 35 }, { "epoch": 1.12, "eval_loss": 0.913889467716217, "eval_runtime": 92.0402, "eval_samples_per_second": 2.173, "eval_steps_per_second": 0.272, "step": 35 }, { "epoch": 1.16, "grad_norm": 0.260093009410511, "learning_rate": 2e-05, "loss": 0.9356, "step": 36 }, { "epoch": 1.16, "eval_loss": 0.9035020470619202, "eval_runtime": 89.6964, "eval_samples_per_second": 2.23, "eval_steps_per_second": 0.279, "step": 36 }, { "epoch": 1.192, "grad_norm": 0.27662724636836117, "learning_rate": 2e-05, "loss": 0.9597, "step": 37 }, { "epoch": 1.192, "eval_loss": 0.8957402110099792, "eval_runtime": 85.1681, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.294, "step": 37 }, { "epoch": 1.224, "grad_norm": 0.5728746629980745, "learning_rate": 2e-05, "loss": 0.9398, "step": 38 }, { "epoch": 1.224, "eval_loss": 0.887246310710907, "eval_runtime": 85.6149, "eval_samples_per_second": 2.336, "eval_steps_per_second": 0.292, "step": 38 }, { "epoch": 1.256, "grad_norm": 0.2684824759760228, "learning_rate": 2e-05, "loss": 0.9616, "step": 39 }, { "epoch": 1.256, "eval_loss": 0.8806753754615784, "eval_runtime": 86.3901, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.289, "step": 39 }, { "epoch": 1.288, "grad_norm": 0.24685769110976413, "learning_rate": 2e-05, "loss": 0.9854, "step": 40 }, { "epoch": 1.288, "eval_loss": 0.8744142055511475, "eval_runtime": 85.3845, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 40 }, { "epoch": 1.32, "grad_norm": 0.2357626526047496, "learning_rate": 2e-05, "loss": 0.9284, "step": 41 }, { "epoch": 1.32, "eval_loss": 0.868619441986084, "eval_runtime": 86.2809, "eval_samples_per_second": 2.318, "eval_steps_per_second": 0.29, "step": 41 }, { "epoch": 1.3519999999999999, "grad_norm": 0.22791772858432163, "learning_rate": 2e-05, "loss": 1.0035, "step": 42 }, { "epoch": 1.3519999999999999, "eval_loss": 0.8631160259246826, "eval_runtime": 85.2643, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 42 }, { "epoch": 1.384, "grad_norm": 0.2301475582048382, "learning_rate": 2e-05, "loss": 0.9441, "step": 43 }, { "epoch": 1.384, "eval_loss": 0.8579904437065125, "eval_runtime": 85.464, "eval_samples_per_second": 2.34, "eval_steps_per_second": 0.293, "step": 43 }, { "epoch": 1.416, "grad_norm": 0.2435877146655292, "learning_rate": 2e-05, "loss": 0.9537, "step": 44 }, { "epoch": 1.416, "eval_loss": 0.8532869219779968, "eval_runtime": 85.4531, "eval_samples_per_second": 2.34, "eval_steps_per_second": 0.293, "step": 44 }, { "epoch": 1.448, "grad_norm": 0.22680224690529022, "learning_rate": 2e-05, "loss": 0.8432, "step": 45 }, { "epoch": 1.448, "eval_loss": 0.8488282561302185, "eval_runtime": 85.2256, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 45 }, { "epoch": 1.48, "grad_norm": 0.24467493716810432, "learning_rate": 2e-05, "loss": 0.9582, "step": 46 }, { "epoch": 1.48, "eval_loss": 0.8452281951904297, "eval_runtime": 86.4412, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 46 }, { "epoch": 1.512, "grad_norm": 0.3102498103163491, "learning_rate": 2e-05, "loss": 0.8935, "step": 47 }, { "epoch": 1.512, "eval_loss": 0.8409687876701355, "eval_runtime": 85.1809, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.293, "step": 47 }, { "epoch": 1.544, "grad_norm": 0.26376164638875965, "learning_rate": 2e-05, "loss": 0.9153, "step": 48 }, { "epoch": 1.544, "eval_loss": 0.836646556854248, "eval_runtime": 85.3347, "eval_samples_per_second": 2.344, "eval_steps_per_second": 0.293, "step": 48 }, { "epoch": 1.576, "grad_norm": 0.26268816516328214, "learning_rate": 2e-05, "loss": 0.8937, "step": 49 }, { "epoch": 1.576, "eval_loss": 0.8322432637214661, "eval_runtime": 85.1246, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 49 }, { "epoch": 1.608, "grad_norm": 0.20800644242816013, "learning_rate": 2e-05, "loss": 0.8346, "step": 50 }, { "epoch": 1.608, "eval_loss": 0.8282632231712341, "eval_runtime": 85.9203, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.291, "step": 50 }, { "epoch": 1.6400000000000001, "grad_norm": 0.234023912047604, "learning_rate": 2e-05, "loss": 0.9457, "step": 51 }, { "epoch": 1.6400000000000001, "eval_loss": 0.8249067068099976, "eval_runtime": 85.5644, "eval_samples_per_second": 2.337, "eval_steps_per_second": 0.292, "step": 51 }, { "epoch": 1.6720000000000002, "grad_norm": 0.22274778391959116, "learning_rate": 2e-05, "loss": 0.894, "step": 52 }, { "epoch": 1.6720000000000002, "eval_loss": 0.8220057487487793, "eval_runtime": 85.2356, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 52 }, { "epoch": 1.704, "grad_norm": 0.247116310753153, "learning_rate": 2e-05, "loss": 0.9422, "step": 53 }, { "epoch": 1.704, "eval_loss": 0.8193264603614807, "eval_runtime": 85.2302, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 53 }, { "epoch": 1.736, "grad_norm": 0.2156755816522451, "learning_rate": 2e-05, "loss": 0.9483, "step": 54 }, { "epoch": 1.736, "eval_loss": 0.8170039653778076, "eval_runtime": 85.5313, "eval_samples_per_second": 2.338, "eval_steps_per_second": 0.292, "step": 54 }, { "epoch": 1.768, "grad_norm": 0.20641699121207405, "learning_rate": 2e-05, "loss": 0.9433, "step": 55 }, { "epoch": 1.768, "eval_loss": 0.8153803944587708, "eval_runtime": 85.4465, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 55 }, { "epoch": 1.8, "grad_norm": 0.22411091268182518, "learning_rate": 2e-05, "loss": 0.8839, "step": 56 }, { "epoch": 1.8, "eval_loss": 0.8131626844406128, "eval_runtime": 87.3514, "eval_samples_per_second": 2.29, "eval_steps_per_second": 0.286, "step": 56 }, { "epoch": 1.8319999999999999, "grad_norm": 0.22136515515298041, "learning_rate": 2e-05, "loss": 0.9219, "step": 57 }, { "epoch": 1.8319999999999999, "eval_loss": 0.8108111023902893, "eval_runtime": 86.5607, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 57 }, { "epoch": 1.8639999999999999, "grad_norm": 0.22277176520749853, "learning_rate": 2e-05, "loss": 0.8317, "step": 58 }, { "epoch": 1.8639999999999999, "eval_loss": 0.8082687854766846, "eval_runtime": 86.4781, "eval_samples_per_second": 2.313, "eval_steps_per_second": 0.289, "step": 58 }, { "epoch": 1.896, "grad_norm": 0.22242544994690336, "learning_rate": 2e-05, "loss": 0.8317, "step": 59 }, { "epoch": 1.896, "eval_loss": 0.8052798509597778, "eval_runtime": 86.1491, "eval_samples_per_second": 2.322, "eval_steps_per_second": 0.29, "step": 59 }, { "epoch": 1.928, "grad_norm": 0.20539599715237697, "learning_rate": 2e-05, "loss": 0.8777, "step": 60 }, { "epoch": 1.928, "eval_loss": 0.8023205399513245, "eval_runtime": 86.2508, "eval_samples_per_second": 2.319, "eval_steps_per_second": 0.29, "step": 60 }, { "epoch": 1.96, "grad_norm": 0.2259203508735786, "learning_rate": 2e-05, "loss": 0.8987, "step": 61 }, { "epoch": 1.96, "eval_loss": 0.7997938394546509, "eval_runtime": 87.6556, "eval_samples_per_second": 2.282, "eval_steps_per_second": 0.285, "step": 61 }, { "epoch": 1.992, "grad_norm": 0.2423173341059814, "learning_rate": 2e-05, "loss": 0.7621, "step": 62 }, { "epoch": 1.992, "eval_loss": 0.7969934344291687, "eval_runtime": 86.8775, "eval_samples_per_second": 2.302, "eval_steps_per_second": 0.288, "step": 62 }, { "epoch": 2.024, "grad_norm": 0.24036001781096705, "learning_rate": 2e-05, "loss": 0.8819, "step": 63 }, { "epoch": 2.024, "eval_loss": 0.7944203019142151, "eval_runtime": 86.5654, "eval_samples_per_second": 2.31, "eval_steps_per_second": 0.289, "step": 63 }, { "epoch": 2.056, "grad_norm": 0.20841482575321812, "learning_rate": 2e-05, "loss": 0.7713, "step": 64 }, { "epoch": 2.056, "eval_loss": 0.791679859161377, "eval_runtime": 86.6028, "eval_samples_per_second": 2.309, "eval_steps_per_second": 0.289, "step": 64 }, { "epoch": 2.088, "grad_norm": 0.2184988692033955, "learning_rate": 2e-05, "loss": 0.829, "step": 65 }, { "epoch": 2.088, "eval_loss": 0.7891654372215271, "eval_runtime": 86.7507, "eval_samples_per_second": 2.305, "eval_steps_per_second": 0.288, "step": 65 }, { "epoch": 2.12, "grad_norm": 0.23020842769384967, "learning_rate": 2e-05, "loss": 0.8473, "step": 66 }, { "epoch": 2.12, "eval_loss": 0.7867069840431213, "eval_runtime": 87.5061, "eval_samples_per_second": 2.286, "eval_steps_per_second": 0.286, "step": 66 }, { "epoch": 2.152, "grad_norm": 0.25430631663993714, "learning_rate": 2e-05, "loss": 0.8681, "step": 67 }, { "epoch": 2.152, "eval_loss": 0.7836448550224304, "eval_runtime": 88.0078, "eval_samples_per_second": 2.273, "eval_steps_per_second": 0.284, "step": 67 }, { "epoch": 2.184, "grad_norm": 0.23653466680757473, "learning_rate": 2e-05, "loss": 0.8876, "step": 68 }, { "epoch": 2.184, "eval_loss": 0.7806727886199951, "eval_runtime": 87.345, "eval_samples_per_second": 2.29, "eval_steps_per_second": 0.286, "step": 68 }, { "epoch": 2.216, "grad_norm": 0.2565004166075463, "learning_rate": 2e-05, "loss": 0.8596, "step": 69 }, { "epoch": 2.216, "eval_loss": 0.7781125903129578, "eval_runtime": 87.2054, "eval_samples_per_second": 2.293, "eval_steps_per_second": 0.287, "step": 69 }, { "epoch": 2.248, "grad_norm": 0.22097009361742267, "learning_rate": 2e-05, "loss": 0.8956, "step": 70 }, { "epoch": 2.248, "eval_loss": 0.7758963704109192, "eval_runtime": 87.6181, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.285, "step": 70 }, { "epoch": 2.2800000000000002, "grad_norm": 0.23458324558709256, "learning_rate": 2e-05, "loss": 0.8812, "step": 71 }, { "epoch": 2.2800000000000002, "eval_loss": 0.773766279220581, "eval_runtime": 86.8377, "eval_samples_per_second": 2.303, "eval_steps_per_second": 0.288, "step": 71 }, { "epoch": 2.312, "grad_norm": 0.2690788840468198, "learning_rate": 2e-05, "loss": 0.8779, "step": 72 }, { "epoch": 2.312, "eval_loss": 0.7716243267059326, "eval_runtime": 86.5931, "eval_samples_per_second": 2.31, "eval_steps_per_second": 0.289, "step": 72 }, { "epoch": 2.344, "grad_norm": 0.22263909993294226, "learning_rate": 2e-05, "loss": 0.7766, "step": 73 }, { "epoch": 2.344, "eval_loss": 0.7695778012275696, "eval_runtime": 86.9844, "eval_samples_per_second": 2.299, "eval_steps_per_second": 0.287, "step": 73 }, { "epoch": 2.376, "grad_norm": 0.26058003387602907, "learning_rate": 2e-05, "loss": 0.8995, "step": 74 }, { "epoch": 2.376, "eval_loss": 0.7680388689041138, "eval_runtime": 86.551, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 74 }, { "epoch": 2.408, "grad_norm": 0.2262224984051455, "learning_rate": 2e-05, "loss": 0.8323, "step": 75 }, { "epoch": 2.408, "eval_loss": 0.766679584980011, "eval_runtime": 86.5962, "eval_samples_per_second": 2.31, "eval_steps_per_second": 0.289, "step": 75 }, { "epoch": 2.44, "grad_norm": 0.25720804331740627, "learning_rate": 2e-05, "loss": 0.8036, "step": 76 }, { "epoch": 2.44, "eval_loss": 0.7652787566184998, "eval_runtime": 86.2366, "eval_samples_per_second": 2.319, "eval_steps_per_second": 0.29, "step": 76 }, { "epoch": 2.472, "grad_norm": 0.22971293606988397, "learning_rate": 2e-05, "loss": 0.8806, "step": 77 }, { "epoch": 2.472, "eval_loss": 0.7643636465072632, "eval_runtime": 86.3577, "eval_samples_per_second": 2.316, "eval_steps_per_second": 0.289, "step": 77 }, { "epoch": 2.504, "grad_norm": 0.2522080484690418, "learning_rate": 2e-05, "loss": 0.817, "step": 78 }, { "epoch": 2.504, "eval_loss": 0.7629995942115784, "eval_runtime": 86.2805, "eval_samples_per_second": 2.318, "eval_steps_per_second": 0.29, "step": 78 }, { "epoch": 2.536, "grad_norm": 0.25383057566992234, "learning_rate": 2e-05, "loss": 0.7931, "step": 79 }, { "epoch": 2.536, "eval_loss": 0.7622135877609253, "eval_runtime": 86.5133, "eval_samples_per_second": 2.312, "eval_steps_per_second": 0.289, "step": 79 }, { "epoch": 2.568, "grad_norm": 0.27933475264216745, "learning_rate": 2e-05, "loss": 0.8135, "step": 80 }, { "epoch": 2.568, "eval_loss": 0.7606070041656494, "eval_runtime": 86.377, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.289, "step": 80 }, { "epoch": 2.6, "grad_norm": 0.24704516135802373, "learning_rate": 2e-05, "loss": 0.7688, "step": 81 }, { "epoch": 2.6, "eval_loss": 0.7587440609931946, "eval_runtime": 88.9717, "eval_samples_per_second": 2.248, "eval_steps_per_second": 0.281, "step": 81 }, { "epoch": 2.632, "grad_norm": 0.2595849376774823, "learning_rate": 2e-05, "loss": 0.7207, "step": 82 }, { "epoch": 2.632, "eval_loss": 0.7568916082382202, "eval_runtime": 88.9097, "eval_samples_per_second": 2.249, "eval_steps_per_second": 0.281, "step": 82 }, { "epoch": 2.664, "grad_norm": 0.2586023772952801, "learning_rate": 2e-05, "loss": 0.8642, "step": 83 }, { "epoch": 2.664, "eval_loss": 0.7559364438056946, "eval_runtime": 89.045, "eval_samples_per_second": 2.246, "eval_steps_per_second": 0.281, "step": 83 }, { "epoch": 2.6959999999999997, "grad_norm": 0.2273264534259725, "learning_rate": 2e-05, "loss": 0.8864, "step": 84 }, { "epoch": 2.6959999999999997, "eval_loss": 0.7552520632743835, "eval_runtime": 88.9448, "eval_samples_per_second": 2.249, "eval_steps_per_second": 0.281, "step": 84 }, { "epoch": 2.7279999999999998, "grad_norm": 0.26638251168101784, "learning_rate": 2e-05, "loss": 0.7977, "step": 85 }, { "epoch": 2.7279999999999998, "eval_loss": 0.753672182559967, "eval_runtime": 89.2211, "eval_samples_per_second": 2.242, "eval_steps_per_second": 0.28, "step": 85 }, { "epoch": 2.76, "grad_norm": 0.27672934644885144, "learning_rate": 2e-05, "loss": 0.8003, "step": 86 }, { "epoch": 2.76, "eval_loss": 0.7510656714439392, "eval_runtime": 95.8714, "eval_samples_per_second": 2.086, "eval_steps_per_second": 0.261, "step": 86 }, { "epoch": 2.792, "grad_norm": 0.28159046758182865, "learning_rate": 2e-05, "loss": 0.8216, "step": 87 }, { "epoch": 2.792, "eval_loss": 0.7484390735626221, "eval_runtime": 93.9836, "eval_samples_per_second": 2.128, "eval_steps_per_second": 0.266, "step": 87 }, { "epoch": 2.824, "grad_norm": 0.25495896352825237, "learning_rate": 2e-05, "loss": 0.8514, "step": 88 }, { "epoch": 2.824, "eval_loss": 0.7466137409210205, "eval_runtime": 92.9783, "eval_samples_per_second": 2.151, "eval_steps_per_second": 0.269, "step": 88 }, { "epoch": 2.856, "grad_norm": 0.24959081452423665, "learning_rate": 2e-05, "loss": 0.8291, "step": 89 }, { "epoch": 2.856, "eval_loss": 0.745083749294281, "eval_runtime": 92.9713, "eval_samples_per_second": 2.151, "eval_steps_per_second": 0.269, "step": 89 }, { "epoch": 2.888, "grad_norm": 0.258467204198503, "learning_rate": 2e-05, "loss": 0.8669, "step": 90 }, { "epoch": 2.888, "eval_loss": 0.7432680726051331, "eval_runtime": 93.3306, "eval_samples_per_second": 2.143, "eval_steps_per_second": 0.268, "step": 90 }, { "epoch": 2.928, "grad_norm": 0.24500563921569218, "learning_rate": 2e-05, "loss": 0.8085, "step": 91 }, { "epoch": 2.928, "eval_loss": 0.7415681481361389, "eval_runtime": 91.2727, "eval_samples_per_second": 2.191, "eval_steps_per_second": 0.274, "step": 91 }, { "epoch": 2.96, "grad_norm": 0.27455934907237084, "learning_rate": 2e-05, "loss": 0.8307, "step": 92 }, { "epoch": 2.96, "eval_loss": 0.7402371168136597, "eval_runtime": 85.3987, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 92 }, { "epoch": 2.992, "grad_norm": 0.2775170496694688, "learning_rate": 2e-05, "loss": 0.6946, "step": 93 }, { "epoch": 2.992, "eval_loss": 0.7383440136909485, "eval_runtime": 85.4187, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 93 }, { "epoch": 3.024, "grad_norm": 0.2792378343187822, "learning_rate": 2e-05, "loss": 0.788, "step": 94 }, { "epoch": 3.024, "eval_loss": 0.7362905144691467, "eval_runtime": 85.2866, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 94 }, { "epoch": 3.056, "grad_norm": 0.24527382879929208, "learning_rate": 2e-05, "loss": 0.8077, "step": 95 }, { "epoch": 3.056, "eval_loss": 0.7345578074455261, "eval_runtime": 85.2836, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 95 }, { "epoch": 3.088, "grad_norm": 0.2639510901590645, "learning_rate": 2e-05, "loss": 0.8159, "step": 96 }, { "epoch": 3.088, "eval_loss": 0.7332432270050049, "eval_runtime": 86.4403, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 96 }, { "epoch": 3.12, "grad_norm": 0.32275944645869054, "learning_rate": 2e-05, "loss": 0.7283, "step": 97 }, { "epoch": 3.12, "eval_loss": 0.7311471700668335, "eval_runtime": 86.3257, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 97 }, { "epoch": 3.152, "grad_norm": 0.22657765140514205, "learning_rate": 2e-05, "loss": 0.796, "step": 98 }, { "epoch": 3.152, "eval_loss": 0.7294245958328247, "eval_runtime": 86.4307, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 98 }, { "epoch": 3.184, "grad_norm": 0.2696947762711156, "learning_rate": 2e-05, "loss": 0.8463, "step": 99 }, { "epoch": 3.184, "eval_loss": 0.7282422780990601, "eval_runtime": 86.5308, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 99 }, { "epoch": 3.216, "grad_norm": 0.2600510971816684, "learning_rate": 2e-05, "loss": 0.8089, "step": 100 }, { "epoch": 3.216, "eval_loss": 0.7277690768241882, "eval_runtime": 86.3354, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 100 }, { "epoch": 3.248, "grad_norm": 0.2786398542362818, "learning_rate": 2e-05, "loss": 0.7746, "step": 101 }, { "epoch": 3.248, "eval_loss": 0.7275124192237854, "eval_runtime": 88.6929, "eval_samples_per_second": 2.255, "eval_steps_per_second": 0.282, "step": 101 }, { "epoch": 3.2800000000000002, "grad_norm": 0.2737884177070957, "learning_rate": 2e-05, "loss": 0.8182, "step": 102 }, { "epoch": 3.2800000000000002, "eval_loss": 0.727260947227478, "eval_runtime": 85.8129, "eval_samples_per_second": 2.331, "eval_steps_per_second": 0.291, "step": 102 }, { "epoch": 3.312, "grad_norm": 0.29485392261335913, "learning_rate": 2e-05, "loss": 0.771, "step": 103 }, { "epoch": 3.312, "eval_loss": 0.726463794708252, "eval_runtime": 85.5624, "eval_samples_per_second": 2.337, "eval_steps_per_second": 0.292, "step": 103 }, { "epoch": 3.344, "grad_norm": 0.2950854321605982, "learning_rate": 2e-05, "loss": 0.7412, "step": 104 }, { "epoch": 3.344, "eval_loss": 0.7254646420478821, "eval_runtime": 86.1462, "eval_samples_per_second": 2.322, "eval_steps_per_second": 0.29, "step": 104 }, { "epoch": 3.376, "grad_norm": 0.2868496115468271, "learning_rate": 2e-05, "loss": 0.7902, "step": 105 }, { "epoch": 3.376, "eval_loss": 0.724499523639679, "eval_runtime": 85.8109, "eval_samples_per_second": 2.331, "eval_steps_per_second": 0.291, "step": 105 }, { "epoch": 3.408, "grad_norm": 0.27526808102180006, "learning_rate": 2e-05, "loss": 0.7962, "step": 106 }, { "epoch": 3.408, "eval_loss": 0.723967432975769, "eval_runtime": 85.1137, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.294, "step": 106 }, { "epoch": 3.44, "grad_norm": 0.28826054599507117, "learning_rate": 2e-05, "loss": 0.7659, "step": 107 }, { "epoch": 3.44, "eval_loss": 0.7228976488113403, "eval_runtime": 85.338, "eval_samples_per_second": 2.344, "eval_steps_per_second": 0.293, "step": 107 }, { "epoch": 3.472, "grad_norm": 0.2739253052624054, "learning_rate": 2e-05, "loss": 0.8122, "step": 108 }, { "epoch": 3.472, "eval_loss": 0.7213765978813171, "eval_runtime": 86.0819, "eval_samples_per_second": 2.323, "eval_steps_per_second": 0.29, "step": 108 }, { "epoch": 3.504, "grad_norm": 0.3244236677701114, "learning_rate": 2e-05, "loss": 0.7926, "step": 109 }, { "epoch": 3.504, "eval_loss": 0.7201890349388123, "eval_runtime": 85.264, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 109 }, { "epoch": 3.536, "grad_norm": 0.272846304884481, "learning_rate": 2e-05, "loss": 0.7815, "step": 110 }, { "epoch": 3.536, "eval_loss": 0.7191389203071594, "eval_runtime": 85.3814, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 110 }, { "epoch": 3.568, "grad_norm": 0.32540225984762255, "learning_rate": 2e-05, "loss": 0.7669, "step": 111 }, { "epoch": 3.568, "eval_loss": 0.7177696824073792, "eval_runtime": 86.5851, "eval_samples_per_second": 2.31, "eval_steps_per_second": 0.289, "step": 111 }, { "epoch": 3.6, "grad_norm": 0.3049195701830638, "learning_rate": 2e-05, "loss": 0.7817, "step": 112 }, { "epoch": 3.6, "eval_loss": 0.7163457274436951, "eval_runtime": 86.3221, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 112 }, { "epoch": 3.632, "grad_norm": 0.2908157712070727, "learning_rate": 2e-05, "loss": 0.7803, "step": 113 }, { "epoch": 3.632, "eval_loss": 0.7153773307800293, "eval_runtime": 86.5278, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 113 }, { "epoch": 3.664, "grad_norm": 0.3068313248625758, "learning_rate": 2e-05, "loss": 0.8223, "step": 114 }, { "epoch": 3.664, "eval_loss": 0.7154207825660706, "eval_runtime": 87.3327, "eval_samples_per_second": 2.29, "eval_steps_per_second": 0.286, "step": 114 }, { "epoch": 3.6959999999999997, "grad_norm": 0.3055979867515295, "learning_rate": 2e-05, "loss": 0.7682, "step": 115 }, { "epoch": 3.6959999999999997, "eval_loss": 0.7148604393005371, "eval_runtime": 86.2716, "eval_samples_per_second": 2.318, "eval_steps_per_second": 0.29, "step": 115 }, { "epoch": 3.7279999999999998, "grad_norm": 0.30145967440162974, "learning_rate": 2e-05, "loss": 0.7794, "step": 116 }, { "epoch": 3.7279999999999998, "eval_loss": 0.7141232490539551, "eval_runtime": 90.4025, "eval_samples_per_second": 2.212, "eval_steps_per_second": 0.277, "step": 116 }, { "epoch": 3.76, "grad_norm": 0.30263126216965686, "learning_rate": 2e-05, "loss": 0.7924, "step": 117 }, { "epoch": 3.76, "eval_loss": 0.7125248908996582, "eval_runtime": 89.9787, "eval_samples_per_second": 2.223, "eval_steps_per_second": 0.278, "step": 117 }, { "epoch": 3.792, "grad_norm": 0.37055787998484313, "learning_rate": 2e-05, "loss": 0.7527, "step": 118 }, { "epoch": 3.792, "eval_loss": 0.7103064060211182, "eval_runtime": 89.9435, "eval_samples_per_second": 2.224, "eval_steps_per_second": 0.278, "step": 118 }, { "epoch": 3.824, "grad_norm": 0.32370435744629744, "learning_rate": 2e-05, "loss": 0.7225, "step": 119 }, { "epoch": 3.824, "eval_loss": 0.708495020866394, "eval_runtime": 89.7211, "eval_samples_per_second": 2.229, "eval_steps_per_second": 0.279, "step": 119 }, { "epoch": 3.856, "grad_norm": 0.28450870148051394, "learning_rate": 2e-05, "loss": 0.7594, "step": 120 }, { "epoch": 3.856, "eval_loss": 0.7078144550323486, "eval_runtime": 89.755, "eval_samples_per_second": 2.228, "eval_steps_per_second": 0.279, "step": 120 }, { "epoch": 3.888, "grad_norm": 0.3521496955227081, "learning_rate": 2e-05, "loss": 0.8098, "step": 121 }, { "epoch": 3.888, "eval_loss": 0.706774115562439, "eval_runtime": 93.4447, "eval_samples_per_second": 2.14, "eval_steps_per_second": 0.268, "step": 121 }, { "epoch": 3.92, "grad_norm": 0.29964697600639706, "learning_rate": 2e-05, "loss": 0.7945, "step": 122 }, { "epoch": 3.92, "eval_loss": 0.7057322859764099, "eval_runtime": 93.0089, "eval_samples_per_second": 2.15, "eval_steps_per_second": 0.269, "step": 122 }, { "epoch": 3.952, "grad_norm": 0.2998200701516689, "learning_rate": 2e-05, "loss": 0.7986, "step": 123 }, { "epoch": 3.952, "eval_loss": 0.7051501274108887, "eval_runtime": 93.7613, "eval_samples_per_second": 2.133, "eval_steps_per_second": 0.267, "step": 123 }, { "epoch": 3.984, "grad_norm": 0.34265154113873836, "learning_rate": 2e-05, "loss": 0.7626, "step": 124 }, { "epoch": 3.984, "eval_loss": 0.7055770754814148, "eval_runtime": 94.0074, "eval_samples_per_second": 2.127, "eval_steps_per_second": 0.266, "step": 124 }, { "epoch": 4.016, "grad_norm": 0.3227557876231983, "learning_rate": 2e-05, "loss": 0.8266, "step": 125 }, { "epoch": 4.016, "eval_loss": 0.7067859172821045, "eval_runtime": 92.4085, "eval_samples_per_second": 2.164, "eval_steps_per_second": 0.271, "step": 125 }, { "epoch": 4.064, "grad_norm": 0.31358966391371784, "learning_rate": 2e-05, "loss": 0.7162, "step": 126 }, { "epoch": 4.064, "eval_loss": 0.7073588371276855, "eval_runtime": 89.235, "eval_samples_per_second": 2.241, "eval_steps_per_second": 0.28, "step": 126 }, { "epoch": 4.096, "grad_norm": 0.29594296413078097, "learning_rate": 2e-05, "loss": 0.737, "step": 127 }, { "epoch": 4.096, "eval_loss": 0.7072306871414185, "eval_runtime": 85.5672, "eval_samples_per_second": 2.337, "eval_steps_per_second": 0.292, "step": 127 }, { "epoch": 4.128, "grad_norm": 0.31562345712114676, "learning_rate": 2e-05, "loss": 0.7735, "step": 128 }, { "epoch": 4.128, "eval_loss": 0.7067290544509888, "eval_runtime": 85.4464, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 128 }, { "epoch": 4.16, "grad_norm": 0.36960151197946806, "learning_rate": 2e-05, "loss": 0.7275, "step": 129 }, { "epoch": 4.16, "eval_loss": 0.7046365141868591, "eval_runtime": 85.4173, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 129 }, { "epoch": 4.192, "grad_norm": 0.28777555135336585, "learning_rate": 2e-05, "loss": 0.7568, "step": 130 }, { "epoch": 4.192, "eval_loss": 0.7030876278877258, "eval_runtime": 85.2072, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 130 }, { "epoch": 4.224, "grad_norm": 0.3335688387393771, "learning_rate": 2e-05, "loss": 0.7473, "step": 131 }, { "epoch": 4.224, "eval_loss": 0.7016716003417969, "eval_runtime": 85.3898, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 131 }, { "epoch": 4.256, "grad_norm": 0.36992044176671973, "learning_rate": 2e-05, "loss": 0.7915, "step": 132 }, { "epoch": 4.256, "eval_loss": 0.7006884813308716, "eval_runtime": 85.1939, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.293, "step": 132 }, { "epoch": 4.288, "grad_norm": 0.3213431246183001, "learning_rate": 2e-05, "loss": 0.7716, "step": 133 }, { "epoch": 4.288, "eval_loss": 0.7004576325416565, "eval_runtime": 85.1892, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.293, "step": 133 }, { "epoch": 4.32, "grad_norm": 0.31818378029100663, "learning_rate": 2e-05, "loss": 0.7504, "step": 134 }, { "epoch": 4.32, "eval_loss": 0.7006973028182983, "eval_runtime": 85.2711, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 134 }, { "epoch": 4.352, "grad_norm": 0.34120408014701054, "learning_rate": 2e-05, "loss": 0.8125, "step": 135 }, { "epoch": 4.352, "eval_loss": 0.7006770372390747, "eval_runtime": 85.0797, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.294, "step": 135 }, { "epoch": 4.384, "grad_norm": 0.3354650435400624, "learning_rate": 2e-05, "loss": 0.7623, "step": 136 }, { "epoch": 4.384, "eval_loss": 0.7007671594619751, "eval_runtime": 86.3137, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 136 }, { "epoch": 4.416, "grad_norm": 0.34273454929170855, "learning_rate": 2e-05, "loss": 0.7539, "step": 137 }, { "epoch": 4.416, "eval_loss": 0.7007145881652832, "eval_runtime": 86.1203, "eval_samples_per_second": 2.322, "eval_steps_per_second": 0.29, "step": 137 }, { "epoch": 4.448, "grad_norm": 0.34329366738767764, "learning_rate": 2e-05, "loss": 0.673, "step": 138 }, { "epoch": 4.448, "eval_loss": 0.7001290321350098, "eval_runtime": 86.4938, "eval_samples_per_second": 2.312, "eval_steps_per_second": 0.289, "step": 138 }, { "epoch": 4.48, "grad_norm": 0.32986462476877876, "learning_rate": 2e-05, "loss": 0.7874, "step": 139 }, { "epoch": 4.48, "eval_loss": 0.6998225450515747, "eval_runtime": 85.0554, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.294, "step": 139 }, { "epoch": 4.5120000000000005, "grad_norm": 0.42029356364309967, "learning_rate": 2e-05, "loss": 0.7391, "step": 140 }, { "epoch": 4.5120000000000005, "eval_loss": 0.6981640458106995, "eval_runtime": 86.43, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 140 }, { "epoch": 4.5440000000000005, "grad_norm": 0.3410153964676588, "learning_rate": 2e-05, "loss": 0.7375, "step": 141 }, { "epoch": 4.5440000000000005, "eval_loss": 0.6970750689506531, "eval_runtime": 85.1484, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 141 }, { "epoch": 4.576, "grad_norm": 0.39568033105661293, "learning_rate": 2e-05, "loss": 0.7175, "step": 142 }, { "epoch": 4.576, "eval_loss": 0.6954947113990784, "eval_runtime": 85.256, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 142 }, { "epoch": 4.608, "grad_norm": 0.35114222943495293, "learning_rate": 2e-05, "loss": 0.6854, "step": 143 }, { "epoch": 4.608, "eval_loss": 0.6938956379890442, "eval_runtime": 85.0562, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.294, "step": 143 }, { "epoch": 4.64, "grad_norm": 0.36129302808062, "learning_rate": 2e-05, "loss": 0.7821, "step": 144 }, { "epoch": 4.64, "eval_loss": 0.6925562024116516, "eval_runtime": 85.1341, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 144 }, { "epoch": 4.672, "grad_norm": 0.39551012956858894, "learning_rate": 2e-05, "loss": 0.7521, "step": 145 }, { "epoch": 4.672, "eval_loss": 0.6914381384849548, "eval_runtime": 85.152, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 145 }, { "epoch": 4.704, "grad_norm": 0.42650783733532543, "learning_rate": 2e-05, "loss": 0.7883, "step": 146 }, { "epoch": 4.704, "eval_loss": 0.6911692023277283, "eval_runtime": 86.8539, "eval_samples_per_second": 2.303, "eval_steps_per_second": 0.288, "step": 146 }, { "epoch": 4.736, "grad_norm": 0.3786582921989863, "learning_rate": 2e-05, "loss": 0.7987, "step": 147 }, { "epoch": 4.736, "eval_loss": 0.6914923191070557, "eval_runtime": 87.0003, "eval_samples_per_second": 2.299, "eval_steps_per_second": 0.287, "step": 147 }, { "epoch": 4.768, "grad_norm": 0.3528223035850843, "learning_rate": 2e-05, "loss": 0.8181, "step": 148 }, { "epoch": 4.768, "eval_loss": 0.6930768489837646, "eval_runtime": 86.8453, "eval_samples_per_second": 2.303, "eval_steps_per_second": 0.288, "step": 148 }, { "epoch": 4.8, "grad_norm": 0.4052106693792139, "learning_rate": 2e-05, "loss": 0.7317, "step": 149 }, { "epoch": 4.8, "eval_loss": 0.6946350336074829, "eval_runtime": 86.808, "eval_samples_per_second": 2.304, "eval_steps_per_second": 0.288, "step": 149 }, { "epoch": 4.832, "grad_norm": 0.3739014269672761, "learning_rate": 2e-05, "loss": 0.7851, "step": 150 }, { "epoch": 4.832, "eval_loss": 0.6952430605888367, "eval_runtime": 86.8255, "eval_samples_per_second": 2.303, "eval_steps_per_second": 0.288, "step": 150 }, { "epoch": 4.864, "grad_norm": 0.42120491782720065, "learning_rate": 2e-05, "loss": 0.6829, "step": 151 }, { "epoch": 4.864, "eval_loss": 0.6938563585281372, "eval_runtime": 89.8279, "eval_samples_per_second": 2.226, "eval_steps_per_second": 0.278, "step": 151 }, { "epoch": 4.896, "grad_norm": 0.40313446713945206, "learning_rate": 2e-05, "loss": 0.6972, "step": 152 }, { "epoch": 4.896, "eval_loss": 0.6912936568260193, "eval_runtime": 89.3777, "eval_samples_per_second": 2.238, "eval_steps_per_second": 0.28, "step": 152 }, { "epoch": 4.928, "grad_norm": 0.36052668588306425, "learning_rate": 2e-05, "loss": 0.7294, "step": 153 }, { "epoch": 4.928, "eval_loss": 0.6893093585968018, "eval_runtime": 89.369, "eval_samples_per_second": 2.238, "eval_steps_per_second": 0.28, "step": 153 }, { "epoch": 4.96, "grad_norm": 0.35889751392140123, "learning_rate": 2e-05, "loss": 0.7471, "step": 154 }, { "epoch": 4.96, "eval_loss": 0.6887902021408081, "eval_runtime": 89.3518, "eval_samples_per_second": 2.238, "eval_steps_per_second": 0.28, "step": 154 }, { "epoch": 4.992, "grad_norm": 0.40694329818018776, "learning_rate": 2e-05, "loss": 0.6145, "step": 155 }, { "epoch": 4.992, "eval_loss": 0.6877387762069702, "eval_runtime": 89.5999, "eval_samples_per_second": 2.232, "eval_steps_per_second": 0.279, "step": 155 }, { "epoch": 5.024, "grad_norm": 0.4071045470916848, "learning_rate": 2e-05, "loss": 0.7106, "step": 156 }, { "epoch": 5.024, "eval_loss": 0.6863316297531128, "eval_runtime": 91.1516, "eval_samples_per_second": 2.194, "eval_steps_per_second": 0.274, "step": 156 }, { "epoch": 5.056, "grad_norm": 0.3825562066811806, "learning_rate": 2e-05, "loss": 0.6845, "step": 157 }, { "epoch": 5.056, "eval_loss": 0.6852035522460938, "eval_runtime": 90.7966, "eval_samples_per_second": 2.203, "eval_steps_per_second": 0.275, "step": 157 }, { "epoch": 5.088, "grad_norm": 0.37147714986904934, "learning_rate": 2e-05, "loss": 0.6739, "step": 158 }, { "epoch": 5.088, "eval_loss": 0.6840065121650696, "eval_runtime": 91.6382, "eval_samples_per_second": 2.182, "eval_steps_per_second": 0.273, "step": 158 }, { "epoch": 5.12, "grad_norm": 0.38259903152485825, "learning_rate": 2e-05, "loss": 0.6777, "step": 159 }, { "epoch": 5.12, "eval_loss": 0.6829774379730225, "eval_runtime": 90.8166, "eval_samples_per_second": 2.202, "eval_steps_per_second": 0.275, "step": 159 }, { "epoch": 5.152, "grad_norm": 0.39777547401791735, "learning_rate": 2e-05, "loss": 0.7145, "step": 160 }, { "epoch": 5.152, "eval_loss": 0.682302713394165, "eval_runtime": 91.0923, "eval_samples_per_second": 2.196, "eval_steps_per_second": 0.274, "step": 160 }, { "epoch": 5.192, "grad_norm": 0.3709714989318106, "learning_rate": 2e-05, "loss": 0.7251, "step": 161 }, { "epoch": 5.192, "eval_loss": 0.6822090148925781, "eval_runtime": 90.1282, "eval_samples_per_second": 2.219, "eval_steps_per_second": 0.277, "step": 161 }, { "epoch": 5.224, "grad_norm": 0.4046346018620919, "learning_rate": 2e-05, "loss": 0.7108, "step": 162 }, { "epoch": 5.224, "eval_loss": 0.6821247935295105, "eval_runtime": 85.5268, "eval_samples_per_second": 2.338, "eval_steps_per_second": 0.292, "step": 162 }, { "epoch": 5.256, "grad_norm": 0.42060496638232386, "learning_rate": 2e-05, "loss": 0.7541, "step": 163 }, { "epoch": 5.256, "eval_loss": 0.6818928718566895, "eval_runtime": 86.5491, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 163 }, { "epoch": 5.288, "grad_norm": 0.3949151390399246, "learning_rate": 2e-05, "loss": 0.731, "step": 164 }, { "epoch": 5.288, "eval_loss": 0.6819549798965454, "eval_runtime": 85.4036, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 164 }, { "epoch": 5.32, "grad_norm": 0.3610134094474086, "learning_rate": 2e-05, "loss": 0.711, "step": 165 }, { "epoch": 5.32, "eval_loss": 0.6825198531150818, "eval_runtime": 85.3022, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 165 }, { "epoch": 5.352, "grad_norm": 0.38905537916660615, "learning_rate": 2e-05, "loss": 0.7693, "step": 166 }, { "epoch": 5.352, "eval_loss": 0.682900071144104, "eval_runtime": 87.5028, "eval_samples_per_second": 2.286, "eval_steps_per_second": 0.286, "step": 166 }, { "epoch": 5.384, "grad_norm": 0.4020289142954435, "learning_rate": 2e-05, "loss": 0.7257, "step": 167 }, { "epoch": 5.384, "eval_loss": 0.6832457184791565, "eval_runtime": 86.3175, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 167 }, { "epoch": 5.416, "grad_norm": 0.4136061042465234, "learning_rate": 2e-05, "loss": 0.7082, "step": 168 }, { "epoch": 5.416, "eval_loss": 0.6837514638900757, "eval_runtime": 87.4244, "eval_samples_per_second": 2.288, "eval_steps_per_second": 0.286, "step": 168 }, { "epoch": 5.448, "grad_norm": 0.40006410263925274, "learning_rate": 2e-05, "loss": 0.6352, "step": 169 }, { "epoch": 5.448, "eval_loss": 0.6845301985740662, "eval_runtime": 86.3451, "eval_samples_per_second": 2.316, "eval_steps_per_second": 0.29, "step": 169 }, { "epoch": 5.48, "grad_norm": 0.40988438997044196, "learning_rate": 2e-05, "loss": 0.7485, "step": 170 }, { "epoch": 5.48, "eval_loss": 0.6851826310157776, "eval_runtime": 86.4269, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 170 }, { "epoch": 5.5120000000000005, "grad_norm": 0.47923323092924647, "learning_rate": 2e-05, "loss": 0.6926, "step": 171 }, { "epoch": 5.5120000000000005, "eval_loss": 0.6841108798980713, "eval_runtime": 86.3221, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 171 }, { "epoch": 5.5440000000000005, "grad_norm": 0.4031545746474779, "learning_rate": 2e-05, "loss": 0.6961, "step": 172 }, { "epoch": 5.5440000000000005, "eval_loss": 0.6829754710197449, "eval_runtime": 85.2735, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 172 }, { "epoch": 5.576, "grad_norm": 0.46444040317934493, "learning_rate": 2e-05, "loss": 0.6757, "step": 173 }, { "epoch": 5.576, "eval_loss": 0.6810196042060852, "eval_runtime": 85.23, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 173 }, { "epoch": 5.608, "grad_norm": 0.40032547211306824, "learning_rate": 2e-05, "loss": 0.6465, "step": 174 }, { "epoch": 5.608, "eval_loss": 0.6795651316642761, "eval_runtime": 85.2935, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 174 }, { "epoch": 5.64, "grad_norm": 0.3975749684060634, "learning_rate": 2e-05, "loss": 0.7434, "step": 175 }, { "epoch": 5.64, "eval_loss": 0.6787837147712708, "eval_runtime": 86.4234, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 175 }, { "epoch": 5.672, "grad_norm": 0.4413863489846678, "learning_rate": 2e-05, "loss": 0.7148, "step": 176 }, { "epoch": 5.672, "eval_loss": 0.678077220916748, "eval_runtime": 86.46, "eval_samples_per_second": 2.313, "eval_steps_per_second": 0.289, "step": 176 }, { "epoch": 5.704, "grad_norm": 0.4552334205325458, "learning_rate": 2e-05, "loss": 0.7467, "step": 177 }, { "epoch": 5.704, "eval_loss": 0.6782705783843994, "eval_runtime": 85.7334, "eval_samples_per_second": 2.333, "eval_steps_per_second": 0.292, "step": 177 }, { "epoch": 5.736, "grad_norm": 0.4222034129737574, "learning_rate": 2e-05, "loss": 0.7573, "step": 178 }, { "epoch": 5.736, "eval_loss": 0.6788575053215027, "eval_runtime": 85.407, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 178 }, { "epoch": 5.768, "grad_norm": 0.4212365440913614, "learning_rate": 2e-05, "loss": 0.7853, "step": 179 }, { "epoch": 5.768, "eval_loss": 0.680314302444458, "eval_runtime": 85.1528, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 179 }, { "epoch": 5.8, "grad_norm": 0.47040418573969534, "learning_rate": 2e-05, "loss": 0.6941, "step": 180 }, { "epoch": 5.8, "eval_loss": 0.6814693212509155, "eval_runtime": 85.2668, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 180 }, { "epoch": 5.832, "grad_norm": 0.43506164207204023, "learning_rate": 2e-05, "loss": 0.7466, "step": 181 }, { "epoch": 5.832, "eval_loss": 0.6818942427635193, "eval_runtime": 86.1082, "eval_samples_per_second": 2.323, "eval_steps_per_second": 0.29, "step": 181 }, { "epoch": 5.864, "grad_norm": 0.4851524205448296, "learning_rate": 2e-05, "loss": 0.6414, "step": 182 }, { "epoch": 5.864, "eval_loss": 0.6807515621185303, "eval_runtime": 85.2905, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 182 }, { "epoch": 5.896, "grad_norm": 0.46212982880574544, "learning_rate": 2e-05, "loss": 0.6594, "step": 183 }, { "epoch": 5.896, "eval_loss": 0.6793842911720276, "eval_runtime": 85.2531, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 183 }, { "epoch": 5.928, "grad_norm": 0.43483234178092045, "learning_rate": 2e-05, "loss": 0.6927, "step": 184 }, { "epoch": 5.928, "eval_loss": 0.6785325407981873, "eval_runtime": 86.4294, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 184 }, { "epoch": 5.96, "grad_norm": 0.45461536176049777, "learning_rate": 2e-05, "loss": 0.7127, "step": 185 }, { "epoch": 5.96, "eval_loss": 0.6785117983818054, "eval_runtime": 86.5612, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 185 }, { "epoch": 5.992, "grad_norm": 0.5124892629103449, "learning_rate": 2e-05, "loss": 0.5778, "step": 186 }, { "epoch": 5.992, "eval_loss": 0.6772163510322571, "eval_runtime": 85.3177, "eval_samples_per_second": 2.344, "eval_steps_per_second": 0.293, "step": 186 }, { "epoch": 6.024, "grad_norm": 0.4872469973004331, "learning_rate": 2e-05, "loss": 0.7045, "step": 187 }, { "epoch": 6.024, "eval_loss": 0.6760932207107544, "eval_runtime": 85.1785, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.294, "step": 187 }, { "epoch": 6.056, "grad_norm": 0.43317759363804015, "learning_rate": 2e-05, "loss": 0.6121, "step": 188 }, { "epoch": 6.056, "eval_loss": 0.6763756275177002, "eval_runtime": 85.4466, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 188 }, { "epoch": 6.088, "grad_norm": 0.47411518505747885, "learning_rate": 2e-05, "loss": 0.7409, "step": 189 }, { "epoch": 6.088, "eval_loss": 0.6757389903068542, "eval_runtime": 85.3382, "eval_samples_per_second": 2.344, "eval_steps_per_second": 0.293, "step": 189 }, { "epoch": 6.12, "grad_norm": 0.4971851748274855, "learning_rate": 2e-05, "loss": 0.7193, "step": 190 }, { "epoch": 6.12, "eval_loss": 0.6749419569969177, "eval_runtime": 85.4198, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 190 }, { "epoch": 6.152, "grad_norm": 0.46832302038313683, "learning_rate": 2e-05, "loss": 0.7413, "step": 191 }, { "epoch": 6.152, "eval_loss": 0.674567699432373, "eval_runtime": 85.6429, "eval_samples_per_second": 2.335, "eval_steps_per_second": 0.292, "step": 191 }, { "epoch": 6.184, "grad_norm": 0.47651234347196103, "learning_rate": 2e-05, "loss": 0.7113, "step": 192 }, { "epoch": 6.184, "eval_loss": 0.6739189028739929, "eval_runtime": 85.3814, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 192 }, { "epoch": 6.216, "grad_norm": 0.4808945978374079, "learning_rate": 2e-05, "loss": 0.6603, "step": 193 }, { "epoch": 6.216, "eval_loss": 0.6737512350082397, "eval_runtime": 85.1781, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.294, "step": 193 }, { "epoch": 6.248, "grad_norm": 0.45747741700806654, "learning_rate": 2e-05, "loss": 0.6905, "step": 194 }, { "epoch": 6.248, "eval_loss": 0.6738162040710449, "eval_runtime": 86.5609, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 194 }, { "epoch": 6.28, "grad_norm": 0.49033746737240025, "learning_rate": 2e-05, "loss": 0.7373, "step": 195 }, { "epoch": 6.28, "eval_loss": 0.6742382645606995, "eval_runtime": 86.4284, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 195 }, { "epoch": 6.312, "grad_norm": 0.5438084044824532, "learning_rate": 2e-05, "loss": 0.6819, "step": 196 }, { "epoch": 6.312, "eval_loss": 0.674878716468811, "eval_runtime": 85.7504, "eval_samples_per_second": 2.332, "eval_steps_per_second": 0.292, "step": 196 }, { "epoch": 6.344, "grad_norm": 0.4631516087052852, "learning_rate": 2e-05, "loss": 0.6775, "step": 197 }, { "epoch": 6.344, "eval_loss": 0.6761616468429565, "eval_runtime": 85.417, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 197 }, { "epoch": 6.376, "grad_norm": 0.49177247669398155, "learning_rate": 2e-05, "loss": 0.6605, "step": 198 }, { "epoch": 6.376, "eval_loss": 0.6770765781402588, "eval_runtime": 85.4274, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 198 }, { "epoch": 6.408, "grad_norm": 0.5177407926775024, "learning_rate": 2e-05, "loss": 0.7136, "step": 199 }, { "epoch": 6.408, "eval_loss": 0.6772163510322571, "eval_runtime": 85.2682, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 199 }, { "epoch": 6.44, "grad_norm": 0.5385213429977403, "learning_rate": 2e-05, "loss": 0.6809, "step": 200 }, { "epoch": 6.44, "eval_loss": 0.6758923530578613, "eval_runtime": 85.4858, "eval_samples_per_second": 2.34, "eval_steps_per_second": 0.292, "step": 200 }, { "epoch": 6.4719999999999995, "grad_norm": 0.4982626204598202, "learning_rate": 2e-05, "loss": 0.7159, "step": 201 }, { "epoch": 6.4719999999999995, "eval_loss": 0.675208568572998, "eval_runtime": 86.2523, "eval_samples_per_second": 2.319, "eval_steps_per_second": 0.29, "step": 201 }, { "epoch": 6.504, "grad_norm": 0.4710756307884673, "learning_rate": 2e-05, "loss": 0.6309, "step": 202 }, { "epoch": 6.504, "eval_loss": 0.6743338108062744, "eval_runtime": 86.6636, "eval_samples_per_second": 2.308, "eval_steps_per_second": 0.288, "step": 202 }, { "epoch": 6.536, "grad_norm": 0.5127505608717865, "learning_rate": 2e-05, "loss": 0.6257, "step": 203 }, { "epoch": 6.536, "eval_loss": 0.6735503673553467, "eval_runtime": 86.3216, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 203 }, { "epoch": 6.568, "grad_norm": 0.48812419654399086, "learning_rate": 2e-05, "loss": 0.6164, "step": 204 }, { "epoch": 6.568, "eval_loss": 0.6740123629570007, "eval_runtime": 86.3491, "eval_samples_per_second": 2.316, "eval_steps_per_second": 0.29, "step": 204 }, { "epoch": 6.6, "grad_norm": 0.5031408070515696, "learning_rate": 2e-05, "loss": 0.6765, "step": 205 }, { "epoch": 6.6, "eval_loss": 0.6746988892555237, "eval_runtime": 86.2475, "eval_samples_per_second": 2.319, "eval_steps_per_second": 0.29, "step": 205 }, { "epoch": 6.632, "grad_norm": 0.5221751920115928, "learning_rate": 2e-05, "loss": 0.6836, "step": 206 }, { "epoch": 6.632, "eval_loss": 0.6757599711418152, "eval_runtime": 90.9953, "eval_samples_per_second": 2.198, "eval_steps_per_second": 0.275, "step": 206 }, { "epoch": 6.664, "grad_norm": 0.49611473039131815, "learning_rate": 2e-05, "loss": 0.6809, "step": 207 }, { "epoch": 6.664, "eval_loss": 0.676249086856842, "eval_runtime": 88.6484, "eval_samples_per_second": 2.256, "eval_steps_per_second": 0.282, "step": 207 }, { "epoch": 6.696, "grad_norm": 0.5646771313169766, "learning_rate": 2e-05, "loss": 0.6905, "step": 208 }, { "epoch": 6.696, "eval_loss": 0.675098717212677, "eval_runtime": 88.7358, "eval_samples_per_second": 2.254, "eval_steps_per_second": 0.282, "step": 208 }, { "epoch": 6.728, "grad_norm": 0.5075133396143146, "learning_rate": 2e-05, "loss": 0.7001, "step": 209 }, { "epoch": 6.728, "eval_loss": 0.6734683513641357, "eval_runtime": 88.7108, "eval_samples_per_second": 2.255, "eval_steps_per_second": 0.282, "step": 209 }, { "epoch": 6.76, "grad_norm": 0.5292004993716772, "learning_rate": 2e-05, "loss": 0.6366, "step": 210 }, { "epoch": 6.76, "eval_loss": 0.6727490425109863, "eval_runtime": 88.7397, "eval_samples_per_second": 2.254, "eval_steps_per_second": 0.282, "step": 210 }, { "epoch": 6.792, "grad_norm": 0.5508154729937994, "learning_rate": 2e-05, "loss": 0.6627, "step": 211 }, { "epoch": 6.792, "eval_loss": 0.6719673275947571, "eval_runtime": 94.3959, "eval_samples_per_second": 2.119, "eval_steps_per_second": 0.265, "step": 211 }, { "epoch": 6.824, "grad_norm": 0.5436944297369074, "learning_rate": 2e-05, "loss": 0.6939, "step": 212 }, { "epoch": 6.824, "eval_loss": 0.6717627048492432, "eval_runtime": 92.6409, "eval_samples_per_second": 2.159, "eval_steps_per_second": 0.27, "step": 212 }, { "epoch": 6.856, "grad_norm": 0.563836681781508, "learning_rate": 2e-05, "loss": 0.6715, "step": 213 }, { "epoch": 6.856, "eval_loss": 0.6704577803611755, "eval_runtime": 92.7628, "eval_samples_per_second": 2.156, "eval_steps_per_second": 0.27, "step": 213 }, { "epoch": 6.888, "grad_norm": 0.5903391746928088, "learning_rate": 2e-05, "loss": 0.6706, "step": 214 }, { "epoch": 6.888, "eval_loss": 0.6705368161201477, "eval_runtime": 93.0342, "eval_samples_per_second": 2.15, "eval_steps_per_second": 0.269, "step": 214 }, { "epoch": 6.92, "grad_norm": 0.5044604071023134, "learning_rate": 2e-05, "loss": 0.6308, "step": 215 }, { "epoch": 6.92, "eval_loss": 0.6709109544754028, "eval_runtime": 92.9865, "eval_samples_per_second": 2.151, "eval_steps_per_second": 0.269, "step": 215 }, { "epoch": 6.96, "grad_norm": 0.5029981251789745, "learning_rate": 2e-05, "loss": 0.6565, "step": 216 }, { "epoch": 6.96, "eval_loss": 0.6729848384857178, "eval_runtime": 91.2007, "eval_samples_per_second": 2.193, "eval_steps_per_second": 0.274, "step": 216 }, { "epoch": 6.992, "grad_norm": 0.662623585564011, "learning_rate": 2e-05, "loss": 0.5311, "step": 217 }, { "epoch": 6.992, "eval_loss": 0.6713245511054993, "eval_runtime": 85.2866, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 217 }, { "epoch": 7.024, "grad_norm": 0.6256446136768937, "learning_rate": 2e-05, "loss": 0.6022, "step": 218 }, { "epoch": 7.024, "eval_loss": 0.6695873737335205, "eval_runtime": 86.4021, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.289, "step": 218 }, { "epoch": 7.056, "grad_norm": 0.4857229274218417, "learning_rate": 2e-05, "loss": 0.647, "step": 219 }, { "epoch": 7.056, "eval_loss": 0.6719114780426025, "eval_runtime": 86.526, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 219 }, { "epoch": 7.088, "grad_norm": 0.5361101044951209, "learning_rate": 2e-05, "loss": 0.6477, "step": 220 }, { "epoch": 7.088, "eval_loss": 0.6757076978683472, "eval_runtime": 85.4011, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 220 }, { "epoch": 7.12, "grad_norm": 0.8396027685018896, "learning_rate": 2e-05, "loss": 0.6112, "step": 221 }, { "epoch": 7.12, "eval_loss": 0.6758625507354736, "eval_runtime": 85.2083, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 221 }, { "epoch": 7.152, "grad_norm": 0.5858149591099446, "learning_rate": 2e-05, "loss": 0.6826, "step": 222 }, { "epoch": 7.152, "eval_loss": 0.6765357255935669, "eval_runtime": 85.5627, "eval_samples_per_second": 2.337, "eval_steps_per_second": 0.292, "step": 222 }, { "epoch": 7.184, "grad_norm": 0.5694999654835196, "learning_rate": 2e-05, "loss": 0.5851, "step": 223 }, { "epoch": 7.184, "eval_loss": 0.6776654124259949, "eval_runtime": 85.417, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 223 }, { "epoch": 7.216, "grad_norm": 0.6001772847123094, "learning_rate": 2e-05, "loss": 0.6633, "step": 224 }, { "epoch": 7.216, "eval_loss": 0.6783779859542847, "eval_runtime": 85.1623, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.294, "step": 224 }, { "epoch": 7.248, "grad_norm": 0.6068993188167514, "learning_rate": 2e-05, "loss": 0.6275, "step": 225 }, { "epoch": 7.248, "eval_loss": 0.6755834817886353, "eval_runtime": 85.3711, "eval_samples_per_second": 2.343, "eval_steps_per_second": 0.293, "step": 225 }, { "epoch": 7.28, "grad_norm": 0.6038060153225616, "learning_rate": 2e-05, "loss": 0.6319, "step": 226 }, { "epoch": 7.28, "eval_loss": 0.6720392107963562, "eval_runtime": 86.1134, "eval_samples_per_second": 2.323, "eval_steps_per_second": 0.29, "step": 226 }, { "epoch": 7.312, "grad_norm": 0.5900082642978601, "learning_rate": 2e-05, "loss": 0.6417, "step": 227 }, { "epoch": 7.312, "eval_loss": 0.6699540615081787, "eval_runtime": 87.4261, "eval_samples_per_second": 2.288, "eval_steps_per_second": 0.286, "step": 227 }, { "epoch": 7.344, "grad_norm": 0.6303979703934064, "learning_rate": 2e-05, "loss": 0.5954, "step": 228 }, { "epoch": 7.344, "eval_loss": 0.6697332859039307, "eval_runtime": 86.3137, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 228 }, { "epoch": 7.376, "grad_norm": 0.5896270303292949, "learning_rate": 2e-05, "loss": 0.638, "step": 229 }, { "epoch": 7.376, "eval_loss": 0.6699292063713074, "eval_runtime": 85.4946, "eval_samples_per_second": 2.339, "eval_steps_per_second": 0.292, "step": 229 }, { "epoch": 7.408, "grad_norm": 0.5499007579825991, "learning_rate": 2e-05, "loss": 0.6312, "step": 230 }, { "epoch": 7.408, "eval_loss": 0.6695923805236816, "eval_runtime": 85.1955, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.293, "step": 230 }, { "epoch": 7.44, "grad_norm": 0.5806994466204508, "learning_rate": 2e-05, "loss": 0.6487, "step": 231 }, { "epoch": 7.44, "eval_loss": 0.670379638671875, "eval_runtime": 85.2487, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 231 }, { "epoch": 7.4719999999999995, "grad_norm": 0.6171318222234403, "learning_rate": 2e-05, "loss": 0.6497, "step": 232 }, { "epoch": 7.4719999999999995, "eval_loss": 0.671440601348877, "eval_runtime": 85.4044, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 232 }, { "epoch": 7.504, "grad_norm": 0.6077497033087486, "learning_rate": 2e-05, "loss": 0.6388, "step": 233 }, { "epoch": 7.504, "eval_loss": 0.6715556383132935, "eval_runtime": 85.7739, "eval_samples_per_second": 2.332, "eval_steps_per_second": 0.291, "step": 233 }, { "epoch": 7.536, "grad_norm": 0.6333159810332618, "learning_rate": 2e-05, "loss": 0.636, "step": 234 }, { "epoch": 7.536, "eval_loss": 0.6708941459655762, "eval_runtime": 85.2691, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 234 }, { "epoch": 7.568, "grad_norm": 0.6022274503734126, "learning_rate": 2e-05, "loss": 0.6455, "step": 235 }, { "epoch": 7.568, "eval_loss": 0.6690527200698853, "eval_runtime": 85.4144, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 235 }, { "epoch": 7.608, "grad_norm": 0.5296025255848918, "learning_rate": 2e-05, "loss": 0.572, "step": 236 }, { "epoch": 7.608, "eval_loss": 0.6683849096298218, "eval_runtime": 89.92, "eval_samples_per_second": 2.224, "eval_steps_per_second": 0.278, "step": 236 }, { "epoch": 7.64, "grad_norm": 0.5436886467794938, "learning_rate": 2e-05, "loss": 0.6681, "step": 237 }, { "epoch": 7.64, "eval_loss": 0.67000412940979, "eval_runtime": 85.2548, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 237 }, { "epoch": 7.672, "grad_norm": 0.5833714563537171, "learning_rate": 2e-05, "loss": 0.646, "step": 238 }, { "epoch": 7.672, "eval_loss": 0.6720954179763794, "eval_runtime": 85.4363, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 238 }, { "epoch": 7.704, "grad_norm": 0.6833890117857615, "learning_rate": 2e-05, "loss": 0.6641, "step": 239 }, { "epoch": 7.704, "eval_loss": 0.6737973093986511, "eval_runtime": 85.2541, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 239 }, { "epoch": 7.736, "grad_norm": 0.5832421680011252, "learning_rate": 2e-05, "loss": 0.6742, "step": 240 }, { "epoch": 7.736, "eval_loss": 0.6757528185844421, "eval_runtime": 85.2234, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 240 }, { "epoch": 7.768, "grad_norm": 0.5843876495624203, "learning_rate": 2e-05, "loss": 0.7069, "step": 241 }, { "epoch": 7.768, "eval_loss": 0.6778927445411682, "eval_runtime": 85.2049, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.293, "step": 241 }, { "epoch": 7.8, "grad_norm": 0.6527712899983633, "learning_rate": 2e-05, "loss": 0.6182, "step": 242 }, { "epoch": 7.8, "eval_loss": 0.6785970330238342, "eval_runtime": 84.9435, "eval_samples_per_second": 2.355, "eval_steps_per_second": 0.294, "step": 242 }, { "epoch": 7.832, "grad_norm": 0.6228341483848424, "learning_rate": 2e-05, "loss": 0.6633, "step": 243 }, { "epoch": 7.832, "eval_loss": 0.678627610206604, "eval_runtime": 85.2349, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 243 }, { "epoch": 7.864, "grad_norm": 0.6762374705072328, "learning_rate": 2e-05, "loss": 0.5581, "step": 244 }, { "epoch": 7.864, "eval_loss": 0.6781509518623352, "eval_runtime": 85.0862, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.294, "step": 244 }, { "epoch": 7.896, "grad_norm": 0.6530004154367571, "learning_rate": 2e-05, "loss": 0.5896, "step": 245 }, { "epoch": 7.896, "eval_loss": 0.6776159405708313, "eval_runtime": 85.4386, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 245 }, { "epoch": 7.928, "grad_norm": 0.6496264347077455, "learning_rate": 2e-05, "loss": 0.6262, "step": 246 }, { "epoch": 7.928, "eval_loss": 0.6762803792953491, "eval_runtime": 86.0069, "eval_samples_per_second": 2.325, "eval_steps_per_second": 0.291, "step": 246 }, { "epoch": 7.96, "grad_norm": 0.6530394584817848, "learning_rate": 2e-05, "loss": 0.6044, "step": 247 }, { "epoch": 7.96, "eval_loss": 0.6763593554496765, "eval_runtime": 86.5382, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 247 }, { "epoch": 7.992, "grad_norm": 0.7165450399528321, "learning_rate": 2e-05, "loss": 0.4777, "step": 248 }, { "epoch": 7.992, "eval_loss": 0.6767419576644897, "eval_runtime": 85.3445, "eval_samples_per_second": 2.343, "eval_steps_per_second": 0.293, "step": 248 }, { "epoch": 8.024, "grad_norm": 0.6210079733679161, "learning_rate": 2e-05, "loss": 0.6113, "step": 249 }, { "epoch": 8.024, "eval_loss": 0.6772445440292358, "eval_runtime": 85.4211, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 249 }, { "epoch": 8.056, "grad_norm": 0.6078116340925231, "learning_rate": 2e-05, "loss": 0.6133, "step": 250 }, { "epoch": 8.056, "eval_loss": 0.6801083087921143, "eval_runtime": 85.1688, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.294, "step": 250 }, { "epoch": 8.088, "grad_norm": 0.6584954900058523, "learning_rate": 2e-05, "loss": 0.6234, "step": 251 }, { "epoch": 8.088, "eval_loss": 0.680172324180603, "eval_runtime": 85.2854, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 251 }, { "epoch": 8.12, "grad_norm": 0.6410123072973634, "learning_rate": 2e-05, "loss": 0.6768, "step": 252 }, { "epoch": 8.12, "eval_loss": 0.6790580749511719, "eval_runtime": 85.343, "eval_samples_per_second": 2.343, "eval_steps_per_second": 0.293, "step": 252 }, { "epoch": 8.152, "grad_norm": 0.7068905769473427, "learning_rate": 2e-05, "loss": 0.6308, "step": 253 }, { "epoch": 8.152, "eval_loss": 0.6779585480690002, "eval_runtime": 85.2579, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 253 }, { "epoch": 8.184, "grad_norm": 0.6941639704688177, "learning_rate": 2e-05, "loss": 0.6651, "step": 254 }, { "epoch": 8.184, "eval_loss": 0.6783471703529358, "eval_runtime": 85.2368, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 254 }, { "epoch": 8.216, "grad_norm": 0.7347551538754563, "learning_rate": 2e-05, "loss": 0.643, "step": 255 }, { "epoch": 8.216, "eval_loss": 0.6772164702415466, "eval_runtime": 86.2786, "eval_samples_per_second": 2.318, "eval_steps_per_second": 0.29, "step": 255 }, { "epoch": 8.248, "grad_norm": 0.7690902453226406, "learning_rate": 2e-05, "loss": 0.6178, "step": 256 }, { "epoch": 8.248, "eval_loss": 0.67960524559021, "eval_runtime": 86.3356, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 256 }, { "epoch": 8.28, "grad_norm": 0.6534589041693806, "learning_rate": 2e-05, "loss": 0.6231, "step": 257 }, { "epoch": 8.28, "eval_loss": 0.683942437171936, "eval_runtime": 86.114, "eval_samples_per_second": 2.323, "eval_steps_per_second": 0.29, "step": 257 }, { "epoch": 8.312, "grad_norm": 0.7620000857656035, "learning_rate": 2e-05, "loss": 0.5937, "step": 258 }, { "epoch": 8.312, "eval_loss": 0.6850832104682922, "eval_runtime": 86.3659, "eval_samples_per_second": 2.316, "eval_steps_per_second": 0.289, "step": 258 }, { "epoch": 8.344, "grad_norm": 0.8769311371151648, "learning_rate": 2e-05, "loss": 0.6288, "step": 259 }, { "epoch": 8.344, "eval_loss": 0.6806495189666748, "eval_runtime": 86.2011, "eval_samples_per_second": 2.32, "eval_steps_per_second": 0.29, "step": 259 }, { "epoch": 8.376, "grad_norm": 0.7549996230143433, "learning_rate": 2e-05, "loss": 0.5614, "step": 260 }, { "epoch": 8.376, "eval_loss": 0.6746546030044556, "eval_runtime": 86.0828, "eval_samples_per_second": 2.323, "eval_steps_per_second": 0.29, "step": 260 }, { "epoch": 8.408, "grad_norm": 0.6678277921019138, "learning_rate": 2e-05, "loss": 0.5818, "step": 261 }, { "epoch": 8.408, "eval_loss": 0.6705954074859619, "eval_runtime": 86.9228, "eval_samples_per_second": 2.301, "eval_steps_per_second": 0.288, "step": 261 }, { "epoch": 8.44, "grad_norm": 0.6629861523432089, "learning_rate": 2e-05, "loss": 0.6231, "step": 262 }, { "epoch": 8.44, "eval_loss": 0.6688622832298279, "eval_runtime": 86.644, "eval_samples_per_second": 2.308, "eval_steps_per_second": 0.289, "step": 262 }, { "epoch": 8.472, "grad_norm": 0.7468331552698385, "learning_rate": 2e-05, "loss": 0.6221, "step": 263 }, { "epoch": 8.472, "eval_loss": 0.6675601005554199, "eval_runtime": 86.5385, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 263 }, { "epoch": 8.504, "grad_norm": 0.6718735805762622, "learning_rate": 2e-05, "loss": 0.5989, "step": 264 }, { "epoch": 8.504, "eval_loss": 0.6682644486427307, "eval_runtime": 86.5137, "eval_samples_per_second": 2.312, "eval_steps_per_second": 0.289, "step": 264 }, { "epoch": 8.536, "grad_norm": 0.7360990456326049, "learning_rate": 2e-05, "loss": 0.586, "step": 265 }, { "epoch": 8.536, "eval_loss": 0.670520544052124, "eval_runtime": 86.3921, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.289, "step": 265 }, { "epoch": 8.568, "grad_norm": 0.7372365755868506, "learning_rate": 2e-05, "loss": 0.6154, "step": 266 }, { "epoch": 8.568, "eval_loss": 0.6722339391708374, "eval_runtime": 86.4486, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 266 }, { "epoch": 8.6, "grad_norm": 0.7691674703908615, "learning_rate": 2e-05, "loss": 0.5759, "step": 267 }, { "epoch": 8.6, "eval_loss": 0.6752627491950989, "eval_runtime": 86.3478, "eval_samples_per_second": 2.316, "eval_steps_per_second": 0.29, "step": 267 }, { "epoch": 8.632, "grad_norm": 0.7037334988016319, "learning_rate": 2e-05, "loss": 0.5808, "step": 268 }, { "epoch": 8.632, "eval_loss": 0.6786094903945923, "eval_runtime": 86.3221, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 268 }, { "epoch": 8.664, "grad_norm": 0.7364875762471698, "learning_rate": 2e-05, "loss": 0.6381, "step": 269 }, { "epoch": 8.664, "eval_loss": 0.6802875399589539, "eval_runtime": 87.3858, "eval_samples_per_second": 2.289, "eval_steps_per_second": 0.286, "step": 269 }, { "epoch": 8.696, "grad_norm": 0.772443884505786, "learning_rate": 2e-05, "loss": 0.5779, "step": 270 }, { "epoch": 8.696, "eval_loss": 0.6788821220397949, "eval_runtime": 86.2832, "eval_samples_per_second": 2.318, "eval_steps_per_second": 0.29, "step": 270 }, { "epoch": 8.728, "grad_norm": 0.8243245423024692, "learning_rate": 2e-05, "loss": 0.5899, "step": 271 }, { "epoch": 8.728, "eval_loss": 0.6770071983337402, "eval_runtime": 89.4282, "eval_samples_per_second": 2.236, "eval_steps_per_second": 0.28, "step": 271 }, { "epoch": 8.76, "grad_norm": 0.7241934433417714, "learning_rate": 2e-05, "loss": 0.6323, "step": 272 }, { "epoch": 8.76, "eval_loss": 0.6750556826591492, "eval_runtime": 88.2376, "eval_samples_per_second": 2.267, "eval_steps_per_second": 0.283, "step": 272 }, { "epoch": 8.792, "grad_norm": 0.7031539699096522, "learning_rate": 2e-05, "loss": 0.5878, "step": 273 }, { "epoch": 8.792, "eval_loss": 0.6727312207221985, "eval_runtime": 88.2462, "eval_samples_per_second": 2.266, "eval_steps_per_second": 0.283, "step": 273 }, { "epoch": 8.824, "grad_norm": 0.7218931138049051, "learning_rate": 2e-05, "loss": 0.613, "step": 274 }, { "epoch": 8.824, "eval_loss": 0.6727555990219116, "eval_runtime": 88.3872, "eval_samples_per_second": 2.263, "eval_steps_per_second": 0.283, "step": 274 }, { "epoch": 8.856, "grad_norm": 0.7231490645694756, "learning_rate": 2e-05, "loss": 0.6315, "step": 275 }, { "epoch": 8.856, "eval_loss": 0.6705790758132935, "eval_runtime": 87.9218, "eval_samples_per_second": 2.275, "eval_steps_per_second": 0.284, "step": 275 }, { "epoch": 8.888, "grad_norm": 0.7051718905755886, "learning_rate": 2e-05, "loss": 0.6076, "step": 276 }, { "epoch": 8.888, "eval_loss": 0.6689162254333496, "eval_runtime": 91.8597, "eval_samples_per_second": 2.177, "eval_steps_per_second": 0.272, "step": 276 }, { "epoch": 8.92, "grad_norm": 0.7328110944982523, "learning_rate": 2e-05, "loss": 0.624, "step": 277 }, { "epoch": 8.92, "eval_loss": 0.6683139204978943, "eval_runtime": 92.6652, "eval_samples_per_second": 2.158, "eval_steps_per_second": 0.27, "step": 277 }, { "epoch": 8.952, "grad_norm": 0.7116677024113118, "learning_rate": 2e-05, "loss": 0.6078, "step": 278 }, { "epoch": 8.952, "eval_loss": 0.6700756549835205, "eval_runtime": 91.0374, "eval_samples_per_second": 2.197, "eval_steps_per_second": 0.275, "step": 278 }, { "epoch": 8.984, "grad_norm": 0.7461165978892803, "learning_rate": 2e-05, "loss": 0.6104, "step": 279 }, { "epoch": 8.984, "eval_loss": 0.670260488986969, "eval_runtime": 90.6087, "eval_samples_per_second": 2.207, "eval_steps_per_second": 0.276, "step": 279 }, { "epoch": 9.016, "grad_norm": 0.73533421631475, "learning_rate": 2e-05, "loss": 0.5993, "step": 280 }, { "epoch": 9.016, "eval_loss": 0.6718578934669495, "eval_runtime": 90.678, "eval_samples_per_second": 2.206, "eval_steps_per_second": 0.276, "step": 280 }, { "epoch": 9.064, "grad_norm": 0.7141681250783954, "learning_rate": 2e-05, "loss": 0.5763, "step": 281 }, { "epoch": 9.064, "eval_loss": 0.6721769571304321, "eval_runtime": 89.905, "eval_samples_per_second": 2.225, "eval_steps_per_second": 0.278, "step": 281 }, { "epoch": 9.096, "grad_norm": 0.7179191597134931, "learning_rate": 2e-05, "loss": 0.6039, "step": 282 }, { "epoch": 9.096, "eval_loss": 0.670803964138031, "eval_runtime": 85.2808, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 282 }, { "epoch": 9.128, "grad_norm": 0.6677686603528123, "learning_rate": 2e-05, "loss": 0.6018, "step": 283 }, { "epoch": 9.128, "eval_loss": 0.67020583152771, "eval_runtime": 85.394, "eval_samples_per_second": 2.342, "eval_steps_per_second": 0.293, "step": 283 }, { "epoch": 9.16, "grad_norm": 0.7730086322440611, "learning_rate": 2e-05, "loss": 0.5776, "step": 284 }, { "epoch": 9.16, "eval_loss": 0.6687878370285034, "eval_runtime": 85.3242, "eval_samples_per_second": 2.344, "eval_steps_per_second": 0.293, "step": 284 }, { "epoch": 9.192, "grad_norm": 0.6910229929978119, "learning_rate": 2e-05, "loss": 0.5947, "step": 285 }, { "epoch": 9.192, "eval_loss": 0.6686851978302002, "eval_runtime": 85.4375, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 285 }, { "epoch": 9.224, "grad_norm": 0.7524377227435594, "learning_rate": 2e-05, "loss": 0.5867, "step": 286 }, { "epoch": 9.224, "eval_loss": 0.6695001125335693, "eval_runtime": 87.2154, "eval_samples_per_second": 2.293, "eval_steps_per_second": 0.287, "step": 286 }, { "epoch": 9.256, "grad_norm": 0.8215711837083275, "learning_rate": 2e-05, "loss": 0.638, "step": 287 }, { "epoch": 9.256, "eval_loss": 0.6707583069801331, "eval_runtime": 86.3799, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.289, "step": 287 }, { "epoch": 9.288, "grad_norm": 0.7573213066866563, "learning_rate": 2e-05, "loss": 0.589, "step": 288 }, { "epoch": 9.288, "eval_loss": 0.6735061407089233, "eval_runtime": 86.3754, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.289, "step": 288 }, { "epoch": 9.32, "grad_norm": 0.7175469904142907, "learning_rate": 2e-05, "loss": 0.5934, "step": 289 }, { "epoch": 9.32, "eval_loss": 0.6759178042411804, "eval_runtime": 86.2696, "eval_samples_per_second": 2.318, "eval_steps_per_second": 0.29, "step": 289 }, { "epoch": 9.352, "grad_norm": 0.796603811157327, "learning_rate": 2e-05, "loss": 0.6388, "step": 290 }, { "epoch": 9.352, "eval_loss": 0.6765501499176025, "eval_runtime": 86.3106, "eval_samples_per_second": 2.317, "eval_steps_per_second": 0.29, "step": 290 }, { "epoch": 9.384, "grad_norm": 0.7812129965602005, "learning_rate": 2e-05, "loss": 0.598, "step": 291 }, { "epoch": 9.384, "eval_loss": 0.6769981384277344, "eval_runtime": 86.5115, "eval_samples_per_second": 2.312, "eval_steps_per_second": 0.289, "step": 291 }, { "epoch": 9.416, "grad_norm": 0.7948370661003635, "learning_rate": 2e-05, "loss": 0.5814, "step": 292 }, { "epoch": 9.416, "eval_loss": 0.6782019138336182, "eval_runtime": 86.4112, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.289, "step": 292 }, { "epoch": 9.448, "grad_norm": 0.7439813718822291, "learning_rate": 2e-05, "loss": 0.5284, "step": 293 }, { "epoch": 9.448, "eval_loss": 0.6803891658782959, "eval_runtime": 85.4807, "eval_samples_per_second": 2.34, "eval_steps_per_second": 0.292, "step": 293 }, { "epoch": 9.48, "grad_norm": 0.7978905583053586, "learning_rate": 2e-05, "loss": 0.6048, "step": 294 }, { "epoch": 9.48, "eval_loss": 0.6811457872390747, "eval_runtime": 85.4336, "eval_samples_per_second": 2.341, "eval_steps_per_second": 0.293, "step": 294 }, { "epoch": 9.512, "grad_norm": 0.8920765475413412, "learning_rate": 2e-05, "loss": 0.5546, "step": 295 }, { "epoch": 9.512, "eval_loss": 0.6799167394638062, "eval_runtime": 85.6101, "eval_samples_per_second": 2.336, "eval_steps_per_second": 0.292, "step": 295 }, { "epoch": 9.544, "grad_norm": 0.7849055407117252, "learning_rate": 2e-05, "loss": 0.5682, "step": 296 }, { "epoch": 9.544, "eval_loss": 0.6794228553771973, "eval_runtime": 86.558, "eval_samples_per_second": 2.311, "eval_steps_per_second": 0.289, "step": 296 }, { "epoch": 9.576, "grad_norm": 0.9110369044092993, "learning_rate": 2e-05, "loss": 0.5545, "step": 297 }, { "epoch": 9.576, "eval_loss": 0.6763817071914673, "eval_runtime": 85.1029, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.294, "step": 297 }, { "epoch": 9.608, "grad_norm": 0.7326860282119771, "learning_rate": 2e-05, "loss": 0.4959, "step": 298 }, { "epoch": 9.608, "eval_loss": 0.676374077796936, "eval_runtime": 85.2996, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.293, "step": 298 }, { "epoch": 9.64, "grad_norm": 0.7800110720802296, "learning_rate": 2e-05, "loss": 0.6006, "step": 299 }, { "epoch": 9.64, "eval_loss": 0.6781018972396851, "eval_runtime": 85.2375, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.293, "step": 299 }, { "epoch": 9.672, "grad_norm": 0.7426497776512717, "learning_rate": 2e-05, "loss": 0.5772, "step": 300 }, { "epoch": 9.672, "eval_loss": 0.6806349158287048, "eval_runtime": 85.3156, "eval_samples_per_second": 2.344, "eval_steps_per_second": 0.293, "step": 300 }, { "epoch": 9.704, "grad_norm": 0.845889275835268, "learning_rate": 2e-05, "loss": 0.5764, "step": 301 }, { "epoch": 9.704, "eval_loss": 0.6840881109237671, "eval_runtime": 86.2074, "eval_samples_per_second": 2.32, "eval_steps_per_second": 0.29, "step": 301 }, { "epoch": 9.736, "grad_norm": 0.8001524154192788, "learning_rate": 2e-05, "loss": 0.5959, "step": 302 }, { "epoch": 9.736, "eval_loss": 0.6890332698822021, "eval_runtime": 86.4179, "eval_samples_per_second": 2.314, "eval_steps_per_second": 0.289, "step": 302 }, { "epoch": 9.768, "grad_norm": 0.8334657722772659, "learning_rate": 2e-05, "loss": 0.6298, "step": 303 }, { "epoch": 9.768, "eval_loss": 0.695709228515625, "eval_runtime": 86.5152, "eval_samples_per_second": 2.312, "eval_steps_per_second": 0.289, "step": 303 }, { "epoch": 9.8, "grad_norm": 0.9548484042314381, "learning_rate": 2e-05, "loss": 0.5411, "step": 304 }, { "epoch": 9.8, "eval_loss": 0.6976072788238525, "eval_runtime": 87.1045, "eval_samples_per_second": 2.296, "eval_steps_per_second": 0.287, "step": 304 }, { "epoch": 9.832, "grad_norm": 0.8157550824960103, "learning_rate": 2e-05, "loss": 0.5835, "step": 305 }, { "epoch": 9.832, "eval_loss": 0.6981484889984131, "eval_runtime": 87.3499, "eval_samples_per_second": 2.29, "eval_steps_per_second": 0.286, "step": 305 }, { "epoch": 9.864, "grad_norm": 0.8683793066073664, "learning_rate": 2e-05, "loss": 0.485, "step": 306 }, { "epoch": 9.864, "eval_loss": 0.6984279155731201, "eval_runtime": 88.0444, "eval_samples_per_second": 2.272, "eval_steps_per_second": 0.284, "step": 306 }, { "epoch": 9.896, "grad_norm": 0.9219487370888539, "learning_rate": 2e-05, "loss": 0.5156, "step": 307 }, { "epoch": 9.896, "eval_loss": 0.6950345039367676, "eval_runtime": 87.0093, "eval_samples_per_second": 2.299, "eval_steps_per_second": 0.287, "step": 307 }, { "epoch": 9.928, "grad_norm": 0.878236710580061, "learning_rate": 2e-05, "loss": 0.5507, "step": 308 }, { "epoch": 9.928, "eval_loss": 0.6907711625099182, "eval_runtime": 87.456, "eval_samples_per_second": 2.287, "eval_steps_per_second": 0.286, "step": 308 }, { "epoch": 9.96, "grad_norm": 0.8621508416806061, "learning_rate": 2e-05, "loss": 0.5283, "step": 309 }, { "epoch": 9.96, "eval_loss": 0.6901798844337463, "eval_runtime": 87.2634, "eval_samples_per_second": 2.292, "eval_steps_per_second": 0.286, "step": 309 }, { "epoch": 9.992, "grad_norm": 0.8991749266444514, "learning_rate": 2e-05, "loss": 0.4063, "step": 310 }, { "epoch": 9.992, "eval_loss": 0.6929752826690674, "eval_runtime": 87.3357, "eval_samples_per_second": 2.29, "eval_steps_per_second": 0.286, "step": 310 }, { "epoch": 9.992, "step": 310, "total_flos": 666604514574336.0, "train_loss": 0.05508536119614878, "train_runtime": 3860.8628, "train_samples_per_second": 2.59, "train_steps_per_second": 0.08 } ], "logging_steps": 1.0, "max_steps": 310, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 666604514574336.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }