{ "best_metric": 0.9047689437866211, "best_model_checkpoint": "data/Llama-31-8B_task-1_180-samples_config-4_full/checkpoint-391", "epoch": 52.94117647058823, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11764705882352941, "grad_norm": 1.835170865058899, "learning_rate": 8.333333333333334e-08, "loss": 2.4537, "step": 1 }, { "epoch": 0.23529411764705882, "grad_norm": 1.8771226406097412, "learning_rate": 1.6666666666666668e-07, "loss": 2.3789, "step": 2 }, { "epoch": 0.47058823529411764, "grad_norm": 2.0154590606689453, "learning_rate": 3.3333333333333335e-07, "loss": 2.4445, "step": 4 }, { "epoch": 0.7058823529411765, "grad_norm": 2.0653038024902344, "learning_rate": 5.000000000000001e-07, "loss": 2.4809, "step": 6 }, { "epoch": 0.9411764705882353, "grad_norm": 1.925260305404663, "learning_rate": 6.666666666666667e-07, "loss": 2.4745, "step": 8 }, { "epoch": 0.9411764705882353, "eval_loss": 2.43351149559021, "eval_runtime": 14.4922, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 8 }, { "epoch": 1.1764705882352942, "grad_norm": 1.789984107017517, "learning_rate": 8.333333333333333e-07, "loss": 2.4155, "step": 10 }, { "epoch": 1.4117647058823528, "grad_norm": 1.7350174188613892, "learning_rate": 1.0000000000000002e-06, "loss": 2.4448, "step": 12 }, { "epoch": 1.6470588235294117, "grad_norm": 1.7722196578979492, "learning_rate": 1.1666666666666668e-06, "loss": 2.4888, "step": 14 }, { "epoch": 1.8823529411764706, "grad_norm": 1.6953096389770508, "learning_rate": 1.3333333333333334e-06, "loss": 2.4286, "step": 16 }, { "epoch": 2.0, "eval_loss": 2.411425828933716, "eval_runtime": 14.4913, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 17 }, { "epoch": 2.1176470588235294, "grad_norm": 1.5511802434921265, "learning_rate": 1.5e-06, "loss": 2.4245, "step": 18 }, { "epoch": 2.3529411764705883, "grad_norm": 1.547842025756836, "learning_rate": 1.6666666666666667e-06, "loss": 2.4254, "step": 20 }, { "epoch": 2.588235294117647, "grad_norm": 1.5457754135131836, "learning_rate": 1.8333333333333333e-06, "loss": 2.4147, "step": 22 }, { "epoch": 2.8235294117647056, "grad_norm": 1.5168875455856323, "learning_rate": 2.0000000000000003e-06, "loss": 2.419, "step": 24 }, { "epoch": 2.9411764705882355, "eval_loss": 2.381402015686035, "eval_runtime": 14.4968, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 25 }, { "epoch": 3.0588235294117645, "grad_norm": 1.573783278465271, "learning_rate": 2.166666666666667e-06, "loss": 2.375, "step": 26 }, { "epoch": 3.2941176470588234, "grad_norm": 1.4476714134216309, "learning_rate": 2.3333333333333336e-06, "loss": 2.3949, "step": 28 }, { "epoch": 3.5294117647058822, "grad_norm": 1.569410800933838, "learning_rate": 2.5e-06, "loss": 2.3609, "step": 30 }, { "epoch": 3.764705882352941, "grad_norm": 1.604489803314209, "learning_rate": 2.666666666666667e-06, "loss": 2.3778, "step": 32 }, { "epoch": 4.0, "grad_norm": 1.7640857696533203, "learning_rate": 2.8333333333333335e-06, "loss": 2.3475, "step": 34 }, { "epoch": 4.0, "eval_loss": 2.326162099838257, "eval_runtime": 14.494, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 34 }, { "epoch": 4.235294117647059, "grad_norm": 1.7499988079071045, "learning_rate": 3e-06, "loss": 2.3113, "step": 36 }, { "epoch": 4.470588235294118, "grad_norm": 1.8631538152694702, "learning_rate": 3.1666666666666667e-06, "loss": 2.3479, "step": 38 }, { "epoch": 4.705882352941177, "grad_norm": 1.75290846824646, "learning_rate": 3.3333333333333333e-06, "loss": 2.2702, "step": 40 }, { "epoch": 4.9411764705882355, "grad_norm": 1.6694003343582153, "learning_rate": 3.5e-06, "loss": 2.3147, "step": 42 }, { "epoch": 4.9411764705882355, "eval_loss": 2.254105567932129, "eval_runtime": 14.4974, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 42 }, { "epoch": 5.176470588235294, "grad_norm": 1.4519027471542358, "learning_rate": 3.6666666666666666e-06, "loss": 2.2606, "step": 44 }, { "epoch": 5.411764705882353, "grad_norm": 1.507379174232483, "learning_rate": 3.833333333333334e-06, "loss": 2.1865, "step": 46 }, { "epoch": 5.647058823529412, "grad_norm": 1.6610608100891113, "learning_rate": 4.000000000000001e-06, "loss": 2.2445, "step": 48 }, { "epoch": 5.882352941176471, "grad_norm": 1.8665122985839844, "learning_rate": 4.166666666666667e-06, "loss": 2.2214, "step": 50 }, { "epoch": 6.0, "eval_loss": 2.1715917587280273, "eval_runtime": 14.4909, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 51 }, { "epoch": 6.117647058823529, "grad_norm": 2.0021536350250244, "learning_rate": 4.333333333333334e-06, "loss": 2.1689, "step": 52 }, { "epoch": 6.352941176470588, "grad_norm": 1.3459073305130005, "learning_rate": 4.5e-06, "loss": 2.1659, "step": 54 }, { "epoch": 6.588235294117647, "grad_norm": 1.1446928977966309, "learning_rate": 4.666666666666667e-06, "loss": 2.1603, "step": 56 }, { "epoch": 6.823529411764706, "grad_norm": 1.0907998085021973, "learning_rate": 4.833333333333333e-06, "loss": 2.1097, "step": 58 }, { "epoch": 6.9411764705882355, "eval_loss": 2.074471950531006, "eval_runtime": 14.4917, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 59 }, { "epoch": 7.0588235294117645, "grad_norm": 1.0119956731796265, "learning_rate": 5e-06, "loss": 2.1078, "step": 60 }, { "epoch": 7.294117647058823, "grad_norm": 1.069096326828003, "learning_rate": 5.1666666666666675e-06, "loss": 2.0802, "step": 62 }, { "epoch": 7.529411764705882, "grad_norm": 1.0303503274917603, "learning_rate": 5.333333333333334e-06, "loss": 2.0106, "step": 64 }, { "epoch": 7.764705882352941, "grad_norm": 1.0196495056152344, "learning_rate": 5.500000000000001e-06, "loss": 2.0143, "step": 66 }, { "epoch": 8.0, "grad_norm": 0.940999448299408, "learning_rate": 5.666666666666667e-06, "loss": 1.9617, "step": 68 }, { "epoch": 8.0, "eval_loss": 1.9479138851165771, "eval_runtime": 14.5012, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 68 }, { "epoch": 8.235294117647058, "grad_norm": 0.9313414692878723, "learning_rate": 5.833333333333334e-06, "loss": 1.9513, "step": 70 }, { "epoch": 8.470588235294118, "grad_norm": 0.8379154801368713, "learning_rate": 6e-06, "loss": 1.9401, "step": 72 }, { "epoch": 8.705882352941176, "grad_norm": 0.8604364395141602, "learning_rate": 6.166666666666667e-06, "loss": 1.8323, "step": 74 }, { "epoch": 8.941176470588236, "grad_norm": 0.895465612411499, "learning_rate": 6.333333333333333e-06, "loss": 1.908, "step": 76 }, { "epoch": 8.941176470588236, "eval_loss": 1.8374704122543335, "eval_runtime": 14.5083, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 76 }, { "epoch": 9.176470588235293, "grad_norm": 0.7938605546951294, "learning_rate": 6.5000000000000004e-06, "loss": 1.811, "step": 78 }, { "epoch": 9.411764705882353, "grad_norm": 0.8125358819961548, "learning_rate": 6.666666666666667e-06, "loss": 1.7889, "step": 80 }, { "epoch": 9.647058823529411, "grad_norm": 0.8654666543006897, "learning_rate": 6.833333333333334e-06, "loss": 1.756, "step": 82 }, { "epoch": 9.882352941176471, "grad_norm": 0.9033392071723938, "learning_rate": 7e-06, "loss": 1.7669, "step": 84 }, { "epoch": 10.0, "eval_loss": 1.6952563524246216, "eval_runtime": 14.4915, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 85 }, { "epoch": 10.117647058823529, "grad_norm": 0.8959240317344666, "learning_rate": 7.166666666666667e-06, "loss": 1.6923, "step": 86 }, { "epoch": 10.352941176470589, "grad_norm": 0.9546213150024414, "learning_rate": 7.333333333333333e-06, "loss": 1.6253, "step": 88 }, { "epoch": 10.588235294117647, "grad_norm": 1.0652508735656738, "learning_rate": 7.500000000000001e-06, "loss": 1.6249, "step": 90 }, { "epoch": 10.823529411764707, "grad_norm": 1.0107028484344482, "learning_rate": 7.666666666666667e-06, "loss": 1.6325, "step": 92 }, { "epoch": 10.941176470588236, "eval_loss": 1.5460655689239502, "eval_runtime": 14.5077, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 93 }, { "epoch": 11.058823529411764, "grad_norm": 0.8419122099876404, "learning_rate": 7.833333333333333e-06, "loss": 1.5955, "step": 94 }, { "epoch": 11.294117647058824, "grad_norm": 0.8873438835144043, "learning_rate": 8.000000000000001e-06, "loss": 1.5361, "step": 96 }, { "epoch": 11.529411764705882, "grad_norm": 0.8289064168930054, "learning_rate": 8.166666666666668e-06, "loss": 1.4957, "step": 98 }, { "epoch": 11.764705882352942, "grad_norm": 0.890578031539917, "learning_rate": 8.333333333333334e-06, "loss": 1.4318, "step": 100 }, { "epoch": 12.0, "grad_norm": 0.8572049140930176, "learning_rate": 8.5e-06, "loss": 1.3201, "step": 102 }, { "epoch": 12.0, "eval_loss": 1.3738850355148315, "eval_runtime": 14.5098, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 102 }, { "epoch": 12.235294117647058, "grad_norm": 0.9629900455474854, "learning_rate": 8.666666666666668e-06, "loss": 1.3875, "step": 104 }, { "epoch": 12.470588235294118, "grad_norm": 0.845135509967804, "learning_rate": 8.833333333333334e-06, "loss": 1.2982, "step": 106 }, { "epoch": 12.705882352941176, "grad_norm": 0.8272332549095154, "learning_rate": 9e-06, "loss": 1.2623, "step": 108 }, { "epoch": 12.941176470588236, "grad_norm": 0.777883768081665, "learning_rate": 9.166666666666666e-06, "loss": 1.2477, "step": 110 }, { "epoch": 12.941176470588236, "eval_loss": 1.233130931854248, "eval_runtime": 14.5015, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 110 }, { "epoch": 13.176470588235293, "grad_norm": 0.6998254656791687, "learning_rate": 9.333333333333334e-06, "loss": 1.209, "step": 112 }, { "epoch": 13.411764705882353, "grad_norm": 0.6446437835693359, "learning_rate": 9.5e-06, "loss": 1.165, "step": 114 }, { "epoch": 13.647058823529411, "grad_norm": 0.6399714946746826, "learning_rate": 9.666666666666667e-06, "loss": 1.1563, "step": 116 }, { "epoch": 13.882352941176471, "grad_norm": 0.6025314331054688, "learning_rate": 9.833333333333333e-06, "loss": 1.163, "step": 118 }, { "epoch": 14.0, "eval_loss": 1.1329914331436157, "eval_runtime": 14.4965, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 119 }, { "epoch": 14.117647058823529, "grad_norm": 0.5692251920700073, "learning_rate": 1e-05, "loss": 1.1526, "step": 120 }, { "epoch": 14.352941176470589, "grad_norm": 0.5997884273529053, "learning_rate": 9.999915384288723e-06, "loss": 1.1087, "step": 122 }, { "epoch": 14.588235294117647, "grad_norm": 0.575445294380188, "learning_rate": 9.999661540018812e-06, "loss": 1.1228, "step": 124 }, { "epoch": 14.823529411764707, "grad_norm": 0.5134881734848022, "learning_rate": 9.999238475781957e-06, "loss": 1.0579, "step": 126 }, { "epoch": 14.941176470588236, "eval_loss": 1.086132526397705, "eval_runtime": 14.5111, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 127 }, { "epoch": 15.058823529411764, "grad_norm": 0.49025943875312805, "learning_rate": 9.99864620589731e-06, "loss": 1.0658, "step": 128 }, { "epoch": 15.294117647058824, "grad_norm": 0.41751477122306824, "learning_rate": 9.997884750411004e-06, "loss": 1.0543, "step": 130 }, { "epoch": 15.529411764705882, "grad_norm": 0.4086260199546814, "learning_rate": 9.99695413509548e-06, "loss": 1.0696, "step": 132 }, { "epoch": 15.764705882352942, "grad_norm": 0.38625603914260864, "learning_rate": 9.995854391448607e-06, "loss": 1.0581, "step": 134 }, { "epoch": 16.0, "grad_norm": 0.3801828622817993, "learning_rate": 9.994585556692624e-06, "loss": 1.0655, "step": 136 }, { "epoch": 16.0, "eval_loss": 1.0610604286193848, "eval_runtime": 14.4857, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 136 }, { "epoch": 16.235294117647058, "grad_norm": 0.3603919744491577, "learning_rate": 9.993147673772869e-06, "loss": 1.0141, "step": 138 }, { "epoch": 16.470588235294116, "grad_norm": 0.3525612950325012, "learning_rate": 9.991540791356342e-06, "loss": 1.0913, "step": 140 }, { "epoch": 16.705882352941178, "grad_norm": 0.34835246205329895, "learning_rate": 9.989764963830038e-06, "loss": 1.0581, "step": 142 }, { "epoch": 16.941176470588236, "grad_norm": 0.3293631076812744, "learning_rate": 9.987820251299121e-06, "loss": 0.9976, "step": 144 }, { "epoch": 16.941176470588236, "eval_loss": 1.0454792976379395, "eval_runtime": 14.4924, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 144 }, { "epoch": 17.176470588235293, "grad_norm": 0.3257598578929901, "learning_rate": 9.985706719584888e-06, "loss": 1.0318, "step": 146 }, { "epoch": 17.41176470588235, "grad_norm": 0.351471483707428, "learning_rate": 9.98342444022253e-06, "loss": 0.9863, "step": 148 }, { "epoch": 17.647058823529413, "grad_norm": 0.35920023918151855, "learning_rate": 9.980973490458728e-06, "loss": 1.0559, "step": 150 }, { "epoch": 17.88235294117647, "grad_norm": 0.3804563283920288, "learning_rate": 9.978353953249023e-06, "loss": 1.0285, "step": 152 }, { "epoch": 18.0, "eval_loss": 1.0318020582199097, "eval_runtime": 14.4883, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 153 }, { "epoch": 18.11764705882353, "grad_norm": 0.3365037739276886, "learning_rate": 9.975565917255017e-06, "loss": 0.9899, "step": 154 }, { "epoch": 18.352941176470587, "grad_norm": 0.33106836676597595, "learning_rate": 9.972609476841368e-06, "loss": 1.0316, "step": 156 }, { "epoch": 18.58823529411765, "grad_norm": 0.34729650616645813, "learning_rate": 9.9694847320726e-06, "loss": 1.0052, "step": 158 }, { "epoch": 18.823529411764707, "grad_norm": 0.3627675473690033, "learning_rate": 9.966191788709716e-06, "loss": 0.998, "step": 160 }, { "epoch": 18.941176470588236, "eval_loss": 1.0204837322235107, "eval_runtime": 14.4913, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 161 }, { "epoch": 19.058823529411764, "grad_norm": 0.37889716029167175, "learning_rate": 9.962730758206612e-06, "loss": 0.9902, "step": 162 }, { "epoch": 19.294117647058822, "grad_norm": 0.36187246441841125, "learning_rate": 9.959101757706308e-06, "loss": 1.0316, "step": 164 }, { "epoch": 19.529411764705884, "grad_norm": 0.4003053903579712, "learning_rate": 9.955304910036993e-06, "loss": 0.9643, "step": 166 }, { "epoch": 19.764705882352942, "grad_norm": 0.351055771112442, "learning_rate": 9.951340343707852e-06, "loss": 0.9944, "step": 168 }, { "epoch": 20.0, "grad_norm": 0.3519394099712372, "learning_rate": 9.947208192904722e-06, "loss": 1.0038, "step": 170 }, { "epoch": 20.0, "eval_loss": 1.01017427444458, "eval_runtime": 14.4899, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 170 }, { "epoch": 20.235294117647058, "grad_norm": 0.3706846237182617, "learning_rate": 9.942908597485558e-06, "loss": 0.9619, "step": 172 }, { "epoch": 20.470588235294116, "grad_norm": 0.39562636613845825, "learning_rate": 9.938441702975689e-06, "loss": 1.0219, "step": 174 }, { "epoch": 20.705882352941178, "grad_norm": 0.48325368762016296, "learning_rate": 9.933807660562898e-06, "loss": 0.9589, "step": 176 }, { "epoch": 20.941176470588236, "grad_norm": 0.4090341627597809, "learning_rate": 9.929006627092298e-06, "loss": 0.9907, "step": 178 }, { "epoch": 20.941176470588236, "eval_loss": 1.0019959211349487, "eval_runtime": 14.4938, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 178 }, { "epoch": 21.176470588235293, "grad_norm": 0.3393973708152771, "learning_rate": 9.924038765061042e-06, "loss": 0.9841, "step": 180 }, { "epoch": 21.41176470588235, "grad_norm": 0.47041693329811096, "learning_rate": 9.918904242612794e-06, "loss": 1.0023, "step": 182 }, { "epoch": 21.647058823529413, "grad_norm": 0.4426097273826599, "learning_rate": 9.913603233532067e-06, "loss": 0.9654, "step": 184 }, { "epoch": 21.88235294117647, "grad_norm": 0.4066382646560669, "learning_rate": 9.908135917238321e-06, "loss": 0.9673, "step": 186 }, { "epoch": 22.0, "eval_loss": 0.9929290413856506, "eval_runtime": 14.5036, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 187 }, { "epoch": 22.11764705882353, "grad_norm": 0.4103560447692871, "learning_rate": 9.902502478779897e-06, "loss": 0.9763, "step": 188 }, { "epoch": 22.352941176470587, "grad_norm": 0.3923400938510895, "learning_rate": 9.896703108827758e-06, "loss": 0.9583, "step": 190 }, { "epoch": 22.58823529411765, "grad_norm": 0.43239229917526245, "learning_rate": 9.890738003669029e-06, "loss": 0.9429, "step": 192 }, { "epoch": 22.823529411764707, "grad_norm": 0.45320191979408264, "learning_rate": 9.884607365200355e-06, "loss": 0.95, "step": 194 }, { "epoch": 22.941176470588236, "eval_loss": 0.9870390295982361, "eval_runtime": 14.4888, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 195 }, { "epoch": 23.058823529411764, "grad_norm": 0.48725754022598267, "learning_rate": 9.878311400921072e-06, "loss": 0.9369, "step": 196 }, { "epoch": 23.294117647058822, "grad_norm": 0.4515678286552429, "learning_rate": 9.871850323926178e-06, "loss": 0.9978, "step": 198 }, { "epoch": 23.529411764705884, "grad_norm": 0.44924628734588623, "learning_rate": 9.86522435289912e-06, "loss": 0.9126, "step": 200 }, { "epoch": 23.764705882352942, "grad_norm": 0.4258614182472229, "learning_rate": 9.858433712104403e-06, "loss": 0.9716, "step": 202 }, { "epoch": 24.0, "grad_norm": 0.46707141399383545, "learning_rate": 9.851478631379982e-06, "loss": 0.9467, "step": 204 }, { "epoch": 24.0, "eval_loss": 0.980078399181366, "eval_runtime": 14.4952, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 204 }, { "epoch": 24.235294117647058, "grad_norm": 0.5438560843467712, "learning_rate": 9.844359346129504e-06, "loss": 0.9756, "step": 206 }, { "epoch": 24.470588235294116, "grad_norm": 0.47229835391044617, "learning_rate": 9.83707609731432e-06, "loss": 0.9285, "step": 208 }, { "epoch": 24.705882352941178, "grad_norm": 0.4578736126422882, "learning_rate": 9.829629131445342e-06, "loss": 0.9288, "step": 210 }, { "epoch": 24.941176470588236, "grad_norm": 0.45513561367988586, "learning_rate": 9.822018700574696e-06, "loss": 0.9423, "step": 212 }, { "epoch": 24.941176470588236, "eval_loss": 0.9737484455108643, "eval_runtime": 14.4981, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 212 }, { "epoch": 25.176470588235293, "grad_norm": 0.4210945963859558, "learning_rate": 9.81424506228719e-06, "loss": 0.9093, "step": 214 }, { "epoch": 25.41176470588235, "grad_norm": 0.5261120796203613, "learning_rate": 9.806308479691595e-06, "loss": 0.9351, "step": 216 }, { "epoch": 25.647058823529413, "grad_norm": 0.5155240893363953, "learning_rate": 9.798209221411748e-06, "loss": 0.9548, "step": 218 }, { "epoch": 25.88235294117647, "grad_norm": 0.5193476676940918, "learning_rate": 9.789947561577445e-06, "loss": 0.937, "step": 220 }, { "epoch": 26.0, "eval_loss": 0.9675381183624268, "eval_runtime": 14.5154, "eval_samples_per_second": 2.48, "eval_steps_per_second": 2.48, "step": 221 }, { "epoch": 26.11764705882353, "grad_norm": 0.4255479574203491, "learning_rate": 9.781523779815178e-06, "loss": 0.9061, "step": 222 }, { "epoch": 26.352941176470587, "grad_norm": 0.5112140774726868, "learning_rate": 9.77293816123866e-06, "loss": 0.9409, "step": 224 }, { "epoch": 26.58823529411765, "grad_norm": 0.5464062094688416, "learning_rate": 9.764190996439181e-06, "loss": 0.9274, "step": 226 }, { "epoch": 26.823529411764707, "grad_norm": 0.473880797624588, "learning_rate": 9.755282581475769e-06, "loss": 0.9035, "step": 228 }, { "epoch": 26.941176470588236, "eval_loss": 0.9625819325447083, "eval_runtime": 14.5037, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 229 }, { "epoch": 27.058823529411764, "grad_norm": 0.5459398627281189, "learning_rate": 9.74621321786517e-06, "loss": 0.9274, "step": 230 }, { "epoch": 27.294117647058822, "grad_norm": 0.5077070593833923, "learning_rate": 9.736983212571646e-06, "loss": 0.911, "step": 232 }, { "epoch": 27.529411764705884, "grad_norm": 0.510081946849823, "learning_rate": 9.727592877996585e-06, "loss": 0.8977, "step": 234 }, { "epoch": 27.764705882352942, "grad_norm": 0.5523853898048401, "learning_rate": 9.718042531967918e-06, "loss": 0.9165, "step": 236 }, { "epoch": 28.0, "grad_norm": 0.587631344795227, "learning_rate": 9.708332497729378e-06, "loss": 0.9074, "step": 238 }, { "epoch": 28.0, "eval_loss": 0.9581733345985413, "eval_runtime": 14.489, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 238 }, { "epoch": 28.235294117647058, "grad_norm": 0.5389757752418518, "learning_rate": 9.698463103929542e-06, "loss": 0.8634, "step": 240 }, { "epoch": 28.470588235294116, "grad_norm": 0.5718576908111572, "learning_rate": 9.688434684610725e-06, "loss": 0.941, "step": 242 }, { "epoch": 28.705882352941178, "grad_norm": 0.5238634943962097, "learning_rate": 9.678247579197658e-06, "loss": 0.9235, "step": 244 }, { "epoch": 28.941176470588236, "grad_norm": 0.5517392754554749, "learning_rate": 9.667902132486009e-06, "loss": 0.8944, "step": 246 }, { "epoch": 28.941176470588236, "eval_loss": 0.9534149169921875, "eval_runtime": 14.4915, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 246 }, { "epoch": 29.176470588235293, "grad_norm": 0.6139256954193115, "learning_rate": 9.657398694630713e-06, "loss": 0.8854, "step": 248 }, { "epoch": 29.41176470588235, "grad_norm": 0.5577236413955688, "learning_rate": 9.646737621134112e-06, "loss": 0.9253, "step": 250 }, { "epoch": 29.647058823529413, "grad_norm": 0.638213038444519, "learning_rate": 9.635919272833938e-06, "loss": 0.9007, "step": 252 }, { "epoch": 29.88235294117647, "grad_norm": 0.5903518795967102, "learning_rate": 9.62494401589108e-06, "loss": 0.8785, "step": 254 }, { "epoch": 30.0, "eval_loss": 0.9493101835250854, "eval_runtime": 14.4864, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 255 }, { "epoch": 30.11764705882353, "grad_norm": 0.6490582823753357, "learning_rate": 9.613812221777212e-06, "loss": 0.8782, "step": 256 }, { "epoch": 30.352941176470587, "grad_norm": 0.5926429033279419, "learning_rate": 9.602524267262202e-06, "loss": 0.8326, "step": 258 }, { "epoch": 30.58823529411765, "grad_norm": 0.6460428237915039, "learning_rate": 9.591080534401371e-06, "loss": 0.9207, "step": 260 }, { "epoch": 30.823529411764707, "grad_norm": 0.59434574842453, "learning_rate": 9.579481410522556e-06, "loss": 0.8797, "step": 262 }, { "epoch": 30.941176470588236, "eval_loss": 0.9451322555541992, "eval_runtime": 14.513, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 263 }, { "epoch": 31.058823529411764, "grad_norm": 0.5878000855445862, "learning_rate": 9.567727288213005e-06, "loss": 0.864, "step": 264 }, { "epoch": 31.294117647058822, "grad_norm": 0.5548307299613953, "learning_rate": 9.555818565306086e-06, "loss": 0.8766, "step": 266 }, { "epoch": 31.529411764705884, "grad_norm": 0.6302839517593384, "learning_rate": 9.543755644867823e-06, "loss": 0.8702, "step": 268 }, { "epoch": 31.764705882352942, "grad_norm": 0.6953015923500061, "learning_rate": 9.531538935183252e-06, "loss": 0.8888, "step": 270 }, { "epoch": 32.0, "grad_norm": 0.8207558989524841, "learning_rate": 9.519168849742603e-06, "loss": 0.8764, "step": 272 }, { "epoch": 32.0, "eval_loss": 0.9421913623809814, "eval_runtime": 14.4916, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 272 }, { "epoch": 32.23529411764706, "grad_norm": 0.699301540851593, "learning_rate": 9.506645807227311e-06, "loss": 0.8338, "step": 274 }, { "epoch": 32.470588235294116, "grad_norm": 0.7847335934638977, "learning_rate": 9.493970231495836e-06, "loss": 0.9099, "step": 276 }, { "epoch": 32.705882352941174, "grad_norm": 0.7237752079963684, "learning_rate": 9.481142551569318e-06, "loss": 0.8442, "step": 278 }, { "epoch": 32.94117647058823, "grad_norm": 0.7358724474906921, "learning_rate": 9.468163201617063e-06, "loss": 0.8903, "step": 280 }, { "epoch": 32.94117647058823, "eval_loss": 0.9388971328735352, "eval_runtime": 14.4936, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 280 }, { "epoch": 33.1764705882353, "grad_norm": 0.7364740371704102, "learning_rate": 9.45503262094184e-06, "loss": 0.8319, "step": 282 }, { "epoch": 33.411764705882355, "grad_norm": 0.6712108254432678, "learning_rate": 9.441751253965022e-06, "loss": 0.8394, "step": 284 }, { "epoch": 33.64705882352941, "grad_norm": 0.9382649660110474, "learning_rate": 9.428319550211531e-06, "loss": 0.8431, "step": 286 }, { "epoch": 33.88235294117647, "grad_norm": 0.6663911938667297, "learning_rate": 9.414737964294636e-06, "loss": 0.8835, "step": 288 }, { "epoch": 34.0, "eval_loss": 0.9377151131629944, "eval_runtime": 14.4983, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 289 }, { "epoch": 34.11764705882353, "grad_norm": 0.9108101725578308, "learning_rate": 9.401006955900555e-06, "loss": 0.8886, "step": 290 }, { "epoch": 34.35294117647059, "grad_norm": 0.6387782096862793, "learning_rate": 9.38712698977291e-06, "loss": 0.8418, "step": 292 }, { "epoch": 34.588235294117645, "grad_norm": 0.7615544199943542, "learning_rate": 9.37309853569698e-06, "loss": 0.833, "step": 294 }, { "epoch": 34.8235294117647, "grad_norm": 0.7540785074234009, "learning_rate": 9.358922068483813e-06, "loss": 0.8452, "step": 296 }, { "epoch": 34.94117647058823, "eval_loss": 0.9331609010696411, "eval_runtime": 14.4883, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 297 }, { "epoch": 35.05882352941177, "grad_norm": 0.9316972494125366, "learning_rate": 9.344598067954151e-06, "loss": 0.8427, "step": 298 }, { "epoch": 35.294117647058826, "grad_norm": 0.8066386580467224, "learning_rate": 9.330127018922195e-06, "loss": 0.8404, "step": 300 }, { "epoch": 35.529411764705884, "grad_norm": 0.7952571511268616, "learning_rate": 9.315509411179182e-06, "loss": 0.7953, "step": 302 }, { "epoch": 35.76470588235294, "grad_norm": 0.7538934946060181, "learning_rate": 9.30074573947683e-06, "loss": 0.8435, "step": 304 }, { "epoch": 36.0, "grad_norm": 0.7602026462554932, "learning_rate": 9.285836503510562e-06, "loss": 0.8777, "step": 306 }, { "epoch": 36.0, "eval_loss": 0.9272398352622986, "eval_runtime": 14.4824, "eval_samples_per_second": 2.486, "eval_steps_per_second": 2.486, "step": 306 }, { "epoch": 36.23529411764706, "grad_norm": 0.7411664128303528, "learning_rate": 9.27078220790263e-06, "loss": 0.8382, "step": 308 }, { "epoch": 36.470588235294116, "grad_norm": 0.7392826676368713, "learning_rate": 9.255583362184998e-06, "loss": 0.8206, "step": 310 }, { "epoch": 36.705882352941174, "grad_norm": 0.8133856058120728, "learning_rate": 9.24024048078213e-06, "loss": 0.8384, "step": 312 }, { "epoch": 36.94117647058823, "grad_norm": 0.8298240900039673, "learning_rate": 9.224754082993553e-06, "loss": 0.8101, "step": 314 }, { "epoch": 36.94117647058823, "eval_loss": 0.9257263541221619, "eval_runtime": 14.5049, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 314 }, { "epoch": 37.1764705882353, "grad_norm": 0.756746768951416, "learning_rate": 9.209124692976287e-06, "loss": 0.8068, "step": 316 }, { "epoch": 37.411764705882355, "grad_norm": 0.7719102501869202, "learning_rate": 9.193352839727122e-06, "loss": 0.8067, "step": 318 }, { "epoch": 37.64705882352941, "grad_norm": 0.7538597583770752, "learning_rate": 9.177439057064684e-06, "loss": 0.821, "step": 320 }, { "epoch": 37.88235294117647, "grad_norm": 0.8648045063018799, "learning_rate": 9.16138388361139e-06, "loss": 0.8526, "step": 322 }, { "epoch": 38.0, "eval_loss": 0.9229225516319275, "eval_runtime": 14.498, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 323 }, { "epoch": 38.11764705882353, "grad_norm": 0.7806123495101929, "learning_rate": 9.145187862775208e-06, "loss": 0.8019, "step": 324 }, { "epoch": 38.35294117647059, "grad_norm": 0.8803067803382874, "learning_rate": 9.128851542731271e-06, "loss": 0.7752, "step": 326 }, { "epoch": 38.588235294117645, "grad_norm": 0.9083964824676514, "learning_rate": 9.112375476403313e-06, "loss": 0.8176, "step": 328 }, { "epoch": 38.8235294117647, "grad_norm": 0.8712770938873291, "learning_rate": 9.09576022144496e-06, "loss": 0.8228, "step": 330 }, { "epoch": 38.94117647058823, "eval_loss": 0.9196635484695435, "eval_runtime": 14.5068, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 331 }, { "epoch": 39.05882352941177, "grad_norm": 0.8734506964683533, "learning_rate": 9.079006340220862e-06, "loss": 0.8059, "step": 332 }, { "epoch": 39.294117647058826, "grad_norm": 0.8067296147346497, "learning_rate": 9.062114399787648e-06, "loss": 0.7936, "step": 334 }, { "epoch": 39.529411764705884, "grad_norm": 0.825115978717804, "learning_rate": 9.045084971874738e-06, "loss": 0.837, "step": 336 }, { "epoch": 39.76470588235294, "grad_norm": 0.9862772226333618, "learning_rate": 9.027918632864998e-06, "loss": 0.756, "step": 338 }, { "epoch": 40.0, "grad_norm": 0.9583683609962463, "learning_rate": 9.01061596377522e-06, "loss": 0.8066, "step": 340 }, { "epoch": 40.0, "eval_loss": 0.9175994396209717, "eval_runtime": 14.4912, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 340 }, { "epoch": 40.23529411764706, "grad_norm": 0.9128953814506531, "learning_rate": 8.993177550236464e-06, "loss": 0.8042, "step": 342 }, { "epoch": 40.470588235294116, "grad_norm": 0.9705334901809692, "learning_rate": 8.97560398247424e-06, "loss": 0.8072, "step": 344 }, { "epoch": 40.705882352941174, "grad_norm": 0.9346606731414795, "learning_rate": 8.957895855288517e-06, "loss": 0.7423, "step": 346 }, { "epoch": 40.94117647058823, "grad_norm": 1.2344577312469482, "learning_rate": 8.94005376803361e-06, "loss": 0.7701, "step": 348 }, { "epoch": 40.94117647058823, "eval_loss": 0.9198606014251709, "eval_runtime": 14.4977, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 348 }, { "epoch": 41.1764705882353, "grad_norm": 0.9365864992141724, "learning_rate": 8.92207832459788e-06, "loss": 0.7861, "step": 350 }, { "epoch": 41.411764705882355, "grad_norm": 0.8833426833152771, "learning_rate": 8.903970133383297e-06, "loss": 0.7686, "step": 352 }, { "epoch": 41.64705882352941, "grad_norm": 0.9196489453315735, "learning_rate": 8.885729807284855e-06, "loss": 0.773, "step": 354 }, { "epoch": 41.88235294117647, "grad_norm": 1.0096192359924316, "learning_rate": 8.867357963669821e-06, "loss": 0.8132, "step": 356 }, { "epoch": 42.0, "eval_loss": 0.916193962097168, "eval_runtime": 14.4918, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 357 }, { "epoch": 42.11764705882353, "grad_norm": 0.9623170495033264, "learning_rate": 8.84885522435684e-06, "loss": 0.7446, "step": 358 }, { "epoch": 42.35294117647059, "grad_norm": 0.9639574885368347, "learning_rate": 8.83022221559489e-06, "loss": 0.8012, "step": 360 }, { "epoch": 42.588235294117645, "grad_norm": 1.0695879459381104, "learning_rate": 8.811459568042092e-06, "loss": 0.7134, "step": 362 }, { "epoch": 42.8235294117647, "grad_norm": 1.0046355724334717, "learning_rate": 8.792567916744346e-06, "loss": 0.7804, "step": 364 }, { "epoch": 42.94117647058823, "eval_loss": 0.9103953838348389, "eval_runtime": 14.5212, "eval_samples_per_second": 2.479, "eval_steps_per_second": 2.479, "step": 365 }, { "epoch": 43.05882352941177, "grad_norm": 0.9536579251289368, "learning_rate": 8.773547901113862e-06, "loss": 0.7675, "step": 366 }, { "epoch": 43.294117647058826, "grad_norm": 0.9319316148757935, "learning_rate": 8.754400164907496e-06, "loss": 0.7311, "step": 368 }, { "epoch": 43.529411764705884, "grad_norm": 1.153613805770874, "learning_rate": 8.735125356204982e-06, "loss": 0.7777, "step": 370 }, { "epoch": 43.76470588235294, "grad_norm": 1.0393705368041992, "learning_rate": 8.715724127386971e-06, "loss": 0.7322, "step": 372 }, { "epoch": 44.0, "grad_norm": 0.9913296103477478, "learning_rate": 8.69619713511298e-06, "loss": 0.7508, "step": 374 }, { "epoch": 44.0, "eval_loss": 0.908278226852417, "eval_runtime": 14.4883, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 374 }, { "epoch": 44.23529411764706, "grad_norm": 0.9832679033279419, "learning_rate": 8.676545040299145e-06, "loss": 0.7355, "step": 376 }, { "epoch": 44.470588235294116, "grad_norm": 1.2460023164749146, "learning_rate": 8.656768508095853e-06, "loss": 0.7238, "step": 378 }, { "epoch": 44.705882352941174, "grad_norm": 0.964077889919281, "learning_rate": 8.636868207865244e-06, "loss": 0.7564, "step": 380 }, { "epoch": 44.94117647058823, "grad_norm": 0.9776307344436646, "learning_rate": 8.61684481315854e-06, "loss": 0.7192, "step": 382 }, { "epoch": 44.94117647058823, "eval_loss": 0.9051859378814697, "eval_runtime": 14.5008, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 382 }, { "epoch": 45.1764705882353, "grad_norm": 0.9675242304801941, "learning_rate": 8.596699001693257e-06, "loss": 0.754, "step": 384 }, { "epoch": 45.411764705882355, "grad_norm": 1.1183017492294312, "learning_rate": 8.576431455330258e-06, "loss": 0.6805, "step": 386 }, { "epoch": 45.64705882352941, "grad_norm": 0.9894379377365112, "learning_rate": 8.556042860050686e-06, "loss": 0.7691, "step": 388 }, { "epoch": 45.88235294117647, "grad_norm": 0.8682206869125366, "learning_rate": 8.535533905932739e-06, "loss": 0.7633, "step": 390 }, { "epoch": 46.0, "eval_loss": 0.9047689437866211, "eval_runtime": 14.5123, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 391 }, { "epoch": 46.11764705882353, "grad_norm": 0.9697660803794861, "learning_rate": 8.51490528712831e-06, "loss": 0.7131, "step": 392 }, { "epoch": 46.35294117647059, "grad_norm": 0.9910651445388794, "learning_rate": 8.4941577018395e-06, "loss": 0.7183, "step": 394 }, { "epoch": 46.588235294117645, "grad_norm": 1.670358419418335, "learning_rate": 8.473291852294986e-06, "loss": 0.6982, "step": 396 }, { "epoch": 46.8235294117647, "grad_norm": 1.1389248371124268, "learning_rate": 8.452308444726249e-06, "loss": 0.7534, "step": 398 }, { "epoch": 46.94117647058823, "eval_loss": 0.9052470326423645, "eval_runtime": 14.5068, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 399 }, { "epoch": 47.05882352941177, "grad_norm": 0.963392972946167, "learning_rate": 8.43120818934367e-06, "loss": 0.7282, "step": 400 }, { "epoch": 47.294117647058826, "grad_norm": 1.0950130224227905, "learning_rate": 8.409991800312493e-06, "loss": 0.7273, "step": 402 }, { "epoch": 47.529411764705884, "grad_norm": 1.0003873109817505, "learning_rate": 8.388659995728662e-06, "loss": 0.7113, "step": 404 }, { "epoch": 47.76470588235294, "grad_norm": 1.1168714761734009, "learning_rate": 8.367213497594501e-06, "loss": 0.7019, "step": 406 }, { "epoch": 48.0, "grad_norm": 1.1041070222854614, "learning_rate": 8.345653031794292e-06, "loss": 0.666, "step": 408 }, { "epoch": 48.0, "eval_loss": 0.915124237537384, "eval_runtime": 14.4863, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 408 }, { "epoch": 48.23529411764706, "grad_norm": 0.8982572555541992, "learning_rate": 8.323979328069689e-06, "loss": 0.6762, "step": 410 }, { "epoch": 48.470588235294116, "grad_norm": 1.0614545345306396, "learning_rate": 8.302193119995038e-06, "loss": 0.6779, "step": 412 }, { "epoch": 48.705882352941174, "grad_norm": 0.9689596891403198, "learning_rate": 8.280295144952537e-06, "loss": 0.7313, "step": 414 }, { "epoch": 48.94117647058823, "grad_norm": 0.9231712818145752, "learning_rate": 8.258286144107277e-06, "loss": 0.7298, "step": 416 }, { "epoch": 48.94117647058823, "eval_loss": 0.9143383502960205, "eval_runtime": 14.4951, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 416 }, { "epoch": 49.1764705882353, "grad_norm": 1.089706301689148, "learning_rate": 8.236166862382163e-06, "loss": 0.6328, "step": 418 }, { "epoch": 49.411764705882355, "grad_norm": 1.1155279874801636, "learning_rate": 8.213938048432697e-06, "loss": 0.6914, "step": 420 }, { "epoch": 49.64705882352941, "grad_norm": 0.9705954194068909, "learning_rate": 8.191600454621642e-06, "loss": 0.7003, "step": 422 }, { "epoch": 49.88235294117647, "grad_norm": 1.0692474842071533, "learning_rate": 8.16915483699355e-06, "loss": 0.6815, "step": 424 }, { "epoch": 50.0, "eval_loss": 0.9157248735427856, "eval_runtime": 14.4868, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 425 }, { "epoch": 50.11764705882353, "grad_norm": 1.0241312980651855, "learning_rate": 8.146601955249187e-06, "loss": 0.6999, "step": 426 }, { "epoch": 50.35294117647059, "grad_norm": 1.097902774810791, "learning_rate": 8.123942572719801e-06, "loss": 0.6549, "step": 428 }, { "epoch": 50.588235294117645, "grad_norm": 1.1850600242614746, "learning_rate": 8.101177456341301e-06, "loss": 0.6675, "step": 430 }, { "epoch": 50.8235294117647, "grad_norm": 1.080406904220581, "learning_rate": 8.078307376628292e-06, "loss": 0.6845, "step": 432 }, { "epoch": 50.94117647058823, "eval_loss": 0.9170003533363342, "eval_runtime": 14.5105, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 433 }, { "epoch": 51.05882352941177, "grad_norm": 1.128921389579773, "learning_rate": 8.055333107648e-06, "loss": 0.7022, "step": 434 }, { "epoch": 51.294117647058826, "grad_norm": 1.2979844808578491, "learning_rate": 8.032255426994069e-06, "loss": 0.6501, "step": 436 }, { "epoch": 51.529411764705884, "grad_norm": 1.0280219316482544, "learning_rate": 8.009075115760243e-06, "loss": 0.6824, "step": 438 }, { "epoch": 51.76470588235294, "grad_norm": 1.2425734996795654, "learning_rate": 7.985792958513932e-06, "loss": 0.6787, "step": 440 }, { "epoch": 52.0, "grad_norm": 1.114044427871704, "learning_rate": 7.962409743269654e-06, "loss": 0.6524, "step": 442 }, { "epoch": 52.0, "eval_loss": 0.9215981960296631, "eval_runtime": 14.5021, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 442 }, { "epoch": 52.23529411764706, "grad_norm": 1.1072558164596558, "learning_rate": 7.938926261462366e-06, "loss": 0.6602, "step": 444 }, { "epoch": 52.470588235294116, "grad_norm": 1.1433347463607788, "learning_rate": 7.915343307920674e-06, "loss": 0.6649, "step": 446 }, { "epoch": 52.705882352941174, "grad_norm": 1.1459075212478638, "learning_rate": 7.891661680839932e-06, "loss": 0.6407, "step": 448 }, { "epoch": 52.94117647058823, "grad_norm": 1.146519422531128, "learning_rate": 7.86788218175523e-06, "loss": 0.6397, "step": 450 }, { "epoch": 52.94117647058823, "eval_loss": 0.9228353500366211, "eval_runtime": 14.5275, "eval_samples_per_second": 2.478, "eval_steps_per_second": 2.478, "step": 450 }, { "epoch": 52.94117647058823, "step": 450, "total_flos": 1.1473460706410496e+17, "train_loss": 1.1512780372301739, "train_runtime": 8506.7232, "train_samples_per_second": 2.398, "train_steps_per_second": 0.141 } ], "logging_steps": 2, "max_steps": 1200, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 7, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1473460706410496e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }