{ "best_metric": 0.9539479613304138, "best_model_checkpoint": "outputs/checkpoint-231", "epoch": 10.99055330634278, "eval_steps": 500, "global_step": 509, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4318488529014845, "grad_norm": 0.7265238165855408, "learning_rate": 4e-05, "loss": 2.2901, "step": 20 }, { "epoch": 0.863697705802969, "grad_norm": 0.4902186989784241, "learning_rate": 8e-05, "loss": 2.0553, "step": 40 }, { "epoch": 0.9932523616734144, "eval_loss": 1.7082113027572632, "eval_runtime": 11.6752, "eval_samples_per_second": 31.862, "eval_steps_per_second": 4.026, "step": 46 }, { "epoch": 1.2955465587044535, "grad_norm": 0.5778974294662476, "learning_rate": 0.00012, "loss": 1.9034, "step": 60 }, { "epoch": 1.7273954116059378, "grad_norm": 0.818723201751709, "learning_rate": 0.00016, "loss": 1.6895, "step": 80 }, { "epoch": 1.9865047233468287, "eval_loss": 1.3229522705078125, "eval_runtime": 11.7014, "eval_samples_per_second": 31.791, "eval_steps_per_second": 4.017, "step": 92 }, { "epoch": 2.1592442645074224, "grad_norm": 1.2037190198898315, "learning_rate": 0.0002, "loss": 1.5763, "step": 100 }, { "epoch": 2.591093117408907, "grad_norm": 1.2296253442764282, "learning_rate": 0.00019978499773373596, "loss": 1.3997, "step": 120 }, { "epoch": 2.979757085020243, "eval_loss": 1.0936287641525269, "eval_runtime": 11.7036, "eval_samples_per_second": 31.785, "eval_steps_per_second": 4.016, "step": 138 }, { "epoch": 3.0229419703103915, "grad_norm": 1.3699569702148438, "learning_rate": 0.0001991409154544338, "loss": 1.3491, "step": 140 }, { "epoch": 3.454790823211876, "grad_norm": 1.451266884803772, "learning_rate": 0.00019807052274508773, "loss": 1.1626, "step": 160 }, { "epoch": 3.8866396761133606, "grad_norm": 1.3704801797866821, "learning_rate": 0.0001965784223428638, "loss": 1.1558, "step": 180 }, { "epoch": 3.9946018893387314, "eval_loss": 0.9916501641273499, "eval_runtime": 11.7248, "eval_samples_per_second": 31.728, "eval_steps_per_second": 4.009, "step": 185 }, { "epoch": 4.318488529014845, "grad_norm": 1.6158828735351562, "learning_rate": 0.0001946710303471214, "loss": 1.0048, "step": 200 }, { "epoch": 4.75033738191633, "grad_norm": 1.7279417514801025, "learning_rate": 0.00019235654862989537, "loss": 1.0293, "step": 220 }, { "epoch": 4.987854251012146, "eval_loss": 0.9539479613304138, "eval_runtime": 11.7041, "eval_samples_per_second": 31.784, "eval_steps_per_second": 4.016, "step": 231 }, { "epoch": 5.182186234817814, "grad_norm": 1.5809475183486938, "learning_rate": 0.00018964492956747425, "loss": 0.966, "step": 240 }, { "epoch": 5.614035087719298, "grad_norm": 1.6080421209335327, "learning_rate": 0.00018654783324473137, "loss": 0.8655, "step": 260 }, { "epoch": 5.98110661268556, "eval_loss": 0.979802131652832, "eval_runtime": 11.7244, "eval_samples_per_second": 31.729, "eval_steps_per_second": 4.009, "step": 277 }, { "epoch": 6.045883940620783, "grad_norm": 1.3081737756729126, "learning_rate": 0.00018307857731623132, "loss": 0.8181, "step": 280 }, { "epoch": 6.477732793522267, "grad_norm": 1.4086545705795288, "learning_rate": 0.0001792520797397116, "loss": 0.689, "step": 300 }, { "epoch": 6.909581646423752, "grad_norm": 1.4386905431747437, "learning_rate": 0.00017508479462818833, "loss": 0.768, "step": 320 }, { "epoch": 6.995951417004049, "eval_loss": 1.0414738655090332, "eval_runtime": 11.7025, "eval_samples_per_second": 31.788, "eval_steps_per_second": 4.016, "step": 324 }, { "epoch": 7.341430499325236, "grad_norm": 1.4608936309814453, "learning_rate": 0.00017059464149652448, "loss": 0.6351, "step": 340 }, { "epoch": 7.77327935222672, "grad_norm": 1.4568973779678345, "learning_rate": 0.0001658009282067036, "loss": 0.6611, "step": 360 }, { "epoch": 7.989203778677463, "eval_loss": 1.0716439485549927, "eval_runtime": 11.702, "eval_samples_per_second": 31.789, "eval_steps_per_second": 4.016, "step": 370 }, { "epoch": 8.205128205128204, "grad_norm": 1.7085548639297485, "learning_rate": 0.00016072426794314836, "loss": 0.5859, "step": 380 }, { "epoch": 8.63697705802969, "grad_norm": 1.7092846632003784, "learning_rate": 0.00015538649057509306, "loss": 0.5377, "step": 400 }, { "epoch": 8.982456140350877, "eval_loss": 1.1437946557998657, "eval_runtime": 11.7191, "eval_samples_per_second": 31.743, "eval_steps_per_second": 4.011, "step": 416 }, { "epoch": 9.068825910931174, "grad_norm": 1.2362638711929321, "learning_rate": 0.0001498105487871566, "loss": 0.5412, "step": 420 }, { "epoch": 9.50067476383266, "grad_norm": 1.3204894065856934, "learning_rate": 0.00014402041938175826, "loss": 0.454, "step": 440 }, { "epoch": 9.932523616734143, "grad_norm": 1.385745882987976, "learning_rate": 0.00013804100017778047, "loss": 0.4787, "step": 460 }, { "epoch": 9.997300944669366, "eval_loss": 1.2027604579925537, "eval_runtime": 11.7626, "eval_samples_per_second": 31.626, "eval_steps_per_second": 3.996, "step": 463 }, { "epoch": 10.364372469635628, "grad_norm": 1.4514780044555664, "learning_rate": 0.00013189800294881925, "loss": 0.3897, "step": 480 }, { "epoch": 10.796221322537113, "grad_norm": 1.575695276260376, "learning_rate": 0.00012561784286139213, "loss": 0.3861, "step": 500 }, { "epoch": 10.99055330634278, "eval_loss": 1.2678759098052979, "eval_runtime": 11.7185, "eval_samples_per_second": 31.745, "eval_steps_per_second": 4.011, "step": 509 } ], "logging_steps": 20, "max_steps": 1058, "num_input_tokens_seen": 0, "num_train_epochs": 23, "save_steps": 500, "total_flos": 4.524072585417523e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }