{ "best_metric": 1.0718672275543213, "best_model_checkpoint": "outputs/checkpoint-602", "epoch": 16.885290148448043, "eval_steps": 500, "global_step": 782, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4318488529014845, "grad_norm": 0.9060143232345581, "learning_rate": 6e-06, "loss": 2.3403, "step": 20 }, { "epoch": 0.863697705802969, "grad_norm": 0.6521434187889099, "learning_rate": 1.2e-05, "loss": 2.2954, "step": 40 }, { "epoch": 0.9932523616734144, "eval_loss": 2.0418646335601807, "eval_runtime": 11.606, "eval_samples_per_second": 32.052, "eval_steps_per_second": 4.05, "step": 46 }, { "epoch": 1.2955465587044535, "grad_norm": 0.5583148002624512, "learning_rate": 1.8e-05, "loss": 2.1939, "step": 60 }, { "epoch": 1.7273954116059378, "grad_norm": 0.5125463008880615, "learning_rate": 2.4e-05, "loss": 2.011, "step": 80 }, { "epoch": 1.9865047233468287, "eval_loss": 1.755987524986267, "eval_runtime": 11.6593, "eval_samples_per_second": 31.906, "eval_steps_per_second": 4.031, "step": 92 }, { "epoch": 2.1592442645074224, "grad_norm": 0.6073498725891113, "learning_rate": 3e-05, "loss": 1.9537, "step": 100 }, { "epoch": 2.591093117408907, "grad_norm": 0.5852423906326294, "learning_rate": 2.9936387121454473e-05, "loss": 1.848, "step": 120 }, { "epoch": 2.979757085020243, "eval_loss": 1.5818973779678345, "eval_runtime": 11.6353, "eval_samples_per_second": 31.972, "eval_steps_per_second": 4.039, "step": 138 }, { "epoch": 3.0229419703103915, "grad_norm": 0.9267003536224365, "learning_rate": 2.9746088032260134e-05, "loss": 1.8373, "step": 140 }, { "epoch": 3.454790823211876, "grad_norm": 0.929217517375946, "learning_rate": 2.9430716795463416e-05, "loss": 1.7299, "step": 160 }, { "epoch": 3.8866396761133606, "grad_norm": 0.9659837484359741, "learning_rate": 2.8992948300688734e-05, "loss": 1.6946, "step": 180 }, { "epoch": 3.9946018893387314, "eval_loss": 1.4404759407043457, "eval_runtime": 11.6425, "eval_samples_per_second": 31.952, "eval_steps_per_second": 4.037, "step": 185 }, { "epoch": 4.318488529014845, "grad_norm": 1.2371968030929565, "learning_rate": 2.843649557648131e-05, "loss": 1.6049, "step": 200 }, { "epoch": 4.75033738191633, "grad_norm": 1.4866975545883179, "learning_rate": 2.776607829744932e-05, "loss": 1.5971, "step": 220 }, { "epoch": 4.987854251012146, "eval_loss": 1.325922966003418, "eval_runtime": 11.6354, "eval_samples_per_second": 31.971, "eval_steps_per_second": 4.039, "step": 231 }, { "epoch": 5.182186234817814, "grad_norm": 1.290022611618042, "learning_rate": 2.6987382753318885e-05, "loss": 1.5523, "step": 240 }, { "epoch": 5.614035087719298, "grad_norm": 1.576865315437317, "learning_rate": 2.6107013619433027e-05, "loss": 1.4756, "step": 260 }, { "epoch": 5.98110661268556, "eval_loss": 1.2389355897903442, "eval_runtime": 11.6564, "eval_samples_per_second": 31.914, "eval_steps_per_second": 4.032, "step": 277 }, { "epoch": 6.045883940620783, "grad_norm": 1.606406569480896, "learning_rate": 2.513243793776364e-05, "loss": 1.4332, "step": 280 }, { "epoch": 6.477732793522267, "grad_norm": 1.5990298986434937, "learning_rate": 2.407192178357357e-05, "loss": 1.3462, "step": 300 }, { "epoch": 6.909581646423752, "grad_norm": 1.7616294622421265, "learning_rate": 2.2934460154904436e-05, "loss": 1.3916, "step": 320 }, { "epoch": 6.995951417004049, "eval_loss": 1.1750843524932861, "eval_runtime": 11.6363, "eval_samples_per_second": 31.969, "eval_steps_per_second": 4.039, "step": 324 }, { "epoch": 7.341430499325236, "grad_norm": 2.2005562782287598, "learning_rate": 2.1729700679547537e-05, "loss": 1.3007, "step": 340 }, { "epoch": 7.77327935222672, "grad_norm": 2.0084428787231445, "learning_rate": 2.0467861786593858e-05, "loss": 1.3156, "step": 360 }, { "epoch": 7.989203778677463, "eval_loss": 1.1366231441497803, "eval_runtime": 11.652, "eval_samples_per_second": 31.926, "eval_steps_per_second": 4.034, "step": 370 }, { "epoch": 8.205128205128204, "grad_norm": 2.2589313983917236, "learning_rate": 1.915964603660893e-05, "loss": 1.265, "step": 380 }, { "epoch": 8.63697705802969, "grad_norm": 2.530799627304077, "learning_rate": 1.7816149345541454e-05, "loss": 1.2353, "step": 400 }, { "epoch": 8.982456140350877, "eval_loss": 1.09785795211792, "eval_runtime": 11.6325, "eval_samples_per_second": 31.979, "eval_steps_per_second": 4.04, "step": 416 }, { "epoch": 9.068825910931174, "grad_norm": 1.9402692317962646, "learning_rate": 1.6448766872302767e-05, "loss": 1.2324, "step": 420 }, { "epoch": 9.50067476383266, "grad_norm": 2.265495538711548, "learning_rate": 1.5069096368252277e-05, "loss": 1.1746, "step": 440 }, { "epoch": 9.932523616734143, "grad_norm": 2.078183174133301, "learning_rate": 1.368883980835091e-05, "loss": 1.2004, "step": 460 }, { "epoch": 9.997300944669366, "eval_loss": 1.0847485065460205, "eval_runtime": 11.6602, "eval_samples_per_second": 31.903, "eval_steps_per_second": 4.031, "step": 463 }, { "epoch": 10.364372469635628, "grad_norm": 2.5572941303253174, "learning_rate": 1.2319704138319558e-05, "loss": 1.1473, "step": 480 }, { "epoch": 10.796221322537113, "grad_norm": 2.599616050720215, "learning_rate": 1.0973301979636888e-05, "loss": 1.1167, "step": 500 }, { "epoch": 10.99055330634278, "eval_loss": 1.075912356376648, "eval_runtime": 11.6432, "eval_samples_per_second": 31.95, "eval_steps_per_second": 4.037, "step": 509 }, { "epoch": 11.228070175438596, "grad_norm": 2.4529025554656982, "learning_rate": 9.66105313456874e-06, "loss": 1.169, "step": 520 }, { "epoch": 11.65991902834008, "grad_norm": 2.9774246215820312, "learning_rate": 8.394087726635483e-06, "loss": 1.1259, "step": 540 }, { "epoch": 11.983805668016194, "eval_loss": 1.0757372379302979, "eval_runtime": 11.6211, "eval_samples_per_second": 32.011, "eval_steps_per_second": 4.044, "step": 555 }, { "epoch": 12.091767881241566, "grad_norm": 2.654956340789795, "learning_rate": 7.183151798052628e-06, "loss": 1.098, "step": 560 }, { "epoch": 12.523616734143049, "grad_norm": 2.110180139541626, "learning_rate": 6.038516164840341e-06, "loss": 1.0905, "step": 580 }, { "epoch": 12.955465587044534, "grad_norm": 3.4599268436431885, "learning_rate": 4.969889302667217e-06, "loss": 1.0781, "step": 600 }, { "epoch": 12.998650472334683, "eval_loss": 1.0718672275543213, "eval_runtime": 11.6347, "eval_samples_per_second": 31.973, "eval_steps_per_second": 4.04, "step": 602 }, { "epoch": 13.387314439946019, "grad_norm": 2.49025821685791, "learning_rate": 3.986335002305783e-06, "loss": 1.0559, "step": 620 }, { "epoch": 13.819163292847504, "grad_norm": 2.6091041564941406, "learning_rate": 3.0961954931229385e-06, "loss": 1.1183, "step": 640 }, { "epoch": 13.991902834008098, "eval_loss": 1.0730143785476685, "eval_runtime": 11.6275, "eval_samples_per_second": 31.993, "eval_steps_per_second": 4.042, "step": 648 }, { "epoch": 14.251012145748987, "grad_norm": 2.651367425918579, "learning_rate": 2.307020686650187e-06, "loss": 1.0891, "step": 660 }, { "epoch": 14.682860998650472, "grad_norm": 2.513788938522339, "learning_rate": 1.6255041403695748e-06, "loss": 1.058, "step": 680 }, { "epoch": 14.98515519568151, "eval_loss": 1.074668288230896, "eval_runtime": 11.6329, "eval_samples_per_second": 31.978, "eval_steps_per_second": 4.04, "step": 694 }, { "epoch": 15.114709851551957, "grad_norm": 2.880951166152954, "learning_rate": 1.0574262848524725e-06, "loss": 1.0858, "step": 700 }, { "epoch": 15.54655870445344, "grad_norm": 2.6492371559143066, "learning_rate": 6.076053957825411e-07, "loss": 1.0445, "step": 720 }, { "epoch": 15.978407557354926, "grad_norm": 2.578446626663208, "learning_rate": 2.798567267042601e-07, "loss": 1.1116, "step": 740 }, { "epoch": 16.0, "eval_loss": 1.0725425481796265, "eval_runtime": 11.6476, "eval_samples_per_second": 31.938, "eval_steps_per_second": 4.035, "step": 741 }, { "epoch": 16.41025641025641, "grad_norm": 2.4758195877075195, "learning_rate": 7.696014912157267e-08, "loss": 1.0335, "step": 760 }, { "epoch": 16.842105263157894, "grad_norm": 2.5701375007629395, "learning_rate": 6.365744140879315e-10, "loss": 1.0869, "step": 780 }, { "epoch": 16.885290148448043, "eval_loss": 1.0733098983764648, "eval_runtime": 11.6223, "eval_samples_per_second": 32.008, "eval_steps_per_second": 4.044, "step": 782 } ], "logging_steps": 20, "max_steps": 782, "num_input_tokens_seen": 0, "num_train_epochs": 17, "save_steps": 500, "total_flos": 6.955404634570752e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }