{ "best_metric": 1.0532242059707642, "best_model_checkpoint": "outputs/checkpoint-648", "epoch": 19.865047233468285, "eval_steps": 500, "global_step": 920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4318488529014845, "grad_norm": 0.8586685061454773, "learning_rate": 6e-06, "loss": 2.3406, "step": 20 }, { "epoch": 0.863697705802969, "grad_norm": 0.6262326240539551, "learning_rate": 1.2e-05, "loss": 2.2977, "step": 40 }, { "epoch": 0.9932523616734144, "eval_loss": 2.046743154525757, "eval_runtime": 11.8178, "eval_samples_per_second": 31.478, "eval_steps_per_second": 3.977, "step": 46 }, { "epoch": 1.2955465587044535, "grad_norm": 0.5441118478775024, "learning_rate": 1.8e-05, "loss": 2.1983, "step": 60 }, { "epoch": 1.7273954116059378, "grad_norm": 0.5027536153793335, "learning_rate": 2.4e-05, "loss": 2.0126, "step": 80 }, { "epoch": 1.9865047233468287, "eval_loss": 1.756650686264038, "eval_runtime": 11.8327, "eval_samples_per_second": 31.438, "eval_steps_per_second": 3.972, "step": 92 }, { "epoch": 2.1592442645074224, "grad_norm": 0.5985815525054932, "learning_rate": 3e-05, "loss": 1.9539, "step": 100 }, { "epoch": 2.591093117408907, "grad_norm": 0.5876953601837158, "learning_rate": 2.9955987017756105e-05, "loss": 1.8487, "step": 120 }, { "epoch": 2.979757085020243, "eval_loss": 1.5829453468322754, "eval_runtime": 11.8342, "eval_samples_per_second": 31.434, "eval_steps_per_second": 3.972, "step": 138 }, { "epoch": 3.0229419703103915, "grad_norm": 0.9397473931312561, "learning_rate": 2.982420635670523e-05, "loss": 1.838, "step": 140 }, { "epoch": 3.454790823211876, "grad_norm": 0.9532083868980408, "learning_rate": 2.9605431358166687e-05, "loss": 1.7307, "step": 160 }, { "epoch": 3.8866396761133606, "grad_norm": 0.9994444847106934, "learning_rate": 2.9300945880823957e-05, "loss": 1.6956, "step": 180 }, { "epoch": 3.9946018893387314, "eval_loss": 1.4417527914047241, "eval_runtime": 11.8388, "eval_samples_per_second": 31.422, "eval_steps_per_second": 3.97, "step": 185 }, { "epoch": 4.318488529014845, "grad_norm": 1.2850545644760132, "learning_rate": 2.8912536766531424e-05, "loss": 1.6058, "step": 200 }, { "epoch": 4.75033738191633, "grad_norm": 1.5529141426086426, "learning_rate": 2.8442483354415835e-05, "loss": 1.5981, "step": 220 }, { "epoch": 4.987854251012146, "eval_loss": 1.3265655040740967, "eval_runtime": 11.8379, "eval_samples_per_second": 31.424, "eval_steps_per_second": 3.97, "step": 231 }, { "epoch": 5.182186234817814, "grad_norm": 1.335929036140442, "learning_rate": 2.789354410480802e-05, "loss": 1.5531, "step": 240 }, { "epoch": 5.614035087719298, "grad_norm": 1.6453096866607666, "learning_rate": 2.7268940411500768e-05, "loss": 1.4762, "step": 260 }, { "epoch": 5.98110661268556, "eval_loss": 1.2389419078826904, "eval_runtime": 11.8303, "eval_samples_per_second": 31.445, "eval_steps_per_second": 3.973, "step": 277 }, { "epoch": 6.045883940620783, "grad_norm": 1.6546735763549805, "learning_rate": 2.6572337697329145e-05, "loss": 1.4335, "step": 280 }, { "epoch": 6.477732793522267, "grad_norm": 1.6672505140304565, "learning_rate": 2.5807823904011803e-05, "loss": 1.3456, "step": 300 }, { "epoch": 6.909581646423752, "grad_norm": 1.872035264968872, "learning_rate": 2.497988550248348e-05, "loss": 1.3907, "step": 320 }, { "epoch": 6.995951417004049, "eval_loss": 1.1727797985076904, "eval_runtime": 11.8334, "eval_samples_per_second": 31.436, "eval_steps_per_second": 3.972, "step": 324 }, { "epoch": 7.341430499325236, "grad_norm": 2.2928590774536133, "learning_rate": 2.4093381164499572e-05, "loss": 1.2972, "step": 340 }, { "epoch": 7.77327935222672, "grad_norm": 2.046597957611084, "learning_rate": 2.315351325001832e-05, "loss": 1.3121, "step": 360 }, { "epoch": 7.989203778677463, "eval_loss": 1.1317445039749146, "eval_runtime": 11.8354, "eval_samples_per_second": 31.431, "eval_steps_per_second": 3.971, "step": 370 }, { "epoch": 8.205128205128204, "grad_norm": 2.3657939434051514, "learning_rate": 2.2165797277683945e-05, "loss": 1.2587, "step": 380 }, { "epoch": 8.63697705802969, "grad_norm": 2.6628217697143555, "learning_rate": 2.11360295575701e-05, "loss": 1.2259, "step": 400 }, { "epoch": 8.982456140350877, "eval_loss": 1.089838981628418, "eval_runtime": 11.8331, "eval_samples_per_second": 31.437, "eval_steps_per_second": 3.972, "step": 416 }, { "epoch": 9.068825910931174, "grad_norm": 2.126490354537964, "learning_rate": 2.007025317612754e-05, "loss": 1.2231, "step": 420 }, { "epoch": 9.50067476383266, "grad_norm": 2.33013653755188, "learning_rate": 1.897472253294993e-05, "loss": 1.1601, "step": 440 }, { "epoch": 9.932523616734143, "grad_norm": 2.1320955753326416, "learning_rate": 1.7855866637470027e-05, "loss": 1.1858, "step": 460 }, { "epoch": 9.997300944669366, "eval_loss": 1.0753756761550903, "eval_runtime": 11.8356, "eval_samples_per_second": 31.431, "eval_steps_per_second": 3.971, "step": 463 }, { "epoch": 10.364372469635628, "grad_norm": 2.7270896434783936, "learning_rate": 1.6720251380976008e-05, "loss": 1.1264, "step": 480 }, { "epoch": 10.796221322537113, "grad_norm": 2.817499876022339, "learning_rate": 1.557454100535053e-05, "loss": 1.0953, "step": 500 }, { "epoch": 10.99055330634278, "eval_loss": 1.0628381967544556, "eval_runtime": 11.8346, "eval_samples_per_second": 31.433, "eval_steps_per_second": 3.971, "step": 509 }, { "epoch": 11.228070175438596, "grad_norm": 2.4983890056610107, "learning_rate": 1.442545899464947e-05, "loss": 1.1448, "step": 520 }, { "epoch": 11.65991902834008, "grad_norm": 3.187267303466797, "learning_rate": 1.3279748619023994e-05, "loss": 1.0969, "step": 540 }, { "epoch": 11.983805668016194, "eval_loss": 1.0652039051055908, "eval_runtime": 11.8314, "eval_samples_per_second": 31.442, "eval_steps_per_second": 3.972, "step": 555 }, { "epoch": 12.091767881241566, "grad_norm": 2.7236690521240234, "learning_rate": 1.2144133362529972e-05, "loss": 1.0659, "step": 560 }, { "epoch": 12.523616734143049, "grad_norm": 2.291569232940674, "learning_rate": 1.1025277467050077e-05, "loss": 1.0509, "step": 580 }, { "epoch": 12.955465587044534, "grad_norm": 3.7598021030426025, "learning_rate": 9.929746823872461e-06, "loss": 1.0398, "step": 600 }, { "epoch": 12.998650472334683, "eval_loss": 1.0660185813903809, "eval_runtime": 11.8359, "eval_samples_per_second": 31.43, "eval_steps_per_second": 3.971, "step": 602 }, { "epoch": 13.387314439946019, "grad_norm": 2.636166572570801, "learning_rate": 8.863970442429903e-06, "loss": 1.0089, "step": 620 }, { "epoch": 13.819163292847504, "grad_norm": 2.8337323665618896, "learning_rate": 7.834202722316054e-06, "loss": 1.0692, "step": 640 }, { "epoch": 13.991902834008098, "eval_loss": 1.0532242059707642, "eval_runtime": 11.8398, "eval_samples_per_second": 31.419, "eval_steps_per_second": 3.97, "step": 648 }, { "epoch": 14.251012145748987, "grad_norm": 3.053009033203125, "learning_rate": 6.846486749981685e-06, "loss": 1.0366, "step": 660 }, { "epoch": 14.682860998650472, "grad_norm": 2.598278522491455, "learning_rate": 5.906618835500434e-06, "loss": 0.9994, "step": 680 }, { "epoch": 14.98515519568151, "eval_loss": 1.0662660598754883, "eval_runtime": 11.8811, "eval_samples_per_second": 31.31, "eval_steps_per_second": 3.956, "step": 694 }, { "epoch": 15.114709851551957, "grad_norm": 3.115391492843628, "learning_rate": 5.0201144975165215e-06, "loss": 1.0248, "step": 700 }, { "epoch": 15.54655870445344, "grad_norm": 2.9552927017211914, "learning_rate": 4.192176095988196e-06, "loss": 0.9757, "step": 720 }, { "epoch": 15.978407557354926, "grad_norm": 2.836552143096924, "learning_rate": 3.4276623026708552e-06, "loss": 1.0462, "step": 740 }, { "epoch": 16.0, "eval_loss": 1.0600887537002563, "eval_runtime": 11.8762, "eval_samples_per_second": 31.323, "eval_steps_per_second": 3.957, "step": 741 }, { "epoch": 16.41025641025641, "grad_norm": 2.7339868545532227, "learning_rate": 2.7310595884992356e-06, "loss": 0.956, "step": 760 }, { "epoch": 16.842105263157894, "grad_norm": 2.860797882080078, "learning_rate": 2.1064558951919854e-06, "loss": 1.0121, "step": 780 }, { "epoch": 16.993252361673413, "eval_loss": 1.067551851272583, "eval_runtime": 11.9433, "eval_samples_per_second": 31.147, "eval_steps_per_second": 3.935, "step": 787 }, { "epoch": 17.27395411605938, "grad_norm": 3.2689273357391357, "learning_rate": 1.5575166455841678e-06, "loss": 1.0109, "step": 800 }, { "epoch": 17.705802968960864, "grad_norm": 2.9255502223968506, "learning_rate": 1.0874632334685808e-06, "loss": 0.926, "step": 820 }, { "epoch": 17.98650472334683, "eval_loss": 1.0671662092208862, "eval_runtime": 11.9354, "eval_samples_per_second": 31.168, "eval_steps_per_second": 3.938, "step": 833 }, { "epoch": 18.13765182186235, "grad_norm": 2.799520492553711, "learning_rate": 6.990541191760419e-07, "loss": 1.015, "step": 840 }, { "epoch": 18.569500674763834, "grad_norm": 3.2285776138305664, "learning_rate": 3.945686418333155e-07, "loss": 0.9512, "step": 860 }, { "epoch": 18.979757085020243, "eval_loss": 1.0708719491958618, "eval_runtime": 11.9397, "eval_samples_per_second": 31.157, "eval_steps_per_second": 3.936, "step": 879 }, { "epoch": 19.00134952766532, "grad_norm": 2.990569591522217, "learning_rate": 1.7579364329477376e-07, "loss": 0.9571, "step": 880 }, { "epoch": 19.4331983805668, "grad_norm": 3.1629343032836914, "learning_rate": 4.401298224389338e-08, "loss": 0.9524, "step": 900 }, { "epoch": 19.865047233468285, "grad_norm": 3.00166392326355, "learning_rate": 0.0, "loss": 0.976, "step": 920 }, { "epoch": 19.865047233468285, "eval_loss": 1.0710477828979492, "eval_runtime": 11.9108, "eval_samples_per_second": 31.232, "eval_steps_per_second": 3.946, "step": 920 } ], "logging_steps": 20, "max_steps": 920, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 8.174619616582042e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }