{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9977011494252874, "eval_steps": 500, "global_step": 978, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03065134099616858, "grad_norm": 1.728469379207447, "learning_rate": 5e-06, "loss": 0.8116, "step": 10 }, { "epoch": 0.06130268199233716, "grad_norm": 10.819369936863422, "learning_rate": 5e-06, "loss": 0.7399, "step": 20 }, { "epoch": 0.09195402298850575, "grad_norm": 4.190454906116627, "learning_rate": 5e-06, "loss": 0.7334, "step": 30 }, { "epoch": 0.12260536398467432, "grad_norm": 1.3438381662790366, "learning_rate": 5e-06, "loss": 0.7064, "step": 40 }, { "epoch": 0.1532567049808429, "grad_norm": 0.8718769327333502, "learning_rate": 5e-06, "loss": 0.6887, "step": 50 }, { "epoch": 0.1839080459770115, "grad_norm": 1.005358100868816, "learning_rate": 5e-06, "loss": 0.6891, "step": 60 }, { "epoch": 0.21455938697318008, "grad_norm": 1.4650120091926921, "learning_rate": 5e-06, "loss": 0.6725, "step": 70 }, { "epoch": 0.24521072796934865, "grad_norm": 0.7835954045094561, "learning_rate": 5e-06, "loss": 0.6616, "step": 80 }, { "epoch": 0.27586206896551724, "grad_norm": 0.8866257551751919, "learning_rate": 5e-06, "loss": 0.6505, "step": 90 }, { "epoch": 0.3065134099616858, "grad_norm": 0.5918777617135533, "learning_rate": 5e-06, "loss": 0.6513, "step": 100 }, { "epoch": 0.3371647509578544, "grad_norm": 0.5197043260328372, "learning_rate": 5e-06, "loss": 0.6424, "step": 110 }, { "epoch": 0.367816091954023, "grad_norm": 1.3052113076963672, "learning_rate": 5e-06, "loss": 0.6453, "step": 120 }, { "epoch": 0.39846743295019155, "grad_norm": 1.1472080859635072, "learning_rate": 5e-06, "loss": 0.6473, "step": 130 }, { "epoch": 0.42911877394636017, "grad_norm": 0.6535118613806697, "learning_rate": 5e-06, "loss": 0.6464, "step": 140 }, { "epoch": 0.45977011494252873, "grad_norm": 0.5755555420983144, "learning_rate": 5e-06, "loss": 0.6502, "step": 150 }, { "epoch": 0.4904214559386973, "grad_norm": 0.5960503646265235, "learning_rate": 5e-06, "loss": 0.6399, "step": 160 }, { "epoch": 0.5210727969348659, "grad_norm": 0.6534430237511478, "learning_rate": 5e-06, "loss": 0.6404, "step": 170 }, { "epoch": 0.5517241379310345, "grad_norm": 0.479772716926196, "learning_rate": 5e-06, "loss": 0.6379, "step": 180 }, { "epoch": 0.5823754789272031, "grad_norm": 0.6700701447636983, "learning_rate": 5e-06, "loss": 0.6268, "step": 190 }, { "epoch": 0.6130268199233716, "grad_norm": 0.46981415838615953, "learning_rate": 5e-06, "loss": 0.6332, "step": 200 }, { "epoch": 0.6436781609195402, "grad_norm": 0.6740217268091045, "learning_rate": 5e-06, "loss": 0.6315, "step": 210 }, { "epoch": 0.6743295019157088, "grad_norm": 0.45781083259612837, "learning_rate": 5e-06, "loss": 0.6251, "step": 220 }, { "epoch": 0.7049808429118773, "grad_norm": 0.46419264516556447, "learning_rate": 5e-06, "loss": 0.6331, "step": 230 }, { "epoch": 0.735632183908046, "grad_norm": 0.46609321446559443, "learning_rate": 5e-06, "loss": 0.636, "step": 240 }, { "epoch": 0.7662835249042146, "grad_norm": 0.5350231771223092, "learning_rate": 5e-06, "loss": 0.6177, "step": 250 }, { "epoch": 0.7969348659003831, "grad_norm": 0.4743633834962204, "learning_rate": 5e-06, "loss": 0.6261, "step": 260 }, { "epoch": 0.8275862068965517, "grad_norm": 0.6677241895273329, "learning_rate": 5e-06, "loss": 0.6334, "step": 270 }, { "epoch": 0.8582375478927203, "grad_norm": 0.5408535187898951, "learning_rate": 5e-06, "loss": 0.628, "step": 280 }, { "epoch": 0.8888888888888888, "grad_norm": 0.6364909996205239, "learning_rate": 5e-06, "loss": 0.6304, "step": 290 }, { "epoch": 0.9195402298850575, "grad_norm": 0.5219626763836217, "learning_rate": 5e-06, "loss": 0.624, "step": 300 }, { "epoch": 0.9501915708812261, "grad_norm": 0.6504021944371435, "learning_rate": 5e-06, "loss": 0.6229, "step": 310 }, { "epoch": 0.9808429118773946, "grad_norm": 0.5113118633186393, "learning_rate": 5e-06, "loss": 0.6389, "step": 320 }, { "epoch": 0.9992337164750957, "eval_loss": 0.6248807907104492, "eval_runtime": 173.9145, "eval_samples_per_second": 50.525, "eval_steps_per_second": 0.397, "step": 326 }, { "epoch": 1.0114942528735633, "grad_norm": 0.6441733804420072, "learning_rate": 5e-06, "loss": 0.6049, "step": 330 }, { "epoch": 1.0421455938697317, "grad_norm": 0.507096490465927, "learning_rate": 5e-06, "loss": 0.5799, "step": 340 }, { "epoch": 1.0727969348659003, "grad_norm": 0.46519093469137246, "learning_rate": 5e-06, "loss": 0.5673, "step": 350 }, { "epoch": 1.103448275862069, "grad_norm": 0.5442582638655543, "learning_rate": 5e-06, "loss": 0.5712, "step": 360 }, { "epoch": 1.1340996168582376, "grad_norm": 0.5538652390132913, "learning_rate": 5e-06, "loss": 0.5716, "step": 370 }, { "epoch": 1.1647509578544062, "grad_norm": 0.48392472216552723, "learning_rate": 5e-06, "loss": 0.571, "step": 380 }, { "epoch": 1.1954022988505748, "grad_norm": 0.45417280073723987, "learning_rate": 5e-06, "loss": 0.5747, "step": 390 }, { "epoch": 1.2260536398467432, "grad_norm": 0.5583573730240281, "learning_rate": 5e-06, "loss": 0.5671, "step": 400 }, { "epoch": 1.2567049808429118, "grad_norm": 0.5207404064605624, "learning_rate": 5e-06, "loss": 0.573, "step": 410 }, { "epoch": 1.2873563218390804, "grad_norm": 0.7437830024244683, "learning_rate": 5e-06, "loss": 0.5782, "step": 420 }, { "epoch": 1.318007662835249, "grad_norm": 0.47334072928355125, "learning_rate": 5e-06, "loss": 0.5719, "step": 430 }, { "epoch": 1.3486590038314177, "grad_norm": 0.4856499631859895, "learning_rate": 5e-06, "loss": 0.5734, "step": 440 }, { "epoch": 1.3793103448275863, "grad_norm": 0.527600892251984, "learning_rate": 5e-06, "loss": 0.5721, "step": 450 }, { "epoch": 1.4099616858237547, "grad_norm": 0.7847888047362421, "learning_rate": 5e-06, "loss": 0.578, "step": 460 }, { "epoch": 1.4406130268199233, "grad_norm": 0.5862516700997007, "learning_rate": 5e-06, "loss": 0.5819, "step": 470 }, { "epoch": 1.471264367816092, "grad_norm": 0.5210251428696359, "learning_rate": 5e-06, "loss": 0.5742, "step": 480 }, { "epoch": 1.5019157088122606, "grad_norm": 0.5051694283145385, "learning_rate": 5e-06, "loss": 0.5757, "step": 490 }, { "epoch": 1.5325670498084292, "grad_norm": 0.4944291165015092, "learning_rate": 5e-06, "loss": 0.5676, "step": 500 }, { "epoch": 1.5632183908045976, "grad_norm": 0.5480611839354791, "learning_rate": 5e-06, "loss": 0.5711, "step": 510 }, { "epoch": 1.5938697318007664, "grad_norm": 0.6057348261647956, "learning_rate": 5e-06, "loss": 0.5761, "step": 520 }, { "epoch": 1.6245210727969348, "grad_norm": 0.4842987104881793, "learning_rate": 5e-06, "loss": 0.571, "step": 530 }, { "epoch": 1.6551724137931034, "grad_norm": 0.47443514901158346, "learning_rate": 5e-06, "loss": 0.5738, "step": 540 }, { "epoch": 1.685823754789272, "grad_norm": 0.47018891679148556, "learning_rate": 5e-06, "loss": 0.571, "step": 550 }, { "epoch": 1.7164750957854407, "grad_norm": 0.49622691394409835, "learning_rate": 5e-06, "loss": 0.5741, "step": 560 }, { "epoch": 1.7471264367816093, "grad_norm": 0.5557199806859595, "learning_rate": 5e-06, "loss": 0.5825, "step": 570 }, { "epoch": 1.7777777777777777, "grad_norm": 0.5102099459496233, "learning_rate": 5e-06, "loss": 0.575, "step": 580 }, { "epoch": 1.8084291187739465, "grad_norm": 0.5072094199992658, "learning_rate": 5e-06, "loss": 0.5719, "step": 590 }, { "epoch": 1.839080459770115, "grad_norm": 0.4947973206788556, "learning_rate": 5e-06, "loss": 0.5756, "step": 600 }, { "epoch": 1.8697318007662835, "grad_norm": 0.5908448445205707, "learning_rate": 5e-06, "loss": 0.5731, "step": 610 }, { "epoch": 1.9003831417624522, "grad_norm": 0.5096415926362652, "learning_rate": 5e-06, "loss": 0.5714, "step": 620 }, { "epoch": 1.9310344827586206, "grad_norm": 0.4450734203993437, "learning_rate": 5e-06, "loss": 0.5746, "step": 630 }, { "epoch": 1.9616858237547894, "grad_norm": 0.4789070006031585, "learning_rate": 5e-06, "loss": 0.5734, "step": 640 }, { "epoch": 1.9923371647509578, "grad_norm": 0.45516367538378033, "learning_rate": 5e-06, "loss": 0.5656, "step": 650 }, { "epoch": 1.9984674329501915, "eval_loss": 0.6165235638618469, "eval_runtime": 174.5393, "eval_samples_per_second": 50.344, "eval_steps_per_second": 0.395, "step": 652 }, { "epoch": 2.0229885057471266, "grad_norm": 0.7229863984265442, "learning_rate": 5e-06, "loss": 0.5381, "step": 660 }, { "epoch": 2.053639846743295, "grad_norm": 0.6598298631907643, "learning_rate": 5e-06, "loss": 0.5185, "step": 670 }, { "epoch": 2.0842911877394634, "grad_norm": 0.68582902786082, "learning_rate": 5e-06, "loss": 0.5182, "step": 680 }, { "epoch": 2.1149425287356323, "grad_norm": 0.5582955965512938, "learning_rate": 5e-06, "loss": 0.5165, "step": 690 }, { "epoch": 2.1455938697318007, "grad_norm": 0.5574612110577156, "learning_rate": 5e-06, "loss": 0.5215, "step": 700 }, { "epoch": 2.1762452107279695, "grad_norm": 0.5869240427384007, "learning_rate": 5e-06, "loss": 0.5142, "step": 710 }, { "epoch": 2.206896551724138, "grad_norm": 0.5917922238270269, "learning_rate": 5e-06, "loss": 0.5169, "step": 720 }, { "epoch": 2.2375478927203067, "grad_norm": 0.5584099299795768, "learning_rate": 5e-06, "loss": 0.5226, "step": 730 }, { "epoch": 2.268199233716475, "grad_norm": 0.5168618242941307, "learning_rate": 5e-06, "loss": 0.526, "step": 740 }, { "epoch": 2.2988505747126435, "grad_norm": 0.548452551643676, "learning_rate": 5e-06, "loss": 0.5274, "step": 750 }, { "epoch": 2.3295019157088124, "grad_norm": 0.5102986091253042, "learning_rate": 5e-06, "loss": 0.5201, "step": 760 }, { "epoch": 2.3601532567049808, "grad_norm": 0.5270431299308651, "learning_rate": 5e-06, "loss": 0.5261, "step": 770 }, { "epoch": 2.3908045977011496, "grad_norm": 0.49928113754253045, "learning_rate": 5e-06, "loss": 0.5188, "step": 780 }, { "epoch": 2.421455938697318, "grad_norm": 0.5526978578532159, "learning_rate": 5e-06, "loss": 0.5169, "step": 790 }, { "epoch": 2.4521072796934864, "grad_norm": 0.5512472751145416, "learning_rate": 5e-06, "loss": 0.528, "step": 800 }, { "epoch": 2.4827586206896552, "grad_norm": 0.5213408423447775, "learning_rate": 5e-06, "loss": 0.5184, "step": 810 }, { "epoch": 2.5134099616858236, "grad_norm": 0.5057448072220034, "learning_rate": 5e-06, "loss": 0.5172, "step": 820 }, { "epoch": 2.5440613026819925, "grad_norm": 0.5108411960779173, "learning_rate": 5e-06, "loss": 0.5206, "step": 830 }, { "epoch": 2.574712643678161, "grad_norm": 0.6573602606621571, "learning_rate": 5e-06, "loss": 0.5244, "step": 840 }, { "epoch": 2.6053639846743293, "grad_norm": 0.5606803819365237, "learning_rate": 5e-06, "loss": 0.5287, "step": 850 }, { "epoch": 2.636015325670498, "grad_norm": 0.5198211565475227, "learning_rate": 5e-06, "loss": 0.5246, "step": 860 }, { "epoch": 2.6666666666666665, "grad_norm": 0.6142828850111772, "learning_rate": 5e-06, "loss": 0.5253, "step": 870 }, { "epoch": 2.6973180076628354, "grad_norm": 0.4911449572882352, "learning_rate": 5e-06, "loss": 0.5266, "step": 880 }, { "epoch": 2.7279693486590038, "grad_norm": 0.5541387277544253, "learning_rate": 5e-06, "loss": 0.5207, "step": 890 }, { "epoch": 2.7586206896551726, "grad_norm": 0.529597237583542, "learning_rate": 5e-06, "loss": 0.5271, "step": 900 }, { "epoch": 2.789272030651341, "grad_norm": 0.527413846058275, "learning_rate": 5e-06, "loss": 0.5292, "step": 910 }, { "epoch": 2.8199233716475094, "grad_norm": 0.5127935237005165, "learning_rate": 5e-06, "loss": 0.5236, "step": 920 }, { "epoch": 2.8505747126436782, "grad_norm": 0.563060155094441, "learning_rate": 5e-06, "loss": 0.5256, "step": 930 }, { "epoch": 2.8812260536398466, "grad_norm": 0.5678860479436445, "learning_rate": 5e-06, "loss": 0.5293, "step": 940 }, { "epoch": 2.9118773946360155, "grad_norm": 0.5163021730870726, "learning_rate": 5e-06, "loss": 0.5299, "step": 950 }, { "epoch": 2.942528735632184, "grad_norm": 0.5461456019011041, "learning_rate": 5e-06, "loss": 0.521, "step": 960 }, { "epoch": 2.9731800766283527, "grad_norm": 0.530000023353534, "learning_rate": 5e-06, "loss": 0.5212, "step": 970 }, { "epoch": 2.9977011494252874, "eval_loss": 0.6206015944480896, "eval_runtime": 175.2408, "eval_samples_per_second": 50.142, "eval_steps_per_second": 0.394, "step": 978 }, { "epoch": 2.9977011494252874, "step": 978, "total_flos": 1637767666728960.0, "train_loss": 0.5840159120735215, "train_runtime": 29297.5545, "train_samples_per_second": 17.094, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 978, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1637767666728960.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }