{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9989389920424403, "eval_steps": 100, "global_step": 942, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010610079575596816, "grad_norm": 0.07623420818069793, "learning_rate": 2.631578947368421e-06, "loss": 1.0589, "step": 5 }, { "epoch": 0.021220159151193633, "grad_norm": 0.07205664213007502, "learning_rate": 5.263157894736842e-06, "loss": 1.0432, "step": 10 }, { "epoch": 0.03183023872679045, "grad_norm": 0.07626950292061684, "learning_rate": 7.894736842105263e-06, "loss": 1.0265, "step": 15 }, { "epoch": 0.042440318302387266, "grad_norm": 0.08705716193396125, "learning_rate": 1.0526315789473684e-05, "loss": 1.0666, "step": 20 }, { "epoch": 0.05305039787798409, "grad_norm": 0.08703221369797066, "learning_rate": 1.3157894736842106e-05, "loss": 1.0659, "step": 25 }, { "epoch": 0.0636604774535809, "grad_norm": 0.07909046201171775, "learning_rate": 1.5789473684210526e-05, "loss": 1.0403, "step": 30 }, { "epoch": 0.07427055702917772, "grad_norm": 0.06926697000806412, "learning_rate": 1.8421052631578947e-05, "loss": 1.0297, "step": 35 }, { "epoch": 0.08488063660477453, "grad_norm": 0.06368642008991743, "learning_rate": 2.105263157894737e-05, "loss": 1.0295, "step": 40 }, { "epoch": 0.09549071618037135, "grad_norm": 0.07386077960990603, "learning_rate": 2.368421052631579e-05, "loss": 1.0332, "step": 45 }, { "epoch": 0.10610079575596817, "grad_norm": 0.056736689937553715, "learning_rate": 2.6315789473684212e-05, "loss": 1.0529, "step": 50 }, { "epoch": 0.11671087533156499, "grad_norm": 0.054506530378128165, "learning_rate": 2.8947368421052634e-05, "loss": 1.0286, "step": 55 }, { "epoch": 0.1273209549071618, "grad_norm": 0.05335225285700553, "learning_rate": 3.157894736842105e-05, "loss": 1.0347, "step": 60 }, { "epoch": 0.13793103448275862, "grad_norm": 0.04664862379258267, "learning_rate": 3.421052631578947e-05, "loss": 1.0234, "step": 65 }, { "epoch": 0.14854111405835543, "grad_norm": 0.04835549496136054, "learning_rate": 3.6842105263157895e-05, "loss": 1.0313, "step": 70 }, { "epoch": 0.15915119363395225, "grad_norm": 0.04800181980380734, "learning_rate": 3.9473684210526316e-05, "loss": 1.0114, "step": 75 }, { "epoch": 0.16976127320954906, "grad_norm": 0.04677268624263905, "learning_rate": 4.210526315789474e-05, "loss": 1.0158, "step": 80 }, { "epoch": 0.18037135278514588, "grad_norm": 0.04581493024930908, "learning_rate": 4.473684210526316e-05, "loss": 1.0187, "step": 85 }, { "epoch": 0.1909814323607427, "grad_norm": 0.05373519219769595, "learning_rate": 4.736842105263158e-05, "loss": 1.0084, "step": 90 }, { "epoch": 0.20159151193633953, "grad_norm": 0.048548759937219306, "learning_rate": 5e-05, "loss": 1.0017, "step": 95 }, { "epoch": 0.21220159151193635, "grad_norm": 0.04992085587516446, "learning_rate": 4.999570096976961e-05, "loss": 1.0282, "step": 100 }, { "epoch": 0.21220159151193635, "eval_loss": 1.0028369426727295, "eval_runtime": 501.3877, "eval_samples_per_second": 26.734, "eval_steps_per_second": 1.671, "step": 100 }, { "epoch": 0.22281167108753316, "grad_norm": 0.0508347009907372, "learning_rate": 4.998280535761132e-05, "loss": 0.9892, "step": 105 }, { "epoch": 0.23342175066312998, "grad_norm": 0.05151620741113757, "learning_rate": 4.996131759861523e-05, "loss": 1.0344, "step": 110 }, { "epoch": 0.2440318302387268, "grad_norm": 0.051059943915805414, "learning_rate": 4.99312450829034e-05, "loss": 0.9892, "step": 115 }, { "epoch": 0.2546419098143236, "grad_norm": 0.05925864823530942, "learning_rate": 4.989259815308815e-05, "loss": 1.0061, "step": 120 }, { "epoch": 0.26525198938992045, "grad_norm": 0.05985023245860431, "learning_rate": 4.984539010071506e-05, "loss": 1.0031, "step": 125 }, { "epoch": 0.27586206896551724, "grad_norm": 0.05755101663405019, "learning_rate": 4.978963716169166e-05, "loss": 0.9758, "step": 130 }, { "epoch": 0.2864721485411141, "grad_norm": 0.06563201267943408, "learning_rate": 4.972535851070358e-05, "loss": 1.0091, "step": 135 }, { "epoch": 0.29708222811671087, "grad_norm": 0.05863974474653586, "learning_rate": 4.965257625461992e-05, "loss": 0.9945, "step": 140 }, { "epoch": 0.3076923076923077, "grad_norm": 0.05932391323295519, "learning_rate": 4.957131542489021e-05, "loss": 0.9934, "step": 145 }, { "epoch": 0.3183023872679045, "grad_norm": 0.06076133148832644, "learning_rate": 4.948160396893553e-05, "loss": 1.0224, "step": 150 }, { "epoch": 0.32891246684350134, "grad_norm": 0.059180711274686784, "learning_rate": 4.9383472740536785e-05, "loss": 0.9976, "step": 155 }, { "epoch": 0.3395225464190981, "grad_norm": 0.06405548051430922, "learning_rate": 4.927695548922335e-05, "loss": 1.0121, "step": 160 }, { "epoch": 0.35013262599469497, "grad_norm": 0.07064952148771018, "learning_rate": 4.916208884866593e-05, "loss": 1.0045, "step": 165 }, { "epoch": 0.36074270557029176, "grad_norm": 0.06166758944403884, "learning_rate": 4.9038912324077315e-05, "loss": 1.0037, "step": 170 }, { "epoch": 0.3713527851458886, "grad_norm": 0.06639359858554056, "learning_rate": 4.8907468278625747e-05, "loss": 1.0125, "step": 175 }, { "epoch": 0.3819628647214854, "grad_norm": 0.06726002562984101, "learning_rate": 4.876780191886523e-05, "loss": 1.0041, "step": 180 }, { "epoch": 0.3925729442970822, "grad_norm": 0.06791796540898594, "learning_rate": 4.861996127918798e-05, "loss": 0.9925, "step": 185 }, { "epoch": 0.40318302387267907, "grad_norm": 0.06940095499199088, "learning_rate": 4.846399720530434e-05, "loss": 0.9977, "step": 190 }, { "epoch": 0.41379310344827586, "grad_norm": 0.06478068174424859, "learning_rate": 4.8299963336755784e-05, "loss": 0.99, "step": 195 }, { "epoch": 0.4244031830238727, "grad_norm": 0.06917834842275485, "learning_rate": 4.81279160884671e-05, "loss": 1.0122, "step": 200 }, { "epoch": 0.4244031830238727, "eval_loss": 0.9916980266571045, "eval_runtime": 501.6523, "eval_samples_per_second": 26.72, "eval_steps_per_second": 1.67, "step": 200 }, { "epoch": 0.4350132625994695, "grad_norm": 0.06793364380058203, "learning_rate": 4.794791463134399e-05, "loss": 0.9895, "step": 205 }, { "epoch": 0.44562334217506633, "grad_norm": 0.07479898864312906, "learning_rate": 4.7760020871922914e-05, "loss": 1.0012, "step": 210 }, { "epoch": 0.4562334217506631, "grad_norm": 0.07448217111087417, "learning_rate": 4.7564299431080016e-05, "loss": 0.9883, "step": 215 }, { "epoch": 0.46684350132625996, "grad_norm": 0.07102920253452748, "learning_rate": 4.736081762180658e-05, "loss": 1.0064, "step": 220 }, { "epoch": 0.47745358090185674, "grad_norm": 0.0740137877754686, "learning_rate": 4.714964542605855e-05, "loss": 1.0064, "step": 225 }, { "epoch": 0.4880636604774536, "grad_norm": 0.06837487045141567, "learning_rate": 4.69308554706882e-05, "loss": 0.9942, "step": 230 }, { "epoch": 0.4986737400530504, "grad_norm": 0.07204206463584076, "learning_rate": 4.67045230024661e-05, "loss": 0.9857, "step": 235 }, { "epoch": 0.5092838196286472, "grad_norm": 0.07396728806729067, "learning_rate": 4.64707258622021e-05, "loss": 0.9769, "step": 240 }, { "epoch": 0.519893899204244, "grad_norm": 0.07411133999786101, "learning_rate": 4.622954445797409e-05, "loss": 0.9993, "step": 245 }, { "epoch": 0.5305039787798409, "grad_norm": 0.07672751862649076, "learning_rate": 4.5981061737473904e-05, "loss": 0.9958, "step": 250 }, { "epoch": 0.5411140583554377, "grad_norm": 0.07549422167935345, "learning_rate": 4.572536315947971e-05, "loss": 0.9801, "step": 255 }, { "epoch": 0.5517241379310345, "grad_norm": 0.07551777425138942, "learning_rate": 4.546253666446484e-05, "loss": 0.9937, "step": 260 }, { "epoch": 0.5623342175066313, "grad_norm": 0.07657102092212426, "learning_rate": 4.519267264435309e-05, "loss": 0.9951, "step": 265 }, { "epoch": 0.5729442970822282, "grad_norm": 0.07861407498734943, "learning_rate": 4.49158639114309e-05, "loss": 1.0039, "step": 270 }, { "epoch": 0.583554376657825, "grad_norm": 0.07564071240328456, "learning_rate": 4.463220566642715e-05, "loss": 0.9739, "step": 275 }, { "epoch": 0.5941644562334217, "grad_norm": 0.08386678281152504, "learning_rate": 4.434179546577146e-05, "loss": 0.9864, "step": 280 }, { "epoch": 0.6047745358090185, "grad_norm": 0.0802924700957904, "learning_rate": 4.4044733188042384e-05, "loss": 0.9894, "step": 285 }, { "epoch": 0.6153846153846154, "grad_norm": 0.08070465999022375, "learning_rate": 4.374112099961689e-05, "loss": 0.9876, "step": 290 }, { "epoch": 0.6259946949602122, "grad_norm": 0.08270967742116453, "learning_rate": 4.34310633195331e-05, "loss": 0.9979, "step": 295 }, { "epoch": 0.636604774535809, "grad_norm": 0.08009831243849455, "learning_rate": 4.3114666783578195e-05, "loss": 0.9884, "step": 300 }, { "epoch": 0.636604774535809, "eval_loss": 0.9869238138198853, "eval_runtime": 501.56, "eval_samples_per_second": 26.725, "eval_steps_per_second": 1.671, "step": 300 }, { "epoch": 0.6472148541114059, "grad_norm": 0.08352239373025634, "learning_rate": 4.2792040207614005e-05, "loss": 1.0002, "step": 305 }, { "epoch": 0.6578249336870027, "grad_norm": 0.07918803174823354, "learning_rate": 4.2463294550152786e-05, "loss": 0.9769, "step": 310 }, { "epoch": 0.6684350132625995, "grad_norm": 0.07978777758565579, "learning_rate": 4.212854287419611e-05, "loss": 0.9958, "step": 315 }, { "epoch": 0.6790450928381963, "grad_norm": 0.07751762504569489, "learning_rate": 4.1787900308349924e-05, "loss": 0.9857, "step": 320 }, { "epoch": 0.6896551724137931, "grad_norm": 0.0799008768730751, "learning_rate": 4.1441484007229314e-05, "loss": 0.9694, "step": 325 }, { "epoch": 0.7002652519893899, "grad_norm": 0.08036989463755223, "learning_rate": 4.108941311116634e-05, "loss": 0.9931, "step": 330 }, { "epoch": 0.7108753315649867, "grad_norm": 0.07942444342254458, "learning_rate": 4.073180870523503e-05, "loss": 0.9584, "step": 335 }, { "epoch": 0.7214854111405835, "grad_norm": 0.08600410405491565, "learning_rate": 4.0368793777607524e-05, "loss": 0.9904, "step": 340 }, { "epoch": 0.7320954907161804, "grad_norm": 0.07890904511326151, "learning_rate": 4.000049317725565e-05, "loss": 0.9867, "step": 345 }, { "epoch": 0.7427055702917772, "grad_norm": 0.08405665680455511, "learning_rate": 3.9627033571012586e-05, "loss": 0.9905, "step": 350 }, { "epoch": 0.753315649867374, "grad_norm": 0.07844813100517245, "learning_rate": 3.924854340000931e-05, "loss": 0.9901, "step": 355 }, { "epoch": 0.7639257294429708, "grad_norm": 0.08284034103558724, "learning_rate": 3.886515283550079e-05, "loss": 0.9806, "step": 360 }, { "epoch": 0.7745358090185677, "grad_norm": 0.08189138625693122, "learning_rate": 3.8476993734097155e-05, "loss": 0.9849, "step": 365 }, { "epoch": 0.7851458885941645, "grad_norm": 0.08045617857202153, "learning_rate": 3.8084199592415305e-05, "loss": 0.9618, "step": 370 }, { "epoch": 0.7957559681697612, "grad_norm": 0.08175433626726658, "learning_rate": 3.768690550116639e-05, "loss": 0.9827, "step": 375 }, { "epoch": 0.8063660477453581, "grad_norm": 0.08203297851648732, "learning_rate": 3.728524809869511e-05, "loss": 0.9896, "step": 380 }, { "epoch": 0.8169761273209549, "grad_norm": 0.08613988767270629, "learning_rate": 3.6879365523986706e-05, "loss": 1.0072, "step": 385 }, { "epoch": 0.8275862068965517, "grad_norm": 0.08663299741096588, "learning_rate": 3.646939736915786e-05, "loss": 0.9836, "step": 390 }, { "epoch": 0.8381962864721485, "grad_norm": 0.08220940782461005, "learning_rate": 3.605548463144786e-05, "loss": 0.9666, "step": 395 }, { "epoch": 0.8488063660477454, "grad_norm": 0.08065282059812733, "learning_rate": 3.563776966472649e-05, "loss": 0.9771, "step": 400 }, { "epoch": 0.8488063660477454, "eval_loss": 0.9840684533119202, "eval_runtime": 501.5164, "eval_samples_per_second": 26.727, "eval_steps_per_second": 1.671, "step": 400 }, { "epoch": 0.8594164456233422, "grad_norm": 0.08650228379554649, "learning_rate": 3.52163961305353e-05, "loss": 0.9998, "step": 405 }, { "epoch": 0.870026525198939, "grad_norm": 0.08387979383384966, "learning_rate": 3.479150894867926e-05, "loss": 0.9685, "step": 410 }, { "epoch": 0.8806366047745358, "grad_norm": 0.08721899001858337, "learning_rate": 3.436325424738549e-05, "loss": 0.9895, "step": 415 }, { "epoch": 0.8912466843501327, "grad_norm": 0.08653678608089069, "learning_rate": 3.3931779313046574e-05, "loss": 0.9861, "step": 420 }, { "epoch": 0.9018567639257294, "grad_norm": 0.08927106779743564, "learning_rate": 3.349723253956542e-05, "loss": 0.9879, "step": 425 }, { "epoch": 0.9124668435013262, "grad_norm": 0.09299794688285011, "learning_rate": 3.3059763377319294e-05, "loss": 0.9686, "step": 430 }, { "epoch": 0.9230769230769231, "grad_norm": 0.08250762921336344, "learning_rate": 3.261952228176044e-05, "loss": 0.9697, "step": 435 }, { "epoch": 0.9336870026525199, "grad_norm": 0.08464622856430148, "learning_rate": 3.217666066167117e-05, "loss": 0.991, "step": 440 }, { "epoch": 0.9442970822281167, "grad_norm": 0.08831657667074837, "learning_rate": 3.1731330827090865e-05, "loss": 0.9796, "step": 445 }, { "epoch": 0.9549071618037135, "grad_norm": 0.0875900719250125, "learning_rate": 3.128368593693325e-05, "loss": 0.9914, "step": 450 }, { "epoch": 0.9655172413793104, "grad_norm": 0.09632644759370569, "learning_rate": 3.083387994631154e-05, "loss": 0.9934, "step": 455 }, { "epoch": 0.9761273209549072, "grad_norm": 0.0890756143568433, "learning_rate": 3.0382067553589867e-05, "loss": 0.9927, "step": 460 }, { "epoch": 0.986737400530504, "grad_norm": 0.08262187491301902, "learning_rate": 2.992840414717899e-05, "loss": 0.9738, "step": 465 }, { "epoch": 0.9973474801061007, "grad_norm": 0.08825771969765614, "learning_rate": 2.9473045752094818e-05, "loss": 0.9789, "step": 470 }, { "epoch": 1.0079575596816976, "grad_norm": 0.08895467811064861, "learning_rate": 2.9016148976297832e-05, "loss": 0.9656, "step": 475 }, { "epoch": 1.0185676392572944, "grad_norm": 0.08772274514325573, "learning_rate": 2.8557870956832132e-05, "loss": 0.9702, "step": 480 }, { "epoch": 1.0291777188328912, "grad_norm": 0.08961456548197023, "learning_rate": 2.809836930578249e-05, "loss": 0.9916, "step": 485 }, { "epoch": 1.039787798408488, "grad_norm": 0.09423600327500739, "learning_rate": 2.7637802056068018e-05, "loss": 0.9879, "step": 490 }, { "epoch": 1.0503978779840848, "grad_norm": 0.08849197341901133, "learning_rate": 2.7176327607091075e-05, "loss": 0.9665, "step": 495 }, { "epoch": 1.0610079575596818, "grad_norm": 0.09073880855625993, "learning_rate": 2.671410467026021e-05, "loss": 0.9974, "step": 500 }, { "epoch": 1.0610079575596818, "eval_loss": 0.9823258519172668, "eval_runtime": 501.7907, "eval_samples_per_second": 26.712, "eval_steps_per_second": 1.67, "step": 500 }, { "epoch": 1.0716180371352786, "grad_norm": 0.09183816285643769, "learning_rate": 2.625129221440569e-05, "loss": 0.981, "step": 505 }, { "epoch": 1.0822281167108754, "grad_norm": 0.0946815619794241, "learning_rate": 2.578804941110664e-05, "loss": 1.0164, "step": 510 }, { "epoch": 1.0928381962864722, "grad_norm": 0.10079876133979877, "learning_rate": 2.5324535579948274e-05, "loss": 0.9685, "step": 515 }, { "epoch": 1.103448275862069, "grad_norm": 0.09357417526833312, "learning_rate": 2.4860910133728388e-05, "loss": 0.9723, "step": 520 }, { "epoch": 1.1140583554376657, "grad_norm": 0.09538254274812408, "learning_rate": 2.4397332523631684e-05, "loss": 0.9954, "step": 525 }, { "epoch": 1.1246684350132625, "grad_norm": 0.09322492263056234, "learning_rate": 2.393396218439097e-05, "loss": 0.9805, "step": 530 }, { "epoch": 1.1352785145888595, "grad_norm": 0.09381667388866403, "learning_rate": 2.3470958479453938e-05, "loss": 0.9785, "step": 535 }, { "epoch": 1.1458885941644563, "grad_norm": 0.0984535713170549, "learning_rate": 2.3008480646174534e-05, "loss": 0.9934, "step": 540 }, { "epoch": 1.156498673740053, "grad_norm": 0.0976123186479438, "learning_rate": 2.2546687741047645e-05, "loss": 0.9989, "step": 545 }, { "epoch": 1.16710875331565, "grad_norm": 0.09777674870174517, "learning_rate": 2.2085738585006024e-05, "loss": 0.9759, "step": 550 }, { "epoch": 1.1777188328912467, "grad_norm": 0.10035436324164396, "learning_rate": 2.1625791708798188e-05, "loss": 0.9783, "step": 555 }, { "epoch": 1.1883289124668435, "grad_norm": 0.09847514609619938, "learning_rate": 2.1167005298466156e-05, "loss": 0.9713, "step": 560 }, { "epoch": 1.1989389920424403, "grad_norm": 0.09672521232190742, "learning_rate": 2.0709537140941705e-05, "loss": 0.9654, "step": 565 }, { "epoch": 1.209549071618037, "grad_norm": 0.09847144343966878, "learning_rate": 2.0253544569779933e-05, "loss": 0.9661, "step": 570 }, { "epoch": 1.2201591511936338, "grad_norm": 0.0989651039802197, "learning_rate": 1.9799184411048695e-05, "loss": 0.9759, "step": 575 }, { "epoch": 1.2307692307692308, "grad_norm": 0.09888803396418354, "learning_rate": 1.9346612929392636e-05, "loss": 0.9804, "step": 580 }, { "epoch": 1.2413793103448276, "grad_norm": 0.09734456390328389, "learning_rate": 1.889598577429022e-05, "loss": 0.9807, "step": 585 }, { "epoch": 1.2519893899204244, "grad_norm": 0.10124183430533916, "learning_rate": 1.8447457926522454e-05, "loss": 0.9945, "step": 590 }, { "epoch": 1.2625994694960212, "grad_norm": 0.0991604825386959, "learning_rate": 1.800118364487146e-05, "loss": 0.9769, "step": 595 }, { "epoch": 1.273209549071618, "grad_norm": 0.09660029282065204, "learning_rate": 1.7557316413067488e-05, "loss": 0.9934, "step": 600 }, { "epoch": 1.273209549071618, "eval_loss": 0.9812601804733276, "eval_runtime": 501.6223, "eval_samples_per_second": 26.721, "eval_steps_per_second": 1.671, "step": 600 }, { "epoch": 1.2838196286472148, "grad_norm": 0.09872097381002237, "learning_rate": 1.7116008887002344e-05, "loss": 0.9622, "step": 605 }, { "epoch": 1.2944297082228116, "grad_norm": 0.09522222343603413, "learning_rate": 1.667741284222768e-05, "loss": 0.9831, "step": 610 }, { "epoch": 1.3050397877984086, "grad_norm": 0.1014944316104097, "learning_rate": 1.6241679121755914e-05, "loss": 0.9827, "step": 615 }, { "epoch": 1.3156498673740054, "grad_norm": 0.10149346811583228, "learning_rate": 1.5808957584181998e-05, "loss": 0.9754, "step": 620 }, { "epoch": 1.3262599469496021, "grad_norm": 0.0958760217857124, "learning_rate": 1.537939705214364e-05, "loss": 0.9897, "step": 625 }, { "epoch": 1.336870026525199, "grad_norm": 0.10206592940194624, "learning_rate": 1.4953145261137868e-05, "loss": 0.9797, "step": 630 }, { "epoch": 1.3474801061007957, "grad_norm": 0.10224543719732158, "learning_rate": 1.4530348808711508e-05, "loss": 0.9844, "step": 635 }, { "epoch": 1.3580901856763925, "grad_norm": 0.0998882895360292, "learning_rate": 1.4111153104042993e-05, "loss": 0.9955, "step": 640 }, { "epoch": 1.3687002652519893, "grad_norm": 0.09440011868080461, "learning_rate": 1.3695702317932862e-05, "loss": 0.998, "step": 645 }, { "epoch": 1.3793103448275863, "grad_norm": 0.10070400345672928, "learning_rate": 1.3284139333220207e-05, "loss": 0.9832, "step": 650 }, { "epoch": 1.389920424403183, "grad_norm": 0.10284785595425723, "learning_rate": 1.2876605695642086e-05, "loss": 0.9818, "step": 655 }, { "epoch": 1.4005305039787799, "grad_norm": 0.09907296614307597, "learning_rate": 1.247324156515271e-05, "loss": 0.976, "step": 660 }, { "epoch": 1.4111405835543767, "grad_norm": 0.09857949146993086, "learning_rate": 1.2074185667719353e-05, "loss": 0.9489, "step": 665 }, { "epoch": 1.4217506631299734, "grad_norm": 0.1027503153542044, "learning_rate": 1.1679575247611341e-05, "loss": 0.9765, "step": 670 }, { "epoch": 1.4323607427055702, "grad_norm": 0.09647369113256919, "learning_rate": 1.1289546020198719e-05, "loss": 0.9934, "step": 675 }, { "epoch": 1.442970822281167, "grad_norm": 0.09839399148271344, "learning_rate": 1.0904232125276609e-05, "loss": 0.9728, "step": 680 }, { "epoch": 1.453580901856764, "grad_norm": 0.10156504921281477, "learning_rate": 1.052376608093162e-05, "loss": 0.9811, "step": 685 }, { "epoch": 1.4641909814323608, "grad_norm": 0.10257219505892362, "learning_rate": 1.0148278737965845e-05, "loss": 0.9653, "step": 690 }, { "epoch": 1.4748010610079576, "grad_norm": 0.09960716843498467, "learning_rate": 9.777899234894387e-06, "loss": 0.9735, "step": 695 }, { "epoch": 1.4854111405835544, "grad_norm": 0.09941623700354961, "learning_rate": 9.412754953531663e-06, "loss": 0.9738, "step": 700 }, { "epoch": 1.4854111405835544, "eval_loss": 0.9805117249488831, "eval_runtime": 501.4908, "eval_samples_per_second": 26.728, "eval_steps_per_second": 1.671, "step": 700 }, { "epoch": 1.4960212201591512, "grad_norm": 0.09699569939298165, "learning_rate": 9.052971475182004e-06, "loss": 0.976, "step": 705 }, { "epoch": 1.506631299734748, "grad_norm": 0.1022949810528641, "learning_rate": 8.698672537449385e-06, "loss": 0.9678, "step": 710 }, { "epoch": 1.5172413793103448, "grad_norm": 0.09925245872600137, "learning_rate": 8.349979991681333e-06, "loss": 0.9707, "step": 715 }, { "epoch": 1.5278514588859418, "grad_norm": 0.0978077000211089, "learning_rate": 8.00701376106148e-06, "loss": 1.0067, "step": 720 }, { "epoch": 1.5384615384615383, "grad_norm": 0.09702110693617705, "learning_rate": 7.669891799365283e-06, "loss": 0.9753, "step": 725 }, { "epoch": 1.5490716180371353, "grad_norm": 0.10106616057187415, "learning_rate": 7.338730050393114e-06, "loss": 0.9703, "step": 730 }, { "epoch": 1.5596816976127321, "grad_norm": 0.10139301720670488, "learning_rate": 7.01364240809459e-06, "loss": 0.9804, "step": 735 }, { "epoch": 1.570291777188329, "grad_norm": 0.10543185047759056, "learning_rate": 6.694740677397845e-06, "loss": 0.9809, "step": 740 }, { "epoch": 1.5809018567639257, "grad_norm": 0.09810200524296686, "learning_rate": 6.382134535757339e-06, "loss": 0.9675, "step": 745 }, { "epoch": 1.5915119363395225, "grad_norm": 0.10063678881940345, "learning_rate": 6.075931495433315e-06, "loss": 0.9792, "step": 750 }, { "epoch": 1.6021220159151195, "grad_norm": 0.10751838250373763, "learning_rate": 5.776236866515947e-06, "loss": 0.9759, "step": 755 }, { "epoch": 1.612732095490716, "grad_norm": 0.10251266957258413, "learning_rate": 5.483153720706799e-06, "loss": 0.9756, "step": 760 }, { "epoch": 1.623342175066313, "grad_norm": 0.10576679802248473, "learning_rate": 5.19678285587018e-06, "loss": 0.9719, "step": 765 }, { "epoch": 1.6339522546419099, "grad_norm": 0.0979386252610659, "learning_rate": 4.917222761366477e-06, "loss": 1.0034, "step": 770 }, { "epoch": 1.6445623342175066, "grad_norm": 0.09861558820444138, "learning_rate": 4.644569584179509e-06, "loss": 0.9789, "step": 775 }, { "epoch": 1.6551724137931034, "grad_norm": 0.10337492990116616, "learning_rate": 4.3789170958493585e-06, "loss": 0.9751, "step": 780 }, { "epoch": 1.6657824933687002, "grad_norm": 0.0973808745211656, "learning_rate": 4.1203566602222745e-06, "loss": 0.9915, "step": 785 }, { "epoch": 1.6763925729442972, "grad_norm": 0.09761203241646615, "learning_rate": 3.868977202028581e-06, "loss": 0.973, "step": 790 }, { "epoch": 1.6870026525198938, "grad_norm": 0.10171457056462775, "learning_rate": 3.6248651762994995e-06, "loss": 0.9931, "step": 795 }, { "epoch": 1.6976127320954908, "grad_norm": 0.10461287792457526, "learning_rate": 3.38810453863328e-06, "loss": 0.9744, "step": 800 }, { "epoch": 1.6976127320954908, "eval_loss": 0.9801440834999084, "eval_runtime": 501.708, "eval_samples_per_second": 26.717, "eval_steps_per_second": 1.67, "step": 800 }, { "epoch": 1.7082228116710876, "grad_norm": 0.10531636550074536, "learning_rate": 3.1587767163210157e-06, "loss": 0.9817, "step": 805 }, { "epoch": 1.7188328912466844, "grad_norm": 0.10694585581807346, "learning_rate": 2.9369605803419715e-06, "loss": 0.9809, "step": 810 }, { "epoch": 1.7294429708222812, "grad_norm": 0.10505045842111997, "learning_rate": 2.7227324182380775e-06, "loss": 0.9654, "step": 815 }, { "epoch": 1.740053050397878, "grad_norm": 0.10697090247086784, "learning_rate": 2.5161659078769466e-06, "loss": 0.9634, "step": 820 }, { "epoch": 1.750663129973475, "grad_norm": 0.10337571820075339, "learning_rate": 2.317332092112384e-06, "loss": 0.9866, "step": 825 }, { "epoch": 1.7612732095490715, "grad_norm": 0.09992375037287819, "learning_rate": 2.1262993543511717e-06, "loss": 0.9833, "step": 830 }, { "epoch": 1.7718832891246685, "grad_norm": 0.1002631099347442, "learning_rate": 1.9431333950344855e-06, "loss": 0.9785, "step": 835 }, { "epoch": 1.782493368700265, "grad_norm": 0.10479627463164663, "learning_rate": 1.767897209042027e-06, "loss": 0.9752, "step": 840 }, { "epoch": 1.793103448275862, "grad_norm": 0.10197107953524158, "learning_rate": 1.6006510640266787e-06, "loss": 0.9846, "step": 845 }, { "epoch": 1.8037135278514589, "grad_norm": 0.10167121112531119, "learning_rate": 1.4414524796871027e-06, "loss": 0.9624, "step": 850 }, { "epoch": 1.8143236074270557, "grad_norm": 0.09922610672387365, "learning_rate": 1.2903562079854492e-06, "loss": 0.9817, "step": 855 }, { "epoch": 1.8249336870026527, "grad_norm": 0.1048631234042592, "learning_rate": 1.1474142143168832e-06, "loss": 1.0072, "step": 860 }, { "epoch": 1.8355437665782492, "grad_norm": 0.09750361912686112, "learning_rate": 1.0126756596375686e-06, "loss": 0.9737, "step": 865 }, { "epoch": 1.8461538461538463, "grad_norm": 0.10178011267304485, "learning_rate": 8.86186883557083e-07, "loss": 0.9675, "step": 870 }, { "epoch": 1.8567639257294428, "grad_norm": 0.10344721675019256, "learning_rate": 7.679913884012069e-07, "loss": 0.9718, "step": 875 }, { "epoch": 1.8673740053050398, "grad_norm": 0.09806593314586452, "learning_rate": 6.58129824250478e-07, "loss": 0.9787, "step": 880 }, { "epoch": 1.8779840848806366, "grad_norm": 0.10272666481587975, "learning_rate": 5.566399749597328e-07, "loss": 0.9511, "step": 885 }, { "epoch": 1.8885941644562334, "grad_norm": 0.10081265185788726, "learning_rate": 4.635567451633821e-07, "loss": 0.9849, "step": 890 }, { "epoch": 1.8992042440318302, "grad_norm": 0.10227171696148227, "learning_rate": 3.789121482709407e-07, "loss": 0.9765, "step": 895 }, { "epoch": 1.909814323607427, "grad_norm": 0.10128987382975041, "learning_rate": 3.027352954568713e-07, "loss": 0.9887, "step": 900 }, { "epoch": 1.909814323607427, "eval_loss": 0.9800187945365906, "eval_runtime": 501.7687, "eval_samples_per_second": 26.714, "eval_steps_per_second": 1.67, "step": 900 }, { "epoch": 1.920424403183024, "grad_norm": 0.10888174926639403, "learning_rate": 2.350523856486292e-07, "loss": 0.9813, "step": 905 }, { "epoch": 1.9310344827586206, "grad_norm": 0.09593621401437472, "learning_rate": 1.7588669651623368e-07, "loss": 0.9907, "step": 910 }, { "epoch": 1.9416445623342176, "grad_norm": 0.09834431957222971, "learning_rate": 1.2525857646658312e-07, "loss": 0.9686, "step": 915 }, { "epoch": 1.9522546419098143, "grad_norm": 0.09987466210480013, "learning_rate": 8.318543764516961e-08, "loss": 0.9882, "step": 920 }, { "epoch": 1.9628647214854111, "grad_norm": 0.10365551816637686, "learning_rate": 4.968174994764152e-08, "loss": 0.9643, "step": 925 }, { "epoch": 1.973474801061008, "grad_norm": 0.09717537741476147, "learning_rate": 2.4759036043300875e-08, "loss": 0.9801, "step": 930 }, { "epoch": 1.9840848806366047, "grad_norm": 0.09721331281181161, "learning_rate": 8.42586741219009e-09, "loss": 0.9604, "step": 935 }, { "epoch": 1.9946949602122017, "grad_norm": 0.10113910306283141, "learning_rate": 6.878613971583736e-10, "loss": 0.9835, "step": 940 }, { "epoch": 1.9989389920424403, "step": 942, "total_flos": 1.4259794694102843e+19, "train_loss": 0.9894429036513003, "train_runtime": 36869.1128, "train_samples_per_second": 6.544, "train_steps_per_second": 0.026 } ], "logging_steps": 5, "max_steps": 942, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4259794694102843e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }