|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9806451612903224, |
|
"eval_steps": 500, |
|
"global_step": 231, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012903225806451613, |
|
"grad_norm": 0.882150089808769, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.3191, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.025806451612903226, |
|
"grad_norm": 0.8369153094823952, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.249, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03870967741935484, |
|
"grad_norm": 0.8525103918091212, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2775, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.05161290322580645, |
|
"grad_norm": 0.8113130093304075, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.2577, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 0.7691226782403744, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.2275, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07741935483870968, |
|
"grad_norm": 0.5954210054804412, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1159, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.09032258064516129, |
|
"grad_norm": 0.48189256930049384, |
|
"learning_rate": 5.833333333333334e-05, |
|
"loss": 1.0593, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1032258064516129, |
|
"grad_norm": 0.5241879927945232, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.0031, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.11612903225806452, |
|
"grad_norm": 0.5751865259411146, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.9263, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 0.5686526755807603, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.8146, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.14193548387096774, |
|
"grad_norm": 0.5156906474251192, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 0.7583, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.15483870967741936, |
|
"grad_norm": 0.4901634328534619, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6686, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.16774193548387098, |
|
"grad_norm": 0.376084270046461, |
|
"learning_rate": 0.00010833333333333333, |
|
"loss": 0.6005, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.18064516129032257, |
|
"grad_norm": 0.2761318809240614, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 0.5741, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 0.25038763704461725, |
|
"learning_rate": 0.000125, |
|
"loss": 0.5465, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2064516129032258, |
|
"grad_norm": 0.2214903977106201, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.5138, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.21935483870967742, |
|
"grad_norm": 0.28905541505099525, |
|
"learning_rate": 0.00014166666666666668, |
|
"loss": 0.5247, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.23225806451612904, |
|
"grad_norm": 0.20699066633757193, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.4978, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.24516129032258063, |
|
"grad_norm": 0.219457528851344, |
|
"learning_rate": 0.00015833333333333332, |
|
"loss": 0.4924, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 0.16596853789220767, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.4759, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2709677419354839, |
|
"grad_norm": 0.13228412371333673, |
|
"learning_rate": 0.000175, |
|
"loss": 0.4613, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2838709677419355, |
|
"grad_norm": 0.1421107856190867, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.4852, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2967741935483871, |
|
"grad_norm": 0.12552928984887968, |
|
"learning_rate": 0.00019166666666666667, |
|
"loss": 0.4786, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3096774193548387, |
|
"grad_norm": 0.11489463060846784, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4532, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.11476879539402507, |
|
"learning_rate": 0.00019998848349441062, |
|
"loss": 0.4454, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.33548387096774196, |
|
"grad_norm": 0.1256602270101812, |
|
"learning_rate": 0.00019995393663024054, |
|
"loss": 0.4513, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.34838709677419355, |
|
"grad_norm": 0.11833482485698336, |
|
"learning_rate": 0.00019989636736467278, |
|
"loss": 0.44, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.36129032258064514, |
|
"grad_norm": 0.11124019681377781, |
|
"learning_rate": 0.00019981578895764273, |
|
"loss": 0.4439, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3741935483870968, |
|
"grad_norm": 0.10954971384477814, |
|
"learning_rate": 0.00019971221996878394, |
|
"loss": 0.4274, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 0.11422715129880294, |
|
"learning_rate": 0.00019958568425315314, |
|
"loss": 0.4254, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.11262310014016527, |
|
"learning_rate": 0.00019943621095573586, |
|
"loss": 0.4204, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.4129032258064516, |
|
"grad_norm": 0.11143099554463408, |
|
"learning_rate": 0.00019926383450473344, |
|
"loss": 0.4105, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4258064516129032, |
|
"grad_norm": 0.1088260973247734, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 0.4136, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.43870967741935485, |
|
"grad_norm": 0.10400753996611788, |
|
"learning_rate": 0.00019885053622206304, |
|
"loss": 0.4213, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 0.09587900896302251, |
|
"learning_rate": 0.0001986097095854347, |
|
"loss": 0.4085, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4645161290322581, |
|
"grad_norm": 0.10119603747308556, |
|
"learning_rate": 0.0001983461701633742, |
|
"loss": 0.4181, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.4774193548387097, |
|
"grad_norm": 0.10062413136253176, |
|
"learning_rate": 0.00019805997865694614, |
|
"loss": 0.4098, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.49032258064516127, |
|
"grad_norm": 0.09162394941720846, |
|
"learning_rate": 0.0001977512009846721, |
|
"loss": 0.4085, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5032258064516129, |
|
"grad_norm": 0.09269316443279575, |
|
"learning_rate": 0.00019741990826734794, |
|
"loss": 0.3994, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 0.08782581803238095, |
|
"learning_rate": 0.00019706617681166218, |
|
"loss": 0.3983, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5290322580645161, |
|
"grad_norm": 0.08665646987756218, |
|
"learning_rate": 0.00019669008809262062, |
|
"loss": 0.3938, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5419354838709678, |
|
"grad_norm": 0.09289388957990503, |
|
"learning_rate": 0.00019629172873477995, |
|
"loss": 0.396, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5548387096774193, |
|
"grad_norm": 0.09203344649472522, |
|
"learning_rate": 0.00019587119049229557, |
|
"loss": 0.4052, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.567741935483871, |
|
"grad_norm": 0.08209774194723368, |
|
"learning_rate": 0.0001954285702277879, |
|
"loss": 0.3959, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 0.08595872863630391, |
|
"learning_rate": 0.00019496396989003193, |
|
"loss": 0.397, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5935483870967742, |
|
"grad_norm": 0.09041908237644536, |
|
"learning_rate": 0.00019447749649047542, |
|
"loss": 0.3992, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6064516129032258, |
|
"grad_norm": 0.08321976348844515, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.4095, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.6193548387096774, |
|
"grad_norm": 0.07887604040253807, |
|
"learning_rate": 0.00019343938371606712, |
|
"loss": 0.3866, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.632258064516129, |
|
"grad_norm": 0.08329265943906447, |
|
"learning_rate": 0.00019288798344984672, |
|
"loss": 0.3985, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.08661703211305888, |
|
"learning_rate": 0.00019231518828401458, |
|
"loss": 0.3925, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6580645161290323, |
|
"grad_norm": 0.08382217550700771, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 0.3862, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6709677419354839, |
|
"grad_norm": 0.08245124856491458, |
|
"learning_rate": 0.00019110594587891519, |
|
"loss": 0.3847, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.6838709677419355, |
|
"grad_norm": 0.08319716279149986, |
|
"learning_rate": 0.00019046977716458626, |
|
"loss": 0.3775, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.6967741935483871, |
|
"grad_norm": 0.08074648144423298, |
|
"learning_rate": 0.0001898127705363696, |
|
"loss": 0.3786, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 0.08472762376284584, |
|
"learning_rate": 0.0001891350773226754, |
|
"loss": 0.3923, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7225806451612903, |
|
"grad_norm": 0.08398076059437376, |
|
"learning_rate": 0.00018843685361665723, |
|
"loss": 0.3709, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7354838709677419, |
|
"grad_norm": 0.08465216102770419, |
|
"learning_rate": 0.00018771826024025946, |
|
"loss": 0.3818, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.7483870967741936, |
|
"grad_norm": 0.09145572810056589, |
|
"learning_rate": 0.00018697946270717467, |
|
"loss": 0.39, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.7612903225806451, |
|
"grad_norm": 0.08415188367023674, |
|
"learning_rate": 0.00018622063118472134, |
|
"loss": 0.3733, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 0.08576290382509591, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 0.3878, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7870967741935484, |
|
"grad_norm": 0.0844142047859298, |
|
"learning_rate": 0.00018464356987288013, |
|
"loss": 0.3637, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.08918487261557899, |
|
"learning_rate": 0.00018382570332820043, |
|
"loss": 0.3775, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.8129032258064516, |
|
"grad_norm": 0.0795181880669878, |
|
"learning_rate": 0.00018298852919990252, |
|
"loss": 0.3853, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.8258064516129032, |
|
"grad_norm": 0.08173055996583302, |
|
"learning_rate": 0.0001821322403143969, |
|
"loss": 0.38, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 0.08525070031165603, |
|
"learning_rate": 0.0001812570339007983, |
|
"loss": 0.3778, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.8516129032258064, |
|
"grad_norm": 0.08531235204546653, |
|
"learning_rate": 0.00018036311154549784, |
|
"loss": 0.3727, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.864516129032258, |
|
"grad_norm": 0.08169851479895494, |
|
"learning_rate": 0.00017945067914573146, |
|
"loss": 0.365, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.8774193548387097, |
|
"grad_norm": 0.08463789046916101, |
|
"learning_rate": 0.0001785199468621559, |
|
"loss": 0.3752, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.8903225806451613, |
|
"grad_norm": 0.09441843624235378, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 0.3665, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 0.08530939476149231, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.3926, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9161290322580645, |
|
"grad_norm": 0.0836606457284625, |
|
"learning_rate": 0.00017562011524313185, |
|
"loss": 0.3844, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.9290322580645162, |
|
"grad_norm": 0.09868625782773943, |
|
"learning_rate": 0.00017461836858476856, |
|
"loss": 0.3835, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.9419354838709677, |
|
"grad_norm": 0.082132336261239, |
|
"learning_rate": 0.00017359943506922774, |
|
"loss": 0.3792, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.9548387096774194, |
|
"grad_norm": 0.08948965393301354, |
|
"learning_rate": 0.0001725635493875799, |
|
"loss": 0.3813, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 0.08539410389371488, |
|
"learning_rate": 0.00017151095013548994, |
|
"loss": 0.3774, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.9806451612903225, |
|
"grad_norm": 0.08690404790165682, |
|
"learning_rate": 0.00017044187975826124, |
|
"loss": 0.3762, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.9935483870967742, |
|
"grad_norm": 0.09039522496805455, |
|
"learning_rate": 0.0001693565844949933, |
|
"loss": 0.3733, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.9935483870967742, |
|
"eval_loss": 0.3743511736392975, |
|
"eval_runtime": 42.1339, |
|
"eval_samples_per_second": 24.66, |
|
"eval_steps_per_second": 0.783, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.0064516129032257, |
|
"grad_norm": 0.09165665911792642, |
|
"learning_rate": 0.00016825531432186543, |
|
"loss": 0.3532, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.0193548387096774, |
|
"grad_norm": 0.0801922544260219, |
|
"learning_rate": 0.0001671383228945597, |
|
"loss": 0.347, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.032258064516129, |
|
"grad_norm": 0.08352186065175837, |
|
"learning_rate": 0.00016600586748983641, |
|
"loss": 0.3566, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0451612903225806, |
|
"grad_norm": 0.08793176795367076, |
|
"learning_rate": 0.0001648582089462756, |
|
"loss": 0.3473, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.0580645161290323, |
|
"grad_norm": 0.08913951531063671, |
|
"learning_rate": 0.00016369561160419784, |
|
"loss": 0.342, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.070967741935484, |
|
"grad_norm": 0.08309712335786672, |
|
"learning_rate": 0.0001625183432447789, |
|
"loss": 0.345, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.0838709677419356, |
|
"grad_norm": 0.08725330804483407, |
|
"learning_rate": 0.00016132667502837165, |
|
"loss": 0.3523, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.096774193548387, |
|
"grad_norm": 0.08680862762413778, |
|
"learning_rate": 0.00016012088143204953, |
|
"loss": 0.3554, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1096774193548387, |
|
"grad_norm": 0.0863782848559528, |
|
"learning_rate": 0.00015890124018638638, |
|
"loss": 0.364, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.1225806451612903, |
|
"grad_norm": 0.08388848992116194, |
|
"learning_rate": 0.00015766803221148673, |
|
"loss": 0.3568, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.135483870967742, |
|
"grad_norm": 0.08226994751114965, |
|
"learning_rate": 0.00015642154155228122, |
|
"loss": 0.3489, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.1483870967741936, |
|
"grad_norm": 0.08575965994905438, |
|
"learning_rate": 0.00015516205531310273, |
|
"loss": 0.3466, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.1612903225806452, |
|
"grad_norm": 0.0895747440427046, |
|
"learning_rate": 0.00015388986359155758, |
|
"loss": 0.3488, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1741935483870969, |
|
"grad_norm": 0.08403222320010312, |
|
"learning_rate": 0.00015260525941170712, |
|
"loss": 0.356, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.1870967741935483, |
|
"grad_norm": 0.08627434364043794, |
|
"learning_rate": 0.0001513085386565758, |
|
"loss": 0.3519, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.08925414655300028, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.3523, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.2129032258064516, |
|
"grad_norm": 0.09120079741968923, |
|
"learning_rate": 0.00014867994483783485, |
|
"loss": 0.3555, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.2258064516129032, |
|
"grad_norm": 0.08519037826685563, |
|
"learning_rate": 0.0001473486772185334, |
|
"loss": 0.3551, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.238709677419355, |
|
"grad_norm": 0.08814591743170447, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 0.3535, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.2516129032258063, |
|
"grad_norm": 0.08812877093082108, |
|
"learning_rate": 0.00014465373364454001, |
|
"loss": 0.3498, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.2645161290322582, |
|
"grad_norm": 0.08596197743921638, |
|
"learning_rate": 0.00014329067841650274, |
|
"loss": 0.3484, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.2774193548387096, |
|
"grad_norm": 0.09025513346881896, |
|
"learning_rate": 0.00014191765204166643, |
|
"loss": 0.3465, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 0.08665409616008209, |
|
"learning_rate": 0.00014053497076934948, |
|
"loss": 0.35, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.303225806451613, |
|
"grad_norm": 0.09012608398761074, |
|
"learning_rate": 0.00013914295307268396, |
|
"loss": 0.3516, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.3161290322580645, |
|
"grad_norm": 0.09456407877563842, |
|
"learning_rate": 0.00013774191957526143, |
|
"loss": 0.3639, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.3290322580645162, |
|
"grad_norm": 0.0888376260234129, |
|
"learning_rate": 0.00013633219297728416, |
|
"loss": 0.3396, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.3419354838709676, |
|
"grad_norm": 0.08652600639054038, |
|
"learning_rate": 0.00013491409798123687, |
|
"loss": 0.3445, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.3548387096774195, |
|
"grad_norm": 0.09269194410505097, |
|
"learning_rate": 0.00013348796121709862, |
|
"loss": 0.3555, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.367741935483871, |
|
"grad_norm": 0.09421096011594207, |
|
"learning_rate": 0.00013205411116710972, |
|
"loss": 0.3508, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.3806451612903226, |
|
"grad_norm": 0.09286783444235318, |
|
"learning_rate": 0.00013061287809011242, |
|
"loss": 0.3571, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.3935483870967742, |
|
"grad_norm": 0.08172852976047028, |
|
"learning_rate": 0.0001291645939454825, |
|
"loss": 0.3488, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.4064516129032258, |
|
"grad_norm": 0.09033973727962885, |
|
"learning_rate": 0.0001277095923166689, |
|
"loss": 0.3498, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.4193548387096775, |
|
"grad_norm": 0.09628933362833343, |
|
"learning_rate": 0.00012624820833435937, |
|
"loss": 0.3472, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.432258064516129, |
|
"grad_norm": 0.08471497514674803, |
|
"learning_rate": 0.00012478077859929, |
|
"loss": 0.3353, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.4451612903225808, |
|
"grad_norm": 0.08976133324522119, |
|
"learning_rate": 0.00012330764110471566, |
|
"loss": 0.3468, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.4580645161290322, |
|
"grad_norm": 0.09634877556737409, |
|
"learning_rate": 0.00012182913515856015, |
|
"loss": 0.3541, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.4709677419354839, |
|
"grad_norm": 0.09348923296138459, |
|
"learning_rate": 0.0001203456013052634, |
|
"loss": 0.3521, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.4838709677419355, |
|
"grad_norm": 0.09437711091684706, |
|
"learning_rate": 0.00011885738124734358, |
|
"loss": 0.3566, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.4967741935483871, |
|
"grad_norm": 0.08916702937111011, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.3458, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.5096774193548388, |
|
"grad_norm": 0.09100601467580355, |
|
"learning_rate": 0.00011586825464562514, |
|
"loss": 0.3593, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.5225806451612902, |
|
"grad_norm": 0.08990470683690902, |
|
"learning_rate": 0.00011436803658769082, |
|
"loss": 0.3434, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.535483870967742, |
|
"grad_norm": 0.0932653393737011, |
|
"learning_rate": 0.00011286450913828312, |
|
"loss": 0.342, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.5483870967741935, |
|
"grad_norm": 0.08960531773257623, |
|
"learning_rate": 0.00011135801860504749, |
|
"loss": 0.3628, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5612903225806452, |
|
"grad_norm": 0.09275069273094473, |
|
"learning_rate": 0.00010984891197811687, |
|
"loss": 0.3513, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.5741935483870968, |
|
"grad_norm": 0.09527469311088294, |
|
"learning_rate": 0.00010833753685018935, |
|
"loss": 0.3556, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.5870967741935482, |
|
"grad_norm": 0.09323849659154124, |
|
"learning_rate": 0.0001068242413364671, |
|
"loss": 0.3448, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.08474554028292876, |
|
"learning_rate": 0.00010530937399447496, |
|
"loss": 0.3499, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 0.09382059811382143, |
|
"learning_rate": 0.00010379328374377715, |
|
"loss": 0.3384, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.6258064516129034, |
|
"grad_norm": 0.09276702527842776, |
|
"learning_rate": 0.00010227631978561056, |
|
"loss": 0.3444, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.6387096774193548, |
|
"grad_norm": 0.08750152088472078, |
|
"learning_rate": 0.00010075883152245334, |
|
"loss": 0.3569, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.6516129032258065, |
|
"grad_norm": 0.08714445180642569, |
|
"learning_rate": 9.92411684775467e-05, |
|
"loss": 0.342, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.664516129032258, |
|
"grad_norm": 0.08469902272466831, |
|
"learning_rate": 9.772368021438943e-05, |
|
"loss": 0.3342, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.6774193548387095, |
|
"grad_norm": 0.08724585745005611, |
|
"learning_rate": 9.620671625622288e-05, |
|
"loss": 0.3335, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6903225806451614, |
|
"grad_norm": 0.09087336723016343, |
|
"learning_rate": 9.469062600552509e-05, |
|
"loss": 0.3447, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.7032258064516128, |
|
"grad_norm": 0.08863278083042062, |
|
"learning_rate": 9.317575866353292e-05, |
|
"loss": 0.3487, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.7161290322580647, |
|
"grad_norm": 0.08343459715762, |
|
"learning_rate": 9.166246314981066e-05, |
|
"loss": 0.3454, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.729032258064516, |
|
"grad_norm": 0.08837483796029806, |
|
"learning_rate": 9.015108802188313e-05, |
|
"loss": 0.3484, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.7419354838709677, |
|
"grad_norm": 0.08762249376974672, |
|
"learning_rate": 8.86419813949525e-05, |
|
"loss": 0.3447, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.7548387096774194, |
|
"grad_norm": 0.08446853010895118, |
|
"learning_rate": 8.713549086171691e-05, |
|
"loss": 0.3466, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.7677419354838708, |
|
"grad_norm": 0.08897676787603495, |
|
"learning_rate": 8.563196341230919e-05, |
|
"loss": 0.3434, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.7806451612903227, |
|
"grad_norm": 0.09210810174866911, |
|
"learning_rate": 8.413174535437487e-05, |
|
"loss": 0.355, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.793548387096774, |
|
"grad_norm": 0.0877098792555575, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.3392, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.8064516129032258, |
|
"grad_norm": 0.09059259587839792, |
|
"learning_rate": 8.114261875265643e-05, |
|
"loss": 0.3465, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8193548387096774, |
|
"grad_norm": 0.09043152099082513, |
|
"learning_rate": 7.965439869473664e-05, |
|
"loss": 0.3409, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.832258064516129, |
|
"grad_norm": 0.08863483273837267, |
|
"learning_rate": 7.817086484143986e-05, |
|
"loss": 0.3497, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.8451612903225807, |
|
"grad_norm": 0.08351509862847174, |
|
"learning_rate": 7.669235889528436e-05, |
|
"loss": 0.3484, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.8580645161290321, |
|
"grad_norm": 0.08881689002413959, |
|
"learning_rate": 7.521922140071002e-05, |
|
"loss": 0.3428, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.870967741935484, |
|
"grad_norm": 0.08962413300366581, |
|
"learning_rate": 7.375179166564063e-05, |
|
"loss": 0.3353, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.8838709677419354, |
|
"grad_norm": 0.08991947191225944, |
|
"learning_rate": 7.229040768333115e-05, |
|
"loss": 0.3366, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.896774193548387, |
|
"grad_norm": 0.0890545628104281, |
|
"learning_rate": 7.08354060545175e-05, |
|
"loss": 0.3381, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.9096774193548387, |
|
"grad_norm": 0.09306016588414409, |
|
"learning_rate": 6.93871219098876e-05, |
|
"loss": 0.3356, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.9225806451612903, |
|
"grad_norm": 0.08816048934545212, |
|
"learning_rate": 6.79458888328903e-05, |
|
"loss": 0.3412, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 0.09006593042575502, |
|
"learning_rate": 6.651203878290139e-05, |
|
"loss": 0.3471, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9483870967741934, |
|
"grad_norm": 0.08499237638300171, |
|
"learning_rate": 6.508590201876317e-05, |
|
"loss": 0.335, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.9612903225806453, |
|
"grad_norm": 0.09566747308379261, |
|
"learning_rate": 6.366780702271589e-05, |
|
"loss": 0.3395, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.9741935483870967, |
|
"grad_norm": 0.0915253754596643, |
|
"learning_rate": 6.225808042473858e-05, |
|
"loss": 0.3488, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.9870967741935484, |
|
"grad_norm": 0.08657357278603872, |
|
"learning_rate": 6.085704692731609e-05, |
|
"loss": 0.3344, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.08950726731743963, |
|
"learning_rate": 5.9465029230650534e-05, |
|
"loss": 0.33, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.35439133644104004, |
|
"eval_runtime": 36.1469, |
|
"eval_samples_per_second": 28.744, |
|
"eval_steps_per_second": 0.913, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.0129032258064514, |
|
"grad_norm": 0.08961232668946545, |
|
"learning_rate": 5.8082347958333625e-05, |
|
"loss": 0.3273, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.0258064516129033, |
|
"grad_norm": 0.09402916213349197, |
|
"learning_rate": 5.670932158349731e-05, |
|
"loss": 0.3218, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.0387096774193547, |
|
"grad_norm": 0.08520247695821515, |
|
"learning_rate": 5.5346266355459995e-05, |
|
"loss": 0.3089, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.0516129032258066, |
|
"grad_norm": 0.08637288183919145, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 0.3266, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.064516129032258, |
|
"grad_norm": 0.08823864345930746, |
|
"learning_rate": 5.26513227814666e-05, |
|
"loss": 0.329, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.07741935483871, |
|
"grad_norm": 0.09384371931382793, |
|
"learning_rate": 5.1320055162165115e-05, |
|
"loss": 0.3275, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.0903225806451613, |
|
"grad_norm": 0.09516405744887674, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.332, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.1032258064516127, |
|
"grad_norm": 0.08966279182804247, |
|
"learning_rate": 4.869146134342426e-05, |
|
"loss": 0.3247, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.1161290322580646, |
|
"grad_norm": 0.08700940402163973, |
|
"learning_rate": 4.739474058829289e-05, |
|
"loss": 0.3221, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.129032258064516, |
|
"grad_norm": 0.08984677102800173, |
|
"learning_rate": 4.611013640844245e-05, |
|
"loss": 0.3272, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.141935483870968, |
|
"grad_norm": 0.08964202186304891, |
|
"learning_rate": 4.483794468689728e-05, |
|
"loss": 0.3188, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.1548387096774193, |
|
"grad_norm": 0.09997697429798251, |
|
"learning_rate": 4.357845844771881e-05, |
|
"loss": 0.3383, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.167741935483871, |
|
"grad_norm": 0.09510073376177604, |
|
"learning_rate": 4.2331967788513295e-05, |
|
"loss": 0.3252, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.1806451612903226, |
|
"grad_norm": 0.09107612709336496, |
|
"learning_rate": 4.109875981361363e-05, |
|
"loss": 0.3217, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.193548387096774, |
|
"grad_norm": 0.08804927379783276, |
|
"learning_rate": 3.987911856795047e-05, |
|
"loss": 0.3173, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.206451612903226, |
|
"grad_norm": 0.0916081059987062, |
|
"learning_rate": 3.8673324971628357e-05, |
|
"loss": 0.3285, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.2193548387096773, |
|
"grad_norm": 0.09226628432750343, |
|
"learning_rate": 3.7481656755221125e-05, |
|
"loss": 0.3154, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.232258064516129, |
|
"grad_norm": 0.09145015878266409, |
|
"learning_rate": 3.630438839580217e-05, |
|
"loss": 0.3087, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.2451612903225806, |
|
"grad_norm": 0.08786201399591659, |
|
"learning_rate": 3.5141791053724405e-05, |
|
"loss": 0.3151, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.258064516129032, |
|
"grad_norm": 0.09259402512083086, |
|
"learning_rate": 3.399413251016359e-05, |
|
"loss": 0.3369, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.270967741935484, |
|
"grad_norm": 0.09311260751337232, |
|
"learning_rate": 3.2861677105440336e-05, |
|
"loss": 0.3051, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.2838709677419353, |
|
"grad_norm": 0.09217712904693832, |
|
"learning_rate": 3.174468567813461e-05, |
|
"loss": 0.3199, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.296774193548387, |
|
"grad_norm": 0.09141877592974519, |
|
"learning_rate": 3.0643415505006735e-05, |
|
"loss": 0.3229, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.3096774193548386, |
|
"grad_norm": 0.09528833689903496, |
|
"learning_rate": 2.9558120241738784e-05, |
|
"loss": 0.3286, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.3225806451612905, |
|
"grad_norm": 0.09070636787107308, |
|
"learning_rate": 2.8489049864510054e-05, |
|
"loss": 0.3348, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.335483870967742, |
|
"grad_norm": 0.09307512327341362, |
|
"learning_rate": 2.7436450612420095e-05, |
|
"loss": 0.3256, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.3483870967741938, |
|
"grad_norm": 0.09127823479306682, |
|
"learning_rate": 2.640056493077231e-05, |
|
"loss": 0.3181, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.361290322580645, |
|
"grad_norm": 0.09246009256113925, |
|
"learning_rate": 2.5381631415231454e-05, |
|
"loss": 0.3391, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.3741935483870966, |
|
"grad_norm": 0.09095352379758655, |
|
"learning_rate": 2.4379884756868167e-05, |
|
"loss": 0.3172, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.3870967741935485, |
|
"grad_norm": 0.0926880163626768, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.3177, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.09094474131194094, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 0.3199, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.412903225806452, |
|
"grad_norm": 0.09106546035353981, |
|
"learning_rate": 2.1480053137844115e-05, |
|
"loss": 0.3222, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.425806451612903, |
|
"grad_norm": 0.08873018715134598, |
|
"learning_rate": 2.054932085426856e-05, |
|
"loss": 0.3118, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.4387096774193546, |
|
"grad_norm": 0.0932765377498955, |
|
"learning_rate": 1.9636888454502178e-05, |
|
"loss": 0.3358, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.4516129032258065, |
|
"grad_norm": 0.09181586534157822, |
|
"learning_rate": 1.8742966099201697e-05, |
|
"loss": 0.3157, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.464516129032258, |
|
"grad_norm": 0.0929486436457203, |
|
"learning_rate": 1.7867759685603114e-05, |
|
"loss": 0.3154, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.47741935483871, |
|
"grad_norm": 0.09188630220285351, |
|
"learning_rate": 1.7011470800097496e-05, |
|
"loss": 0.3181, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.490322580645161, |
|
"grad_norm": 0.09574286894431329, |
|
"learning_rate": 1.6174296671799572e-05, |
|
"loss": 0.3222, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.5032258064516126, |
|
"grad_norm": 0.09145354457132104, |
|
"learning_rate": 1.5356430127119913e-05, |
|
"loss": 0.3222, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.5161290322580645, |
|
"grad_norm": 0.09039580690260736, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 0.324, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.5290322580645164, |
|
"grad_norm": 0.08979381831653434, |
|
"learning_rate": 1.3779368815278647e-05, |
|
"loss": 0.3107, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.541935483870968, |
|
"grad_norm": 0.09526292697431937, |
|
"learning_rate": 1.302053729282533e-05, |
|
"loss": 0.3219, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.554838709677419, |
|
"grad_norm": 0.09310358146453943, |
|
"learning_rate": 1.2281739759740574e-05, |
|
"loss": 0.3214, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.567741935483871, |
|
"grad_norm": 0.09212645063531479, |
|
"learning_rate": 1.1563146383342772e-05, |
|
"loss": 0.3154, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.5806451612903225, |
|
"grad_norm": 0.09533681862557382, |
|
"learning_rate": 1.0864922677324618e-05, |
|
"loss": 0.319, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5935483870967744, |
|
"grad_norm": 0.09551418366783314, |
|
"learning_rate": 1.01872294636304e-05, |
|
"loss": 0.3333, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.606451612903226, |
|
"grad_norm": 0.08930212325894361, |
|
"learning_rate": 9.530222835413738e-06, |
|
"loss": 0.3048, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.6193548387096772, |
|
"grad_norm": 0.09220378121771236, |
|
"learning_rate": 8.894054121084838e-06, |
|
"loss": 0.3146, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.632258064516129, |
|
"grad_norm": 0.09150774720724307, |
|
"learning_rate": 8.278869849454718e-06, |
|
"loss": 0.3311, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.6451612903225805, |
|
"grad_norm": 0.09261513270619316, |
|
"learning_rate": 7.684811715985429e-06, |
|
"loss": 0.3172, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.6580645161290324, |
|
"grad_norm": 0.0941004102909483, |
|
"learning_rate": 7.1120165501533e-06, |
|
"loss": 0.3347, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.670967741935484, |
|
"grad_norm": 0.08707518610128166, |
|
"learning_rate": 6.560616283932897e-06, |
|
"loss": 0.3116, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.6838709677419352, |
|
"grad_norm": 0.08648707636296159, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 0.3144, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.696774193548387, |
|
"grad_norm": 0.09169150101119816, |
|
"learning_rate": 5.52250350952459e-06, |
|
"loss": 0.3255, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.709677419354839, |
|
"grad_norm": 0.09060072523264334, |
|
"learning_rate": 5.036030109968082e-06, |
|
"loss": 0.3183, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.7225806451612904, |
|
"grad_norm": 0.09077216490604942, |
|
"learning_rate": 4.5714297722121106e-06, |
|
"loss": 0.321, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.735483870967742, |
|
"grad_norm": 0.09088968433443333, |
|
"learning_rate": 4.128809507704445e-06, |
|
"loss": 0.3172, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.7483870967741937, |
|
"grad_norm": 0.09191902683388614, |
|
"learning_rate": 3.7082712652200867e-06, |
|
"loss": 0.3261, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.761290322580645, |
|
"grad_norm": 0.08843215800144302, |
|
"learning_rate": 3.3099119073793928e-06, |
|
"loss": 0.3158, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.774193548387097, |
|
"grad_norm": 0.09079938334868655, |
|
"learning_rate": 2.9338231883378366e-06, |
|
"loss": 0.3178, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.7870967741935484, |
|
"grad_norm": 0.09122789808454786, |
|
"learning_rate": 2.580091732652101e-06, |
|
"loss": 0.3282, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.09380292374109117, |
|
"learning_rate": 2.248799015327907e-06, |
|
"loss": 0.3359, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.8129032258064517, |
|
"grad_norm": 0.09035917420929797, |
|
"learning_rate": 1.9400213430538773e-06, |
|
"loss": 0.3169, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.825806451612903, |
|
"grad_norm": 0.09195121657817087, |
|
"learning_rate": 1.6538298366257976e-06, |
|
"loss": 0.3314, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.838709677419355, |
|
"grad_norm": 0.09166102367139951, |
|
"learning_rate": 1.3902904145653096e-06, |
|
"loss": 0.3258, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.8516129032258064, |
|
"grad_norm": 0.0921992572010057, |
|
"learning_rate": 1.1494637779369766e-06, |
|
"loss": 0.3298, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.864516129032258, |
|
"grad_norm": 0.09068261067988724, |
|
"learning_rate": 9.314053963669245e-07, |
|
"loss": 0.3214, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.8774193548387097, |
|
"grad_norm": 0.09417924199778298, |
|
"learning_rate": 7.361654952665609e-07, |
|
"loss": 0.3134, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.8903225806451616, |
|
"grad_norm": 0.0901765977296441, |
|
"learning_rate": 5.637890442641402e-07, |
|
"loss": 0.3221, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.903225806451613, |
|
"grad_norm": 0.09094506589085496, |
|
"learning_rate": 4.143157468468717e-07, |
|
"loss": 0.3128, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.9161290322580644, |
|
"grad_norm": 0.08772549933058231, |
|
"learning_rate": 2.877800312160783e-07, |
|
"loss": 0.3248, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.9290322580645163, |
|
"grad_norm": 0.09191883931659987, |
|
"learning_rate": 1.8421104235727405e-07, |
|
"loss": 0.3114, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.9419354838709677, |
|
"grad_norm": 0.08876137430429, |
|
"learning_rate": 1.0363263532724432e-07, |
|
"loss": 0.3127, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.9548387096774196, |
|
"grad_norm": 0.09157045134043748, |
|
"learning_rate": 4.606336975948589e-08, |
|
"loss": 0.3275, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.967741935483871, |
|
"grad_norm": 0.08940213355520302, |
|
"learning_rate": 1.1516505589381776e-08, |
|
"loss": 0.3246, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.9806451612903224, |
|
"grad_norm": 0.0895898052255747, |
|
"learning_rate": 0.0, |
|
"loss": 0.3079, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.9806451612903224, |
|
"eval_loss": 0.3507891595363617, |
|
"eval_runtime": 36.0777, |
|
"eval_samples_per_second": 28.799, |
|
"eval_steps_per_second": 0.915, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.9806451612903224, |
|
"step": 231, |
|
"total_flos": 9.324729662937498e+16, |
|
"train_loss": 0.3951803825118325, |
|
"train_runtime": 2997.4381, |
|
"train_samples_per_second": 9.871, |
|
"train_steps_per_second": 0.077 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 231, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.324729662937498e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|