Model save

5cdefe8 verified 3 months ago

41.5 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 2.9806451612903224,
	"eval_steps": 500,
	"global_step": 231,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.012903225806451613,
	"grad_norm": 0.882150089808769,
	"learning_rate": 8.333333333333334e-06,
	"loss": 1.3191,
	"step": 1
	},
	{
	"epoch": 0.025806451612903226,
	"grad_norm": 0.8369153094823952,
	"learning_rate": 1.6666666666666667e-05,
	"loss": 1.249,
	"step": 2
	},
	{
	"epoch": 0.03870967741935484,
	"grad_norm": 0.8525103918091212,
	"learning_rate": 2.5e-05,
	"loss": 1.2775,
	"step": 3
	},
	{
	"epoch": 0.05161290322580645,
	"grad_norm": 0.8113130093304075,
	"learning_rate": 3.3333333333333335e-05,
	"loss": 1.2577,
	"step": 4
	},
	{
	"epoch": 0.06451612903225806,
	"grad_norm": 0.7691226782403744,
	"learning_rate": 4.166666666666667e-05,
	"loss": 1.2275,
	"step": 5
	},
	{
	"epoch": 0.07741935483870968,
	"grad_norm": 0.5954210054804412,
	"learning_rate": 5e-05,
	"loss": 1.1159,
	"step": 6
	},
	{
	"epoch": 0.09032258064516129,
	"grad_norm": 0.48189256930049384,
	"learning_rate": 5.833333333333334e-05,
	"loss": 1.0593,
	"step": 7
	},
	{
	"epoch": 0.1032258064516129,
	"grad_norm": 0.5241879927945232,
	"learning_rate": 6.666666666666667e-05,
	"loss": 1.0031,
	"step": 8
	},
	{
	"epoch": 0.11612903225806452,
	"grad_norm": 0.5751865259411146,
	"learning_rate": 7.500000000000001e-05,
	"loss": 0.9263,
	"step": 9
	},
	{
	"epoch": 0.12903225806451613,
	"grad_norm": 0.5686526755807603,
	"learning_rate": 8.333333333333334e-05,
	"loss": 0.8146,
	"step": 10
	},
	{
	"epoch": 0.14193548387096774,
	"grad_norm": 0.5156906474251192,
	"learning_rate": 9.166666666666667e-05,
	"loss": 0.7583,
	"step": 11
	},
	{
	"epoch": 0.15483870967741936,
	"grad_norm": 0.4901634328534619,
	"learning_rate": 0.0001,
	"loss": 0.6686,
	"step": 12
	},
	{
	"epoch": 0.16774193548387098,
	"grad_norm": 0.376084270046461,
	"learning_rate": 0.00010833333333333333,
	"loss": 0.6005,
	"step": 13
	},
	{
	"epoch": 0.18064516129032257,
	"grad_norm": 0.2761318809240614,
	"learning_rate": 0.00011666666666666668,
	"loss": 0.5741,
	"step": 14
	},
	{
	"epoch": 0.1935483870967742,
	"grad_norm": 0.25038763704461725,
	"learning_rate": 0.000125,
	"loss": 0.5465,
	"step": 15
	},
	{
	"epoch": 0.2064516129032258,
	"grad_norm": 0.2214903977106201,
	"learning_rate": 0.00013333333333333334,
	"loss": 0.5138,
	"step": 16
	},
	{
	"epoch": 0.21935483870967742,
	"grad_norm": 0.28905541505099525,
	"learning_rate": 0.00014166666666666668,
	"loss": 0.5247,
	"step": 17
	},
	{
	"epoch": 0.23225806451612904,
	"grad_norm": 0.20699066633757193,
	"learning_rate": 0.00015000000000000001,
	"loss": 0.4978,
	"step": 18
	},
	{
	"epoch": 0.24516129032258063,
	"grad_norm": 0.219457528851344,
	"learning_rate": 0.00015833333333333332,
	"loss": 0.4924,
	"step": 19
	},
	{
	"epoch": 0.25806451612903225,
	"grad_norm": 0.16596853789220767,
	"learning_rate": 0.0001666666666666667,
	"loss": 0.4759,
	"step": 20
	},
	{
	"epoch": 0.2709677419354839,
	"grad_norm": 0.13228412371333673,
	"learning_rate": 0.000175,
	"loss": 0.4613,
	"step": 21
	},
	{
	"epoch": 0.2838709677419355,
	"grad_norm": 0.1421107856190867,
	"learning_rate": 0.00018333333333333334,
	"loss": 0.4852,
	"step": 22
	},
	{
	"epoch": 0.2967741935483871,
	"grad_norm": 0.12552928984887968,
	"learning_rate": 0.00019166666666666667,
	"loss": 0.4786,
	"step": 23
	},
	{
	"epoch": 0.3096774193548387,
	"grad_norm": 0.11489463060846784,
	"learning_rate": 0.0002,
	"loss": 0.4532,
	"step": 24
	},
	{
	"epoch": 0.3225806451612903,
	"grad_norm": 0.11476879539402507,
	"learning_rate": 0.00019998848349441062,
	"loss": 0.4454,
	"step": 25
	},
	{
	"epoch": 0.33548387096774196,
	"grad_norm": 0.1256602270101812,
	"learning_rate": 0.00019995393663024054,
	"loss": 0.4513,
	"step": 26
	},
	{
	"epoch": 0.34838709677419355,
	"grad_norm": 0.11833482485698336,
	"learning_rate": 0.00019989636736467278,
	"loss": 0.44,
	"step": 27
	},
	{
	"epoch": 0.36129032258064514,
	"grad_norm": 0.11124019681377781,
	"learning_rate": 0.00019981578895764273,
	"loss": 0.4439,
	"step": 28
	},
	{
	"epoch": 0.3741935483870968,
	"grad_norm": 0.10954971384477814,
	"learning_rate": 0.00019971221996878394,
	"loss": 0.4274,
	"step": 29
	},
	{
	"epoch": 0.3870967741935484,
	"grad_norm": 0.11422715129880294,
	"learning_rate": 0.00019958568425315314,
	"loss": 0.4254,
	"step": 30
	},
	{
	"epoch": 0.4,
	"grad_norm": 0.11262310014016527,
	"learning_rate": 0.00019943621095573586,
	"loss": 0.4204,
	"step": 31
	},
	{
	"epoch": 0.4129032258064516,
	"grad_norm": 0.11143099554463408,
	"learning_rate": 0.00019926383450473344,
	"loss": 0.4105,
	"step": 32
	},
	{
	"epoch": 0.4258064516129032,
	"grad_norm": 0.1088260973247734,
	"learning_rate": 0.00019906859460363307,
	"loss": 0.4136,
	"step": 33
	},
	{
	"epoch": 0.43870967741935485,
	"grad_norm": 0.10400753996611788,
	"learning_rate": 0.00019885053622206304,
	"loss": 0.4213,
	"step": 34
	},
	{
	"epoch": 0.45161290322580644,
	"grad_norm": 0.09587900896302251,
	"learning_rate": 0.0001986097095854347,
	"loss": 0.4085,
	"step": 35
	},
	{
	"epoch": 0.4645161290322581,
	"grad_norm": 0.10119603747308556,
	"learning_rate": 0.0001983461701633742,
	"loss": 0.4181,
	"step": 36
	},
	{
	"epoch": 0.4774193548387097,
	"grad_norm": 0.10062413136253176,
	"learning_rate": 0.00019805997865694614,
	"loss": 0.4098,
	"step": 37
	},
	{
	"epoch": 0.49032258064516127,
	"grad_norm": 0.09162394941720846,
	"learning_rate": 0.0001977512009846721,
	"loss": 0.4085,
	"step": 38
	},
	{
	"epoch": 0.5032258064516129,
	"grad_norm": 0.09269316443279575,
	"learning_rate": 0.00019741990826734794,
	"loss": 0.3994,
	"step": 39
	},
	{
	"epoch": 0.5161290322580645,
	"grad_norm": 0.08782581803238095,
	"learning_rate": 0.00019706617681166218,
	"loss": 0.3983,
	"step": 40
	},
	{
	"epoch": 0.5290322580645161,
	"grad_norm": 0.08665646987756218,
	"learning_rate": 0.00019669008809262062,
	"loss": 0.3938,
	"step": 41
	},
	{
	"epoch": 0.5419354838709678,
	"grad_norm": 0.09289388957990503,
	"learning_rate": 0.00019629172873477995,
	"loss": 0.396,
	"step": 42
	},
	{
	"epoch": 0.5548387096774193,
	"grad_norm": 0.09203344649472522,
	"learning_rate": 0.00019587119049229557,
	"loss": 0.4052,
	"step": 43
	},
	{
	"epoch": 0.567741935483871,
	"grad_norm": 0.08209774194723368,
	"learning_rate": 0.0001954285702277879,
	"loss": 0.3959,
	"step": 44
	},
	{
	"epoch": 0.5806451612903226,
	"grad_norm": 0.08595872863630391,
	"learning_rate": 0.00019496396989003193,
	"loss": 0.397,
	"step": 45
	},
	{
	"epoch": 0.5935483870967742,
	"grad_norm": 0.09041908237644536,
	"learning_rate": 0.00019447749649047542,
	"loss": 0.3992,
	"step": 46
	},
	{
	"epoch": 0.6064516129032258,
	"grad_norm": 0.08321976348844515,
	"learning_rate": 0.00019396926207859084,
	"loss": 0.4095,
	"step": 47
	},
	{
	"epoch": 0.6193548387096774,
	"grad_norm": 0.07887604040253807,
	"learning_rate": 0.00019343938371606712,
	"loss": 0.3866,
	"step": 48
	},
	{
	"epoch": 0.632258064516129,
	"grad_norm": 0.08329265943906447,
	"learning_rate": 0.00019288798344984672,
	"loss": 0.3985,
	"step": 49
	},
	{
	"epoch": 0.6451612903225806,
	"grad_norm": 0.08661703211305888,
	"learning_rate": 0.00019231518828401458,
	"loss": 0.3925,
	"step": 50
	},
	{
	"epoch": 0.6580645161290323,
	"grad_norm": 0.08382217550700771,
	"learning_rate": 0.00019172113015054532,
	"loss": 0.3862,
	"step": 51
	},
	{
	"epoch": 0.6709677419354839,
	"grad_norm": 0.08245124856491458,
	"learning_rate": 0.00019110594587891519,
	"loss": 0.3847,
	"step": 52
	},
	{
	"epoch": 0.6838709677419355,
	"grad_norm": 0.08319716279149986,
	"learning_rate": 0.00019046977716458626,
	"loss": 0.3775,
	"step": 53
	},
	{
	"epoch": 0.6967741935483871,
	"grad_norm": 0.08074648144423298,
	"learning_rate": 0.0001898127705363696,
	"loss": 0.3786,
	"step": 54
	},
	{
	"epoch": 0.7096774193548387,
	"grad_norm": 0.08472762376284584,
	"learning_rate": 0.0001891350773226754,
	"loss": 0.3923,
	"step": 55
	},
	{
	"epoch": 0.7225806451612903,
	"grad_norm": 0.08398076059437376,
	"learning_rate": 0.00018843685361665723,
	"loss": 0.3709,
	"step": 56
	},
	{
	"epoch": 0.7354838709677419,
	"grad_norm": 0.08465216102770419,
	"learning_rate": 0.00018771826024025946,
	"loss": 0.3818,
	"step": 57
	},
	{
	"epoch": 0.7483870967741936,
	"grad_norm": 0.09145572810056589,
	"learning_rate": 0.00018697946270717467,
	"loss": 0.39,
	"step": 58
	},
	{
	"epoch": 0.7612903225806451,
	"grad_norm": 0.08415188367023674,
	"learning_rate": 0.00018622063118472134,
	"loss": 0.3733,
	"step": 59
	},
	{
	"epoch": 0.7741935483870968,
	"grad_norm": 0.08576290382509591,
	"learning_rate": 0.00018544194045464886,
	"loss": 0.3878,
	"step": 60
	},
	{
	"epoch": 0.7870967741935484,
	"grad_norm": 0.0844142047859298,
	"learning_rate": 0.00018464356987288013,
	"loss": 0.3637,
	"step": 61
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.08918487261557899,
	"learning_rate": 0.00018382570332820043,
	"loss": 0.3775,
	"step": 62
	},
	{
	"epoch": 0.8129032258064516,
	"grad_norm": 0.0795181880669878,
	"learning_rate": 0.00018298852919990252,
	"loss": 0.3853,
	"step": 63
	},
	{
	"epoch": 0.8258064516129032,
	"grad_norm": 0.08173055996583302,
	"learning_rate": 0.0001821322403143969,
	"loss": 0.38,
	"step": 64
	},
	{
	"epoch": 0.8387096774193549,
	"grad_norm": 0.08525070031165603,
	"learning_rate": 0.0001812570339007983,
	"loss": 0.3778,
	"step": 65
	},
	{
	"epoch": 0.8516129032258064,
	"grad_norm": 0.08531235204546653,
	"learning_rate": 0.00018036311154549784,
	"loss": 0.3727,
	"step": 66
	},
	{
	"epoch": 0.864516129032258,
	"grad_norm": 0.08169851479895494,
	"learning_rate": 0.00017945067914573146,
	"loss": 0.365,
	"step": 67
	},
	{
	"epoch": 0.8774193548387097,
	"grad_norm": 0.08463789046916101,
	"learning_rate": 0.0001785199468621559,
	"loss": 0.3752,
	"step": 68
	},
	{
	"epoch": 0.8903225806451613,
	"grad_norm": 0.09441843624235378,
	"learning_rate": 0.000177571129070442,
	"loss": 0.3665,
	"step": 69
	},
	{
	"epoch": 0.9032258064516129,
	"grad_norm": 0.08530939476149231,
	"learning_rate": 0.0001766044443118978,
	"loss": 0.3926,
	"step": 70
	},
	{
	"epoch": 0.9161290322580645,
	"grad_norm": 0.0836606457284625,
	"learning_rate": 0.00017562011524313185,
	"loss": 0.3844,
	"step": 71
	},
	{
	"epoch": 0.9290322580645162,
	"grad_norm": 0.09868625782773943,
	"learning_rate": 0.00017461836858476856,
	"loss": 0.3835,
	"step": 72
	},
	{
	"epoch": 0.9419354838709677,
	"grad_norm": 0.082132336261239,
	"learning_rate": 0.00017359943506922774,
	"loss": 0.3792,
	"step": 73
	},
	{
	"epoch": 0.9548387096774194,
	"grad_norm": 0.08948965393301354,
	"learning_rate": 0.0001725635493875799,
	"loss": 0.3813,
	"step": 74
	},
	{
	"epoch": 0.967741935483871,
	"grad_norm": 0.08539410389371488,
	"learning_rate": 0.00017151095013548994,
	"loss": 0.3774,
	"step": 75
	},
	{
	"epoch": 0.9806451612903225,
	"grad_norm": 0.08690404790165682,
	"learning_rate": 0.00017044187975826124,
	"loss": 0.3762,
	"step": 76
	},
	{
	"epoch": 0.9935483870967742,
	"grad_norm": 0.09039522496805455,
	"learning_rate": 0.0001693565844949933,
	"loss": 0.3733,
	"step": 77
	},
	{
	"epoch": 0.9935483870967742,
	"eval_loss": 0.3743511736392975,
	"eval_runtime": 42.1339,
	"eval_samples_per_second": 24.66,
	"eval_steps_per_second": 0.783,
	"step": 77
	},
	{
	"epoch": 1.0064516129032257,
	"grad_norm": 0.09165665911792642,
	"learning_rate": 0.00016825531432186543,
	"loss": 0.3532,
	"step": 78
	},
	{
	"epoch": 1.0193548387096774,
	"grad_norm": 0.0801922544260219,
	"learning_rate": 0.0001671383228945597,
	"loss": 0.347,
	"step": 79
	},
	{
	"epoch": 1.032258064516129,
	"grad_norm": 0.08352186065175837,
	"learning_rate": 0.00016600586748983641,
	"loss": 0.3566,
	"step": 80
	},
	{
	"epoch": 1.0451612903225806,
	"grad_norm": 0.08793176795367076,
	"learning_rate": 0.0001648582089462756,
	"loss": 0.3473,
	"step": 81
	},
	{
	"epoch": 1.0580645161290323,
	"grad_norm": 0.08913951531063671,
	"learning_rate": 0.00016369561160419784,
	"loss": 0.342,
	"step": 82
	},
	{
	"epoch": 1.070967741935484,
	"grad_norm": 0.08309712335786672,
	"learning_rate": 0.0001625183432447789,
	"loss": 0.345,
	"step": 83
	},
	{
	"epoch": 1.0838709677419356,
	"grad_norm": 0.08725330804483407,
	"learning_rate": 0.00016132667502837165,
	"loss": 0.3523,
	"step": 84
	},
	{
	"epoch": 1.096774193548387,
	"grad_norm": 0.08680862762413778,
	"learning_rate": 0.00016012088143204953,
	"loss": 0.3554,
	"step": 85
	},
	{
	"epoch": 1.1096774193548387,
	"grad_norm": 0.0863782848559528,
	"learning_rate": 0.00015890124018638638,
	"loss": 0.364,
	"step": 86
	},
	{
	"epoch": 1.1225806451612903,
	"grad_norm": 0.08388848992116194,
	"learning_rate": 0.00015766803221148673,
	"loss": 0.3568,
	"step": 87
	},
	{
	"epoch": 1.135483870967742,
	"grad_norm": 0.08226994751114965,
	"learning_rate": 0.00015642154155228122,
	"loss": 0.3489,
	"step": 88
	},
	{
	"epoch": 1.1483870967741936,
	"grad_norm": 0.08575965994905438,
	"learning_rate": 0.00015516205531310273,
	"loss": 0.3466,
	"step": 89
	},
	{
	"epoch": 1.1612903225806452,
	"grad_norm": 0.0895747440427046,
	"learning_rate": 0.00015388986359155758,
	"loss": 0.3488,
	"step": 90
	},
	{
	"epoch": 1.1741935483870969,
	"grad_norm": 0.08403222320010312,
	"learning_rate": 0.00015260525941170712,
	"loss": 0.356,
	"step": 91
	},
	{
	"epoch": 1.1870967741935483,
	"grad_norm": 0.08627434364043794,
	"learning_rate": 0.0001513085386565758,
	"loss": 0.3519,
	"step": 92
	},
	{
	"epoch": 1.2,
	"grad_norm": 0.08925414655300028,
	"learning_rate": 0.00015000000000000001,
	"loss": 0.3523,
	"step": 93
	},
	{
	"epoch": 1.2129032258064516,
	"grad_norm": 0.09120079741968923,
	"learning_rate": 0.00014867994483783485,
	"loss": 0.3555,
	"step": 94
	},
	{
	"epoch": 1.2258064516129032,
	"grad_norm": 0.08519037826685563,
	"learning_rate": 0.0001473486772185334,
	"loss": 0.3551,
	"step": 95
	},
	{
	"epoch": 1.238709677419355,
	"grad_norm": 0.08814591743170447,
	"learning_rate": 0.00014600650377311522,
	"loss": 0.3535,
	"step": 96
	},
	{
	"epoch": 1.2516129032258063,
	"grad_norm": 0.08812877093082108,
	"learning_rate": 0.00014465373364454001,
	"loss": 0.3498,
	"step": 97
	},
	{
	"epoch": 1.2645161290322582,
	"grad_norm": 0.08596197743921638,
	"learning_rate": 0.00014329067841650274,
	"loss": 0.3484,
	"step": 98
	},
	{
	"epoch": 1.2774193548387096,
	"grad_norm": 0.09025513346881896,
	"learning_rate": 0.00014191765204166643,
	"loss": 0.3465,
	"step": 99
	},
	{
	"epoch": 1.2903225806451613,
	"grad_norm": 0.08665409616008209,
	"learning_rate": 0.00014053497076934948,
	"loss": 0.35,
	"step": 100
	},
	{
	"epoch": 1.303225806451613,
	"grad_norm": 0.09012608398761074,
	"learning_rate": 0.00013914295307268396,
	"loss": 0.3516,
	"step": 101
	},
	{
	"epoch": 1.3161290322580645,
	"grad_norm": 0.09456407877563842,
	"learning_rate": 0.00013774191957526143,
	"loss": 0.3639,
	"step": 102
	},
	{
	"epoch": 1.3290322580645162,
	"grad_norm": 0.0888376260234129,
	"learning_rate": 0.00013633219297728416,
	"loss": 0.3396,
	"step": 103
	},
	{
	"epoch": 1.3419354838709676,
	"grad_norm": 0.08652600639054038,
	"learning_rate": 0.00013491409798123687,
	"loss": 0.3445,
	"step": 104
	},
	{
	"epoch": 1.3548387096774195,
	"grad_norm": 0.09269194410505097,
	"learning_rate": 0.00013348796121709862,
	"loss": 0.3555,
	"step": 105
	},
	{
	"epoch": 1.367741935483871,
	"grad_norm": 0.09421096011594207,
	"learning_rate": 0.00013205411116710972,
	"loss": 0.3508,
	"step": 106
	},
	{
	"epoch": 1.3806451612903226,
	"grad_norm": 0.09286783444235318,
	"learning_rate": 0.00013061287809011242,
	"loss": 0.3571,
	"step": 107
	},
	{
	"epoch": 1.3935483870967742,
	"grad_norm": 0.08172852976047028,
	"learning_rate": 0.0001291645939454825,
	"loss": 0.3488,
	"step": 108
	},
	{
	"epoch": 1.4064516129032258,
	"grad_norm": 0.09033973727962885,
	"learning_rate": 0.0001277095923166689,
	"loss": 0.3498,
	"step": 109
	},
	{
	"epoch": 1.4193548387096775,
	"grad_norm": 0.09628933362833343,
	"learning_rate": 0.00012624820833435937,
	"loss": 0.3472,
	"step": 110
	},
	{
	"epoch": 1.432258064516129,
	"grad_norm": 0.08471497514674803,
	"learning_rate": 0.00012478077859929,
	"loss": 0.3353,
	"step": 111
	},
	{
	"epoch": 1.4451612903225808,
	"grad_norm": 0.08976133324522119,
	"learning_rate": 0.00012330764110471566,
	"loss": 0.3468,
	"step": 112
	},
	{
	"epoch": 1.4580645161290322,
	"grad_norm": 0.09634877556737409,
	"learning_rate": 0.00012182913515856015,
	"loss": 0.3541,
	"step": 113
	},
	{
	"epoch": 1.4709677419354839,
	"grad_norm": 0.09348923296138459,
	"learning_rate": 0.0001203456013052634,
	"loss": 0.3521,
	"step": 114
	},
	{
	"epoch": 1.4838709677419355,
	"grad_norm": 0.09437711091684706,
	"learning_rate": 0.00011885738124734358,
	"loss": 0.3566,
	"step": 115
	},
	{
	"epoch": 1.4967741935483871,
	"grad_norm": 0.08916702937111011,
	"learning_rate": 0.00011736481776669306,
	"loss": 0.3458,
	"step": 116
	},
	{
	"epoch": 1.5096774193548388,
	"grad_norm": 0.09100601467580355,
	"learning_rate": 0.00011586825464562514,
	"loss": 0.3593,
	"step": 117
	},
	{
	"epoch": 1.5225806451612902,
	"grad_norm": 0.08990470683690902,
	"learning_rate": 0.00011436803658769082,
	"loss": 0.3434,
	"step": 118
	},
	{
	"epoch": 1.535483870967742,
	"grad_norm": 0.0932653393737011,
	"learning_rate": 0.00011286450913828312,
	"loss": 0.342,
	"step": 119
	},
	{
	"epoch": 1.5483870967741935,
	"grad_norm": 0.08960531773257623,
	"learning_rate": 0.00011135801860504749,
	"loss": 0.3628,
	"step": 120
	},
	{
	"epoch": 1.5612903225806452,
	"grad_norm": 0.09275069273094473,
	"learning_rate": 0.00010984891197811687,
	"loss": 0.3513,
	"step": 121
	},
	{
	"epoch": 1.5741935483870968,
	"grad_norm": 0.09527469311088294,
	"learning_rate": 0.00010833753685018935,
	"loss": 0.3556,
	"step": 122
	},
	{
	"epoch": 1.5870967741935482,
	"grad_norm": 0.09323849659154124,
	"learning_rate": 0.0001068242413364671,
	"loss": 0.3448,
	"step": 123
	},
	{
	"epoch": 1.6,
	"grad_norm": 0.08474554028292876,
	"learning_rate": 0.00010530937399447496,
	"loss": 0.3499,
	"step": 124
	},
	{
	"epoch": 1.6129032258064515,
	"grad_norm": 0.09382059811382143,
	"learning_rate": 0.00010379328374377715,
	"loss": 0.3384,
	"step": 125
	},
	{
	"epoch": 1.6258064516129034,
	"grad_norm": 0.09276702527842776,
	"learning_rate": 0.00010227631978561056,
	"loss": 0.3444,
	"step": 126
	},
	{
	"epoch": 1.6387096774193548,
	"grad_norm": 0.08750152088472078,
	"learning_rate": 0.00010075883152245334,
	"loss": 0.3569,
	"step": 127
	},
	{
	"epoch": 1.6516129032258065,
	"grad_norm": 0.08714445180642569,
	"learning_rate": 9.92411684775467e-05,
	"loss": 0.342,
	"step": 128
	},
	{
	"epoch": 1.664516129032258,
	"grad_norm": 0.08469902272466831,
	"learning_rate": 9.772368021438943e-05,
	"loss": 0.3342,
	"step": 129
	},
	{
	"epoch": 1.6774193548387095,
	"grad_norm": 0.08724585745005611,
	"learning_rate": 9.620671625622288e-05,
	"loss": 0.3335,
	"step": 130
	},
	{
	"epoch": 1.6903225806451614,
	"grad_norm": 0.09087336723016343,
	"learning_rate": 9.469062600552509e-05,
	"loss": 0.3447,
	"step": 131
	},
	{
	"epoch": 1.7032258064516128,
	"grad_norm": 0.08863278083042062,
	"learning_rate": 9.317575866353292e-05,
	"loss": 0.3487,
	"step": 132
	},
	{
	"epoch": 1.7161290322580647,
	"grad_norm": 0.08343459715762,
	"learning_rate": 9.166246314981066e-05,
	"loss": 0.3454,
	"step": 133
	},
	{
	"epoch": 1.729032258064516,
	"grad_norm": 0.08837483796029806,
	"learning_rate": 9.015108802188313e-05,
	"loss": 0.3484,
	"step": 134
	},
	{
	"epoch": 1.7419354838709677,
	"grad_norm": 0.08762249376974672,
	"learning_rate": 8.86419813949525e-05,
	"loss": 0.3447,
	"step": 135
	},
	{
	"epoch": 1.7548387096774194,
	"grad_norm": 0.08446853010895118,
	"learning_rate": 8.713549086171691e-05,
	"loss": 0.3466,
	"step": 136
	},
	{
	"epoch": 1.7677419354838708,
	"grad_norm": 0.08897676787603495,
	"learning_rate": 8.563196341230919e-05,
	"loss": 0.3434,
	"step": 137
	},
	{
	"epoch": 1.7806451612903227,
	"grad_norm": 0.09210810174866911,
	"learning_rate": 8.413174535437487e-05,
	"loss": 0.355,
	"step": 138
	},
	{
	"epoch": 1.793548387096774,
	"grad_norm": 0.0877098792555575,
	"learning_rate": 8.263518223330697e-05,
	"loss": 0.3392,
	"step": 139
	},
	{
	"epoch": 1.8064516129032258,
	"grad_norm": 0.09059259587839792,
	"learning_rate": 8.114261875265643e-05,
	"loss": 0.3465,
	"step": 140
	},
	{
	"epoch": 1.8193548387096774,
	"grad_norm": 0.09043152099082513,
	"learning_rate": 7.965439869473664e-05,
	"loss": 0.3409,
	"step": 141
	},
	{
	"epoch": 1.832258064516129,
	"grad_norm": 0.08863483273837267,
	"learning_rate": 7.817086484143986e-05,
	"loss": 0.3497,
	"step": 142
	},
	{
	"epoch": 1.8451612903225807,
	"grad_norm": 0.08351509862847174,
	"learning_rate": 7.669235889528436e-05,
	"loss": 0.3484,
	"step": 143
	},
	{
	"epoch": 1.8580645161290321,
	"grad_norm": 0.08881689002413959,
	"learning_rate": 7.521922140071002e-05,
	"loss": 0.3428,
	"step": 144
	},
	{
	"epoch": 1.870967741935484,
	"grad_norm": 0.08962413300366581,
	"learning_rate": 7.375179166564063e-05,
	"loss": 0.3353,
	"step": 145
	},
	{
	"epoch": 1.8838709677419354,
	"grad_norm": 0.08991947191225944,
	"learning_rate": 7.229040768333115e-05,
	"loss": 0.3366,
	"step": 146
	},
	{
	"epoch": 1.896774193548387,
	"grad_norm": 0.0890545628104281,
	"learning_rate": 7.08354060545175e-05,
	"loss": 0.3381,
	"step": 147
	},
	{
	"epoch": 1.9096774193548387,
	"grad_norm": 0.09306016588414409,
	"learning_rate": 6.93871219098876e-05,
	"loss": 0.3356,
	"step": 148
	},
	{
	"epoch": 1.9225806451612903,
	"grad_norm": 0.08816048934545212,
	"learning_rate": 6.79458888328903e-05,
	"loss": 0.3412,
	"step": 149
	},
	{
	"epoch": 1.935483870967742,
	"grad_norm": 0.09006593042575502,
	"learning_rate": 6.651203878290139e-05,
	"loss": 0.3471,
	"step": 150
	},
	{
	"epoch": 1.9483870967741934,
	"grad_norm": 0.08499237638300171,
	"learning_rate": 6.508590201876317e-05,
	"loss": 0.335,
	"step": 151
	},
	{
	"epoch": 1.9612903225806453,
	"grad_norm": 0.09566747308379261,
	"learning_rate": 6.366780702271589e-05,
	"loss": 0.3395,
	"step": 152
	},
	{
	"epoch": 1.9741935483870967,
	"grad_norm": 0.0915253754596643,
	"learning_rate": 6.225808042473858e-05,
	"loss": 0.3488,
	"step": 153
	},
	{
	"epoch": 1.9870967741935484,
	"grad_norm": 0.08657357278603872,
	"learning_rate": 6.085704692731609e-05,
	"loss": 0.3344,
	"step": 154
	},
	{
	"epoch": 2.0,
	"grad_norm": 0.08950726731743963,
	"learning_rate": 5.9465029230650534e-05,
	"loss": 0.33,
	"step": 155
	},
	{
	"epoch": 2.0,
	"eval_loss": 0.35439133644104004,
	"eval_runtime": 36.1469,
	"eval_samples_per_second": 28.744,
	"eval_steps_per_second": 0.913,
	"step": 155
	},
	{
	"epoch": 2.0129032258064514,
	"grad_norm": 0.08961232668946545,
	"learning_rate": 5.8082347958333625e-05,
	"loss": 0.3273,
	"step": 156
	},
	{
	"epoch": 2.0258064516129033,
	"grad_norm": 0.09402916213349197,
	"learning_rate": 5.670932158349731e-05,
	"loss": 0.3218,
	"step": 157
	},
	{
	"epoch": 2.0387096774193547,
	"grad_norm": 0.08520247695821515,
	"learning_rate": 5.5346266355459995e-05,
	"loss": 0.3089,
	"step": 158
	},
	{
	"epoch": 2.0516129032258066,
	"grad_norm": 0.08637288183919145,
	"learning_rate": 5.399349622688479e-05,
	"loss": 0.3266,
	"step": 159
	},
	{
	"epoch": 2.064516129032258,
	"grad_norm": 0.08823864345930746,
	"learning_rate": 5.26513227814666e-05,
	"loss": 0.329,
	"step": 160
	},
	{
	"epoch": 2.07741935483871,
	"grad_norm": 0.09384371931382793,
	"learning_rate": 5.1320055162165115e-05,
	"loss": 0.3275,
	"step": 161
	},
	{
	"epoch": 2.0903225806451613,
	"grad_norm": 0.09516405744887674,
	"learning_rate": 5.000000000000002e-05,
	"loss": 0.332,
	"step": 162
	},
	{
	"epoch": 2.1032258064516127,
	"grad_norm": 0.08966279182804247,
	"learning_rate": 4.869146134342426e-05,
	"loss": 0.3247,
	"step": 163
	},
	{
	"epoch": 2.1161290322580646,
	"grad_norm": 0.08700940402163973,
	"learning_rate": 4.739474058829289e-05,
	"loss": 0.3221,
	"step": 164
	},
	{
	"epoch": 2.129032258064516,
	"grad_norm": 0.08984677102800173,
	"learning_rate": 4.611013640844245e-05,
	"loss": 0.3272,
	"step": 165
	},
	{
	"epoch": 2.141935483870968,
	"grad_norm": 0.08964202186304891,
	"learning_rate": 4.483794468689728e-05,
	"loss": 0.3188,
	"step": 166
	},
	{
	"epoch": 2.1548387096774193,
	"grad_norm": 0.09997697429798251,
	"learning_rate": 4.357845844771881e-05,
	"loss": 0.3383,
	"step": 167
	},
	{
	"epoch": 2.167741935483871,
	"grad_norm": 0.09510073376177604,
	"learning_rate": 4.2331967788513295e-05,
	"loss": 0.3252,
	"step": 168
	},
	{
	"epoch": 2.1806451612903226,
	"grad_norm": 0.09107612709336496,
	"learning_rate": 4.109875981361363e-05,
	"loss": 0.3217,
	"step": 169
	},
	{
	"epoch": 2.193548387096774,
	"grad_norm": 0.08804927379783276,
	"learning_rate": 3.987911856795047e-05,
	"loss": 0.3173,
	"step": 170
	},
	{
	"epoch": 2.206451612903226,
	"grad_norm": 0.0916081059987062,
	"learning_rate": 3.8673324971628357e-05,
	"loss": 0.3285,
	"step": 171
	},
	{
	"epoch": 2.2193548387096773,
	"grad_norm": 0.09226628432750343,
	"learning_rate": 3.7481656755221125e-05,
	"loss": 0.3154,
	"step": 172
	},
	{
	"epoch": 2.232258064516129,
	"grad_norm": 0.09145015878266409,
	"learning_rate": 3.630438839580217e-05,
	"loss": 0.3087,
	"step": 173
	},
	{
	"epoch": 2.2451612903225806,
	"grad_norm": 0.08786201399591659,
	"learning_rate": 3.5141791053724405e-05,
	"loss": 0.3151,
	"step": 174
	},
	{
	"epoch": 2.258064516129032,
	"grad_norm": 0.09259402512083086,
	"learning_rate": 3.399413251016359e-05,
	"loss": 0.3369,
	"step": 175
	},
	{
	"epoch": 2.270967741935484,
	"grad_norm": 0.09311260751337232,
	"learning_rate": 3.2861677105440336e-05,
	"loss": 0.3051,
	"step": 176
	},
	{
	"epoch": 2.2838709677419353,
	"grad_norm": 0.09217712904693832,
	"learning_rate": 3.174468567813461e-05,
	"loss": 0.3199,
	"step": 177
	},
	{
	"epoch": 2.296774193548387,
	"grad_norm": 0.09141877592974519,
	"learning_rate": 3.0643415505006735e-05,
	"loss": 0.3229,
	"step": 178
	},
	{
	"epoch": 2.3096774193548386,
	"grad_norm": 0.09528833689903496,
	"learning_rate": 2.9558120241738784e-05,
	"loss": 0.3286,
	"step": 179
	},
	{
	"epoch": 2.3225806451612905,
	"grad_norm": 0.09070636787107308,
	"learning_rate": 2.8489049864510054e-05,
	"loss": 0.3348,
	"step": 180
	},
	{
	"epoch": 2.335483870967742,
	"grad_norm": 0.09307512327341362,
	"learning_rate": 2.7436450612420095e-05,
	"loss": 0.3256,
	"step": 181
	},
	{
	"epoch": 2.3483870967741938,
	"grad_norm": 0.09127823479306682,
	"learning_rate": 2.640056493077231e-05,
	"loss": 0.3181,
	"step": 182
	},
	{
	"epoch": 2.361290322580645,
	"grad_norm": 0.09246009256113925,
	"learning_rate": 2.5381631415231454e-05,
	"loss": 0.3391,
	"step": 183
	},
	{
	"epoch": 2.3741935483870966,
	"grad_norm": 0.09095352379758655,
	"learning_rate": 2.4379884756868167e-05,
	"loss": 0.3172,
	"step": 184
	},
	{
	"epoch": 2.3870967741935485,
	"grad_norm": 0.0926880163626768,
	"learning_rate": 2.339555568810221e-05,
	"loss": 0.3177,
	"step": 185
	},
	{
	"epoch": 2.4,
	"grad_norm": 0.09094474131194094,
	"learning_rate": 2.242887092955801e-05,
	"loss": 0.3199,
	"step": 186
	},
	{
	"epoch": 2.412903225806452,
	"grad_norm": 0.09106546035353981,
	"learning_rate": 2.1480053137844115e-05,
	"loss": 0.3222,
	"step": 187
	},
	{
	"epoch": 2.425806451612903,
	"grad_norm": 0.08873018715134598,
	"learning_rate": 2.054932085426856e-05,
	"loss": 0.3118,
	"step": 188
	},
	{
	"epoch": 2.4387096774193546,
	"grad_norm": 0.0932765377498955,
	"learning_rate": 1.9636888454502178e-05,
	"loss": 0.3358,
	"step": 189
	},
	{
	"epoch": 2.4516129032258065,
	"grad_norm": 0.09181586534157822,
	"learning_rate": 1.8742966099201697e-05,
	"loss": 0.3157,
	"step": 190
	},
	{
	"epoch": 2.464516129032258,
	"grad_norm": 0.0929486436457203,
	"learning_rate": 1.7867759685603114e-05,
	"loss": 0.3154,
	"step": 191
	},
	{
	"epoch": 2.47741935483871,
	"grad_norm": 0.09188630220285351,
	"learning_rate": 1.7011470800097496e-05,
	"loss": 0.3181,
	"step": 192
	},
	{
	"epoch": 2.490322580645161,
	"grad_norm": 0.09574286894431329,
	"learning_rate": 1.6174296671799572e-05,
	"loss": 0.3222,
	"step": 193
	},
	{
	"epoch": 2.5032258064516126,
	"grad_norm": 0.09145354457132104,
	"learning_rate": 1.5356430127119913e-05,
	"loss": 0.3222,
	"step": 194
	},
	{
	"epoch": 2.5161290322580645,
	"grad_norm": 0.09039580690260736,
	"learning_rate": 1.4558059545351143e-05,
	"loss": 0.324,
	"step": 195
	},
	{
	"epoch": 2.5290322580645164,
	"grad_norm": 0.08979381831653434,
	"learning_rate": 1.3779368815278647e-05,
	"loss": 0.3107,
	"step": 196
	},
	{
	"epoch": 2.541935483870968,
	"grad_norm": 0.09526292697431937,
	"learning_rate": 1.302053729282533e-05,
	"loss": 0.3219,
	"step": 197
	},
	{
	"epoch": 2.554838709677419,
	"grad_norm": 0.09310358146453943,
	"learning_rate": 1.2281739759740574e-05,
	"loss": 0.3214,
	"step": 198
	},
	{
	"epoch": 2.567741935483871,
	"grad_norm": 0.09212645063531479,
	"learning_rate": 1.1563146383342772e-05,
	"loss": 0.3154,
	"step": 199
	},
	{
	"epoch": 2.5806451612903225,
	"grad_norm": 0.09533681862557382,
	"learning_rate": 1.0864922677324618e-05,
	"loss": 0.319,
	"step": 200
	},
	{
	"epoch": 2.5935483870967744,
	"grad_norm": 0.09551418366783314,
	"learning_rate": 1.01872294636304e-05,
	"loss": 0.3333,
	"step": 201
	},
	{
	"epoch": 2.606451612903226,
	"grad_norm": 0.08930212325894361,
	"learning_rate": 9.530222835413738e-06,
	"loss": 0.3048,
	"step": 202
	},
	{
	"epoch": 2.6193548387096772,
	"grad_norm": 0.09220378121771236,
	"learning_rate": 8.894054121084838e-06,
	"loss": 0.3146,
	"step": 203
	},
	{
	"epoch": 2.632258064516129,
	"grad_norm": 0.09150774720724307,
	"learning_rate": 8.278869849454718e-06,
	"loss": 0.3311,
	"step": 204
	},
	{
	"epoch": 2.6451612903225805,
	"grad_norm": 0.09261513270619316,
	"learning_rate": 7.684811715985429e-06,
	"loss": 0.3172,
	"step": 205
	},
	{
	"epoch": 2.6580645161290324,
	"grad_norm": 0.0941004102909483,
	"learning_rate": 7.1120165501533e-06,
	"loss": 0.3347,
	"step": 206
	},
	{
	"epoch": 2.670967741935484,
	"grad_norm": 0.08707518610128166,
	"learning_rate": 6.560616283932897e-06,
	"loss": 0.3116,
	"step": 207
	},
	{
	"epoch": 2.6838709677419352,
	"grad_norm": 0.08648707636296159,
	"learning_rate": 6.030737921409169e-06,
	"loss": 0.3144,
	"step": 208
	},
	{
	"epoch": 2.696774193548387,
	"grad_norm": 0.09169150101119816,
	"learning_rate": 5.52250350952459e-06,
	"loss": 0.3255,
	"step": 209
	},
	{
	"epoch": 2.709677419354839,
	"grad_norm": 0.09060072523264334,
	"learning_rate": 5.036030109968082e-06,
	"loss": 0.3183,
	"step": 210
	},
	{
	"epoch": 2.7225806451612904,
	"grad_norm": 0.09077216490604942,
	"learning_rate": 4.5714297722121106e-06,
	"loss": 0.321,
	"step": 211
	},
	{
	"epoch": 2.735483870967742,
	"grad_norm": 0.09088968433443333,
	"learning_rate": 4.128809507704445e-06,
	"loss": 0.3172,
	"step": 212
	},
	{
	"epoch": 2.7483870967741937,
	"grad_norm": 0.09191902683388614,
	"learning_rate": 3.7082712652200867e-06,
	"loss": 0.3261,
	"step": 213
	},
	{
	"epoch": 2.761290322580645,
	"grad_norm": 0.08843215800144302,
	"learning_rate": 3.3099119073793928e-06,
	"loss": 0.3158,
	"step": 214
	},
	{
	"epoch": 2.774193548387097,
	"grad_norm": 0.09079938334868655,
	"learning_rate": 2.9338231883378366e-06,
	"loss": 0.3178,
	"step": 215
	},
	{
	"epoch": 2.7870967741935484,
	"grad_norm": 0.09122789808454786,
	"learning_rate": 2.580091732652101e-06,
	"loss": 0.3282,
	"step": 216
	},
	{
	"epoch": 2.8,
	"grad_norm": 0.09380292374109117,
	"learning_rate": 2.248799015327907e-06,
	"loss": 0.3359,
	"step": 217
	},
	{
	"epoch": 2.8129032258064517,
	"grad_norm": 0.09035917420929797,
	"learning_rate": 1.9400213430538773e-06,
	"loss": 0.3169,
	"step": 218
	},
	{
	"epoch": 2.825806451612903,
	"grad_norm": 0.09195121657817087,
	"learning_rate": 1.6538298366257976e-06,
	"loss": 0.3314,
	"step": 219
	},
	{
	"epoch": 2.838709677419355,
	"grad_norm": 0.09166102367139951,
	"learning_rate": 1.3902904145653096e-06,
	"loss": 0.3258,
	"step": 220
	},
	{
	"epoch": 2.8516129032258064,
	"grad_norm": 0.0921992572010057,
	"learning_rate": 1.1494637779369766e-06,
	"loss": 0.3298,
	"step": 221
	},
	{
	"epoch": 2.864516129032258,
	"grad_norm": 0.09068261067988724,
	"learning_rate": 9.314053963669245e-07,
	"loss": 0.3214,
	"step": 222
	},
	{
	"epoch": 2.8774193548387097,
	"grad_norm": 0.09417924199778298,
	"learning_rate": 7.361654952665609e-07,
	"loss": 0.3134,
	"step": 223
	},
	{
	"epoch": 2.8903225806451616,
	"grad_norm": 0.0901765977296441,
	"learning_rate": 5.637890442641402e-07,
	"loss": 0.3221,
	"step": 224
	},
	{
	"epoch": 2.903225806451613,
	"grad_norm": 0.09094506589085496,
	"learning_rate": 4.143157468468717e-07,
	"loss": 0.3128,
	"step": 225
	},
	{
	"epoch": 2.9161290322580644,
	"grad_norm": 0.08772549933058231,
	"learning_rate": 2.877800312160783e-07,
	"loss": 0.3248,
	"step": 226
	},
	{
	"epoch": 2.9290322580645163,
	"grad_norm": 0.09191883931659987,
	"learning_rate": 1.8421104235727405e-07,
	"loss": 0.3114,
	"step": 227
	},
	{
	"epoch": 2.9419354838709677,
	"grad_norm": 0.08876137430429,
	"learning_rate": 1.0363263532724432e-07,
	"loss": 0.3127,
	"step": 228
	},
	{
	"epoch": 2.9548387096774196,
	"grad_norm": 0.09157045134043748,
	"learning_rate": 4.606336975948589e-08,
	"loss": 0.3275,
	"step": 229
	},
	{
	"epoch": 2.967741935483871,
	"grad_norm": 0.08940213355520302,
	"learning_rate": 1.1516505589381776e-08,
	"loss": 0.3246,
	"step": 230
	},
	{
	"epoch": 2.9806451612903224,
	"grad_norm": 0.0895898052255747,
	"learning_rate": 0.0,
	"loss": 0.3079,
	"step": 231
	},
	{
	"epoch": 2.9806451612903224,
	"eval_loss": 0.3507891595363617,
	"eval_runtime": 36.0777,
	"eval_samples_per_second": 28.799,
	"eval_steps_per_second": 0.915,
	"step": 231
	},
	{
	"epoch": 2.9806451612903224,
	"step": 231,
	"total_flos": 9.324729662937498e+16,
	"train_loss": 0.3951803825118325,
	"train_runtime": 2997.4381,
	"train_samples_per_second": 9.871,
	"train_steps_per_second": 0.077
	}
	],
	"logging_steps": 1,
	"max_steps": 231,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 3,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 9.324729662937498e+16,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}