lillian039's picture
Model save
5cdefe8 verified
raw
history blame
41.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9806451612903224,
"eval_steps": 500,
"global_step": 231,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012903225806451613,
"grad_norm": 0.882150089808769,
"learning_rate": 8.333333333333334e-06,
"loss": 1.3191,
"step": 1
},
{
"epoch": 0.025806451612903226,
"grad_norm": 0.8369153094823952,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.249,
"step": 2
},
{
"epoch": 0.03870967741935484,
"grad_norm": 0.8525103918091212,
"learning_rate": 2.5e-05,
"loss": 1.2775,
"step": 3
},
{
"epoch": 0.05161290322580645,
"grad_norm": 0.8113130093304075,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.2577,
"step": 4
},
{
"epoch": 0.06451612903225806,
"grad_norm": 0.7691226782403744,
"learning_rate": 4.166666666666667e-05,
"loss": 1.2275,
"step": 5
},
{
"epoch": 0.07741935483870968,
"grad_norm": 0.5954210054804412,
"learning_rate": 5e-05,
"loss": 1.1159,
"step": 6
},
{
"epoch": 0.09032258064516129,
"grad_norm": 0.48189256930049384,
"learning_rate": 5.833333333333334e-05,
"loss": 1.0593,
"step": 7
},
{
"epoch": 0.1032258064516129,
"grad_norm": 0.5241879927945232,
"learning_rate": 6.666666666666667e-05,
"loss": 1.0031,
"step": 8
},
{
"epoch": 0.11612903225806452,
"grad_norm": 0.5751865259411146,
"learning_rate": 7.500000000000001e-05,
"loss": 0.9263,
"step": 9
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.5686526755807603,
"learning_rate": 8.333333333333334e-05,
"loss": 0.8146,
"step": 10
},
{
"epoch": 0.14193548387096774,
"grad_norm": 0.5156906474251192,
"learning_rate": 9.166666666666667e-05,
"loss": 0.7583,
"step": 11
},
{
"epoch": 0.15483870967741936,
"grad_norm": 0.4901634328534619,
"learning_rate": 0.0001,
"loss": 0.6686,
"step": 12
},
{
"epoch": 0.16774193548387098,
"grad_norm": 0.376084270046461,
"learning_rate": 0.00010833333333333333,
"loss": 0.6005,
"step": 13
},
{
"epoch": 0.18064516129032257,
"grad_norm": 0.2761318809240614,
"learning_rate": 0.00011666666666666668,
"loss": 0.5741,
"step": 14
},
{
"epoch": 0.1935483870967742,
"grad_norm": 0.25038763704461725,
"learning_rate": 0.000125,
"loss": 0.5465,
"step": 15
},
{
"epoch": 0.2064516129032258,
"grad_norm": 0.2214903977106201,
"learning_rate": 0.00013333333333333334,
"loss": 0.5138,
"step": 16
},
{
"epoch": 0.21935483870967742,
"grad_norm": 0.28905541505099525,
"learning_rate": 0.00014166666666666668,
"loss": 0.5247,
"step": 17
},
{
"epoch": 0.23225806451612904,
"grad_norm": 0.20699066633757193,
"learning_rate": 0.00015000000000000001,
"loss": 0.4978,
"step": 18
},
{
"epoch": 0.24516129032258063,
"grad_norm": 0.219457528851344,
"learning_rate": 0.00015833333333333332,
"loss": 0.4924,
"step": 19
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.16596853789220767,
"learning_rate": 0.0001666666666666667,
"loss": 0.4759,
"step": 20
},
{
"epoch": 0.2709677419354839,
"grad_norm": 0.13228412371333673,
"learning_rate": 0.000175,
"loss": 0.4613,
"step": 21
},
{
"epoch": 0.2838709677419355,
"grad_norm": 0.1421107856190867,
"learning_rate": 0.00018333333333333334,
"loss": 0.4852,
"step": 22
},
{
"epoch": 0.2967741935483871,
"grad_norm": 0.12552928984887968,
"learning_rate": 0.00019166666666666667,
"loss": 0.4786,
"step": 23
},
{
"epoch": 0.3096774193548387,
"grad_norm": 0.11489463060846784,
"learning_rate": 0.0002,
"loss": 0.4532,
"step": 24
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.11476879539402507,
"learning_rate": 0.00019998848349441062,
"loss": 0.4454,
"step": 25
},
{
"epoch": 0.33548387096774196,
"grad_norm": 0.1256602270101812,
"learning_rate": 0.00019995393663024054,
"loss": 0.4513,
"step": 26
},
{
"epoch": 0.34838709677419355,
"grad_norm": 0.11833482485698336,
"learning_rate": 0.00019989636736467278,
"loss": 0.44,
"step": 27
},
{
"epoch": 0.36129032258064514,
"grad_norm": 0.11124019681377781,
"learning_rate": 0.00019981578895764273,
"loss": 0.4439,
"step": 28
},
{
"epoch": 0.3741935483870968,
"grad_norm": 0.10954971384477814,
"learning_rate": 0.00019971221996878394,
"loss": 0.4274,
"step": 29
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.11422715129880294,
"learning_rate": 0.00019958568425315314,
"loss": 0.4254,
"step": 30
},
{
"epoch": 0.4,
"grad_norm": 0.11262310014016527,
"learning_rate": 0.00019943621095573586,
"loss": 0.4204,
"step": 31
},
{
"epoch": 0.4129032258064516,
"grad_norm": 0.11143099554463408,
"learning_rate": 0.00019926383450473344,
"loss": 0.4105,
"step": 32
},
{
"epoch": 0.4258064516129032,
"grad_norm": 0.1088260973247734,
"learning_rate": 0.00019906859460363307,
"loss": 0.4136,
"step": 33
},
{
"epoch": 0.43870967741935485,
"grad_norm": 0.10400753996611788,
"learning_rate": 0.00019885053622206304,
"loss": 0.4213,
"step": 34
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.09587900896302251,
"learning_rate": 0.0001986097095854347,
"loss": 0.4085,
"step": 35
},
{
"epoch": 0.4645161290322581,
"grad_norm": 0.10119603747308556,
"learning_rate": 0.0001983461701633742,
"loss": 0.4181,
"step": 36
},
{
"epoch": 0.4774193548387097,
"grad_norm": 0.10062413136253176,
"learning_rate": 0.00019805997865694614,
"loss": 0.4098,
"step": 37
},
{
"epoch": 0.49032258064516127,
"grad_norm": 0.09162394941720846,
"learning_rate": 0.0001977512009846721,
"loss": 0.4085,
"step": 38
},
{
"epoch": 0.5032258064516129,
"grad_norm": 0.09269316443279575,
"learning_rate": 0.00019741990826734794,
"loss": 0.3994,
"step": 39
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.08782581803238095,
"learning_rate": 0.00019706617681166218,
"loss": 0.3983,
"step": 40
},
{
"epoch": 0.5290322580645161,
"grad_norm": 0.08665646987756218,
"learning_rate": 0.00019669008809262062,
"loss": 0.3938,
"step": 41
},
{
"epoch": 0.5419354838709678,
"grad_norm": 0.09289388957990503,
"learning_rate": 0.00019629172873477995,
"loss": 0.396,
"step": 42
},
{
"epoch": 0.5548387096774193,
"grad_norm": 0.09203344649472522,
"learning_rate": 0.00019587119049229557,
"loss": 0.4052,
"step": 43
},
{
"epoch": 0.567741935483871,
"grad_norm": 0.08209774194723368,
"learning_rate": 0.0001954285702277879,
"loss": 0.3959,
"step": 44
},
{
"epoch": 0.5806451612903226,
"grad_norm": 0.08595872863630391,
"learning_rate": 0.00019496396989003193,
"loss": 0.397,
"step": 45
},
{
"epoch": 0.5935483870967742,
"grad_norm": 0.09041908237644536,
"learning_rate": 0.00019447749649047542,
"loss": 0.3992,
"step": 46
},
{
"epoch": 0.6064516129032258,
"grad_norm": 0.08321976348844515,
"learning_rate": 0.00019396926207859084,
"loss": 0.4095,
"step": 47
},
{
"epoch": 0.6193548387096774,
"grad_norm": 0.07887604040253807,
"learning_rate": 0.00019343938371606712,
"loss": 0.3866,
"step": 48
},
{
"epoch": 0.632258064516129,
"grad_norm": 0.08329265943906447,
"learning_rate": 0.00019288798344984672,
"loss": 0.3985,
"step": 49
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.08661703211305888,
"learning_rate": 0.00019231518828401458,
"loss": 0.3925,
"step": 50
},
{
"epoch": 0.6580645161290323,
"grad_norm": 0.08382217550700771,
"learning_rate": 0.00019172113015054532,
"loss": 0.3862,
"step": 51
},
{
"epoch": 0.6709677419354839,
"grad_norm": 0.08245124856491458,
"learning_rate": 0.00019110594587891519,
"loss": 0.3847,
"step": 52
},
{
"epoch": 0.6838709677419355,
"grad_norm": 0.08319716279149986,
"learning_rate": 0.00019046977716458626,
"loss": 0.3775,
"step": 53
},
{
"epoch": 0.6967741935483871,
"grad_norm": 0.08074648144423298,
"learning_rate": 0.0001898127705363696,
"loss": 0.3786,
"step": 54
},
{
"epoch": 0.7096774193548387,
"grad_norm": 0.08472762376284584,
"learning_rate": 0.0001891350773226754,
"loss": 0.3923,
"step": 55
},
{
"epoch": 0.7225806451612903,
"grad_norm": 0.08398076059437376,
"learning_rate": 0.00018843685361665723,
"loss": 0.3709,
"step": 56
},
{
"epoch": 0.7354838709677419,
"grad_norm": 0.08465216102770419,
"learning_rate": 0.00018771826024025946,
"loss": 0.3818,
"step": 57
},
{
"epoch": 0.7483870967741936,
"grad_norm": 0.09145572810056589,
"learning_rate": 0.00018697946270717467,
"loss": 0.39,
"step": 58
},
{
"epoch": 0.7612903225806451,
"grad_norm": 0.08415188367023674,
"learning_rate": 0.00018622063118472134,
"loss": 0.3733,
"step": 59
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.08576290382509591,
"learning_rate": 0.00018544194045464886,
"loss": 0.3878,
"step": 60
},
{
"epoch": 0.7870967741935484,
"grad_norm": 0.0844142047859298,
"learning_rate": 0.00018464356987288013,
"loss": 0.3637,
"step": 61
},
{
"epoch": 0.8,
"grad_norm": 0.08918487261557899,
"learning_rate": 0.00018382570332820043,
"loss": 0.3775,
"step": 62
},
{
"epoch": 0.8129032258064516,
"grad_norm": 0.0795181880669878,
"learning_rate": 0.00018298852919990252,
"loss": 0.3853,
"step": 63
},
{
"epoch": 0.8258064516129032,
"grad_norm": 0.08173055996583302,
"learning_rate": 0.0001821322403143969,
"loss": 0.38,
"step": 64
},
{
"epoch": 0.8387096774193549,
"grad_norm": 0.08525070031165603,
"learning_rate": 0.0001812570339007983,
"loss": 0.3778,
"step": 65
},
{
"epoch": 0.8516129032258064,
"grad_norm": 0.08531235204546653,
"learning_rate": 0.00018036311154549784,
"loss": 0.3727,
"step": 66
},
{
"epoch": 0.864516129032258,
"grad_norm": 0.08169851479895494,
"learning_rate": 0.00017945067914573146,
"loss": 0.365,
"step": 67
},
{
"epoch": 0.8774193548387097,
"grad_norm": 0.08463789046916101,
"learning_rate": 0.0001785199468621559,
"loss": 0.3752,
"step": 68
},
{
"epoch": 0.8903225806451613,
"grad_norm": 0.09441843624235378,
"learning_rate": 0.000177571129070442,
"loss": 0.3665,
"step": 69
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.08530939476149231,
"learning_rate": 0.0001766044443118978,
"loss": 0.3926,
"step": 70
},
{
"epoch": 0.9161290322580645,
"grad_norm": 0.0836606457284625,
"learning_rate": 0.00017562011524313185,
"loss": 0.3844,
"step": 71
},
{
"epoch": 0.9290322580645162,
"grad_norm": 0.09868625782773943,
"learning_rate": 0.00017461836858476856,
"loss": 0.3835,
"step": 72
},
{
"epoch": 0.9419354838709677,
"grad_norm": 0.082132336261239,
"learning_rate": 0.00017359943506922774,
"loss": 0.3792,
"step": 73
},
{
"epoch": 0.9548387096774194,
"grad_norm": 0.08948965393301354,
"learning_rate": 0.0001725635493875799,
"loss": 0.3813,
"step": 74
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.08539410389371488,
"learning_rate": 0.00017151095013548994,
"loss": 0.3774,
"step": 75
},
{
"epoch": 0.9806451612903225,
"grad_norm": 0.08690404790165682,
"learning_rate": 0.00017044187975826124,
"loss": 0.3762,
"step": 76
},
{
"epoch": 0.9935483870967742,
"grad_norm": 0.09039522496805455,
"learning_rate": 0.0001693565844949933,
"loss": 0.3733,
"step": 77
},
{
"epoch": 0.9935483870967742,
"eval_loss": 0.3743511736392975,
"eval_runtime": 42.1339,
"eval_samples_per_second": 24.66,
"eval_steps_per_second": 0.783,
"step": 77
},
{
"epoch": 1.0064516129032257,
"grad_norm": 0.09165665911792642,
"learning_rate": 0.00016825531432186543,
"loss": 0.3532,
"step": 78
},
{
"epoch": 1.0193548387096774,
"grad_norm": 0.0801922544260219,
"learning_rate": 0.0001671383228945597,
"loss": 0.347,
"step": 79
},
{
"epoch": 1.032258064516129,
"grad_norm": 0.08352186065175837,
"learning_rate": 0.00016600586748983641,
"loss": 0.3566,
"step": 80
},
{
"epoch": 1.0451612903225806,
"grad_norm": 0.08793176795367076,
"learning_rate": 0.0001648582089462756,
"loss": 0.3473,
"step": 81
},
{
"epoch": 1.0580645161290323,
"grad_norm": 0.08913951531063671,
"learning_rate": 0.00016369561160419784,
"loss": 0.342,
"step": 82
},
{
"epoch": 1.070967741935484,
"grad_norm": 0.08309712335786672,
"learning_rate": 0.0001625183432447789,
"loss": 0.345,
"step": 83
},
{
"epoch": 1.0838709677419356,
"grad_norm": 0.08725330804483407,
"learning_rate": 0.00016132667502837165,
"loss": 0.3523,
"step": 84
},
{
"epoch": 1.096774193548387,
"grad_norm": 0.08680862762413778,
"learning_rate": 0.00016012088143204953,
"loss": 0.3554,
"step": 85
},
{
"epoch": 1.1096774193548387,
"grad_norm": 0.0863782848559528,
"learning_rate": 0.00015890124018638638,
"loss": 0.364,
"step": 86
},
{
"epoch": 1.1225806451612903,
"grad_norm": 0.08388848992116194,
"learning_rate": 0.00015766803221148673,
"loss": 0.3568,
"step": 87
},
{
"epoch": 1.135483870967742,
"grad_norm": 0.08226994751114965,
"learning_rate": 0.00015642154155228122,
"loss": 0.3489,
"step": 88
},
{
"epoch": 1.1483870967741936,
"grad_norm": 0.08575965994905438,
"learning_rate": 0.00015516205531310273,
"loss": 0.3466,
"step": 89
},
{
"epoch": 1.1612903225806452,
"grad_norm": 0.0895747440427046,
"learning_rate": 0.00015388986359155758,
"loss": 0.3488,
"step": 90
},
{
"epoch": 1.1741935483870969,
"grad_norm": 0.08403222320010312,
"learning_rate": 0.00015260525941170712,
"loss": 0.356,
"step": 91
},
{
"epoch": 1.1870967741935483,
"grad_norm": 0.08627434364043794,
"learning_rate": 0.0001513085386565758,
"loss": 0.3519,
"step": 92
},
{
"epoch": 1.2,
"grad_norm": 0.08925414655300028,
"learning_rate": 0.00015000000000000001,
"loss": 0.3523,
"step": 93
},
{
"epoch": 1.2129032258064516,
"grad_norm": 0.09120079741968923,
"learning_rate": 0.00014867994483783485,
"loss": 0.3555,
"step": 94
},
{
"epoch": 1.2258064516129032,
"grad_norm": 0.08519037826685563,
"learning_rate": 0.0001473486772185334,
"loss": 0.3551,
"step": 95
},
{
"epoch": 1.238709677419355,
"grad_norm": 0.08814591743170447,
"learning_rate": 0.00014600650377311522,
"loss": 0.3535,
"step": 96
},
{
"epoch": 1.2516129032258063,
"grad_norm": 0.08812877093082108,
"learning_rate": 0.00014465373364454001,
"loss": 0.3498,
"step": 97
},
{
"epoch": 1.2645161290322582,
"grad_norm": 0.08596197743921638,
"learning_rate": 0.00014329067841650274,
"loss": 0.3484,
"step": 98
},
{
"epoch": 1.2774193548387096,
"grad_norm": 0.09025513346881896,
"learning_rate": 0.00014191765204166643,
"loss": 0.3465,
"step": 99
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.08665409616008209,
"learning_rate": 0.00014053497076934948,
"loss": 0.35,
"step": 100
},
{
"epoch": 1.303225806451613,
"grad_norm": 0.09012608398761074,
"learning_rate": 0.00013914295307268396,
"loss": 0.3516,
"step": 101
},
{
"epoch": 1.3161290322580645,
"grad_norm": 0.09456407877563842,
"learning_rate": 0.00013774191957526143,
"loss": 0.3639,
"step": 102
},
{
"epoch": 1.3290322580645162,
"grad_norm": 0.0888376260234129,
"learning_rate": 0.00013633219297728416,
"loss": 0.3396,
"step": 103
},
{
"epoch": 1.3419354838709676,
"grad_norm": 0.08652600639054038,
"learning_rate": 0.00013491409798123687,
"loss": 0.3445,
"step": 104
},
{
"epoch": 1.3548387096774195,
"grad_norm": 0.09269194410505097,
"learning_rate": 0.00013348796121709862,
"loss": 0.3555,
"step": 105
},
{
"epoch": 1.367741935483871,
"grad_norm": 0.09421096011594207,
"learning_rate": 0.00013205411116710972,
"loss": 0.3508,
"step": 106
},
{
"epoch": 1.3806451612903226,
"grad_norm": 0.09286783444235318,
"learning_rate": 0.00013061287809011242,
"loss": 0.3571,
"step": 107
},
{
"epoch": 1.3935483870967742,
"grad_norm": 0.08172852976047028,
"learning_rate": 0.0001291645939454825,
"loss": 0.3488,
"step": 108
},
{
"epoch": 1.4064516129032258,
"grad_norm": 0.09033973727962885,
"learning_rate": 0.0001277095923166689,
"loss": 0.3498,
"step": 109
},
{
"epoch": 1.4193548387096775,
"grad_norm": 0.09628933362833343,
"learning_rate": 0.00012624820833435937,
"loss": 0.3472,
"step": 110
},
{
"epoch": 1.432258064516129,
"grad_norm": 0.08471497514674803,
"learning_rate": 0.00012478077859929,
"loss": 0.3353,
"step": 111
},
{
"epoch": 1.4451612903225808,
"grad_norm": 0.08976133324522119,
"learning_rate": 0.00012330764110471566,
"loss": 0.3468,
"step": 112
},
{
"epoch": 1.4580645161290322,
"grad_norm": 0.09634877556737409,
"learning_rate": 0.00012182913515856015,
"loss": 0.3541,
"step": 113
},
{
"epoch": 1.4709677419354839,
"grad_norm": 0.09348923296138459,
"learning_rate": 0.0001203456013052634,
"loss": 0.3521,
"step": 114
},
{
"epoch": 1.4838709677419355,
"grad_norm": 0.09437711091684706,
"learning_rate": 0.00011885738124734358,
"loss": 0.3566,
"step": 115
},
{
"epoch": 1.4967741935483871,
"grad_norm": 0.08916702937111011,
"learning_rate": 0.00011736481776669306,
"loss": 0.3458,
"step": 116
},
{
"epoch": 1.5096774193548388,
"grad_norm": 0.09100601467580355,
"learning_rate": 0.00011586825464562514,
"loss": 0.3593,
"step": 117
},
{
"epoch": 1.5225806451612902,
"grad_norm": 0.08990470683690902,
"learning_rate": 0.00011436803658769082,
"loss": 0.3434,
"step": 118
},
{
"epoch": 1.535483870967742,
"grad_norm": 0.0932653393737011,
"learning_rate": 0.00011286450913828312,
"loss": 0.342,
"step": 119
},
{
"epoch": 1.5483870967741935,
"grad_norm": 0.08960531773257623,
"learning_rate": 0.00011135801860504749,
"loss": 0.3628,
"step": 120
},
{
"epoch": 1.5612903225806452,
"grad_norm": 0.09275069273094473,
"learning_rate": 0.00010984891197811687,
"loss": 0.3513,
"step": 121
},
{
"epoch": 1.5741935483870968,
"grad_norm": 0.09527469311088294,
"learning_rate": 0.00010833753685018935,
"loss": 0.3556,
"step": 122
},
{
"epoch": 1.5870967741935482,
"grad_norm": 0.09323849659154124,
"learning_rate": 0.0001068242413364671,
"loss": 0.3448,
"step": 123
},
{
"epoch": 1.6,
"grad_norm": 0.08474554028292876,
"learning_rate": 0.00010530937399447496,
"loss": 0.3499,
"step": 124
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.09382059811382143,
"learning_rate": 0.00010379328374377715,
"loss": 0.3384,
"step": 125
},
{
"epoch": 1.6258064516129034,
"grad_norm": 0.09276702527842776,
"learning_rate": 0.00010227631978561056,
"loss": 0.3444,
"step": 126
},
{
"epoch": 1.6387096774193548,
"grad_norm": 0.08750152088472078,
"learning_rate": 0.00010075883152245334,
"loss": 0.3569,
"step": 127
},
{
"epoch": 1.6516129032258065,
"grad_norm": 0.08714445180642569,
"learning_rate": 9.92411684775467e-05,
"loss": 0.342,
"step": 128
},
{
"epoch": 1.664516129032258,
"grad_norm": 0.08469902272466831,
"learning_rate": 9.772368021438943e-05,
"loss": 0.3342,
"step": 129
},
{
"epoch": 1.6774193548387095,
"grad_norm": 0.08724585745005611,
"learning_rate": 9.620671625622288e-05,
"loss": 0.3335,
"step": 130
},
{
"epoch": 1.6903225806451614,
"grad_norm": 0.09087336723016343,
"learning_rate": 9.469062600552509e-05,
"loss": 0.3447,
"step": 131
},
{
"epoch": 1.7032258064516128,
"grad_norm": 0.08863278083042062,
"learning_rate": 9.317575866353292e-05,
"loss": 0.3487,
"step": 132
},
{
"epoch": 1.7161290322580647,
"grad_norm": 0.08343459715762,
"learning_rate": 9.166246314981066e-05,
"loss": 0.3454,
"step": 133
},
{
"epoch": 1.729032258064516,
"grad_norm": 0.08837483796029806,
"learning_rate": 9.015108802188313e-05,
"loss": 0.3484,
"step": 134
},
{
"epoch": 1.7419354838709677,
"grad_norm": 0.08762249376974672,
"learning_rate": 8.86419813949525e-05,
"loss": 0.3447,
"step": 135
},
{
"epoch": 1.7548387096774194,
"grad_norm": 0.08446853010895118,
"learning_rate": 8.713549086171691e-05,
"loss": 0.3466,
"step": 136
},
{
"epoch": 1.7677419354838708,
"grad_norm": 0.08897676787603495,
"learning_rate": 8.563196341230919e-05,
"loss": 0.3434,
"step": 137
},
{
"epoch": 1.7806451612903227,
"grad_norm": 0.09210810174866911,
"learning_rate": 8.413174535437487e-05,
"loss": 0.355,
"step": 138
},
{
"epoch": 1.793548387096774,
"grad_norm": 0.0877098792555575,
"learning_rate": 8.263518223330697e-05,
"loss": 0.3392,
"step": 139
},
{
"epoch": 1.8064516129032258,
"grad_norm": 0.09059259587839792,
"learning_rate": 8.114261875265643e-05,
"loss": 0.3465,
"step": 140
},
{
"epoch": 1.8193548387096774,
"grad_norm": 0.09043152099082513,
"learning_rate": 7.965439869473664e-05,
"loss": 0.3409,
"step": 141
},
{
"epoch": 1.832258064516129,
"grad_norm": 0.08863483273837267,
"learning_rate": 7.817086484143986e-05,
"loss": 0.3497,
"step": 142
},
{
"epoch": 1.8451612903225807,
"grad_norm": 0.08351509862847174,
"learning_rate": 7.669235889528436e-05,
"loss": 0.3484,
"step": 143
},
{
"epoch": 1.8580645161290321,
"grad_norm": 0.08881689002413959,
"learning_rate": 7.521922140071002e-05,
"loss": 0.3428,
"step": 144
},
{
"epoch": 1.870967741935484,
"grad_norm": 0.08962413300366581,
"learning_rate": 7.375179166564063e-05,
"loss": 0.3353,
"step": 145
},
{
"epoch": 1.8838709677419354,
"grad_norm": 0.08991947191225944,
"learning_rate": 7.229040768333115e-05,
"loss": 0.3366,
"step": 146
},
{
"epoch": 1.896774193548387,
"grad_norm": 0.0890545628104281,
"learning_rate": 7.08354060545175e-05,
"loss": 0.3381,
"step": 147
},
{
"epoch": 1.9096774193548387,
"grad_norm": 0.09306016588414409,
"learning_rate": 6.93871219098876e-05,
"loss": 0.3356,
"step": 148
},
{
"epoch": 1.9225806451612903,
"grad_norm": 0.08816048934545212,
"learning_rate": 6.79458888328903e-05,
"loss": 0.3412,
"step": 149
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.09006593042575502,
"learning_rate": 6.651203878290139e-05,
"loss": 0.3471,
"step": 150
},
{
"epoch": 1.9483870967741934,
"grad_norm": 0.08499237638300171,
"learning_rate": 6.508590201876317e-05,
"loss": 0.335,
"step": 151
},
{
"epoch": 1.9612903225806453,
"grad_norm": 0.09566747308379261,
"learning_rate": 6.366780702271589e-05,
"loss": 0.3395,
"step": 152
},
{
"epoch": 1.9741935483870967,
"grad_norm": 0.0915253754596643,
"learning_rate": 6.225808042473858e-05,
"loss": 0.3488,
"step": 153
},
{
"epoch": 1.9870967741935484,
"grad_norm": 0.08657357278603872,
"learning_rate": 6.085704692731609e-05,
"loss": 0.3344,
"step": 154
},
{
"epoch": 2.0,
"grad_norm": 0.08950726731743963,
"learning_rate": 5.9465029230650534e-05,
"loss": 0.33,
"step": 155
},
{
"epoch": 2.0,
"eval_loss": 0.35439133644104004,
"eval_runtime": 36.1469,
"eval_samples_per_second": 28.744,
"eval_steps_per_second": 0.913,
"step": 155
},
{
"epoch": 2.0129032258064514,
"grad_norm": 0.08961232668946545,
"learning_rate": 5.8082347958333625e-05,
"loss": 0.3273,
"step": 156
},
{
"epoch": 2.0258064516129033,
"grad_norm": 0.09402916213349197,
"learning_rate": 5.670932158349731e-05,
"loss": 0.3218,
"step": 157
},
{
"epoch": 2.0387096774193547,
"grad_norm": 0.08520247695821515,
"learning_rate": 5.5346266355459995e-05,
"loss": 0.3089,
"step": 158
},
{
"epoch": 2.0516129032258066,
"grad_norm": 0.08637288183919145,
"learning_rate": 5.399349622688479e-05,
"loss": 0.3266,
"step": 159
},
{
"epoch": 2.064516129032258,
"grad_norm": 0.08823864345930746,
"learning_rate": 5.26513227814666e-05,
"loss": 0.329,
"step": 160
},
{
"epoch": 2.07741935483871,
"grad_norm": 0.09384371931382793,
"learning_rate": 5.1320055162165115e-05,
"loss": 0.3275,
"step": 161
},
{
"epoch": 2.0903225806451613,
"grad_norm": 0.09516405744887674,
"learning_rate": 5.000000000000002e-05,
"loss": 0.332,
"step": 162
},
{
"epoch": 2.1032258064516127,
"grad_norm": 0.08966279182804247,
"learning_rate": 4.869146134342426e-05,
"loss": 0.3247,
"step": 163
},
{
"epoch": 2.1161290322580646,
"grad_norm": 0.08700940402163973,
"learning_rate": 4.739474058829289e-05,
"loss": 0.3221,
"step": 164
},
{
"epoch": 2.129032258064516,
"grad_norm": 0.08984677102800173,
"learning_rate": 4.611013640844245e-05,
"loss": 0.3272,
"step": 165
},
{
"epoch": 2.141935483870968,
"grad_norm": 0.08964202186304891,
"learning_rate": 4.483794468689728e-05,
"loss": 0.3188,
"step": 166
},
{
"epoch": 2.1548387096774193,
"grad_norm": 0.09997697429798251,
"learning_rate": 4.357845844771881e-05,
"loss": 0.3383,
"step": 167
},
{
"epoch": 2.167741935483871,
"grad_norm": 0.09510073376177604,
"learning_rate": 4.2331967788513295e-05,
"loss": 0.3252,
"step": 168
},
{
"epoch": 2.1806451612903226,
"grad_norm": 0.09107612709336496,
"learning_rate": 4.109875981361363e-05,
"loss": 0.3217,
"step": 169
},
{
"epoch": 2.193548387096774,
"grad_norm": 0.08804927379783276,
"learning_rate": 3.987911856795047e-05,
"loss": 0.3173,
"step": 170
},
{
"epoch": 2.206451612903226,
"grad_norm": 0.0916081059987062,
"learning_rate": 3.8673324971628357e-05,
"loss": 0.3285,
"step": 171
},
{
"epoch": 2.2193548387096773,
"grad_norm": 0.09226628432750343,
"learning_rate": 3.7481656755221125e-05,
"loss": 0.3154,
"step": 172
},
{
"epoch": 2.232258064516129,
"grad_norm": 0.09145015878266409,
"learning_rate": 3.630438839580217e-05,
"loss": 0.3087,
"step": 173
},
{
"epoch": 2.2451612903225806,
"grad_norm": 0.08786201399591659,
"learning_rate": 3.5141791053724405e-05,
"loss": 0.3151,
"step": 174
},
{
"epoch": 2.258064516129032,
"grad_norm": 0.09259402512083086,
"learning_rate": 3.399413251016359e-05,
"loss": 0.3369,
"step": 175
},
{
"epoch": 2.270967741935484,
"grad_norm": 0.09311260751337232,
"learning_rate": 3.2861677105440336e-05,
"loss": 0.3051,
"step": 176
},
{
"epoch": 2.2838709677419353,
"grad_norm": 0.09217712904693832,
"learning_rate": 3.174468567813461e-05,
"loss": 0.3199,
"step": 177
},
{
"epoch": 2.296774193548387,
"grad_norm": 0.09141877592974519,
"learning_rate": 3.0643415505006735e-05,
"loss": 0.3229,
"step": 178
},
{
"epoch": 2.3096774193548386,
"grad_norm": 0.09528833689903496,
"learning_rate": 2.9558120241738784e-05,
"loss": 0.3286,
"step": 179
},
{
"epoch": 2.3225806451612905,
"grad_norm": 0.09070636787107308,
"learning_rate": 2.8489049864510054e-05,
"loss": 0.3348,
"step": 180
},
{
"epoch": 2.335483870967742,
"grad_norm": 0.09307512327341362,
"learning_rate": 2.7436450612420095e-05,
"loss": 0.3256,
"step": 181
},
{
"epoch": 2.3483870967741938,
"grad_norm": 0.09127823479306682,
"learning_rate": 2.640056493077231e-05,
"loss": 0.3181,
"step": 182
},
{
"epoch": 2.361290322580645,
"grad_norm": 0.09246009256113925,
"learning_rate": 2.5381631415231454e-05,
"loss": 0.3391,
"step": 183
},
{
"epoch": 2.3741935483870966,
"grad_norm": 0.09095352379758655,
"learning_rate": 2.4379884756868167e-05,
"loss": 0.3172,
"step": 184
},
{
"epoch": 2.3870967741935485,
"grad_norm": 0.0926880163626768,
"learning_rate": 2.339555568810221e-05,
"loss": 0.3177,
"step": 185
},
{
"epoch": 2.4,
"grad_norm": 0.09094474131194094,
"learning_rate": 2.242887092955801e-05,
"loss": 0.3199,
"step": 186
},
{
"epoch": 2.412903225806452,
"grad_norm": 0.09106546035353981,
"learning_rate": 2.1480053137844115e-05,
"loss": 0.3222,
"step": 187
},
{
"epoch": 2.425806451612903,
"grad_norm": 0.08873018715134598,
"learning_rate": 2.054932085426856e-05,
"loss": 0.3118,
"step": 188
},
{
"epoch": 2.4387096774193546,
"grad_norm": 0.0932765377498955,
"learning_rate": 1.9636888454502178e-05,
"loss": 0.3358,
"step": 189
},
{
"epoch": 2.4516129032258065,
"grad_norm": 0.09181586534157822,
"learning_rate": 1.8742966099201697e-05,
"loss": 0.3157,
"step": 190
},
{
"epoch": 2.464516129032258,
"grad_norm": 0.0929486436457203,
"learning_rate": 1.7867759685603114e-05,
"loss": 0.3154,
"step": 191
},
{
"epoch": 2.47741935483871,
"grad_norm": 0.09188630220285351,
"learning_rate": 1.7011470800097496e-05,
"loss": 0.3181,
"step": 192
},
{
"epoch": 2.490322580645161,
"grad_norm": 0.09574286894431329,
"learning_rate": 1.6174296671799572e-05,
"loss": 0.3222,
"step": 193
},
{
"epoch": 2.5032258064516126,
"grad_norm": 0.09145354457132104,
"learning_rate": 1.5356430127119913e-05,
"loss": 0.3222,
"step": 194
},
{
"epoch": 2.5161290322580645,
"grad_norm": 0.09039580690260736,
"learning_rate": 1.4558059545351143e-05,
"loss": 0.324,
"step": 195
},
{
"epoch": 2.5290322580645164,
"grad_norm": 0.08979381831653434,
"learning_rate": 1.3779368815278647e-05,
"loss": 0.3107,
"step": 196
},
{
"epoch": 2.541935483870968,
"grad_norm": 0.09526292697431937,
"learning_rate": 1.302053729282533e-05,
"loss": 0.3219,
"step": 197
},
{
"epoch": 2.554838709677419,
"grad_norm": 0.09310358146453943,
"learning_rate": 1.2281739759740574e-05,
"loss": 0.3214,
"step": 198
},
{
"epoch": 2.567741935483871,
"grad_norm": 0.09212645063531479,
"learning_rate": 1.1563146383342772e-05,
"loss": 0.3154,
"step": 199
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.09533681862557382,
"learning_rate": 1.0864922677324618e-05,
"loss": 0.319,
"step": 200
},
{
"epoch": 2.5935483870967744,
"grad_norm": 0.09551418366783314,
"learning_rate": 1.01872294636304e-05,
"loss": 0.3333,
"step": 201
},
{
"epoch": 2.606451612903226,
"grad_norm": 0.08930212325894361,
"learning_rate": 9.530222835413738e-06,
"loss": 0.3048,
"step": 202
},
{
"epoch": 2.6193548387096772,
"grad_norm": 0.09220378121771236,
"learning_rate": 8.894054121084838e-06,
"loss": 0.3146,
"step": 203
},
{
"epoch": 2.632258064516129,
"grad_norm": 0.09150774720724307,
"learning_rate": 8.278869849454718e-06,
"loss": 0.3311,
"step": 204
},
{
"epoch": 2.6451612903225805,
"grad_norm": 0.09261513270619316,
"learning_rate": 7.684811715985429e-06,
"loss": 0.3172,
"step": 205
},
{
"epoch": 2.6580645161290324,
"grad_norm": 0.0941004102909483,
"learning_rate": 7.1120165501533e-06,
"loss": 0.3347,
"step": 206
},
{
"epoch": 2.670967741935484,
"grad_norm": 0.08707518610128166,
"learning_rate": 6.560616283932897e-06,
"loss": 0.3116,
"step": 207
},
{
"epoch": 2.6838709677419352,
"grad_norm": 0.08648707636296159,
"learning_rate": 6.030737921409169e-06,
"loss": 0.3144,
"step": 208
},
{
"epoch": 2.696774193548387,
"grad_norm": 0.09169150101119816,
"learning_rate": 5.52250350952459e-06,
"loss": 0.3255,
"step": 209
},
{
"epoch": 2.709677419354839,
"grad_norm": 0.09060072523264334,
"learning_rate": 5.036030109968082e-06,
"loss": 0.3183,
"step": 210
},
{
"epoch": 2.7225806451612904,
"grad_norm": 0.09077216490604942,
"learning_rate": 4.5714297722121106e-06,
"loss": 0.321,
"step": 211
},
{
"epoch": 2.735483870967742,
"grad_norm": 0.09088968433443333,
"learning_rate": 4.128809507704445e-06,
"loss": 0.3172,
"step": 212
},
{
"epoch": 2.7483870967741937,
"grad_norm": 0.09191902683388614,
"learning_rate": 3.7082712652200867e-06,
"loss": 0.3261,
"step": 213
},
{
"epoch": 2.761290322580645,
"grad_norm": 0.08843215800144302,
"learning_rate": 3.3099119073793928e-06,
"loss": 0.3158,
"step": 214
},
{
"epoch": 2.774193548387097,
"grad_norm": 0.09079938334868655,
"learning_rate": 2.9338231883378366e-06,
"loss": 0.3178,
"step": 215
},
{
"epoch": 2.7870967741935484,
"grad_norm": 0.09122789808454786,
"learning_rate": 2.580091732652101e-06,
"loss": 0.3282,
"step": 216
},
{
"epoch": 2.8,
"grad_norm": 0.09380292374109117,
"learning_rate": 2.248799015327907e-06,
"loss": 0.3359,
"step": 217
},
{
"epoch": 2.8129032258064517,
"grad_norm": 0.09035917420929797,
"learning_rate": 1.9400213430538773e-06,
"loss": 0.3169,
"step": 218
},
{
"epoch": 2.825806451612903,
"grad_norm": 0.09195121657817087,
"learning_rate": 1.6538298366257976e-06,
"loss": 0.3314,
"step": 219
},
{
"epoch": 2.838709677419355,
"grad_norm": 0.09166102367139951,
"learning_rate": 1.3902904145653096e-06,
"loss": 0.3258,
"step": 220
},
{
"epoch": 2.8516129032258064,
"grad_norm": 0.0921992572010057,
"learning_rate": 1.1494637779369766e-06,
"loss": 0.3298,
"step": 221
},
{
"epoch": 2.864516129032258,
"grad_norm": 0.09068261067988724,
"learning_rate": 9.314053963669245e-07,
"loss": 0.3214,
"step": 222
},
{
"epoch": 2.8774193548387097,
"grad_norm": 0.09417924199778298,
"learning_rate": 7.361654952665609e-07,
"loss": 0.3134,
"step": 223
},
{
"epoch": 2.8903225806451616,
"grad_norm": 0.0901765977296441,
"learning_rate": 5.637890442641402e-07,
"loss": 0.3221,
"step": 224
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.09094506589085496,
"learning_rate": 4.143157468468717e-07,
"loss": 0.3128,
"step": 225
},
{
"epoch": 2.9161290322580644,
"grad_norm": 0.08772549933058231,
"learning_rate": 2.877800312160783e-07,
"loss": 0.3248,
"step": 226
},
{
"epoch": 2.9290322580645163,
"grad_norm": 0.09191883931659987,
"learning_rate": 1.8421104235727405e-07,
"loss": 0.3114,
"step": 227
},
{
"epoch": 2.9419354838709677,
"grad_norm": 0.08876137430429,
"learning_rate": 1.0363263532724432e-07,
"loss": 0.3127,
"step": 228
},
{
"epoch": 2.9548387096774196,
"grad_norm": 0.09157045134043748,
"learning_rate": 4.606336975948589e-08,
"loss": 0.3275,
"step": 229
},
{
"epoch": 2.967741935483871,
"grad_norm": 0.08940213355520302,
"learning_rate": 1.1516505589381776e-08,
"loss": 0.3246,
"step": 230
},
{
"epoch": 2.9806451612903224,
"grad_norm": 0.0895898052255747,
"learning_rate": 0.0,
"loss": 0.3079,
"step": 231
},
{
"epoch": 2.9806451612903224,
"eval_loss": 0.3507891595363617,
"eval_runtime": 36.0777,
"eval_samples_per_second": 28.799,
"eval_steps_per_second": 0.915,
"step": 231
},
{
"epoch": 2.9806451612903224,
"step": 231,
"total_flos": 9.324729662937498e+16,
"train_loss": 0.3951803825118325,
"train_runtime": 2997.4381,
"train_samples_per_second": 9.871,
"train_steps_per_second": 0.077
}
],
"logging_steps": 1,
"max_steps": 231,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.324729662937498e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}