sizhkhy's picture
Upload folder using huggingface_hub
cb0bd09 verified
raw
history blame
89.4 kB
{
"best_metric": 0.020535213872790337,
"best_model_checkpoint": "/home/paperspace/Data/models/brasingh_publicis_f5f/llm3br256/checkpoint-410",
"epoch": 4.96969696969697,
"eval_steps": 5,
"global_step": 410,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012121212121212121,
"grad_norm": 0.17719489336013794,
"learning_rate": 2.4390243902439027e-06,
"loss": 0.103,
"step": 1
},
{
"epoch": 0.024242424242424242,
"grad_norm": 0.1567779779434204,
"learning_rate": 4.8780487804878055e-06,
"loss": 0.11,
"step": 2
},
{
"epoch": 0.03636363636363636,
"grad_norm": 0.1601039469242096,
"learning_rate": 7.317073170731707e-06,
"loss": 0.1104,
"step": 3
},
{
"epoch": 0.048484848484848485,
"grad_norm": 0.16313816606998444,
"learning_rate": 9.756097560975611e-06,
"loss": 0.1076,
"step": 4
},
{
"epoch": 0.06060606060606061,
"grad_norm": 0.15266162157058716,
"learning_rate": 1.2195121951219513e-05,
"loss": 0.1038,
"step": 5
},
{
"epoch": 0.06060606060606061,
"eval_loss": 0.09789121896028519,
"eval_runtime": 8.116,
"eval_samples_per_second": 6.161,
"eval_steps_per_second": 1.602,
"step": 5
},
{
"epoch": 0.07272727272727272,
"grad_norm": 0.1329907476902008,
"learning_rate": 1.4634146341463415e-05,
"loss": 0.0995,
"step": 6
},
{
"epoch": 0.08484848484848485,
"grad_norm": 0.09588994085788727,
"learning_rate": 1.707317073170732e-05,
"loss": 0.0833,
"step": 7
},
{
"epoch": 0.09696969696969697,
"grad_norm": 0.07421080023050308,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.0756,
"step": 8
},
{
"epoch": 0.10909090909090909,
"grad_norm": 0.0636032298207283,
"learning_rate": 2.1951219512195124e-05,
"loss": 0.0681,
"step": 9
},
{
"epoch": 0.12121212121212122,
"grad_norm": 0.07186830043792725,
"learning_rate": 2.4390243902439026e-05,
"loss": 0.0759,
"step": 10
},
{
"epoch": 0.12121212121212122,
"eval_loss": 0.07647334039211273,
"eval_runtime": 6.2195,
"eval_samples_per_second": 8.039,
"eval_steps_per_second": 2.09,
"step": 10
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.07592587172985077,
"learning_rate": 2.682926829268293e-05,
"loss": 0.0757,
"step": 11
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.057555243372917175,
"learning_rate": 2.926829268292683e-05,
"loss": 0.0733,
"step": 12
},
{
"epoch": 0.15757575757575756,
"grad_norm": 0.04685232415795326,
"learning_rate": 3.170731707317073e-05,
"loss": 0.0751,
"step": 13
},
{
"epoch": 0.1696969696969697,
"grad_norm": 0.04220229387283325,
"learning_rate": 3.414634146341464e-05,
"loss": 0.0784,
"step": 14
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.050287775695323944,
"learning_rate": 3.6585365853658535e-05,
"loss": 0.069,
"step": 15
},
{
"epoch": 0.18181818181818182,
"eval_loss": 0.06831522285938263,
"eval_runtime": 6.1929,
"eval_samples_per_second": 8.074,
"eval_steps_per_second": 2.099,
"step": 15
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.04325024411082268,
"learning_rate": 3.9024390243902444e-05,
"loss": 0.0691,
"step": 16
},
{
"epoch": 0.20606060606060606,
"grad_norm": 0.037937626242637634,
"learning_rate": 4.146341463414634e-05,
"loss": 0.0813,
"step": 17
},
{
"epoch": 0.21818181818181817,
"grad_norm": 0.03867847099900246,
"learning_rate": 4.390243902439025e-05,
"loss": 0.065,
"step": 18
},
{
"epoch": 0.23030303030303031,
"grad_norm": 0.03792285919189453,
"learning_rate": 4.634146341463415e-05,
"loss": 0.0617,
"step": 19
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.03528020903468132,
"learning_rate": 4.878048780487805e-05,
"loss": 0.0729,
"step": 20
},
{
"epoch": 0.24242424242424243,
"eval_loss": 0.062009546905756,
"eval_runtime": 6.1891,
"eval_samples_per_second": 8.079,
"eval_steps_per_second": 2.1,
"step": 20
},
{
"epoch": 0.2545454545454545,
"grad_norm": 0.031467072665691376,
"learning_rate": 5.121951219512195e-05,
"loss": 0.0602,
"step": 21
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.03714953735470772,
"learning_rate": 5.365853658536586e-05,
"loss": 0.0772,
"step": 22
},
{
"epoch": 0.2787878787878788,
"grad_norm": 0.03779144585132599,
"learning_rate": 5.6097560975609764e-05,
"loss": 0.0584,
"step": 23
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.030055589973926544,
"learning_rate": 5.853658536585366e-05,
"loss": 0.0568,
"step": 24
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.029797468334436417,
"learning_rate": 6.097560975609756e-05,
"loss": 0.0545,
"step": 25
},
{
"epoch": 0.30303030303030304,
"eval_loss": 0.057142239063978195,
"eval_runtime": 6.2052,
"eval_samples_per_second": 8.058,
"eval_steps_per_second": 2.095,
"step": 25
},
{
"epoch": 0.3151515151515151,
"grad_norm": 0.029303744435310364,
"learning_rate": 6.341463414634146e-05,
"loss": 0.0591,
"step": 26
},
{
"epoch": 0.32727272727272727,
"grad_norm": 0.03735222667455673,
"learning_rate": 6.585365853658538e-05,
"loss": 0.0836,
"step": 27
},
{
"epoch": 0.3393939393939394,
"grad_norm": 0.02950606681406498,
"learning_rate": 6.829268292682928e-05,
"loss": 0.0574,
"step": 28
},
{
"epoch": 0.3515151515151515,
"grad_norm": 0.02479255013167858,
"learning_rate": 7.073170731707317e-05,
"loss": 0.0506,
"step": 29
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.030447915196418762,
"learning_rate": 7.317073170731707e-05,
"loss": 0.0589,
"step": 30
},
{
"epoch": 0.36363636363636365,
"eval_loss": 0.05275052413344383,
"eval_runtime": 6.1946,
"eval_samples_per_second": 8.072,
"eval_steps_per_second": 2.099,
"step": 30
},
{
"epoch": 0.37575757575757573,
"grad_norm": 0.029138660058379173,
"learning_rate": 7.560975609756099e-05,
"loss": 0.0536,
"step": 31
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.029026566073298454,
"learning_rate": 7.804878048780489e-05,
"loss": 0.0454,
"step": 32
},
{
"epoch": 0.4,
"grad_norm": 0.03538930043578148,
"learning_rate": 8.048780487804879e-05,
"loss": 0.0727,
"step": 33
},
{
"epoch": 0.4121212121212121,
"grad_norm": 0.028354594483971596,
"learning_rate": 8.292682926829268e-05,
"loss": 0.0557,
"step": 34
},
{
"epoch": 0.42424242424242425,
"grad_norm": 0.02743169106543064,
"learning_rate": 8.53658536585366e-05,
"loss": 0.0461,
"step": 35
},
{
"epoch": 0.42424242424242425,
"eval_loss": 0.05005570873618126,
"eval_runtime": 6.192,
"eval_samples_per_second": 8.075,
"eval_steps_per_second": 2.099,
"step": 35
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.03530753031373024,
"learning_rate": 8.78048780487805e-05,
"loss": 0.0627,
"step": 36
},
{
"epoch": 0.4484848484848485,
"grad_norm": 0.02797996811568737,
"learning_rate": 9.02439024390244e-05,
"loss": 0.0527,
"step": 37
},
{
"epoch": 0.46060606060606063,
"grad_norm": 0.022809529677033424,
"learning_rate": 9.26829268292683e-05,
"loss": 0.0509,
"step": 38
},
{
"epoch": 0.4727272727272727,
"grad_norm": 0.02468150481581688,
"learning_rate": 9.51219512195122e-05,
"loss": 0.0488,
"step": 39
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.030917035415768623,
"learning_rate": 9.75609756097561e-05,
"loss": 0.0522,
"step": 40
},
{
"epoch": 0.48484848484848486,
"eval_loss": 0.049276672303676605,
"eval_runtime": 6.1874,
"eval_samples_per_second": 8.081,
"eval_steps_per_second": 2.101,
"step": 40
},
{
"epoch": 0.49696969696969695,
"grad_norm": 0.026523206382989883,
"learning_rate": 0.0001,
"loss": 0.0463,
"step": 41
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.028745442628860474,
"learning_rate": 9.999818789066165e-05,
"loss": 0.0433,
"step": 42
},
{
"epoch": 0.5212121212121212,
"grad_norm": 0.026402153074741364,
"learning_rate": 9.999275169399614e-05,
"loss": 0.0393,
"step": 43
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.02671145275235176,
"learning_rate": 9.998369180404283e-05,
"loss": 0.044,
"step": 44
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.034986190497875214,
"learning_rate": 9.997100887750215e-05,
"loss": 0.052,
"step": 45
},
{
"epoch": 0.5454545454545454,
"eval_loss": 0.04825693741440773,
"eval_runtime": 6.1973,
"eval_samples_per_second": 8.068,
"eval_steps_per_second": 2.098,
"step": 45
},
{
"epoch": 0.5575757575757576,
"grad_norm": 0.029590139165520668,
"learning_rate": 9.995470383368808e-05,
"loss": 0.0436,
"step": 46
},
{
"epoch": 0.5696969696969697,
"grad_norm": 0.03095312975347042,
"learning_rate": 9.99347778544615e-05,
"loss": 0.0431,
"step": 47
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.030565602704882622,
"learning_rate": 9.991123238414455e-05,
"loss": 0.0526,
"step": 48
},
{
"epoch": 0.593939393939394,
"grad_norm": 0.027898119762539864,
"learning_rate": 9.98840691294159e-05,
"loss": 0.0447,
"step": 49
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.03219461813569069,
"learning_rate": 9.985329005918702e-05,
"loss": 0.0459,
"step": 50
},
{
"epoch": 0.6060606060606061,
"eval_loss": 0.045825447887182236,
"eval_runtime": 6.1916,
"eval_samples_per_second": 8.075,
"eval_steps_per_second": 2.1,
"step": 50
},
{
"epoch": 0.6181818181818182,
"grad_norm": 0.02641221322119236,
"learning_rate": 9.981889740445958e-05,
"loss": 0.0417,
"step": 51
},
{
"epoch": 0.6303030303030303,
"grad_norm": 0.028501464053988457,
"learning_rate": 9.978089365816357e-05,
"loss": 0.0446,
"step": 52
},
{
"epoch": 0.6424242424242425,
"grad_norm": 0.0260939784348011,
"learning_rate": 9.973928157497674e-05,
"loss": 0.0451,
"step": 53
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.029564740136265755,
"learning_rate": 9.969406417112489e-05,
"loss": 0.0416,
"step": 54
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.027353042736649513,
"learning_rate": 9.964524472416319e-05,
"loss": 0.0363,
"step": 55
},
{
"epoch": 0.6666666666666666,
"eval_loss": 0.0433911457657814,
"eval_runtime": 6.2376,
"eval_samples_per_second": 8.016,
"eval_steps_per_second": 2.084,
"step": 55
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.03154386952519417,
"learning_rate": 9.95928267727387e-05,
"loss": 0.0457,
"step": 56
},
{
"epoch": 0.6909090909090909,
"grad_norm": 0.0249126385897398,
"learning_rate": 9.953681411633376e-05,
"loss": 0.0367,
"step": 57
},
{
"epoch": 0.703030303030303,
"grad_norm": 0.02522316575050354,
"learning_rate": 9.947721081499068e-05,
"loss": 0.0428,
"step": 58
},
{
"epoch": 0.7151515151515152,
"grad_norm": 0.028446340933442116,
"learning_rate": 9.941402118901744e-05,
"loss": 0.0456,
"step": 59
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.0324234701693058,
"learning_rate": 9.934724981867446e-05,
"loss": 0.0553,
"step": 60
},
{
"epoch": 0.7272727272727273,
"eval_loss": 0.04182567819952965,
"eval_runtime": 6.2053,
"eval_samples_per_second": 8.058,
"eval_steps_per_second": 2.095,
"step": 60
},
{
"epoch": 0.7393939393939394,
"grad_norm": 0.027509605512022972,
"learning_rate": 9.927690154384273e-05,
"loss": 0.0443,
"step": 61
},
{
"epoch": 0.7515151515151515,
"grad_norm": 0.025798741728067398,
"learning_rate": 9.920298146367286e-05,
"loss": 0.0423,
"step": 62
},
{
"epoch": 0.7636363636363637,
"grad_norm": 0.029940692707896233,
"learning_rate": 9.912549493621554e-05,
"loss": 0.0469,
"step": 63
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.032555170357227325,
"learning_rate": 9.904444757803321e-05,
"loss": 0.0428,
"step": 64
},
{
"epoch": 0.7878787878787878,
"grad_norm": 0.03051156736910343,
"learning_rate": 9.895984526379281e-05,
"loss": 0.0444,
"step": 65
},
{
"epoch": 0.7878787878787878,
"eval_loss": 0.0403163880109787,
"eval_runtime": 6.1935,
"eval_samples_per_second": 8.073,
"eval_steps_per_second": 2.099,
"step": 65
},
{
"epoch": 0.8,
"grad_norm": 0.02734997309744358,
"learning_rate": 9.887169412584011e-05,
"loss": 0.0389,
"step": 66
},
{
"epoch": 0.8121212121212121,
"grad_norm": 0.026902060955762863,
"learning_rate": 9.878000055375512e-05,
"loss": 0.0397,
"step": 67
},
{
"epoch": 0.8242424242424242,
"grad_norm": 0.03240904584527016,
"learning_rate": 9.868477119388896e-05,
"loss": 0.0387,
"step": 68
},
{
"epoch": 0.8363636363636363,
"grad_norm": 0.02606021985411644,
"learning_rate": 9.858601294888213e-05,
"loss": 0.0344,
"step": 69
},
{
"epoch": 0.8484848484848485,
"grad_norm": 0.029814746230840683,
"learning_rate": 9.848373297716414e-05,
"loss": 0.0469,
"step": 70
},
{
"epoch": 0.8484848484848485,
"eval_loss": 0.03973233327269554,
"eval_runtime": 6.2308,
"eval_samples_per_second": 8.025,
"eval_steps_per_second": 2.086,
"step": 70
},
{
"epoch": 0.8606060606060606,
"grad_norm": 0.025392569601535797,
"learning_rate": 9.837793869243468e-05,
"loss": 0.0388,
"step": 71
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.03046100027859211,
"learning_rate": 9.82686377631262e-05,
"loss": 0.0415,
"step": 72
},
{
"epoch": 0.8848484848484849,
"grad_norm": 0.02428356185555458,
"learning_rate": 9.815583811184808e-05,
"loss": 0.037,
"step": 73
},
{
"epoch": 0.896969696969697,
"grad_norm": 0.029197214171290398,
"learning_rate": 9.803954791481239e-05,
"loss": 0.0408,
"step": 74
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.027502721175551414,
"learning_rate": 9.791977560124119e-05,
"loss": 0.0417,
"step": 75
},
{
"epoch": 0.9090909090909091,
"eval_loss": 0.038558006286621094,
"eval_runtime": 6.2,
"eval_samples_per_second": 8.065,
"eval_steps_per_second": 2.097,
"step": 75
},
{
"epoch": 0.9212121212121213,
"grad_norm": 0.030016757547855377,
"learning_rate": 9.779652985275562e-05,
"loss": 0.0427,
"step": 76
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.029366502538323402,
"learning_rate": 9.766981960274653e-05,
"loss": 0.0312,
"step": 77
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.02805924229323864,
"learning_rate": 9.753965403572703e-05,
"loss": 0.0424,
"step": 78
},
{
"epoch": 0.9575757575757575,
"grad_norm": 0.027496378868818283,
"learning_rate": 9.740604258666668e-05,
"loss": 0.0368,
"step": 79
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.02711924910545349,
"learning_rate": 9.726899494030768e-05,
"loss": 0.0388,
"step": 80
},
{
"epoch": 0.9696969696969697,
"eval_loss": 0.037164073437452316,
"eval_runtime": 6.2151,
"eval_samples_per_second": 8.045,
"eval_steps_per_second": 2.092,
"step": 80
},
{
"epoch": 0.9818181818181818,
"grad_norm": 0.02877042628824711,
"learning_rate": 9.71285210304628e-05,
"loss": 0.0367,
"step": 81
},
{
"epoch": 0.9939393939393939,
"grad_norm": 0.029804140329360962,
"learning_rate": 9.698463103929542e-05,
"loss": 0.0399,
"step": 82
},
{
"epoch": 1.006060606060606,
"grad_norm": 0.04405470937490463,
"learning_rate": 9.683733539658139e-05,
"loss": 0.0545,
"step": 83
},
{
"epoch": 1.018181818181818,
"grad_norm": 0.0315798744559288,
"learning_rate": 9.66866447789531e-05,
"loss": 0.048,
"step": 84
},
{
"epoch": 1.0303030303030303,
"grad_norm": 0.02551027573645115,
"learning_rate": 9.653257010912559e-05,
"loss": 0.0309,
"step": 85
},
{
"epoch": 1.0303030303030303,
"eval_loss": 0.03581343591213226,
"eval_runtime": 6.2558,
"eval_samples_per_second": 7.993,
"eval_steps_per_second": 2.078,
"step": 85
},
{
"epoch": 1.0424242424242425,
"grad_norm": 0.03550685569643974,
"learning_rate": 9.637512255510475e-05,
"loss": 0.0659,
"step": 86
},
{
"epoch": 1.0545454545454545,
"grad_norm": 0.03085348755121231,
"learning_rate": 9.621431352937789e-05,
"loss": 0.0502,
"step": 87
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.02470513805747032,
"learning_rate": 9.605015468808651e-05,
"loss": 0.0318,
"step": 88
},
{
"epoch": 1.0787878787878789,
"grad_norm": 0.02803831174969673,
"learning_rate": 9.58826579301814e-05,
"loss": 0.0446,
"step": 89
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.03941066190600395,
"learning_rate": 9.571183539656011e-05,
"loss": 0.0487,
"step": 90
},
{
"epoch": 1.0909090909090908,
"eval_loss": 0.0354202575981617,
"eval_runtime": 6.1921,
"eval_samples_per_second": 8.075,
"eval_steps_per_second": 2.099,
"step": 90
},
{
"epoch": 1.103030303030303,
"grad_norm": 0.029008885845541954,
"learning_rate": 9.553769946918697e-05,
"loss": 0.0403,
"step": 91
},
{
"epoch": 1.1151515151515152,
"grad_norm": 0.025633882731199265,
"learning_rate": 9.536026277019561e-05,
"loss": 0.032,
"step": 92
},
{
"epoch": 1.1272727272727272,
"grad_norm": 0.02955947443842888,
"learning_rate": 9.517953816097396e-05,
"loss": 0.0366,
"step": 93
},
{
"epoch": 1.1393939393939394,
"grad_norm": 0.029836708679795265,
"learning_rate": 9.499553874123212e-05,
"loss": 0.0383,
"step": 94
},
{
"epoch": 1.1515151515151516,
"grad_norm": 0.030258659273386,
"learning_rate": 9.480827784805278e-05,
"loss": 0.0348,
"step": 95
},
{
"epoch": 1.1515151515151516,
"eval_loss": 0.034031517803668976,
"eval_runtime": 6.1911,
"eval_samples_per_second": 8.076,
"eval_steps_per_second": 2.1,
"step": 95
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.02571636624634266,
"learning_rate": 9.461776905492446e-05,
"loss": 0.0322,
"step": 96
},
{
"epoch": 1.1757575757575758,
"grad_norm": 0.025425300002098083,
"learning_rate": 9.442402617075765e-05,
"loss": 0.0302,
"step": 97
},
{
"epoch": 1.187878787878788,
"grad_norm": 0.02790471538901329,
"learning_rate": 9.422706323888397e-05,
"loss": 0.0305,
"step": 98
},
{
"epoch": 1.2,
"grad_norm": 0.031999390572309494,
"learning_rate": 9.402689453603815e-05,
"loss": 0.0384,
"step": 99
},
{
"epoch": 1.2121212121212122,
"grad_norm": 0.02810075506567955,
"learning_rate": 9.382353457132317e-05,
"loss": 0.0308,
"step": 100
},
{
"epoch": 1.2121212121212122,
"eval_loss": 0.03338392823934555,
"eval_runtime": 6.1917,
"eval_samples_per_second": 8.075,
"eval_steps_per_second": 2.1,
"step": 100
},
{
"epoch": 1.2242424242424241,
"grad_norm": 0.0302734412252903,
"learning_rate": 9.361699808515876e-05,
"loss": 0.0341,
"step": 101
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.033730726689100266,
"learning_rate": 9.340730004821266e-05,
"loss": 0.0346,
"step": 102
},
{
"epoch": 1.2484848484848485,
"grad_norm": 0.03323773667216301,
"learning_rate": 9.31944556603157e-05,
"loss": 0.0408,
"step": 103
},
{
"epoch": 1.2606060606060607,
"grad_norm": 0.027124911546707153,
"learning_rate": 9.297848034936006e-05,
"loss": 0.0332,
"step": 104
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.026853900402784348,
"learning_rate": 9.275938977018081e-05,
"loss": 0.0318,
"step": 105
},
{
"epoch": 1.2727272727272727,
"eval_loss": 0.03301350772380829,
"eval_runtime": 6.1968,
"eval_samples_per_second": 8.069,
"eval_steps_per_second": 2.098,
"step": 105
},
{
"epoch": 1.284848484848485,
"grad_norm": 0.027320127934217453,
"learning_rate": 9.253719980342135e-05,
"loss": 0.0339,
"step": 106
},
{
"epoch": 1.2969696969696969,
"grad_norm": 0.0313449464738369,
"learning_rate": 9.231192655438221e-05,
"loss": 0.0336,
"step": 107
},
{
"epoch": 1.309090909090909,
"grad_norm": 0.029063764959573746,
"learning_rate": 9.208358635185373e-05,
"loss": 0.0324,
"step": 108
},
{
"epoch": 1.3212121212121213,
"grad_norm": 0.03135693818330765,
"learning_rate": 9.185219574693242e-05,
"loss": 0.0332,
"step": 109
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.0317191518843174,
"learning_rate": 9.161777151182136e-05,
"loss": 0.028,
"step": 110
},
{
"epoch": 1.3333333333333333,
"eval_loss": 0.03218723088502884,
"eval_runtime": 6.2118,
"eval_samples_per_second": 8.049,
"eval_steps_per_second": 2.093,
"step": 110
},
{
"epoch": 1.3454545454545455,
"grad_norm": 0.031457044184207916,
"learning_rate": 9.138033063861436e-05,
"loss": 0.0346,
"step": 111
},
{
"epoch": 1.3575757575757577,
"grad_norm": 0.031810589134693146,
"learning_rate": 9.113989033806434e-05,
"loss": 0.0283,
"step": 112
},
{
"epoch": 1.3696969696969696,
"grad_norm": 0.030629124492406845,
"learning_rate": 9.089646803833589e-05,
"loss": 0.0246,
"step": 113
},
{
"epoch": 1.3818181818181818,
"grad_norm": 0.030411459505558014,
"learning_rate": 9.065008138374189e-05,
"loss": 0.0317,
"step": 114
},
{
"epoch": 1.393939393939394,
"grad_norm": 0.029815878719091415,
"learning_rate": 9.040074823346465e-05,
"loss": 0.0311,
"step": 115
},
{
"epoch": 1.393939393939394,
"eval_loss": 0.032092493027448654,
"eval_runtime": 6.1885,
"eval_samples_per_second": 8.08,
"eval_steps_per_second": 2.101,
"step": 115
},
{
"epoch": 1.406060606060606,
"grad_norm": 0.030812319368124008,
"learning_rate": 9.014848666026138e-05,
"loss": 0.0389,
"step": 116
},
{
"epoch": 1.4181818181818182,
"grad_norm": 0.02588343806564808,
"learning_rate": 8.989331494915417e-05,
"loss": 0.0287,
"step": 117
},
{
"epoch": 1.4303030303030302,
"grad_norm": 0.02780727669596672,
"learning_rate": 8.963525159610465e-05,
"loss": 0.0265,
"step": 118
},
{
"epoch": 1.4424242424242424,
"grad_norm": 0.026163380593061447,
"learning_rate": 8.937431530667328e-05,
"loss": 0.0262,
"step": 119
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.0316736213862896,
"learning_rate": 8.911052499466357e-05,
"loss": 0.0382,
"step": 120
},
{
"epoch": 1.4545454545454546,
"eval_loss": 0.031465690582990646,
"eval_runtime": 6.1922,
"eval_samples_per_second": 8.075,
"eval_steps_per_second": 2.099,
"step": 120
},
{
"epoch": 1.4666666666666668,
"grad_norm": 0.03706022724509239,
"learning_rate": 8.884389978075098e-05,
"loss": 0.0336,
"step": 121
},
{
"epoch": 1.4787878787878788,
"grad_norm": 0.027684392407536507,
"learning_rate": 8.857445899109715e-05,
"loss": 0.0267,
"step": 122
},
{
"epoch": 1.490909090909091,
"grad_norm": 0.02498454973101616,
"learning_rate": 8.83022221559489e-05,
"loss": 0.0258,
"step": 123
},
{
"epoch": 1.503030303030303,
"grad_norm": 0.03206618130207062,
"learning_rate": 8.80272090082227e-05,
"loss": 0.0344,
"step": 124
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.03329097852110863,
"learning_rate": 8.774943948207426e-05,
"loss": 0.0316,
"step": 125
},
{
"epoch": 1.5151515151515151,
"eval_loss": 0.030392121523618698,
"eval_runtime": 6.2076,
"eval_samples_per_second": 8.055,
"eval_steps_per_second": 2.094,
"step": 125
},
{
"epoch": 1.5272727272727273,
"grad_norm": 0.029471345245838165,
"learning_rate": 8.746893371145366e-05,
"loss": 0.0279,
"step": 126
},
{
"epoch": 1.5393939393939395,
"grad_norm": 0.030292104929685593,
"learning_rate": 8.718571202864598e-05,
"loss": 0.0292,
"step": 127
},
{
"epoch": 1.5515151515151515,
"grad_norm": 0.028025031089782715,
"learning_rate": 8.689979496279746e-05,
"loss": 0.0296,
"step": 128
},
{
"epoch": 1.5636363636363635,
"grad_norm": 0.027177123352885246,
"learning_rate": 8.661120323842751e-05,
"loss": 0.0286,
"step": 129
},
{
"epoch": 1.5757575757575757,
"grad_norm": 0.03291260078549385,
"learning_rate": 8.631995777392645e-05,
"loss": 0.0278,
"step": 130
},
{
"epoch": 1.5757575757575757,
"eval_loss": 0.029901880770921707,
"eval_runtime": 6.1931,
"eval_samples_per_second": 8.073,
"eval_steps_per_second": 2.099,
"step": 130
},
{
"epoch": 1.587878787878788,
"grad_norm": 0.027456866577267647,
"learning_rate": 8.602607968003935e-05,
"loss": 0.0277,
"step": 131
},
{
"epoch": 1.6,
"grad_norm": 0.02367628738284111,
"learning_rate": 8.572959025833573e-05,
"loss": 0.023,
"step": 132
},
{
"epoch": 1.612121212121212,
"grad_norm": 0.030250705778598785,
"learning_rate": 8.543051099966558e-05,
"loss": 0.0253,
"step": 133
},
{
"epoch": 1.6242424242424243,
"grad_norm": 0.02687668427824974,
"learning_rate": 8.512886358260162e-05,
"loss": 0.0249,
"step": 134
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.032938696444034576,
"learning_rate": 8.482466987186785e-05,
"loss": 0.0285,
"step": 135
},
{
"epoch": 1.6363636363636362,
"eval_loss": 0.029229959473013878,
"eval_runtime": 6.1897,
"eval_samples_per_second": 8.078,
"eval_steps_per_second": 2.1,
"step": 135
},
{
"epoch": 1.6484848484848484,
"grad_norm": 0.02894946001470089,
"learning_rate": 8.451795191675488e-05,
"loss": 0.0268,
"step": 136
},
{
"epoch": 1.6606060606060606,
"grad_norm": 0.03599061071872711,
"learning_rate": 8.420873194952152e-05,
"loss": 0.0351,
"step": 137
},
{
"epoch": 1.6727272727272728,
"grad_norm": 0.031365521252155304,
"learning_rate": 8.389703238378339e-05,
"loss": 0.0309,
"step": 138
},
{
"epoch": 1.6848484848484848,
"grad_norm": 0.024763284251093864,
"learning_rate": 8.358287581288822e-05,
"loss": 0.0244,
"step": 139
},
{
"epoch": 1.696969696969697,
"grad_norm": 0.026635024696588516,
"learning_rate": 8.326628500827826e-05,
"loss": 0.0257,
"step": 140
},
{
"epoch": 1.696969696969697,
"eval_loss": 0.02854442596435547,
"eval_runtime": 6.2016,
"eval_samples_per_second": 8.062,
"eval_steps_per_second": 2.096,
"step": 140
},
{
"epoch": 1.709090909090909,
"grad_norm": 0.03084694594144821,
"learning_rate": 8.294728291783966e-05,
"loss": 0.0301,
"step": 141
},
{
"epoch": 1.7212121212121212,
"grad_norm": 0.024888882413506508,
"learning_rate": 8.262589266423908e-05,
"loss": 0.0254,
"step": 142
},
{
"epoch": 1.7333333333333334,
"grad_norm": 0.02867315709590912,
"learning_rate": 8.230213754324773e-05,
"loss": 0.0338,
"step": 143
},
{
"epoch": 1.7454545454545456,
"grad_norm": 0.030578091740608215,
"learning_rate": 8.197604102205271e-05,
"loss": 0.0265,
"step": 144
},
{
"epoch": 1.7575757575757576,
"grad_norm": 0.025194313377141953,
"learning_rate": 8.16476267375561e-05,
"loss": 0.0244,
"step": 145
},
{
"epoch": 1.7575757575757576,
"eval_loss": 0.028112677857279778,
"eval_runtime": 6.1903,
"eval_samples_per_second": 8.077,
"eval_steps_per_second": 2.1,
"step": 145
},
{
"epoch": 1.7696969696969695,
"grad_norm": 0.03511481732130051,
"learning_rate": 8.131691849466153e-05,
"loss": 0.0351,
"step": 146
},
{
"epoch": 1.7818181818181817,
"grad_norm": 0.034265320748090744,
"learning_rate": 8.098394026454885e-05,
"loss": 0.0318,
"step": 147
},
{
"epoch": 1.793939393939394,
"grad_norm": 0.02798490971326828,
"learning_rate": 8.064871618293646e-05,
"loss": 0.0258,
"step": 148
},
{
"epoch": 1.8060606060606061,
"grad_norm": 0.031277846544981,
"learning_rate": 8.03112705483319e-05,
"loss": 0.0322,
"step": 149
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.025352105498313904,
"learning_rate": 7.997162782027061e-05,
"loss": 0.0256,
"step": 150
},
{
"epoch": 1.8181818181818183,
"eval_loss": 0.027805332094430923,
"eval_runtime": 6.1948,
"eval_samples_per_second": 8.071,
"eval_steps_per_second": 2.099,
"step": 150
},
{
"epoch": 1.8303030303030303,
"grad_norm": 0.03179726377129555,
"learning_rate": 7.962981261754294e-05,
"loss": 0.0265,
"step": 151
},
{
"epoch": 1.8424242424242423,
"grad_norm": 0.02985468879342079,
"learning_rate": 7.928584971640974e-05,
"loss": 0.0302,
"step": 152
},
{
"epoch": 1.8545454545454545,
"grad_norm": 0.031871821731328964,
"learning_rate": 7.893976404880643e-05,
"loss": 0.0331,
"step": 153
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.028416186571121216,
"learning_rate": 7.859158070053577e-05,
"loss": 0.0245,
"step": 154
},
{
"epoch": 1.878787878787879,
"grad_norm": 0.03054559975862503,
"learning_rate": 7.824132490944967e-05,
"loss": 0.0338,
"step": 155
},
{
"epoch": 1.878787878787879,
"eval_loss": 0.027029650285840034,
"eval_runtime": 6.235,
"eval_samples_per_second": 8.019,
"eval_steps_per_second": 2.085,
"step": 155
},
{
"epoch": 1.8909090909090909,
"grad_norm": 0.028330031782388687,
"learning_rate": 7.788902206361973e-05,
"loss": 0.0241,
"step": 156
},
{
"epoch": 1.903030303030303,
"grad_norm": 0.031616389751434326,
"learning_rate": 7.7534697699497e-05,
"loss": 0.0301,
"step": 157
},
{
"epoch": 1.915151515151515,
"grad_norm": 0.027048081159591675,
"learning_rate": 7.717837750006106e-05,
"loss": 0.0274,
"step": 158
},
{
"epoch": 1.9272727272727272,
"grad_norm": 0.028316281735897064,
"learning_rate": 7.682008729295833e-05,
"loss": 0.026,
"step": 159
},
{
"epoch": 1.9393939393939394,
"grad_norm": 0.02987455017864704,
"learning_rate": 7.645985304863003e-05,
"loss": 0.0309,
"step": 160
},
{
"epoch": 1.9393939393939394,
"eval_loss": 0.02624826692044735,
"eval_runtime": 6.1867,
"eval_samples_per_second": 8.082,
"eval_steps_per_second": 2.101,
"step": 160
},
{
"epoch": 1.9515151515151516,
"grad_norm": 0.02562532387673855,
"learning_rate": 7.609770087842969e-05,
"loss": 0.0275,
"step": 161
},
{
"epoch": 1.9636363636363636,
"grad_norm": 0.026776108890771866,
"learning_rate": 7.573365703273046e-05,
"loss": 0.0263,
"step": 162
},
{
"epoch": 1.9757575757575756,
"grad_norm": 0.031301844865083694,
"learning_rate": 7.536774789902246e-05,
"loss": 0.0293,
"step": 163
},
{
"epoch": 1.9878787878787878,
"grad_norm": 0.02761393040418625,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0292,
"step": 164
},
{
"epoch": 2.0,
"grad_norm": 0.04325617477297783,
"learning_rate": 7.463043999163919e-05,
"loss": 0.0378,
"step": 165
},
{
"epoch": 2.0,
"eval_loss": 0.02608395926654339,
"eval_runtime": 6.2208,
"eval_samples_per_second": 8.038,
"eval_steps_per_second": 2.09,
"step": 165
},
{
"epoch": 2.012121212121212,
"grad_norm": 0.026866400614380836,
"learning_rate": 7.425909466126568e-05,
"loss": 0.024,
"step": 166
},
{
"epoch": 2.0242424242424244,
"grad_norm": 0.026734622195363045,
"learning_rate": 7.388599092561315e-05,
"loss": 0.0238,
"step": 167
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.02514388971030712,
"learning_rate": 7.351115582887211e-05,
"loss": 0.0218,
"step": 168
},
{
"epoch": 2.0484848484848484,
"grad_norm": 0.02405986562371254,
"learning_rate": 7.313461654072973e-05,
"loss": 0.0199,
"step": 169
},
{
"epoch": 2.0606060606060606,
"grad_norm": 0.030505580827593803,
"learning_rate": 7.275640035440045e-05,
"loss": 0.0275,
"step": 170
},
{
"epoch": 2.0606060606060606,
"eval_loss": 0.026318900287151337,
"eval_runtime": 6.2393,
"eval_samples_per_second": 8.014,
"eval_steps_per_second": 2.084,
"step": 170
},
{
"epoch": 2.0727272727272728,
"grad_norm": 0.03722088038921356,
"learning_rate": 7.237653468464756e-05,
"loss": 0.0256,
"step": 171
},
{
"epoch": 2.084848484848485,
"grad_norm": 0.03724412992596626,
"learning_rate": 7.199504706579617e-05,
"loss": 0.0226,
"step": 172
},
{
"epoch": 2.096969696969697,
"grad_norm": 0.030355574563145638,
"learning_rate": 7.161196514973734e-05,
"loss": 0.0188,
"step": 173
},
{
"epoch": 2.109090909090909,
"grad_norm": 0.03693992272019386,
"learning_rate": 7.12273167039238e-05,
"loss": 0.0232,
"step": 174
},
{
"epoch": 2.121212121212121,
"grad_norm": 0.03164402395486832,
"learning_rate": 7.084112960935716e-05,
"loss": 0.0225,
"step": 175
},
{
"epoch": 2.121212121212121,
"eval_loss": 0.025883661583065987,
"eval_runtime": 6.2414,
"eval_samples_per_second": 8.011,
"eval_steps_per_second": 2.083,
"step": 175
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.031605158001184464,
"learning_rate": 7.045343185856701e-05,
"loss": 0.0248,
"step": 176
},
{
"epoch": 2.1454545454545455,
"grad_norm": 0.0310862734913826,
"learning_rate": 7.006425155358195e-05,
"loss": 0.0244,
"step": 177
},
{
"epoch": 2.1575757575757577,
"grad_norm": 0.031485848128795624,
"learning_rate": 6.967361690389258e-05,
"loss": 0.0242,
"step": 178
},
{
"epoch": 2.16969696969697,
"grad_norm": 0.03367177024483681,
"learning_rate": 6.92815562244068e-05,
"loss": 0.0246,
"step": 179
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.028202077373862267,
"learning_rate": 6.88880979333973e-05,
"loss": 0.0232,
"step": 180
},
{
"epoch": 2.1818181818181817,
"eval_loss": 0.025616060942411423,
"eval_runtime": 6.1872,
"eval_samples_per_second": 8.081,
"eval_steps_per_second": 2.101,
"step": 180
},
{
"epoch": 2.193939393939394,
"grad_norm": 0.03502137213945389,
"learning_rate": 6.849327055044183e-05,
"loss": 0.0251,
"step": 181
},
{
"epoch": 2.206060606060606,
"grad_norm": 0.029362250119447708,
"learning_rate": 6.809710269435589e-05,
"loss": 0.022,
"step": 182
},
{
"epoch": 2.2181818181818183,
"grad_norm": 0.033701106905937195,
"learning_rate": 6.769962308111839e-05,
"loss": 0.0234,
"step": 183
},
{
"epoch": 2.2303030303030305,
"grad_norm": 0.03379302844405174,
"learning_rate": 6.730086052179004e-05,
"loss": 0.0221,
"step": 184
},
{
"epoch": 2.242424242424242,
"grad_norm": 0.027100039646029472,
"learning_rate": 6.690084392042513e-05,
"loss": 0.0193,
"step": 185
},
{
"epoch": 2.242424242424242,
"eval_loss": 0.025547849014401436,
"eval_runtime": 6.2367,
"eval_samples_per_second": 8.017,
"eval_steps_per_second": 2.084,
"step": 185
},
{
"epoch": 2.2545454545454544,
"grad_norm": 0.03181413188576698,
"learning_rate": 6.649960227197647e-05,
"loss": 0.0217,
"step": 186
},
{
"epoch": 2.2666666666666666,
"grad_norm": 0.03648809716105461,
"learning_rate": 6.609716466019356e-05,
"loss": 0.0239,
"step": 187
},
{
"epoch": 2.278787878787879,
"grad_norm": 0.0302013847976923,
"learning_rate": 6.569356025551454e-05,
"loss": 0.0232,
"step": 188
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.028094977140426636,
"learning_rate": 6.528881831295188e-05,
"loss": 0.02,
"step": 189
},
{
"epoch": 2.303030303030303,
"grad_norm": 0.03214862942695618,
"learning_rate": 6.488296816997173e-05,
"loss": 0.0251,
"step": 190
},
{
"epoch": 2.303030303030303,
"eval_loss": 0.02527759224176407,
"eval_runtime": 6.19,
"eval_samples_per_second": 8.078,
"eval_steps_per_second": 2.1,
"step": 190
},
{
"epoch": 2.315151515151515,
"grad_norm": 0.033984988927841187,
"learning_rate": 6.447603924436744e-05,
"loss": 0.0243,
"step": 191
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.027719179168343544,
"learning_rate": 6.406806103212725e-05,
"loss": 0.0204,
"step": 192
},
{
"epoch": 2.3393939393939394,
"grad_norm": 0.029257657006382942,
"learning_rate": 6.36590631052963e-05,
"loss": 0.0232,
"step": 193
},
{
"epoch": 2.3515151515151516,
"grad_norm": 0.050508007407188416,
"learning_rate": 6.32490751098331e-05,
"loss": 0.0324,
"step": 194
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.029407154768705368,
"learning_rate": 6.283812676346063e-05,
"loss": 0.0228,
"step": 195
},
{
"epoch": 2.3636363636363638,
"eval_loss": 0.024870626628398895,
"eval_runtime": 6.191,
"eval_samples_per_second": 8.076,
"eval_steps_per_second": 2.1,
"step": 195
},
{
"epoch": 2.375757575757576,
"grad_norm": 0.0258539617061615,
"learning_rate": 6.242624785351236e-05,
"loss": 0.0231,
"step": 196
},
{
"epoch": 2.3878787878787877,
"grad_norm": 0.02586168795824051,
"learning_rate": 6.201346823477303e-05,
"loss": 0.0193,
"step": 197
},
{
"epoch": 2.4,
"grad_norm": 0.029741084203124046,
"learning_rate": 6.159981782731474e-05,
"loss": 0.0227,
"step": 198
},
{
"epoch": 2.412121212121212,
"grad_norm": 0.029881663620471954,
"learning_rate": 6.118532661432812e-05,
"loss": 0.0224,
"step": 199
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.027224918827414513,
"learning_rate": 6.0770024639949074e-05,
"loss": 0.0195,
"step": 200
},
{
"epoch": 2.4242424242424243,
"eval_loss": 0.024939175695180893,
"eval_runtime": 6.1942,
"eval_samples_per_second": 8.072,
"eval_steps_per_second": 2.099,
"step": 200
},
{
"epoch": 2.4363636363636365,
"grad_norm": 0.028513159602880478,
"learning_rate": 6.0353942007081046e-05,
"loss": 0.0198,
"step": 201
},
{
"epoch": 2.4484848484848483,
"grad_norm": 0.028778916224837303,
"learning_rate": 5.993710887521302e-05,
"loss": 0.0184,
"step": 202
},
{
"epoch": 2.4606060606060605,
"grad_norm": 0.03407447412610054,
"learning_rate": 5.951955545823342e-05,
"loss": 0.0207,
"step": 203
},
{
"epoch": 2.4727272727272727,
"grad_norm": 0.033413201570510864,
"learning_rate": 5.9101312022240106e-05,
"loss": 0.0217,
"step": 204
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.031220227479934692,
"learning_rate": 5.868240888334653e-05,
"loss": 0.0219,
"step": 205
},
{
"epoch": 2.484848484848485,
"eval_loss": 0.024136777967214584,
"eval_runtime": 6.2,
"eval_samples_per_second": 8.065,
"eval_steps_per_second": 2.097,
"step": 205
},
{
"epoch": 2.496969696969697,
"grad_norm": 0.0299720149487257,
"learning_rate": 5.826287640548425e-05,
"loss": 0.0231,
"step": 206
},
{
"epoch": 2.509090909090909,
"grad_norm": 0.030199084430933,
"learning_rate": 5.784274499820214e-05,
"loss": 0.0243,
"step": 207
},
{
"epoch": 2.5212121212121215,
"grad_norm": 0.03225167095661163,
"learning_rate": 5.742204511446203e-05,
"loss": 0.0241,
"step": 208
},
{
"epoch": 2.533333333333333,
"grad_norm": 0.02794428914785385,
"learning_rate": 5.700080724843147e-05,
"loss": 0.0217,
"step": 209
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.026055919006466866,
"learning_rate": 5.657906193327325e-05,
"loss": 0.0184,
"step": 210
},
{
"epoch": 2.5454545454545454,
"eval_loss": 0.023839673027396202,
"eval_runtime": 6.1855,
"eval_samples_per_second": 8.083,
"eval_steps_per_second": 2.102,
"step": 210
},
{
"epoch": 2.5575757575757576,
"grad_norm": 0.03009297326207161,
"learning_rate": 5.6156839738932343e-05,
"loss": 0.0233,
"step": 211
},
{
"epoch": 2.56969696969697,
"grad_norm": 0.038690801709890366,
"learning_rate": 5.573417126992003e-05,
"loss": 0.0419,
"step": 212
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.03184739127755165,
"learning_rate": 5.531108716309547e-05,
"loss": 0.0208,
"step": 213
},
{
"epoch": 2.5939393939393938,
"grad_norm": 0.04226066172122955,
"learning_rate": 5.4887618085445094e-05,
"loss": 0.0356,
"step": 214
},
{
"epoch": 2.606060606060606,
"grad_norm": 0.02787015587091446,
"learning_rate": 5.446379473185972e-05,
"loss": 0.0199,
"step": 215
},
{
"epoch": 2.606060606060606,
"eval_loss": 0.023647097870707512,
"eval_runtime": 6.1981,
"eval_samples_per_second": 8.067,
"eval_steps_per_second": 2.097,
"step": 215
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.028222182765603065,
"learning_rate": 5.4039647822909624e-05,
"loss": 0.0185,
"step": 216
},
{
"epoch": 2.6303030303030304,
"grad_norm": 0.03137464076280594,
"learning_rate": 5.361520810261779e-05,
"loss": 0.0212,
"step": 217
},
{
"epoch": 2.6424242424242426,
"grad_norm": 0.028826339170336723,
"learning_rate": 5.319050633623142e-05,
"loss": 0.0208,
"step": 218
},
{
"epoch": 2.6545454545454543,
"grad_norm": 0.048953138291835785,
"learning_rate": 5.2765573307992036e-05,
"loss": 0.0343,
"step": 219
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.03138812631368637,
"learning_rate": 5.234043981890394e-05,
"loss": 0.023,
"step": 220
},
{
"epoch": 2.6666666666666665,
"eval_loss": 0.02315612882375717,
"eval_runtime": 6.1852,
"eval_samples_per_second": 8.084,
"eval_steps_per_second": 2.102,
"step": 220
},
{
"epoch": 2.6787878787878787,
"grad_norm": 0.03604348748922348,
"learning_rate": 5.191513668450178e-05,
"loss": 0.0208,
"step": 221
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.028721556067466736,
"learning_rate": 5.14896947326168e-05,
"loss": 0.0178,
"step": 222
},
{
"epoch": 2.703030303030303,
"grad_norm": 0.02585718221962452,
"learning_rate": 5.1064144801142374e-05,
"loss": 0.019,
"step": 223
},
{
"epoch": 2.7151515151515153,
"grad_norm": 0.02729875221848488,
"learning_rate": 5.0638517735798696e-05,
"loss": 0.0184,
"step": 224
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.029812021180987358,
"learning_rate": 5.021284438789694e-05,
"loss": 0.0227,
"step": 225
},
{
"epoch": 2.7272727272727275,
"eval_loss": 0.023449590429663658,
"eval_runtime": 6.1841,
"eval_samples_per_second": 8.085,
"eval_steps_per_second": 2.102,
"step": 225
},
{
"epoch": 2.7393939393939393,
"grad_norm": 0.02997618354856968,
"learning_rate": 4.9787155612103074e-05,
"loss": 0.0205,
"step": 226
},
{
"epoch": 2.7515151515151515,
"grad_norm": 0.028398435562849045,
"learning_rate": 4.936148226420132e-05,
"loss": 0.0171,
"step": 227
},
{
"epoch": 2.7636363636363637,
"grad_norm": 0.030046509578824043,
"learning_rate": 4.893585519885764e-05,
"loss": 0.0197,
"step": 228
},
{
"epoch": 2.775757575757576,
"grad_norm": 0.029226917773485184,
"learning_rate": 4.851030526738321e-05,
"loss": 0.0204,
"step": 229
},
{
"epoch": 2.787878787878788,
"grad_norm": 0.03432171046733856,
"learning_rate": 4.8084863315498234e-05,
"loss": 0.0206,
"step": 230
},
{
"epoch": 2.787878787878788,
"eval_loss": 0.022967081516981125,
"eval_runtime": 6.1905,
"eval_samples_per_second": 8.077,
"eval_steps_per_second": 2.1,
"step": 230
},
{
"epoch": 2.8,
"grad_norm": 0.029804987832903862,
"learning_rate": 4.765956018109607e-05,
"loss": 0.0197,
"step": 231
},
{
"epoch": 2.812121212121212,
"grad_norm": 0.029483767226338387,
"learning_rate": 4.723442669200798e-05,
"loss": 0.0213,
"step": 232
},
{
"epoch": 2.824242424242424,
"grad_norm": 0.03142073005437851,
"learning_rate": 4.680949366376858e-05,
"loss": 0.0237,
"step": 233
},
{
"epoch": 2.8363636363636364,
"grad_norm": 0.029062366113066673,
"learning_rate": 4.638479189738224e-05,
"loss": 0.0235,
"step": 234
},
{
"epoch": 2.8484848484848486,
"grad_norm": 0.03055807389318943,
"learning_rate": 4.5960352177090395e-05,
"loss": 0.0217,
"step": 235
},
{
"epoch": 2.8484848484848486,
"eval_loss": 0.022518714889883995,
"eval_runtime": 6.1918,
"eval_samples_per_second": 8.075,
"eval_steps_per_second": 2.1,
"step": 235
},
{
"epoch": 2.8606060606060604,
"grad_norm": 0.027205798774957657,
"learning_rate": 4.5536205268140294e-05,
"loss": 0.0189,
"step": 236
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.025477448478341103,
"learning_rate": 4.511238191455491e-05,
"loss": 0.0166,
"step": 237
},
{
"epoch": 2.8848484848484848,
"grad_norm": 0.025487707927823067,
"learning_rate": 4.468891283690454e-05,
"loss": 0.0183,
"step": 238
},
{
"epoch": 2.896969696969697,
"grad_norm": 0.0332886204123497,
"learning_rate": 4.4265828730079987e-05,
"loss": 0.0221,
"step": 239
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.029150154441595078,
"learning_rate": 4.3843160261067655e-05,
"loss": 0.0186,
"step": 240
},
{
"epoch": 2.909090909090909,
"eval_loss": 0.022352781146764755,
"eval_runtime": 6.1961,
"eval_samples_per_second": 8.07,
"eval_steps_per_second": 2.098,
"step": 240
},
{
"epoch": 2.9212121212121214,
"grad_norm": 0.029591498896479607,
"learning_rate": 4.342093806672678e-05,
"loss": 0.0181,
"step": 241
},
{
"epoch": 2.9333333333333336,
"grad_norm": 0.03216252475976944,
"learning_rate": 4.2999192751568564e-05,
"loss": 0.0203,
"step": 242
},
{
"epoch": 2.9454545454545453,
"grad_norm": 0.02891668863594532,
"learning_rate": 4.2577954885537986e-05,
"loss": 0.0181,
"step": 243
},
{
"epoch": 2.9575757575757575,
"grad_norm": 0.028023086488246918,
"learning_rate": 4.215725500179787e-05,
"loss": 0.0191,
"step": 244
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.03082926571369171,
"learning_rate": 4.1737123594515756e-05,
"loss": 0.0201,
"step": 245
},
{
"epoch": 2.9696969696969697,
"eval_loss": 0.02198323793709278,
"eval_runtime": 6.1948,
"eval_samples_per_second": 8.071,
"eval_steps_per_second": 2.099,
"step": 245
},
{
"epoch": 2.981818181818182,
"grad_norm": 0.0321161188185215,
"learning_rate": 4.131759111665349e-05,
"loss": 0.0191,
"step": 246
},
{
"epoch": 2.993939393939394,
"grad_norm": 0.026935642585158348,
"learning_rate": 4.089868797775989e-05,
"loss": 0.0185,
"step": 247
},
{
"epoch": 3.006060606060606,
"grad_norm": 0.05411810800433159,
"learning_rate": 4.0480444541766576e-05,
"loss": 0.0301,
"step": 248
},
{
"epoch": 3.018181818181818,
"grad_norm": 0.027184097096323967,
"learning_rate": 4.0062891124787e-05,
"loss": 0.0188,
"step": 249
},
{
"epoch": 3.0303030303030303,
"grad_norm": 0.022542983293533325,
"learning_rate": 3.964605799291897e-05,
"loss": 0.0147,
"step": 250
},
{
"epoch": 3.0303030303030303,
"eval_loss": 0.021969465538859367,
"eval_runtime": 6.1917,
"eval_samples_per_second": 8.075,
"eval_steps_per_second": 2.1,
"step": 250
},
{
"epoch": 3.0424242424242425,
"grad_norm": 0.03472661226987839,
"learning_rate": 3.922997536005094e-05,
"loss": 0.0178,
"step": 251
},
{
"epoch": 3.0545454545454547,
"grad_norm": 0.028128741309046745,
"learning_rate": 3.8814673385671894e-05,
"loss": 0.0157,
"step": 252
},
{
"epoch": 3.066666666666667,
"grad_norm": 0.03135592117905617,
"learning_rate": 3.840018217268527e-05,
"loss": 0.0161,
"step": 253
},
{
"epoch": 3.0787878787878786,
"grad_norm": 0.03661385551095009,
"learning_rate": 3.7986531765226964e-05,
"loss": 0.0161,
"step": 254
},
{
"epoch": 3.090909090909091,
"grad_norm": 0.03205974027514458,
"learning_rate": 3.757375214648764e-05,
"loss": 0.0142,
"step": 255
},
{
"epoch": 3.090909090909091,
"eval_loss": 0.022621195763349533,
"eval_runtime": 6.2467,
"eval_samples_per_second": 8.004,
"eval_steps_per_second": 2.081,
"step": 255
},
{
"epoch": 3.103030303030303,
"grad_norm": 0.037527382373809814,
"learning_rate": 3.716187323653939e-05,
"loss": 0.0167,
"step": 256
},
{
"epoch": 3.1151515151515152,
"grad_norm": 0.03540443629026413,
"learning_rate": 3.675092489016693e-05,
"loss": 0.0168,
"step": 257
},
{
"epoch": 3.1272727272727274,
"grad_norm": 0.034389954060316086,
"learning_rate": 3.634093689470371e-05,
"loss": 0.017,
"step": 258
},
{
"epoch": 3.1393939393939396,
"grad_norm": 0.033294420689344406,
"learning_rate": 3.5931938967872766e-05,
"loss": 0.016,
"step": 259
},
{
"epoch": 3.1515151515151514,
"grad_norm": 0.028759747743606567,
"learning_rate": 3.5523960755632574e-05,
"loss": 0.0149,
"step": 260
},
{
"epoch": 3.1515151515151514,
"eval_loss": 0.021824924275279045,
"eval_runtime": 6.1966,
"eval_samples_per_second": 8.069,
"eval_steps_per_second": 2.098,
"step": 260
},
{
"epoch": 3.1636363636363636,
"grad_norm": 0.02833370864391327,
"learning_rate": 3.5117031830028274e-05,
"loss": 0.0127,
"step": 261
},
{
"epoch": 3.175757575757576,
"grad_norm": 0.0286524910479784,
"learning_rate": 3.471118168704811e-05,
"loss": 0.015,
"step": 262
},
{
"epoch": 3.187878787878788,
"grad_norm": 0.02769540622830391,
"learning_rate": 3.4306439744485454e-05,
"loss": 0.0154,
"step": 263
},
{
"epoch": 3.2,
"grad_norm": 0.03221355006098747,
"learning_rate": 3.390283533980646e-05,
"loss": 0.0167,
"step": 264
},
{
"epoch": 3.212121212121212,
"grad_norm": 0.026392612606287003,
"learning_rate": 3.350039772802354e-05,
"loss": 0.0151,
"step": 265
},
{
"epoch": 3.212121212121212,
"eval_loss": 0.02153392694890499,
"eval_runtime": 6.1848,
"eval_samples_per_second": 8.084,
"eval_steps_per_second": 2.102,
"step": 265
},
{
"epoch": 3.224242424242424,
"grad_norm": 0.03104759193956852,
"learning_rate": 3.309915607957487e-05,
"loss": 0.0171,
"step": 266
},
{
"epoch": 3.2363636363636363,
"grad_norm": 0.028836429119110107,
"learning_rate": 3.269913947820998e-05,
"loss": 0.0158,
"step": 267
},
{
"epoch": 3.2484848484848485,
"grad_norm": 0.032903432846069336,
"learning_rate": 3.2300376918881624e-05,
"loss": 0.0156,
"step": 268
},
{
"epoch": 3.2606060606060607,
"grad_norm": 0.030551951378583908,
"learning_rate": 3.1902897305644095e-05,
"loss": 0.0134,
"step": 269
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.030059080570936203,
"learning_rate": 3.1506729449558184e-05,
"loss": 0.0174,
"step": 270
},
{
"epoch": 3.2727272727272725,
"eval_loss": 0.021679332479834557,
"eval_runtime": 6.1964,
"eval_samples_per_second": 8.069,
"eval_steps_per_second": 2.098,
"step": 270
},
{
"epoch": 3.2848484848484847,
"grad_norm": 0.030379703268408775,
"learning_rate": 3.1111902066602724e-05,
"loss": 0.018,
"step": 271
},
{
"epoch": 3.296969696969697,
"grad_norm": 0.02761555276811123,
"learning_rate": 3.071844377559323e-05,
"loss": 0.016,
"step": 272
},
{
"epoch": 3.309090909090909,
"grad_norm": 0.026775086298584938,
"learning_rate": 3.0326383096107426e-05,
"loss": 0.014,
"step": 273
},
{
"epoch": 3.3212121212121213,
"grad_norm": 0.03328753635287285,
"learning_rate": 2.9935748446418066e-05,
"loss": 0.0169,
"step": 274
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.03564237430691719,
"learning_rate": 2.9546568141433006e-05,
"loss": 0.0172,
"step": 275
},
{
"epoch": 3.3333333333333335,
"eval_loss": 0.021324800327420235,
"eval_runtime": 6.1942,
"eval_samples_per_second": 8.072,
"eval_steps_per_second": 2.099,
"step": 275
},
{
"epoch": 3.3454545454545457,
"grad_norm": 0.028079047799110413,
"learning_rate": 2.915887039064287e-05,
"loss": 0.0141,
"step": 276
},
{
"epoch": 3.3575757575757574,
"grad_norm": 0.02460673451423645,
"learning_rate": 2.8772683296076196e-05,
"loss": 0.0126,
"step": 277
},
{
"epoch": 3.3696969696969696,
"grad_norm": 0.04345537722110748,
"learning_rate": 2.8388034850262646e-05,
"loss": 0.0376,
"step": 278
},
{
"epoch": 3.381818181818182,
"grad_norm": 0.0306687094271183,
"learning_rate": 2.8004952934203838e-05,
"loss": 0.017,
"step": 279
},
{
"epoch": 3.393939393939394,
"grad_norm": 0.033993735909461975,
"learning_rate": 2.762346531535246e-05,
"loss": 0.017,
"step": 280
},
{
"epoch": 3.393939393939394,
"eval_loss": 0.02108747698366642,
"eval_runtime": 6.2236,
"eval_samples_per_second": 8.034,
"eval_steps_per_second": 2.089,
"step": 280
},
{
"epoch": 3.4060606060606062,
"grad_norm": 0.02444186620414257,
"learning_rate": 2.7243599645599576e-05,
"loss": 0.014,
"step": 281
},
{
"epoch": 3.418181818181818,
"grad_norm": 0.028384167701005936,
"learning_rate": 2.6865383459270265e-05,
"loss": 0.0158,
"step": 282
},
{
"epoch": 3.43030303030303,
"grad_norm": 0.03162846714258194,
"learning_rate": 2.6488844171127903e-05,
"loss": 0.017,
"step": 283
},
{
"epoch": 3.4424242424242424,
"grad_norm": 0.027297567576169968,
"learning_rate": 2.6114009074386846e-05,
"loss": 0.0124,
"step": 284
},
{
"epoch": 3.4545454545454546,
"grad_norm": 0.03545952960848808,
"learning_rate": 2.574090533873431e-05,
"loss": 0.0223,
"step": 285
},
{
"epoch": 3.4545454545454546,
"eval_loss": 0.021236957982182503,
"eval_runtime": 6.1899,
"eval_samples_per_second": 8.078,
"eval_steps_per_second": 2.1,
"step": 285
},
{
"epoch": 3.466666666666667,
"grad_norm": 0.024587715044617653,
"learning_rate": 2.5369560008360828e-05,
"loss": 0.0132,
"step": 286
},
{
"epoch": 3.4787878787878785,
"grad_norm": 0.025963526219129562,
"learning_rate": 2.500000000000001e-05,
"loss": 0.0132,
"step": 287
},
{
"epoch": 3.4909090909090907,
"grad_norm": 0.03487967699766159,
"learning_rate": 2.4632252100977566e-05,
"loss": 0.0136,
"step": 288
},
{
"epoch": 3.503030303030303,
"grad_norm": 0.030390363186597824,
"learning_rate": 2.4266342967269552e-05,
"loss": 0.0157,
"step": 289
},
{
"epoch": 3.515151515151515,
"grad_norm": 0.030459538102149963,
"learning_rate": 2.3902299121570333e-05,
"loss": 0.0144,
"step": 290
},
{
"epoch": 3.515151515151515,
"eval_loss": 0.02107882872223854,
"eval_runtime": 6.2006,
"eval_samples_per_second": 8.064,
"eval_steps_per_second": 2.097,
"step": 290
},
{
"epoch": 3.5272727272727273,
"grad_norm": 0.02951274998486042,
"learning_rate": 2.354014695136997e-05,
"loss": 0.0131,
"step": 291
},
{
"epoch": 3.5393939393939395,
"grad_norm": 0.029193086549639702,
"learning_rate": 2.317991270704167e-05,
"loss": 0.0151,
"step": 292
},
{
"epoch": 3.5515151515151517,
"grad_norm": 0.02726319245994091,
"learning_rate": 2.282162249993895e-05,
"loss": 0.0125,
"step": 293
},
{
"epoch": 3.5636363636363635,
"grad_norm": 0.03212954103946686,
"learning_rate": 2.246530230050301e-05,
"loss": 0.0162,
"step": 294
},
{
"epoch": 3.5757575757575757,
"grad_norm": 0.024175025522708893,
"learning_rate": 2.211097793638029e-05,
"loss": 0.0125,
"step": 295
},
{
"epoch": 3.5757575757575757,
"eval_loss": 0.020796656608581543,
"eval_runtime": 6.1933,
"eval_samples_per_second": 8.073,
"eval_steps_per_second": 2.099,
"step": 295
},
{
"epoch": 3.587878787878788,
"grad_norm": 0.02679980918765068,
"learning_rate": 2.175867509055033e-05,
"loss": 0.0111,
"step": 296
},
{
"epoch": 3.6,
"grad_norm": 0.031121132895350456,
"learning_rate": 2.1408419299464245e-05,
"loss": 0.0165,
"step": 297
},
{
"epoch": 3.6121212121212123,
"grad_norm": 0.030764909461140633,
"learning_rate": 2.106023595119358e-05,
"loss": 0.0139,
"step": 298
},
{
"epoch": 3.624242424242424,
"grad_norm": 0.0302122812718153,
"learning_rate": 2.071415028359026e-05,
"loss": 0.0148,
"step": 299
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.029834948480129242,
"learning_rate": 2.0370187382457068e-05,
"loss": 0.0163,
"step": 300
},
{
"epoch": 3.6363636363636362,
"eval_loss": 0.02069205790758133,
"eval_runtime": 6.2054,
"eval_samples_per_second": 8.057,
"eval_steps_per_second": 2.095,
"step": 300
},
{
"epoch": 3.6484848484848484,
"grad_norm": 0.03009135089814663,
"learning_rate": 2.0028372179729403e-05,
"loss": 0.0143,
"step": 301
},
{
"epoch": 3.6606060606060606,
"grad_norm": 0.029639270156621933,
"learning_rate": 1.9688729451668114e-05,
"loss": 0.0168,
"step": 302
},
{
"epoch": 3.672727272727273,
"grad_norm": 0.026824606582522392,
"learning_rate": 1.935128381706355e-05,
"loss": 0.0129,
"step": 303
},
{
"epoch": 3.6848484848484846,
"grad_norm": 0.03427920117974281,
"learning_rate": 1.901605973545116e-05,
"loss": 0.0194,
"step": 304
},
{
"epoch": 3.6969696969696972,
"grad_norm": 0.031160475686192513,
"learning_rate": 1.868308150533847e-05,
"loss": 0.015,
"step": 305
},
{
"epoch": 3.6969696969696972,
"eval_loss": 0.020683376118540764,
"eval_runtime": 6.1876,
"eval_samples_per_second": 8.081,
"eval_steps_per_second": 2.101,
"step": 305
},
{
"epoch": 3.709090909090909,
"grad_norm": 0.033249229192733765,
"learning_rate": 1.8352373262443916e-05,
"loss": 0.0147,
"step": 306
},
{
"epoch": 3.721212121212121,
"grad_norm": 0.029073260724544525,
"learning_rate": 1.8023958977947304e-05,
"loss": 0.0157,
"step": 307
},
{
"epoch": 3.7333333333333334,
"grad_norm": 0.030520522966980934,
"learning_rate": 1.7697862456752273e-05,
"loss": 0.0152,
"step": 308
},
{
"epoch": 3.7454545454545456,
"grad_norm": 0.029693983495235443,
"learning_rate": 1.7374107335760936e-05,
"loss": 0.0172,
"step": 309
},
{
"epoch": 3.757575757575758,
"grad_norm": 0.03103681467473507,
"learning_rate": 1.7052717082160346e-05,
"loss": 0.0154,
"step": 310
},
{
"epoch": 3.757575757575758,
"eval_loss": 0.02056981809437275,
"eval_runtime": 6.1896,
"eval_samples_per_second": 8.078,
"eval_steps_per_second": 2.1,
"step": 310
},
{
"epoch": 3.7696969696969695,
"grad_norm": 0.030047744512557983,
"learning_rate": 1.673371499172174e-05,
"loss": 0.015,
"step": 311
},
{
"epoch": 3.7818181818181817,
"grad_norm": 0.03367823734879494,
"learning_rate": 1.6417124187111775e-05,
"loss": 0.017,
"step": 312
},
{
"epoch": 3.793939393939394,
"grad_norm": 0.027037424966692924,
"learning_rate": 1.610296761621662e-05,
"loss": 0.0145,
"step": 313
},
{
"epoch": 3.806060606060606,
"grad_norm": 0.030140092596411705,
"learning_rate": 1.5791268050478486e-05,
"loss": 0.0228,
"step": 314
},
{
"epoch": 3.8181818181818183,
"grad_norm": 0.031016338616609573,
"learning_rate": 1.5482048083245114e-05,
"loss": 0.0186,
"step": 315
},
{
"epoch": 3.8181818181818183,
"eval_loss": 0.02028246596455574,
"eval_runtime": 6.1869,
"eval_samples_per_second": 8.082,
"eval_steps_per_second": 2.101,
"step": 315
},
{
"epoch": 3.83030303030303,
"grad_norm": 0.02848219871520996,
"learning_rate": 1.517533012813217e-05,
"loss": 0.0159,
"step": 316
},
{
"epoch": 3.8424242424242423,
"grad_norm": 0.024206412956118584,
"learning_rate": 1.4871136417398406e-05,
"loss": 0.0123,
"step": 317
},
{
"epoch": 3.8545454545454545,
"grad_norm": 0.03030635416507721,
"learning_rate": 1.4569489000334436e-05,
"loss": 0.0137,
"step": 318
},
{
"epoch": 3.8666666666666667,
"grad_norm": 0.03156241029500961,
"learning_rate": 1.427040974166427e-05,
"loss": 0.0159,
"step": 319
},
{
"epoch": 3.878787878787879,
"grad_norm": 0.028942270204424858,
"learning_rate": 1.3973920319960655e-05,
"loss": 0.0135,
"step": 320
},
{
"epoch": 3.878787878787879,
"eval_loss": 0.020162392407655716,
"eval_runtime": 6.1914,
"eval_samples_per_second": 8.076,
"eval_steps_per_second": 2.1,
"step": 320
},
{
"epoch": 3.8909090909090907,
"grad_norm": 0.03344618156552315,
"learning_rate": 1.3680042226073552e-05,
"loss": 0.0148,
"step": 321
},
{
"epoch": 3.9030303030303033,
"grad_norm": 0.02961633913218975,
"learning_rate": 1.3388796761572492e-05,
"loss": 0.0141,
"step": 322
},
{
"epoch": 3.915151515151515,
"grad_norm": 0.030708983540534973,
"learning_rate": 1.310020503720254e-05,
"loss": 0.0132,
"step": 323
},
{
"epoch": 3.9272727272727272,
"grad_norm": 0.030072160065174103,
"learning_rate": 1.2814287971354022e-05,
"loss": 0.0161,
"step": 324
},
{
"epoch": 3.9393939393939394,
"grad_norm": 0.03028644621372223,
"learning_rate": 1.253106628854635e-05,
"loss": 0.0159,
"step": 325
},
{
"epoch": 3.9393939393939394,
"eval_loss": 0.020128030329942703,
"eval_runtime": 6.1885,
"eval_samples_per_second": 8.08,
"eval_steps_per_second": 2.101,
"step": 325
},
{
"epoch": 3.9515151515151516,
"grad_norm": 0.02972756326198578,
"learning_rate": 1.2250560517925746e-05,
"loss": 0.0142,
"step": 326
},
{
"epoch": 3.963636363636364,
"grad_norm": 0.02873014286160469,
"learning_rate": 1.1972790991777311e-05,
"loss": 0.0155,
"step": 327
},
{
"epoch": 3.9757575757575756,
"grad_norm": 0.028870223090052605,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.0142,
"step": 328
},
{
"epoch": 3.987878787878788,
"grad_norm": 0.02774449624121189,
"learning_rate": 1.1425541008902851e-05,
"loss": 0.0147,
"step": 329
},
{
"epoch": 4.0,
"grad_norm": 0.04664117470383644,
"learning_rate": 1.1156100219249022e-05,
"loss": 0.0211,
"step": 330
},
{
"epoch": 4.0,
"eval_loss": 0.01995450258255005,
"eval_runtime": 6.191,
"eval_samples_per_second": 8.076,
"eval_steps_per_second": 2.1,
"step": 330
},
{
"epoch": 4.012121212121212,
"grad_norm": 0.024339957162737846,
"learning_rate": 1.0889475005336446e-05,
"loss": 0.0133,
"step": 331
},
{
"epoch": 4.024242424242424,
"grad_norm": 0.023921139538288116,
"learning_rate": 1.0625684693326727e-05,
"loss": 0.013,
"step": 332
},
{
"epoch": 4.036363636363636,
"grad_norm": 0.02364080585539341,
"learning_rate": 1.036474840389537e-05,
"loss": 0.0132,
"step": 333
},
{
"epoch": 4.048484848484849,
"grad_norm": 0.02356121875345707,
"learning_rate": 1.0106685050845838e-05,
"loss": 0.0121,
"step": 334
},
{
"epoch": 4.0606060606060606,
"grad_norm": 0.025879928842186928,
"learning_rate": 9.851513339738628e-06,
"loss": 0.0134,
"step": 335
},
{
"epoch": 4.0606060606060606,
"eval_loss": 0.0202109944075346,
"eval_runtime": 6.1925,
"eval_samples_per_second": 8.074,
"eval_steps_per_second": 2.099,
"step": 335
},
{
"epoch": 4.072727272727272,
"grad_norm": 0.02288251556456089,
"learning_rate": 9.599251766535345e-06,
"loss": 0.0121,
"step": 336
},
{
"epoch": 4.084848484848485,
"grad_norm": 0.02703404612839222,
"learning_rate": 9.349918616258114e-06,
"loss": 0.0126,
"step": 337
},
{
"epoch": 4.096969696969697,
"grad_norm": 0.025494417175650597,
"learning_rate": 9.103531961664118e-06,
"loss": 0.0122,
"step": 338
},
{
"epoch": 4.109090909090909,
"grad_norm": 0.02807869389653206,
"learning_rate": 8.860109661935674e-06,
"loss": 0.0155,
"step": 339
},
{
"epoch": 4.121212121212121,
"grad_norm": 0.027642810717225075,
"learning_rate": 8.619669361385663e-06,
"loss": 0.0113,
"step": 340
},
{
"epoch": 4.121212121212121,
"eval_loss": 0.020561667159199715,
"eval_runtime": 6.1965,
"eval_samples_per_second": 8.069,
"eval_steps_per_second": 2.098,
"step": 340
},
{
"epoch": 4.133333333333334,
"grad_norm": 0.030346019193530083,
"learning_rate": 8.38222848817864e-06,
"loss": 0.0127,
"step": 341
},
{
"epoch": 4.1454545454545455,
"grad_norm": 0.024746423587203026,
"learning_rate": 8.14780425306758e-06,
"loss": 0.0121,
"step": 342
},
{
"epoch": 4.157575757575757,
"grad_norm": 0.026435259729623795,
"learning_rate": 7.91641364814628e-06,
"loss": 0.0109,
"step": 343
},
{
"epoch": 4.16969696969697,
"grad_norm": 0.02962976135313511,
"learning_rate": 7.688073445617799e-06,
"loss": 0.0108,
"step": 344
},
{
"epoch": 4.181818181818182,
"grad_norm": 0.02813326194882393,
"learning_rate": 7.462800196578662e-06,
"loss": 0.0117,
"step": 345
},
{
"epoch": 4.181818181818182,
"eval_loss": 0.0208114180713892,
"eval_runtime": 6.1943,
"eval_samples_per_second": 8.072,
"eval_steps_per_second": 2.099,
"step": 345
},
{
"epoch": 4.193939393939394,
"grad_norm": 0.03237050771713257,
"learning_rate": 7.240610229819195e-06,
"loss": 0.013,
"step": 346
},
{
"epoch": 4.206060606060606,
"grad_norm": 0.026286713778972626,
"learning_rate": 7.0215196506399515e-06,
"loss": 0.0108,
"step": 347
},
{
"epoch": 4.218181818181818,
"grad_norm": 0.026608100160956383,
"learning_rate": 6.8055443396842945e-06,
"loss": 0.0103,
"step": 348
},
{
"epoch": 4.2303030303030305,
"grad_norm": 0.03118029236793518,
"learning_rate": 6.592699951787362e-06,
"loss": 0.0138,
"step": 349
},
{
"epoch": 4.242424242424242,
"grad_norm": 0.030633771792054176,
"learning_rate": 6.3830019148412525e-06,
"loss": 0.0108,
"step": 350
},
{
"epoch": 4.242424242424242,
"eval_loss": 0.020906535908579826,
"eval_runtime": 6.1873,
"eval_samples_per_second": 8.081,
"eval_steps_per_second": 2.101,
"step": 350
},
{
"epoch": 4.254545454545455,
"grad_norm": 0.03377068042755127,
"learning_rate": 6.17646542867682e-06,
"loss": 0.0144,
"step": 351
},
{
"epoch": 4.266666666666667,
"grad_norm": 0.027513034641742706,
"learning_rate": 5.973105463961865e-06,
"loss": 0.0107,
"step": 352
},
{
"epoch": 4.278787878787879,
"grad_norm": 0.03077622503042221,
"learning_rate": 5.772936761116027e-06,
"loss": 0.0139,
"step": 353
},
{
"epoch": 4.290909090909091,
"grad_norm": 0.026948513463139534,
"learning_rate": 5.575973829242364e-06,
"loss": 0.0126,
"step": 354
},
{
"epoch": 4.303030303030303,
"grad_norm": 0.02911302261054516,
"learning_rate": 5.382230945075556e-06,
"loss": 0.012,
"step": 355
},
{
"epoch": 4.303030303030303,
"eval_loss": 0.020746439695358276,
"eval_runtime": 6.1873,
"eval_samples_per_second": 8.081,
"eval_steps_per_second": 2.101,
"step": 355
},
{
"epoch": 4.315151515151515,
"grad_norm": 0.02364683710038662,
"learning_rate": 5.191722151947226e-06,
"loss": 0.0099,
"step": 356
},
{
"epoch": 4.327272727272727,
"grad_norm": 0.031056983396410942,
"learning_rate": 5.004461258767873e-06,
"loss": 0.0143,
"step": 357
},
{
"epoch": 4.33939393939394,
"grad_norm": 0.02985430881381035,
"learning_rate": 4.820461839026047e-06,
"loss": 0.0126,
"step": 358
},
{
"epoch": 4.351515151515152,
"grad_norm": 0.02651560679078102,
"learning_rate": 4.639737229804403e-06,
"loss": 0.0115,
"step": 359
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.026479771360754967,
"learning_rate": 4.462300530813024e-06,
"loss": 0.0111,
"step": 360
},
{
"epoch": 4.363636363636363,
"eval_loss": 0.020565090700984,
"eval_runtime": 6.2012,
"eval_samples_per_second": 8.063,
"eval_steps_per_second": 2.096,
"step": 360
},
{
"epoch": 4.375757575757576,
"grad_norm": 0.02071288600564003,
"learning_rate": 4.2881646034398925e-06,
"loss": 0.0114,
"step": 361
},
{
"epoch": 4.387878787878788,
"grad_norm": 0.02533382549881935,
"learning_rate": 4.117342069818603e-06,
"loss": 0.0113,
"step": 362
},
{
"epoch": 4.4,
"grad_norm": 0.021412553265690804,
"learning_rate": 3.949845311913492e-06,
"loss": 0.0115,
"step": 363
},
{
"epoch": 4.412121212121212,
"grad_norm": 0.024555562064051628,
"learning_rate": 3.7856864706221185e-06,
"loss": 0.0098,
"step": 364
},
{
"epoch": 4.424242424242424,
"grad_norm": 0.027632344514131546,
"learning_rate": 3.6248774448952695e-06,
"loss": 0.0118,
"step": 365
},
{
"epoch": 4.424242424242424,
"eval_loss": 0.02050224132835865,
"eval_runtime": 6.1859,
"eval_samples_per_second": 8.083,
"eval_steps_per_second": 2.102,
"step": 365
},
{
"epoch": 4.4363636363636365,
"grad_norm": 0.028998758643865585,
"learning_rate": 3.467429890874424e-06,
"loss": 0.0111,
"step": 366
},
{
"epoch": 4.448484848484848,
"grad_norm": 0.027949200943112373,
"learning_rate": 3.3133552210468875e-06,
"loss": 0.0122,
"step": 367
},
{
"epoch": 4.460606060606061,
"grad_norm": 0.029877539724111557,
"learning_rate": 3.162664603418608e-06,
"loss": 0.0136,
"step": 368
},
{
"epoch": 4.472727272727273,
"grad_norm": 0.02742207795381546,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.0116,
"step": 369
},
{
"epoch": 4.484848484848484,
"grad_norm": 0.02200300246477127,
"learning_rate": 2.871478969537206e-06,
"loss": 0.0099,
"step": 370
},
{
"epoch": 4.484848484848484,
"eval_loss": 0.02058413252234459,
"eval_runtime": 6.1924,
"eval_samples_per_second": 8.074,
"eval_steps_per_second": 2.099,
"step": 370
},
{
"epoch": 4.496969696969697,
"grad_norm": 0.029120702296495438,
"learning_rate": 2.731005059692332e-06,
"loss": 0.0212,
"step": 371
},
{
"epoch": 4.509090909090909,
"grad_norm": 0.030737141147255898,
"learning_rate": 2.5939574133333312e-06,
"loss": 0.0275,
"step": 372
},
{
"epoch": 4.5212121212121215,
"grad_norm": 0.02619299292564392,
"learning_rate": 2.4603459642729863e-06,
"loss": 0.0104,
"step": 373
},
{
"epoch": 4.533333333333333,
"grad_norm": 0.02171757072210312,
"learning_rate": 2.330180397253473e-06,
"loss": 0.0099,
"step": 374
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.022865932434797287,
"learning_rate": 2.203470147244385e-06,
"loss": 0.0118,
"step": 375
},
{
"epoch": 4.545454545454545,
"eval_loss": 0.020584262907505035,
"eval_runtime": 6.193,
"eval_samples_per_second": 8.074,
"eval_steps_per_second": 2.099,
"step": 375
},
{
"epoch": 4.557575757575758,
"grad_norm": 0.02180948108434677,
"learning_rate": 2.0802243987588066e-06,
"loss": 0.0104,
"step": 376
},
{
"epoch": 4.569696969696969,
"grad_norm": 0.027300819754600525,
"learning_rate": 1.9604520851876198e-06,
"loss": 0.0121,
"step": 377
},
{
"epoch": 4.581818181818182,
"grad_norm": 0.027127033099532127,
"learning_rate": 1.8441618881519184e-06,
"loss": 0.0113,
"step": 378
},
{
"epoch": 4.593939393939394,
"grad_norm": 0.026878971606492996,
"learning_rate": 1.7313622368738014e-06,
"loss": 0.0105,
"step": 379
},
{
"epoch": 4.606060606060606,
"grad_norm": 0.02648119069635868,
"learning_rate": 1.6220613075653202e-06,
"loss": 0.0119,
"step": 380
},
{
"epoch": 4.606060606060606,
"eval_loss": 0.02056843228638172,
"eval_runtime": 6.1957,
"eval_samples_per_second": 8.07,
"eval_steps_per_second": 2.098,
"step": 380
},
{
"epoch": 4.618181818181818,
"grad_norm": 0.026216818019747734,
"learning_rate": 1.51626702283586e-06,
"loss": 0.0098,
"step": 381
},
{
"epoch": 4.63030303030303,
"grad_norm": 0.02351340465247631,
"learning_rate": 1.4139870511178766e-06,
"loss": 0.0116,
"step": 382
},
{
"epoch": 4.642424242424243,
"grad_norm": 0.030724933370947838,
"learning_rate": 1.3152288061110518e-06,
"loss": 0.0101,
"step": 383
},
{
"epoch": 4.654545454545454,
"grad_norm": 0.02688099816441536,
"learning_rate": 1.2199994462448904e-06,
"loss": 0.0113,
"step": 384
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.02722257934510708,
"learning_rate": 1.128305874159896e-06,
"loss": 0.0114,
"step": 385
},
{
"epoch": 4.666666666666667,
"eval_loss": 0.02059413306415081,
"eval_runtime": 6.2059,
"eval_samples_per_second": 8.057,
"eval_steps_per_second": 2.095,
"step": 385
},
{
"epoch": 4.678787878787879,
"grad_norm": 0.02383565530180931,
"learning_rate": 1.040154736207194e-06,
"loss": 0.0106,
"step": 386
},
{
"epoch": 4.690909090909091,
"grad_norm": 0.028037432581186295,
"learning_rate": 9.555524219667989e-07,
"loss": 0.0122,
"step": 387
},
{
"epoch": 4.703030303030303,
"grad_norm": 0.021838784217834473,
"learning_rate": 8.745050637844532e-07,
"loss": 0.0097,
"step": 388
},
{
"epoch": 4.715151515151515,
"grad_norm": 0.030912073329091072,
"learning_rate": 7.970185363271431e-07,
"loss": 0.0121,
"step": 389
},
{
"epoch": 4.7272727272727275,
"grad_norm": 0.02470664493739605,
"learning_rate": 7.230984561572729e-07,
"loss": 0.0109,
"step": 390
},
{
"epoch": 4.7272727272727275,
"eval_loss": 0.020591916516423225,
"eval_runtime": 6.1901,
"eval_samples_per_second": 8.077,
"eval_steps_per_second": 2.1,
"step": 390
},
{
"epoch": 4.739393939393939,
"grad_norm": 0.030810924246907234,
"learning_rate": 6.527501813255344e-07,
"loss": 0.0165,
"step": 391
},
{
"epoch": 4.751515151515152,
"grad_norm": 0.025043383240699768,
"learning_rate": 5.859788109825793e-07,
"loss": 0.0096,
"step": 392
},
{
"epoch": 4.763636363636364,
"grad_norm": 0.028706299141049385,
"learning_rate": 5.227891850093314e-07,
"loss": 0.0129,
"step": 393
},
{
"epoch": 4.775757575757575,
"grad_norm": 0.0277020912617445,
"learning_rate": 4.6318588366625616e-07,
"loss": 0.0122,
"step": 394
},
{
"epoch": 4.787878787878788,
"grad_norm": 0.02909735217690468,
"learning_rate": 4.071732272613149e-07,
"loss": 0.0124,
"step": 395
},
{
"epoch": 4.787878787878788,
"eval_loss": 0.020531287416815758,
"eval_runtime": 6.1957,
"eval_samples_per_second": 8.07,
"eval_steps_per_second": 2.098,
"step": 395
},
{
"epoch": 4.8,
"grad_norm": 0.024658478796482086,
"learning_rate": 3.5475527583681e-07,
"loss": 0.0112,
"step": 396
},
{
"epoch": 4.8121212121212125,
"grad_norm": 0.023081207647919655,
"learning_rate": 3.059358288751202e-07,
"loss": 0.0102,
"step": 397
},
{
"epoch": 4.824242424242424,
"grad_norm": 0.03320831060409546,
"learning_rate": 2.6071842502326527e-07,
"loss": 0.0119,
"step": 398
},
{
"epoch": 4.836363636363636,
"grad_norm": 0.025276964530348778,
"learning_rate": 2.1910634183644474e-07,
"loss": 0.0117,
"step": 399
},
{
"epoch": 4.848484848484849,
"grad_norm": 0.02636777050793171,
"learning_rate": 1.811025955404333e-07,
"loss": 0.0111,
"step": 400
},
{
"epoch": 4.848484848484849,
"eval_loss": 0.020571600645780563,
"eval_runtime": 6.1951,
"eval_samples_per_second": 8.071,
"eval_steps_per_second": 2.098,
"step": 400
},
{
"epoch": 4.86060606060606,
"grad_norm": 0.0283295139670372,
"learning_rate": 1.4670994081297795e-07,
"loss": 0.0141,
"step": 401
},
{
"epoch": 4.872727272727273,
"grad_norm": 0.025880116969347,
"learning_rate": 1.1593087058410779e-07,
"loss": 0.0113,
"step": 402
},
{
"epoch": 4.884848484848485,
"grad_norm": 0.02684679627418518,
"learning_rate": 8.876761585545068e-08,
"loss": 0.0104,
"step": 403
},
{
"epoch": 4.8969696969696965,
"grad_norm": 0.027327047660946846,
"learning_rate": 6.522214553850159e-08,
"loss": 0.0122,
"step": 404
},
{
"epoch": 4.909090909090909,
"grad_norm": 0.033394601196050644,
"learning_rate": 4.529616631193112e-08,
"loss": 0.012,
"step": 405
},
{
"epoch": 4.909090909090909,
"eval_loss": 0.020557112991809845,
"eval_runtime": 6.1866,
"eval_samples_per_second": 8.082,
"eval_steps_per_second": 2.101,
"step": 405
},
{
"epoch": 4.921212121212121,
"grad_norm": 0.02917388454079628,
"learning_rate": 2.899112249786229e-08,
"loss": 0.0115,
"step": 406
},
{
"epoch": 4.933333333333334,
"grad_norm": 0.032552849501371384,
"learning_rate": 1.6308195957182027e-08,
"loss": 0.0125,
"step": 407
},
{
"epoch": 4.945454545454545,
"grad_norm": 0.026771927252411842,
"learning_rate": 7.248306003865279e-09,
"loss": 0.0123,
"step": 408
},
{
"epoch": 4.957575757575757,
"grad_norm": 0.024449503049254417,
"learning_rate": 1.8121093383671738e-09,
"loss": 0.0117,
"step": 409
},
{
"epoch": 4.96969696969697,
"grad_norm": 0.02603001333773136,
"learning_rate": 0.0,
"loss": 0.0104,
"step": 410
},
{
"epoch": 4.96969696969697,
"eval_loss": 0.020535213872790337,
"eval_runtime": 6.2062,
"eval_samples_per_second": 8.056,
"eval_steps_per_second": 2.095,
"step": 410
}
],
"logging_steps": 1,
"max_steps": 410,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.27193584892674e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}