qwen2.5-3b_1 / checkpoint-150 /trainer_state.json
jasong03's picture
Upload folder using huggingface_hub
2cbb493 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.2631939684715556,
"eval_steps": 500,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02193283070596299,
"grad_norm": 0.4870755076408386,
"learning_rate": 8.695652173913044e-07,
"loss": 2.9525,
"step": 1
},
{
"epoch": 0.04386566141192598,
"grad_norm": 0.35328975319862366,
"learning_rate": 1.7391304347826088e-06,
"loss": 2.8102,
"step": 2
},
{
"epoch": 0.06579849211788896,
"grad_norm": 0.4670841097831726,
"learning_rate": 2.6086956521739132e-06,
"loss": 2.8848,
"step": 3
},
{
"epoch": 0.08773132282385196,
"grad_norm": 0.6589818000793457,
"learning_rate": 3.4782608695652175e-06,
"loss": 3.0569,
"step": 4
},
{
"epoch": 0.10966415352981494,
"grad_norm": 0.47537556290626526,
"learning_rate": 4.347826086956522e-06,
"loss": 2.8093,
"step": 5
},
{
"epoch": 0.13159698423577793,
"grad_norm": 0.6465238928794861,
"learning_rate": 5.2173913043478265e-06,
"loss": 3.06,
"step": 6
},
{
"epoch": 0.15352981494174092,
"grad_norm": 0.4992265999317169,
"learning_rate": 6.086956521739132e-06,
"loss": 2.98,
"step": 7
},
{
"epoch": 0.17546264564770392,
"grad_norm": 0.5841557383537292,
"learning_rate": 6.956521739130435e-06,
"loss": 2.9831,
"step": 8
},
{
"epoch": 0.1973954763536669,
"grad_norm": 0.41789695620536804,
"learning_rate": 7.82608695652174e-06,
"loss": 2.8239,
"step": 9
},
{
"epoch": 0.21932830705962988,
"grad_norm": 0.43609240651130676,
"learning_rate": 8.695652173913044e-06,
"loss": 2.8341,
"step": 10
},
{
"epoch": 0.24126113776559288,
"grad_norm": 0.3185977041721344,
"learning_rate": 9.565217391304349e-06,
"loss": 2.7383,
"step": 11
},
{
"epoch": 0.26319396847155585,
"grad_norm": 0.43681928515434265,
"learning_rate": 1.0434782608695653e-05,
"loss": 2.9235,
"step": 12
},
{
"epoch": 0.2851267991775189,
"grad_norm": 0.3825719952583313,
"learning_rate": 1.1304347826086957e-05,
"loss": 2.6964,
"step": 13
},
{
"epoch": 0.30705962988348184,
"grad_norm": 0.5028628706932068,
"learning_rate": 1.2173913043478263e-05,
"loss": 2.8067,
"step": 14
},
{
"epoch": 0.3289924605894448,
"grad_norm": 0.5002133846282959,
"learning_rate": 1.3043478260869566e-05,
"loss": 2.8252,
"step": 15
},
{
"epoch": 0.35092529129540784,
"grad_norm": 0.3002101182937622,
"learning_rate": 1.391304347826087e-05,
"loss": 2.7313,
"step": 16
},
{
"epoch": 0.3728581220013708,
"grad_norm": 0.4624859392642975,
"learning_rate": 1.4782608695652174e-05,
"loss": 2.7972,
"step": 17
},
{
"epoch": 0.3947909527073338,
"grad_norm": 0.57045578956604,
"learning_rate": 1.565217391304348e-05,
"loss": 3.0034,
"step": 18
},
{
"epoch": 0.4167237834132968,
"grad_norm": 0.4259621500968933,
"learning_rate": 1.6521739130434785e-05,
"loss": 2.8309,
"step": 19
},
{
"epoch": 0.43865661411925977,
"grad_norm": 0.46009618043899536,
"learning_rate": 1.739130434782609e-05,
"loss": 2.8408,
"step": 20
},
{
"epoch": 0.46058944482522274,
"grad_norm": 0.5475765466690063,
"learning_rate": 1.8260869565217393e-05,
"loss": 2.8943,
"step": 21
},
{
"epoch": 0.48252227553118576,
"grad_norm": 0.7618455290794373,
"learning_rate": 1.9130434782608697e-05,
"loss": 2.9959,
"step": 22
},
{
"epoch": 0.5044551062371487,
"grad_norm": 0.6684309840202332,
"learning_rate": 2e-05,
"loss": 2.9095,
"step": 23
},
{
"epoch": 0.5263879369431117,
"grad_norm": 0.545538604259491,
"learning_rate": 1.9999729347501484e-05,
"loss": 2.8353,
"step": 24
},
{
"epoch": 0.5483207676490747,
"grad_norm": 0.6128362417221069,
"learning_rate": 1.9998917404656488e-05,
"loss": 2.8127,
"step": 25
},
{
"epoch": 0.5702535983550377,
"grad_norm": 0.7084450125694275,
"learning_rate": 1.9997564215415886e-05,
"loss": 2.9335,
"step": 26
},
{
"epoch": 0.5921864290610007,
"grad_norm": 0.5246658325195312,
"learning_rate": 1.9995669853028485e-05,
"loss": 2.8186,
"step": 27
},
{
"epoch": 0.6141192597669637,
"grad_norm": 0.7249352335929871,
"learning_rate": 1.9993234420037072e-05,
"loss": 2.8336,
"step": 28
},
{
"epoch": 0.6360520904729267,
"grad_norm": 0.5946571826934814,
"learning_rate": 1.999025804827285e-05,
"loss": 2.8233,
"step": 29
},
{
"epoch": 0.6579849211788896,
"grad_norm": 0.42332401871681213,
"learning_rate": 1.9986740898848306e-05,
"loss": 2.7259,
"step": 30
},
{
"epoch": 0.6799177518848526,
"grad_norm": 0.9102485775947571,
"learning_rate": 1.99826831621485e-05,
"loss": 2.9075,
"step": 31
},
{
"epoch": 0.7018505825908157,
"grad_norm": 0.6430304646492004,
"learning_rate": 1.997808505782075e-05,
"loss": 2.9084,
"step": 32
},
{
"epoch": 0.7237834132967786,
"grad_norm": 0.5709037780761719,
"learning_rate": 1.9972946834762732e-05,
"loss": 2.8107,
"step": 33
},
{
"epoch": 0.7457162440027416,
"grad_norm": 0.49835285544395447,
"learning_rate": 1.9967268771109037e-05,
"loss": 2.7338,
"step": 34
},
{
"epoch": 0.7676490747087046,
"grad_norm": 0.5714299082756042,
"learning_rate": 1.996105117421608e-05,
"loss": 2.7816,
"step": 35
},
{
"epoch": 0.7895819054146676,
"grad_norm": 0.48475122451782227,
"learning_rate": 1.9954294380645497e-05,
"loss": 2.7685,
"step": 36
},
{
"epoch": 0.8115147361206305,
"grad_norm": 0.5309097766876221,
"learning_rate": 1.9946998756145894e-05,
"loss": 2.7778,
"step": 37
},
{
"epoch": 0.8334475668265936,
"grad_norm": 0.45952990651130676,
"learning_rate": 1.9939164695633067e-05,
"loss": 2.7954,
"step": 38
},
{
"epoch": 0.8553803975325566,
"grad_norm": 0.48351016640663147,
"learning_rate": 1.9930792623168638e-05,
"loss": 2.6792,
"step": 39
},
{
"epoch": 0.8773132282385195,
"grad_norm": 0.4013523459434509,
"learning_rate": 1.992188299193706e-05,
"loss": 2.7983,
"step": 40
},
{
"epoch": 0.8992460589444825,
"grad_norm": 0.318466454744339,
"learning_rate": 1.9912436284221134e-05,
"loss": 2.5984,
"step": 41
},
{
"epoch": 0.9211788896504455,
"grad_norm": 0.3661440908908844,
"learning_rate": 1.9902453011375865e-05,
"loss": 2.8038,
"step": 42
},
{
"epoch": 0.9431117203564084,
"grad_norm": 0.4626653790473938,
"learning_rate": 1.98919337138008e-05,
"loss": 2.7242,
"step": 43
},
{
"epoch": 0.9650445510623715,
"grad_norm": 0.30195215344429016,
"learning_rate": 1.9880878960910772e-05,
"loss": 2.6637,
"step": 44
},
{
"epoch": 0.9869773817683345,
"grad_norm": 0.40314823389053345,
"learning_rate": 1.9869289351105087e-05,
"loss": 2.7072,
"step": 45
},
{
"epoch": 1.0,
"grad_norm": 0.4138323962688446,
"learning_rate": 1.9857165511735105e-05,
"loss": 2.6019,
"step": 46
},
{
"epoch": 1.021932830705963,
"grad_norm": 0.29498204588890076,
"learning_rate": 1.9844508099070313e-05,
"loss": 2.5943,
"step": 47
},
{
"epoch": 1.043865661411926,
"grad_norm": 0.29517868161201477,
"learning_rate": 1.9831317798262787e-05,
"loss": 2.6963,
"step": 48
},
{
"epoch": 1.065798492117889,
"grad_norm": 0.29591700434684753,
"learning_rate": 1.98175953233101e-05,
"loss": 2.7676,
"step": 49
},
{
"epoch": 1.0877313228238519,
"grad_norm": 0.2561758756637573,
"learning_rate": 1.980334141701667e-05,
"loss": 2.629,
"step": 50
},
{
"epoch": 1.1096641535298148,
"grad_norm": 0.2604333460330963,
"learning_rate": 1.978855685095358e-05,
"loss": 2.7115,
"step": 51
},
{
"epoch": 1.1315969842357778,
"grad_norm": 0.4252321422100067,
"learning_rate": 1.977324242541677e-05,
"loss": 2.6442,
"step": 52
},
{
"epoch": 1.153529814941741,
"grad_norm": 0.23818732798099518,
"learning_rate": 1.9757398969383752e-05,
"loss": 2.7172,
"step": 53
},
{
"epoch": 1.175462645647704,
"grad_norm": 0.16449472308158875,
"learning_rate": 1.974102734046872e-05,
"loss": 2.6405,
"step": 54
},
{
"epoch": 1.197395476353667,
"grad_norm": 0.25156456232070923,
"learning_rate": 1.9724128424876117e-05,
"loss": 2.6376,
"step": 55
},
{
"epoch": 1.21932830705963,
"grad_norm": 0.1786828637123108,
"learning_rate": 1.9706703137352695e-05,
"loss": 2.5754,
"step": 56
},
{
"epoch": 1.2412611377655929,
"grad_norm": 0.21457381546497345,
"learning_rate": 1.968875242113798e-05,
"loss": 2.6309,
"step": 57
},
{
"epoch": 1.2631939684715559,
"grad_norm": 0.16352739930152893,
"learning_rate": 1.9670277247913205e-05,
"loss": 2.5966,
"step": 58
},
{
"epoch": 1.2851267991775188,
"grad_norm": 0.19751280546188354,
"learning_rate": 1.965127861774873e-05,
"loss": 2.7516,
"step": 59
},
{
"epoch": 1.3070596298834818,
"grad_norm": 0.14813798666000366,
"learning_rate": 1.96317575590499e-05,
"loss": 2.6464,
"step": 60
},
{
"epoch": 1.3289924605894448,
"grad_norm": 0.15344278514385223,
"learning_rate": 1.9611715128501378e-05,
"loss": 2.7081,
"step": 61
},
{
"epoch": 1.350925291295408,
"grad_norm": 0.1399158239364624,
"learning_rate": 1.9591152411009942e-05,
"loss": 2.6382,
"step": 62
},
{
"epoch": 1.3728581220013707,
"grad_norm": 0.1705392748117447,
"learning_rate": 1.9570070519645767e-05,
"loss": 2.6442,
"step": 63
},
{
"epoch": 1.3947909527073339,
"grad_norm": 0.16903157532215118,
"learning_rate": 1.9548470595582166e-05,
"loss": 2.5898,
"step": 64
},
{
"epoch": 1.4167237834132969,
"grad_norm": 0.13648824393749237,
"learning_rate": 1.9526353808033827e-05,
"loss": 2.5725,
"step": 65
},
{
"epoch": 1.4386566141192598,
"grad_norm": 0.129099041223526,
"learning_rate": 1.9503721354193507e-05,
"loss": 2.6863,
"step": 66
},
{
"epoch": 1.4605894448252228,
"grad_norm": 0.12659573554992676,
"learning_rate": 1.948057445916724e-05,
"loss": 2.6224,
"step": 67
},
{
"epoch": 1.4825222755311858,
"grad_norm": 0.1909603327512741,
"learning_rate": 1.9456914375908026e-05,
"loss": 2.6401,
"step": 68
},
{
"epoch": 1.5044551062371487,
"grad_norm": 0.25878530740737915,
"learning_rate": 1.9432742385147988e-05,
"loss": 2.6704,
"step": 69
},
{
"epoch": 1.5263879369431117,
"grad_norm": 0.13662640750408173,
"learning_rate": 1.9408059795329073e-05,
"loss": 2.6154,
"step": 70
},
{
"epoch": 1.5483207676490747,
"grad_norm": 0.13560184836387634,
"learning_rate": 1.9382867942532195e-05,
"loss": 2.6227,
"step": 71
},
{
"epoch": 1.5702535983550376,
"grad_norm": 0.1390749216079712,
"learning_rate": 1.9357168190404937e-05,
"loss": 2.6884,
"step": 72
},
{
"epoch": 1.5921864290610008,
"grad_norm": 0.12690310180187225,
"learning_rate": 1.9330961930087724e-05,
"loss": 2.674,
"step": 73
},
{
"epoch": 1.6141192597669636,
"grad_norm": 0.09753144532442093,
"learning_rate": 1.9304250580138524e-05,
"loss": 2.6209,
"step": 74
},
{
"epoch": 1.6360520904729268,
"grad_norm": 0.12032578140497208,
"learning_rate": 1.9277035586456056e-05,
"loss": 2.6915,
"step": 75
},
{
"epoch": 1.6579849211788895,
"grad_norm": 0.16928426921367645,
"learning_rate": 1.9249318422201524e-05,
"loss": 2.6132,
"step": 76
},
{
"epoch": 1.6799177518848527,
"grad_norm": 0.09693765640258789,
"learning_rate": 1.9221100587718884e-05,
"loss": 2.6633,
"step": 77
},
{
"epoch": 1.7018505825908157,
"grad_norm": 0.16512952744960785,
"learning_rate": 1.919238361045362e-05,
"loss": 2.6585,
"step": 78
},
{
"epoch": 1.7237834132967786,
"grad_norm": 0.14026080071926117,
"learning_rate": 1.916316904487005e-05,
"loss": 2.7693,
"step": 79
},
{
"epoch": 1.7457162440027416,
"grad_norm": 0.14885276556015015,
"learning_rate": 1.9133458472367216e-05,
"loss": 2.7144,
"step": 80
},
{
"epoch": 1.7676490747087046,
"grad_norm": 0.11436336487531662,
"learning_rate": 1.9103253501193256e-05,
"loss": 2.5687,
"step": 81
},
{
"epoch": 1.7895819054146676,
"grad_norm": 0.24450720846652985,
"learning_rate": 1.9072555766358346e-05,
"loss": 2.5209,
"step": 82
},
{
"epoch": 1.8115147361206305,
"grad_norm": 0.19669459760189056,
"learning_rate": 1.904136692954622e-05,
"loss": 2.5729,
"step": 83
},
{
"epoch": 1.8334475668265937,
"grad_norm": 0.1368049532175064,
"learning_rate": 1.900968867902419e-05,
"loss": 2.6268,
"step": 84
},
{
"epoch": 1.8553803975325565,
"grad_norm": 0.1277005970478058,
"learning_rate": 1.89775227295518e-05,
"loss": 2.6601,
"step": 85
},
{
"epoch": 1.8773132282385196,
"grad_norm": 0.09511096775531769,
"learning_rate": 1.8944870822287957e-05,
"loss": 2.5918,
"step": 86
},
{
"epoch": 1.8992460589444824,
"grad_norm": 0.06676003336906433,
"learning_rate": 1.891173472469672e-05,
"loss": 2.5671,
"step": 87
},
{
"epoch": 1.9211788896504456,
"grad_norm": 0.13803276419639587,
"learning_rate": 1.8878116230451615e-05,
"loss": 2.6257,
"step": 88
},
{
"epoch": 1.9431117203564083,
"grad_norm": 0.10050716996192932,
"learning_rate": 1.884401715933853e-05,
"loss": 2.6772,
"step": 89
},
{
"epoch": 1.9650445510623715,
"grad_norm": 0.1283024400472641,
"learning_rate": 1.8809439357157226e-05,
"loss": 2.6121,
"step": 90
},
{
"epoch": 1.9869773817683345,
"grad_norm": 0.13763798773288727,
"learning_rate": 1.8774384695621407e-05,
"loss": 2.568,
"step": 91
},
{
"epoch": 2.0,
"grad_norm": 0.22271642088890076,
"learning_rate": 1.8738855072257428e-05,
"loss": 2.5865,
"step": 92
},
{
"epoch": 2.021932830705963,
"grad_norm": 0.11131051927804947,
"learning_rate": 1.8702852410301556e-05,
"loss": 2.5144,
"step": 93
},
{
"epoch": 2.043865661411926,
"grad_norm": 0.11161550134420395,
"learning_rate": 1.8666378658595863e-05,
"loss": 2.5182,
"step": 94
},
{
"epoch": 2.065798492117889,
"grad_norm": 0.08552844822406769,
"learning_rate": 1.8629435791482765e-05,
"loss": 2.6402,
"step": 95
},
{
"epoch": 2.087731322823852,
"grad_norm": 0.11428907513618469,
"learning_rate": 1.8592025808698116e-05,
"loss": 2.6265,
"step": 96
},
{
"epoch": 2.109664153529815,
"grad_norm": 0.09980299323797226,
"learning_rate": 1.8554150735262975e-05,
"loss": 2.623,
"step": 97
},
{
"epoch": 2.131596984235778,
"grad_norm": 0.08849837630987167,
"learning_rate": 1.8515812621373998e-05,
"loss": 2.627,
"step": 98
},
{
"epoch": 2.153529814941741,
"grad_norm": 0.09756634384393692,
"learning_rate": 1.8477013542292446e-05,
"loss": 2.6233,
"step": 99
},
{
"epoch": 2.1754626456477038,
"grad_norm": 0.07598986476659775,
"learning_rate": 1.8437755598231857e-05,
"loss": 2.6633,
"step": 100
},
{
"epoch": 2.197395476353667,
"grad_norm": 0.18077166378498077,
"learning_rate": 1.8398040914244363e-05,
"loss": 2.6253,
"step": 101
},
{
"epoch": 2.2193283070596297,
"grad_norm": 0.14006111025810242,
"learning_rate": 1.8357871640105648e-05,
"loss": 2.6086,
"step": 102
},
{
"epoch": 2.241261137765593,
"grad_norm": 0.08927474915981293,
"learning_rate": 1.8317249950198598e-05,
"loss": 2.4877,
"step": 103
},
{
"epoch": 2.2631939684715556,
"grad_norm": 0.08106345683336258,
"learning_rate": 1.8276178043395588e-05,
"loss": 2.6523,
"step": 104
},
{
"epoch": 2.285126799177519,
"grad_norm": 0.09083954989910126,
"learning_rate": 1.8234658142939454e-05,
"loss": 2.6456,
"step": 105
},
{
"epoch": 2.307059629883482,
"grad_norm": 0.07053636014461517,
"learning_rate": 1.8192692496323158e-05,
"loss": 2.6306,
"step": 106
},
{
"epoch": 2.3289924605894448,
"grad_norm": 0.06668028235435486,
"learning_rate": 1.8150283375168112e-05,
"loss": 2.5175,
"step": 107
},
{
"epoch": 2.350925291295408,
"grad_norm": 0.13295046985149384,
"learning_rate": 1.8107433075101254e-05,
"loss": 2.473,
"step": 108
},
{
"epoch": 2.3728581220013707,
"grad_norm": 0.09093775600194931,
"learning_rate": 1.8064143915630723e-05,
"loss": 2.547,
"step": 109
},
{
"epoch": 2.394790952707334,
"grad_norm": 0.18302415311336517,
"learning_rate": 1.8020418240020362e-05,
"loss": 2.5652,
"step": 110
},
{
"epoch": 2.4167237834132966,
"grad_norm": 0.07250549644231796,
"learning_rate": 1.7976258415162836e-05,
"loss": 2.5828,
"step": 111
},
{
"epoch": 2.43865661411926,
"grad_norm": 0.3828829526901245,
"learning_rate": 1.7931666831451536e-05,
"loss": 2.5063,
"step": 112
},
{
"epoch": 2.4605894448252226,
"grad_norm": 0.5052575469017029,
"learning_rate": 1.7886645902651166e-05,
"loss": 2.6184,
"step": 113
},
{
"epoch": 2.4825222755311858,
"grad_norm": 0.07245208323001862,
"learning_rate": 1.7841198065767107e-05,
"loss": 2.5883,
"step": 114
},
{
"epoch": 2.504455106237149,
"grad_norm": 0.05822201445698738,
"learning_rate": 1.779532578091347e-05,
"loss": 2.6,
"step": 115
},
{
"epoch": 2.5263879369431117,
"grad_norm": 0.08909498155117035,
"learning_rate": 1.7749031531179962e-05,
"loss": 2.549,
"step": 116
},
{
"epoch": 2.5483207676490744,
"grad_norm": 0.07801003754138947,
"learning_rate": 1.7702317822497457e-05,
"loss": 2.5591,
"step": 117
},
{
"epoch": 2.5702535983550376,
"grad_norm": 0.2434745877981186,
"learning_rate": 1.7655187183502344e-05,
"loss": 2.6557,
"step": 118
},
{
"epoch": 2.592186429061001,
"grad_norm": 0.06782998889684677,
"learning_rate": 1.7607642165399665e-05,
"loss": 2.5268,
"step": 119
},
{
"epoch": 2.6141192597669636,
"grad_norm": 0.0920061469078064,
"learning_rate": 1.755968534182501e-05,
"loss": 2.5052,
"step": 120
},
{
"epoch": 2.6360520904729268,
"grad_norm": 0.0655408501625061,
"learning_rate": 1.7511319308705198e-05,
"loss": 2.5784,
"step": 121
},
{
"epoch": 2.6579849211788895,
"grad_norm": 0.13081596791744232,
"learning_rate": 1.746254668411778e-05,
"loss": 2.5305,
"step": 122
},
{
"epoch": 2.6799177518848527,
"grad_norm": 0.3778601884841919,
"learning_rate": 1.7413370108149288e-05,
"loss": 2.5846,
"step": 123
},
{
"epoch": 2.701850582590816,
"grad_norm": 0.07771953195333481,
"learning_rate": 1.7363792242752354e-05,
"loss": 2.563,
"step": 124
},
{
"epoch": 2.7237834132967786,
"grad_norm": 0.06316570937633514,
"learning_rate": 1.731381577160161e-05,
"loss": 2.5705,
"step": 125
},
{
"epoch": 2.7457162440027414,
"grad_norm": 0.10998786240816116,
"learning_rate": 1.726344339994841e-05,
"loss": 2.6364,
"step": 126
},
{
"epoch": 2.7676490747087046,
"grad_norm": 0.08724892884492874,
"learning_rate": 1.7212677854474402e-05,
"loss": 2.4682,
"step": 127
},
{
"epoch": 2.7895819054146678,
"grad_norm": 0.10323834419250488,
"learning_rate": 1.7161521883143936e-05,
"loss": 2.6225,
"step": 128
},
{
"epoch": 2.8115147361206305,
"grad_norm": 0.07787463814020157,
"learning_rate": 1.7109978255055295e-05,
"loss": 2.5786,
"step": 129
},
{
"epoch": 2.8334475668265937,
"grad_norm": 0.07260199636220932,
"learning_rate": 1.705804976029083e-05,
"loss": 2.678,
"step": 130
},
{
"epoch": 2.8553803975325565,
"grad_norm": 0.07246023416519165,
"learning_rate": 1.7005739209765906e-05,
"loss": 2.4259,
"step": 131
},
{
"epoch": 2.8773132282385196,
"grad_norm": 0.09231323003768921,
"learning_rate": 1.6953049435076768e-05,
"loss": 2.6071,
"step": 132
},
{
"epoch": 2.8992460589444824,
"grad_norm": 0.08409086614847183,
"learning_rate": 1.6899983288347248e-05,
"loss": 2.4503,
"step": 133
},
{
"epoch": 2.9211788896504456,
"grad_norm": 0.10303400456905365,
"learning_rate": 1.6846543642074382e-05,
"loss": 2.5125,
"step": 134
},
{
"epoch": 2.9431117203564083,
"grad_norm": 0.053891055285930634,
"learning_rate": 1.679273338897293e-05,
"loss": 2.6059,
"step": 135
},
{
"epoch": 2.9650445510623715,
"grad_norm": 0.08152981102466583,
"learning_rate": 1.6738555441818785e-05,
"loss": 2.62,
"step": 136
},
{
"epoch": 2.9869773817683347,
"grad_norm": 0.06032385304570198,
"learning_rate": 1.668401273329129e-05,
"loss": 2.534,
"step": 137
},
{
"epoch": 3.0,
"grad_norm": 0.08430641144514084,
"learning_rate": 1.6629108215814523e-05,
"loss": 2.5903,
"step": 138
},
{
"epoch": 3.021932830705963,
"grad_norm": 0.0818452313542366,
"learning_rate": 1.6573844861397444e-05,
"loss": 2.7053,
"step": 139
},
{
"epoch": 3.043865661411926,
"grad_norm": 0.11237948387861252,
"learning_rate": 1.6518225661473045e-05,
"loss": 2.4826,
"step": 140
},
{
"epoch": 3.065798492117889,
"grad_norm": 0.0827915221452713,
"learning_rate": 1.6462253626736413e-05,
"loss": 2.5895,
"step": 141
},
{
"epoch": 3.087731322823852,
"grad_norm": 0.11660678684711456,
"learning_rate": 1.6405931786981753e-05,
"loss": 2.5654,
"step": 142
},
{
"epoch": 3.109664153529815,
"grad_norm": 0.09041120111942291,
"learning_rate": 1.63492631909384e-05,
"loss": 2.5197,
"step": 143
},
{
"epoch": 3.131596984235778,
"grad_norm": 0.06614544242620468,
"learning_rate": 1.629225090610577e-05,
"loss": 2.5294,
"step": 144
},
{
"epoch": 3.153529814941741,
"grad_norm": 0.04271303862333298,
"learning_rate": 1.6234898018587336e-05,
"loss": 2.5046,
"step": 145
},
{
"epoch": 3.1754626456477038,
"grad_norm": 0.08055449277162552,
"learning_rate": 1.6177207632923558e-05,
"loss": 2.5966,
"step": 146
},
{
"epoch": 3.197395476353667,
"grad_norm": 0.06253743171691895,
"learning_rate": 1.6119182871923834e-05,
"loss": 2.5158,
"step": 147
},
{
"epoch": 3.2193283070596297,
"grad_norm": 0.08224272727966309,
"learning_rate": 1.606082687649748e-05,
"loss": 2.6654,
"step": 148
},
{
"epoch": 3.241261137765593,
"grad_norm": 0.09529576450586319,
"learning_rate": 1.6002142805483686e-05,
"loss": 2.5656,
"step": 149
},
{
"epoch": 3.2631939684715556,
"grad_norm": 0.04647281765937805,
"learning_rate": 1.5943133835480536e-05,
"loss": 2.5524,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 450,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2639143951466496e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}