model_kg_lora_checkpoint / trainer_state.json
pphuc25's picture
Upload folder using huggingface_hub
26c94ea verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.26430186054599203,
"eval_steps": 95,
"global_step": 190,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013910624239262736,
"grad_norm": 57.481915217399184,
"learning_rate": 5.263157894736842e-06,
"loss": 4.0986,
"step": 1
},
{
"epoch": 0.0013910624239262736,
"eval_loss": 4.090541362762451,
"eval_runtime": 142.8408,
"eval_samples_per_second": 1.4,
"eval_steps_per_second": 0.7,
"step": 1
},
{
"epoch": 0.0027821248478525473,
"grad_norm": 55.2862073256691,
"learning_rate": 1.0526315789473684e-05,
"loss": 4.1421,
"step": 2
},
{
"epoch": 0.004173187271778821,
"grad_norm": 55.47340218339537,
"learning_rate": 1.5789473684210526e-05,
"loss": 4.1421,
"step": 3
},
{
"epoch": 0.0055642496957050945,
"grad_norm": 56.9856868028425,
"learning_rate": 2.105263157894737e-05,
"loss": 3.8228,
"step": 4
},
{
"epoch": 0.006955312119631368,
"grad_norm": 42.20672933294221,
"learning_rate": 2.6315789473684212e-05,
"loss": 3.5493,
"step": 5
},
{
"epoch": 0.008346374543557642,
"grad_norm": 31.332671300322296,
"learning_rate": 3.157894736842105e-05,
"loss": 3.2282,
"step": 6
},
{
"epoch": 0.009737436967483916,
"grad_norm": 35.093913062959864,
"learning_rate": 3.6842105263157895e-05,
"loss": 2.8776,
"step": 7
},
{
"epoch": 0.011128499391410189,
"grad_norm": 30.77232028891992,
"learning_rate": 4.210526315789474e-05,
"loss": 2.4868,
"step": 8
},
{
"epoch": 0.012519561815336464,
"grad_norm": 23.333354067825713,
"learning_rate": 4.736842105263158e-05,
"loss": 2.2354,
"step": 9
},
{
"epoch": 0.013910624239262736,
"grad_norm": 23.169980979193532,
"learning_rate": 5.2631578947368424e-05,
"loss": 2.1108,
"step": 10
},
{
"epoch": 0.01530168666318901,
"grad_norm": 49.58166411656502,
"learning_rate": 5.789473684210527e-05,
"loss": 1.6841,
"step": 11
},
{
"epoch": 0.016692749087115284,
"grad_norm": 65.3736167733946,
"learning_rate": 6.31578947368421e-05,
"loss": 1.4719,
"step": 12
},
{
"epoch": 0.018083811511041558,
"grad_norm": 54.8694395482998,
"learning_rate": 6.842105263157895e-05,
"loss": 1.3789,
"step": 13
},
{
"epoch": 0.019474873934967833,
"grad_norm": 24.218654599322196,
"learning_rate": 7.368421052631579e-05,
"loss": 1.2604,
"step": 14
},
{
"epoch": 0.020865936358894107,
"grad_norm": 28.815008657964423,
"learning_rate": 7.894736842105263e-05,
"loss": 1.1025,
"step": 15
},
{
"epoch": 0.022256998782820378,
"grad_norm": 23.828950081066402,
"learning_rate": 8.421052631578948e-05,
"loss": 0.9609,
"step": 16
},
{
"epoch": 0.023648061206746653,
"grad_norm": 8.100016954630625,
"learning_rate": 8.947368421052632e-05,
"loss": 0.8187,
"step": 17
},
{
"epoch": 0.025039123630672927,
"grad_norm": 5.803159862791972,
"learning_rate": 9.473684210526316e-05,
"loss": 0.6315,
"step": 18
},
{
"epoch": 0.0264301860545992,
"grad_norm": 4.013593116152618,
"learning_rate": 0.0001,
"loss": 0.5044,
"step": 19
},
{
"epoch": 0.027821248478525473,
"grad_norm": 3.5932241384347323,
"learning_rate": 0.00010526315789473685,
"loss": 0.4627,
"step": 20
},
{
"epoch": 0.029212310902451747,
"grad_norm": 3.348669532045955,
"learning_rate": 0.0001105263157894737,
"loss": 0.383,
"step": 21
},
{
"epoch": 0.03060337332637802,
"grad_norm": 2.00227796318112,
"learning_rate": 0.00011578947368421053,
"loss": 0.2781,
"step": 22
},
{
"epoch": 0.03199443575030429,
"grad_norm": 2.5417842762288867,
"learning_rate": 0.00012105263157894738,
"loss": 0.2456,
"step": 23
},
{
"epoch": 0.03338549817423057,
"grad_norm": 1.9711651000722923,
"learning_rate": 0.0001263157894736842,
"loss": 0.2142,
"step": 24
},
{
"epoch": 0.03477656059815684,
"grad_norm": 0.8970659213386247,
"learning_rate": 0.00013157894736842108,
"loss": 0.1834,
"step": 25
},
{
"epoch": 0.036167623022083116,
"grad_norm": 1.2108714072709537,
"learning_rate": 0.0001368421052631579,
"loss": 0.1679,
"step": 26
},
{
"epoch": 0.03755868544600939,
"grad_norm": 1.142669286718837,
"learning_rate": 0.00014210526315789474,
"loss": 0.1674,
"step": 27
},
{
"epoch": 0.038949747869935665,
"grad_norm": 1.336108532018483,
"learning_rate": 0.00014736842105263158,
"loss": 0.1477,
"step": 28
},
{
"epoch": 0.04034081029386194,
"grad_norm": 1.1813897424962707,
"learning_rate": 0.00015263157894736845,
"loss": 0.1232,
"step": 29
},
{
"epoch": 0.041731872717788214,
"grad_norm": 0.9570372121797639,
"learning_rate": 0.00015789473684210527,
"loss": 0.1354,
"step": 30
},
{
"epoch": 0.04312293514171448,
"grad_norm": 0.7746296553935763,
"learning_rate": 0.0001631578947368421,
"loss": 0.1332,
"step": 31
},
{
"epoch": 0.044513997565640756,
"grad_norm": 0.8805659567275839,
"learning_rate": 0.00016842105263157895,
"loss": 0.1264,
"step": 32
},
{
"epoch": 0.04590505998956703,
"grad_norm": 0.632878393506056,
"learning_rate": 0.0001736842105263158,
"loss": 0.1168,
"step": 33
},
{
"epoch": 0.047296122413493305,
"grad_norm": 0.8113076955440898,
"learning_rate": 0.00017894736842105264,
"loss": 0.1202,
"step": 34
},
{
"epoch": 0.04868718483741958,
"grad_norm": 0.5370417178894353,
"learning_rate": 0.00018421052631578948,
"loss": 0.0973,
"step": 35
},
{
"epoch": 0.050078247261345854,
"grad_norm": 0.7576013652590657,
"learning_rate": 0.00018947368421052632,
"loss": 0.1193,
"step": 36
},
{
"epoch": 0.05146930968527213,
"grad_norm": 0.43345436869612397,
"learning_rate": 0.00019473684210526317,
"loss": 0.1201,
"step": 37
},
{
"epoch": 0.0528603721091984,
"grad_norm": 0.5698956636180441,
"learning_rate": 0.0002,
"loss": 0.1032,
"step": 38
},
{
"epoch": 0.05425143453312467,
"grad_norm": 0.4321632422837542,
"learning_rate": 0.00019999578095183124,
"loss": 0.1024,
"step": 39
},
{
"epoch": 0.055642496957050945,
"grad_norm": 0.4635185005058889,
"learning_rate": 0.00019998312416333227,
"loss": 0.1152,
"step": 40
},
{
"epoch": 0.05703355938097722,
"grad_norm": 0.632963924999249,
"learning_rate": 0.00019996203070249516,
"loss": 0.1098,
"step": 41
},
{
"epoch": 0.058424621804903494,
"grad_norm": 0.6090390432507673,
"learning_rate": 0.00019993250234920636,
"loss": 0.1007,
"step": 42
},
{
"epoch": 0.05981568422882977,
"grad_norm": 0.37302557990999896,
"learning_rate": 0.0001998945415950969,
"loss": 0.1025,
"step": 43
},
{
"epoch": 0.06120674665275604,
"grad_norm": 0.4532125483802837,
"learning_rate": 0.00019984815164333163,
"loss": 0.1081,
"step": 44
},
{
"epoch": 0.06259780907668232,
"grad_norm": 0.41760638820645163,
"learning_rate": 0.00019979333640833947,
"loss": 0.1012,
"step": 45
},
{
"epoch": 0.06398887150060859,
"grad_norm": 0.444653137436646,
"learning_rate": 0.00019973010051548275,
"loss": 0.1063,
"step": 46
},
{
"epoch": 0.06537993392453487,
"grad_norm": 0.402931797262621,
"learning_rate": 0.000199658449300667,
"loss": 0.1125,
"step": 47
},
{
"epoch": 0.06677099634846113,
"grad_norm": 0.36319084051345213,
"learning_rate": 0.00019957838880989078,
"loss": 0.1089,
"step": 48
},
{
"epoch": 0.06816205877238742,
"grad_norm": 0.4234051915735979,
"learning_rate": 0.00019948992579873538,
"loss": 0.0954,
"step": 49
},
{
"epoch": 0.06955312119631368,
"grad_norm": 0.4377818054629029,
"learning_rate": 0.00019939306773179497,
"loss": 0.1224,
"step": 50
},
{
"epoch": 0.07094418362023996,
"grad_norm": 0.44410748361632796,
"learning_rate": 0.0001992878227820465,
"loss": 0.1067,
"step": 51
},
{
"epoch": 0.07233524604416623,
"grad_norm": 0.42230766014405663,
"learning_rate": 0.00019917419983016025,
"loss": 0.0987,
"step": 52
},
{
"epoch": 0.0737263084680925,
"grad_norm": 0.5226065230162815,
"learning_rate": 0.00019905220846375032,
"loss": 0.0887,
"step": 53
},
{
"epoch": 0.07511737089201878,
"grad_norm": 0.5319022813543092,
"learning_rate": 0.00019892185897656578,
"loss": 0.1,
"step": 54
},
{
"epoch": 0.07650843331594505,
"grad_norm": 0.3425765043506125,
"learning_rate": 0.00019878316236762196,
"loss": 0.098,
"step": 55
},
{
"epoch": 0.07789949573987133,
"grad_norm": 0.514819329146875,
"learning_rate": 0.00019863613034027224,
"loss": 0.0973,
"step": 56
},
{
"epoch": 0.0792905581637976,
"grad_norm": 0.39896545913697684,
"learning_rate": 0.00019848077530122083,
"loss": 0.0972,
"step": 57
},
{
"epoch": 0.08068162058772388,
"grad_norm": 0.34279296752627336,
"learning_rate": 0.0001983171103594755,
"loss": 0.093,
"step": 58
},
{
"epoch": 0.08207268301165015,
"grad_norm": 0.47443560517829403,
"learning_rate": 0.0001981451493252418,
"loss": 0.099,
"step": 59
},
{
"epoch": 0.08346374543557643,
"grad_norm": 0.2900117090081372,
"learning_rate": 0.0001979649067087574,
"loss": 0.0918,
"step": 60
},
{
"epoch": 0.0848548078595027,
"grad_norm": 0.2857162843888324,
"learning_rate": 0.00019777639771906795,
"loss": 0.0918,
"step": 61
},
{
"epoch": 0.08624587028342896,
"grad_norm": 0.35890054118986736,
"learning_rate": 0.00019757963826274357,
"loss": 0.0884,
"step": 62
},
{
"epoch": 0.08763693270735524,
"grad_norm": 0.43217741701531326,
"learning_rate": 0.0001973746449425368,
"loss": 0.0998,
"step": 63
},
{
"epoch": 0.08902799513128151,
"grad_norm": 0.2786604479535725,
"learning_rate": 0.0001971614350559814,
"loss": 0.1033,
"step": 64
},
{
"epoch": 0.0904190575552078,
"grad_norm": 0.3337063135618657,
"learning_rate": 0.00019694002659393305,
"loss": 0.0904,
"step": 65
},
{
"epoch": 0.09181011997913406,
"grad_norm": 0.4040733407860712,
"learning_rate": 0.0001967104382390511,
"loss": 0.0931,
"step": 66
},
{
"epoch": 0.09320118240306034,
"grad_norm": 0.4114917483532629,
"learning_rate": 0.00019647268936422206,
"loss": 0.1004,
"step": 67
},
{
"epoch": 0.09459224482698661,
"grad_norm": 0.35362892982845173,
"learning_rate": 0.00019622680003092503,
"loss": 0.0854,
"step": 68
},
{
"epoch": 0.09598330725091289,
"grad_norm": 0.3192988295451842,
"learning_rate": 0.0001959727909875389,
"loss": 0.093,
"step": 69
},
{
"epoch": 0.09737436967483916,
"grad_norm": 0.3444301394937438,
"learning_rate": 0.00019571068366759143,
"loss": 0.0851,
"step": 70
},
{
"epoch": 0.09876543209876543,
"grad_norm": 0.327068524632321,
"learning_rate": 0.00019544050018795075,
"loss": 0.0886,
"step": 71
},
{
"epoch": 0.10015649452269171,
"grad_norm": 0.2689917323964736,
"learning_rate": 0.0001951622633469592,
"loss": 0.087,
"step": 72
},
{
"epoch": 0.10154755694661798,
"grad_norm": 0.34135891760226605,
"learning_rate": 0.00019487599662250943,
"loss": 0.0922,
"step": 73
},
{
"epoch": 0.10293861937054426,
"grad_norm": 0.33730658236238953,
"learning_rate": 0.00019458172417006347,
"loss": 0.0873,
"step": 74
},
{
"epoch": 0.10432968179447052,
"grad_norm": 0.30924996627362245,
"learning_rate": 0.00019427947082061432,
"loss": 0.0933,
"step": 75
},
{
"epoch": 0.1057207442183968,
"grad_norm": 0.2853028980014277,
"learning_rate": 0.00019396926207859084,
"loss": 0.0947,
"step": 76
},
{
"epoch": 0.10711180664232307,
"grad_norm": 0.30335052470844565,
"learning_rate": 0.0001936511241197055,
"loss": 0.0863,
"step": 77
},
{
"epoch": 0.10850286906624934,
"grad_norm": 0.3291801673836605,
"learning_rate": 0.0001933250837887457,
"loss": 0.0904,
"step": 78
},
{
"epoch": 0.10989393149017562,
"grad_norm": 0.3045906501477626,
"learning_rate": 0.0001929911685973088,
"loss": 0.0782,
"step": 79
},
{
"epoch": 0.11128499391410189,
"grad_norm": 0.3463504019884883,
"learning_rate": 0.00019264940672148018,
"loss": 0.0957,
"step": 80
},
{
"epoch": 0.11267605633802817,
"grad_norm": 0.48475824530469097,
"learning_rate": 0.0001922998269994563,
"loss": 0.0998,
"step": 81
},
{
"epoch": 0.11406711876195444,
"grad_norm": 0.3347226966629457,
"learning_rate": 0.0001919424589291108,
"loss": 0.0899,
"step": 82
},
{
"epoch": 0.11545818118588072,
"grad_norm": 0.26519200345508975,
"learning_rate": 0.00019157733266550575,
"loss": 0.0977,
"step": 83
},
{
"epoch": 0.11684924360980699,
"grad_norm": 0.2921124084264725,
"learning_rate": 0.00019120447901834706,
"loss": 0.0825,
"step": 84
},
{
"epoch": 0.11824030603373327,
"grad_norm": 0.3489674437133695,
"learning_rate": 0.00019082392944938466,
"loss": 0.1015,
"step": 85
},
{
"epoch": 0.11963136845765954,
"grad_norm": 0.31722397694583876,
"learning_rate": 0.00019043571606975777,
"loss": 0.0894,
"step": 86
},
{
"epoch": 0.1210224308815858,
"grad_norm": 0.2955550980399664,
"learning_rate": 0.00019003987163728535,
"loss": 0.085,
"step": 87
},
{
"epoch": 0.12241349330551209,
"grad_norm": 0.2634355198377348,
"learning_rate": 0.00018963642955370201,
"loss": 0.0746,
"step": 88
},
{
"epoch": 0.12380455572943835,
"grad_norm": 0.27722503829642603,
"learning_rate": 0.0001892254238618394,
"loss": 0.0848,
"step": 89
},
{
"epoch": 0.12519561815336464,
"grad_norm": 0.35391362553378075,
"learning_rate": 0.00018880688924275378,
"loss": 0.0894,
"step": 90
},
{
"epoch": 0.12658668057729092,
"grad_norm": 0.3038040208253159,
"learning_rate": 0.00018838086101279945,
"loss": 0.0856,
"step": 91
},
{
"epoch": 0.12797774300121717,
"grad_norm": 0.26778899531444406,
"learning_rate": 0.0001879473751206489,
"loss": 0.091,
"step": 92
},
{
"epoch": 0.12936880542514345,
"grad_norm": 0.28007185258910006,
"learning_rate": 0.00018750646814425938,
"loss": 0.0952,
"step": 93
},
{
"epoch": 0.13075986784906973,
"grad_norm": 0.34452572989487623,
"learning_rate": 0.00018705817728778624,
"loss": 0.0969,
"step": 94
},
{
"epoch": 0.13215093027299601,
"grad_norm": 0.25095910646088126,
"learning_rate": 0.00018660254037844388,
"loss": 0.0934,
"step": 95
},
{
"epoch": 0.13215093027299601,
"eval_loss": 0.12252213805913925,
"eval_runtime": 134.0804,
"eval_samples_per_second": 1.492,
"eval_steps_per_second": 0.746,
"step": 95
},
{
"epoch": 0.13354199269692227,
"grad_norm": 0.2656299334713188,
"learning_rate": 0.00018613959586331362,
"loss": 0.0775,
"step": 96
},
{
"epoch": 0.13493305512084855,
"grad_norm": 0.3101848899761308,
"learning_rate": 0.00018566938280609966,
"loss": 0.0907,
"step": 97
},
{
"epoch": 0.13632411754477483,
"grad_norm": 0.20489097047659305,
"learning_rate": 0.00018519194088383273,
"loss": 0.0837,
"step": 98
},
{
"epoch": 0.13771517996870108,
"grad_norm": 0.20921809158838175,
"learning_rate": 0.0001847073103835222,
"loss": 0.0797,
"step": 99
},
{
"epoch": 0.13910624239262737,
"grad_norm": 0.34554065473236023,
"learning_rate": 0.00018421553219875658,
"loss": 0.0933,
"step": 100
},
{
"epoch": 0.14049730481655365,
"grad_norm": 0.2673616667778317,
"learning_rate": 0.00018371664782625287,
"loss": 0.1026,
"step": 101
},
{
"epoch": 0.14188836724047993,
"grad_norm": 0.19513622356148747,
"learning_rate": 0.00018321069936235503,
"loss": 0.0865,
"step": 102
},
{
"epoch": 0.14327942966440618,
"grad_norm": 0.2578837270272787,
"learning_rate": 0.00018269772949948182,
"loss": 0.0787,
"step": 103
},
{
"epoch": 0.14467049208833246,
"grad_norm": 0.3093953814557645,
"learning_rate": 0.0001821777815225245,
"loss": 0.0786,
"step": 104
},
{
"epoch": 0.14606155451225875,
"grad_norm": 0.2688836734240716,
"learning_rate": 0.0001816508993051943,
"loss": 0.0833,
"step": 105
},
{
"epoch": 0.147452616936185,
"grad_norm": 0.3619162434045556,
"learning_rate": 0.00018111712730632022,
"loss": 0.1117,
"step": 106
},
{
"epoch": 0.14884367936011128,
"grad_norm": 0.3200444785679587,
"learning_rate": 0.00018057651056609784,
"loss": 0.1029,
"step": 107
},
{
"epoch": 0.15023474178403756,
"grad_norm": 0.18877470060569954,
"learning_rate": 0.00018002909470228842,
"loss": 0.0778,
"step": 108
},
{
"epoch": 0.15162580420796384,
"grad_norm": 0.2771754241309713,
"learning_rate": 0.00017947492590637,
"loss": 0.088,
"step": 109
},
{
"epoch": 0.1530168666318901,
"grad_norm": 0.2858330892223555,
"learning_rate": 0.00017891405093963938,
"loss": 0.0965,
"step": 110
},
{
"epoch": 0.15440792905581638,
"grad_norm": 0.2592357748952338,
"learning_rate": 0.00017834651712926662,
"loss": 0.0771,
"step": 111
},
{
"epoch": 0.15579899147974266,
"grad_norm": 0.24744291594123116,
"learning_rate": 0.0001777723723643014,
"loss": 0.0867,
"step": 112
},
{
"epoch": 0.1571900539036689,
"grad_norm": 0.24974876526771628,
"learning_rate": 0.0001771916650916321,
"loss": 0.0726,
"step": 113
},
{
"epoch": 0.1585811163275952,
"grad_norm": 0.2594525140511626,
"learning_rate": 0.0001766044443118978,
"loss": 0.0801,
"step": 114
},
{
"epoch": 0.15997217875152148,
"grad_norm": 0.23549697860413413,
"learning_rate": 0.00017601075957535364,
"loss": 0.0828,
"step": 115
},
{
"epoch": 0.16136324117544776,
"grad_norm": 0.2608815695894903,
"learning_rate": 0.00017541066097768963,
"loss": 0.093,
"step": 116
},
{
"epoch": 0.162754303599374,
"grad_norm": 0.307954460594231,
"learning_rate": 0.00017480419915580356,
"loss": 0.0913,
"step": 117
},
{
"epoch": 0.1641453660233003,
"grad_norm": 0.22128217162262856,
"learning_rate": 0.00017419142528352817,
"loss": 0.0809,
"step": 118
},
{
"epoch": 0.16553642844722657,
"grad_norm": 0.27170941306905455,
"learning_rate": 0.00017357239106731317,
"loss": 0.0902,
"step": 119
},
{
"epoch": 0.16692749087115286,
"grad_norm": 0.3455641589787572,
"learning_rate": 0.0001729471487418621,
"loss": 0.0942,
"step": 120
},
{
"epoch": 0.1683185532950791,
"grad_norm": 0.2907489840537234,
"learning_rate": 0.00017231575106572467,
"loss": 0.0934,
"step": 121
},
{
"epoch": 0.1697096157190054,
"grad_norm": 0.2040675032776725,
"learning_rate": 0.00017167825131684513,
"loss": 0.088,
"step": 122
},
{
"epoch": 0.17110067814293167,
"grad_norm": 0.33490402178677914,
"learning_rate": 0.0001710347032880664,
"loss": 0.0896,
"step": 123
},
{
"epoch": 0.17249174056685793,
"grad_norm": 0.27733157306473977,
"learning_rate": 0.00017038516128259115,
"loss": 0.0868,
"step": 124
},
{
"epoch": 0.1738828029907842,
"grad_norm": 0.22580639535744193,
"learning_rate": 0.00016972968010939954,
"loss": 0.0918,
"step": 125
},
{
"epoch": 0.1752738654147105,
"grad_norm": 0.4041049399719925,
"learning_rate": 0.00016906831507862443,
"loss": 0.0935,
"step": 126
},
{
"epoch": 0.17666492783863677,
"grad_norm": 0.2633243288406605,
"learning_rate": 0.00016840112199688432,
"loss": 0.0899,
"step": 127
},
{
"epoch": 0.17805599026256302,
"grad_norm": 0.18845032947894733,
"learning_rate": 0.00016772815716257412,
"loss": 0.0757,
"step": 128
},
{
"epoch": 0.1794470526864893,
"grad_norm": 0.2916581222604507,
"learning_rate": 0.00016704947736111492,
"loss": 0.0934,
"step": 129
},
{
"epoch": 0.1808381151104156,
"grad_norm": 0.3843496027129121,
"learning_rate": 0.00016636513986016213,
"loss": 0.0976,
"step": 130
},
{
"epoch": 0.18222917753434184,
"grad_norm": 0.2471262926169607,
"learning_rate": 0.00016567520240477344,
"loss": 0.0884,
"step": 131
},
{
"epoch": 0.18362023995826812,
"grad_norm": 0.24727367186979754,
"learning_rate": 0.000164979723212536,
"loss": 0.0874,
"step": 132
},
{
"epoch": 0.1850113023821944,
"grad_norm": 0.33843248537614945,
"learning_rate": 0.00016427876096865394,
"loss": 0.0848,
"step": 133
},
{
"epoch": 0.18640236480612069,
"grad_norm": 0.31104140922553697,
"learning_rate": 0.00016357237482099684,
"loss": 0.0873,
"step": 134
},
{
"epoch": 0.18779342723004694,
"grad_norm": 0.23469264072995824,
"learning_rate": 0.0001628606243751082,
"loss": 0.0883,
"step": 135
},
{
"epoch": 0.18918448965397322,
"grad_norm": 0.24173966890136098,
"learning_rate": 0.00016214356968917648,
"loss": 0.0779,
"step": 136
},
{
"epoch": 0.1905755520778995,
"grad_norm": 0.32612892204831334,
"learning_rate": 0.0001614212712689668,
"loss": 0.0883,
"step": 137
},
{
"epoch": 0.19196661450182578,
"grad_norm": 0.32980505598105636,
"learning_rate": 0.00016069379006271566,
"loss": 0.089,
"step": 138
},
{
"epoch": 0.19335767692575204,
"grad_norm": 0.23330500427300094,
"learning_rate": 0.00015996118745598817,
"loss": 0.0894,
"step": 139
},
{
"epoch": 0.19474873934967832,
"grad_norm": 0.2491189963902772,
"learning_rate": 0.00015922352526649803,
"loss": 0.0844,
"step": 140
},
{
"epoch": 0.1961398017736046,
"grad_norm": 0.25349298825030675,
"learning_rate": 0.00015848086573889137,
"loss": 0.0828,
"step": 141
},
{
"epoch": 0.19753086419753085,
"grad_norm": 0.2622420285657173,
"learning_rate": 0.00015773327153949465,
"loss": 0.0938,
"step": 142
},
{
"epoch": 0.19892192662145713,
"grad_norm": 0.2451281816103279,
"learning_rate": 0.00015698080575102661,
"loss": 0.0932,
"step": 143
},
{
"epoch": 0.20031298904538342,
"grad_norm": 0.2792069573192707,
"learning_rate": 0.00015622353186727544,
"loss": 0.0911,
"step": 144
},
{
"epoch": 0.2017040514693097,
"grad_norm": 0.3789846509130343,
"learning_rate": 0.00015546151378774086,
"loss": 0.0909,
"step": 145
},
{
"epoch": 0.20309511389323595,
"grad_norm": 0.28195731508101923,
"learning_rate": 0.00015469481581224272,
"loss": 0.0943,
"step": 146
},
{
"epoch": 0.20448617631716223,
"grad_norm": 0.206437401775988,
"learning_rate": 0.0001539235026354946,
"loss": 0.0791,
"step": 147
},
{
"epoch": 0.20587723874108851,
"grad_norm": 0.2633605036911312,
"learning_rate": 0.0001531476393416456,
"loss": 0.0905,
"step": 148
},
{
"epoch": 0.20726830116501477,
"grad_norm": 0.19077873120368308,
"learning_rate": 0.00015236729139878782,
"loss": 0.0802,
"step": 149
},
{
"epoch": 0.20865936358894105,
"grad_norm": 0.18796420991081844,
"learning_rate": 0.00015158252465343242,
"loss": 0.084,
"step": 150
},
{
"epoch": 0.21005042601286733,
"grad_norm": 0.20416107931180086,
"learning_rate": 0.00015079340532495343,
"loss": 0.0875,
"step": 151
},
{
"epoch": 0.2114414884367936,
"grad_norm": 0.2193281922198467,
"learning_rate": 0.00015000000000000001,
"loss": 0.0744,
"step": 152
},
{
"epoch": 0.21283255086071987,
"grad_norm": 0.23044847983325983,
"learning_rate": 0.00014920237562687785,
"loss": 0.075,
"step": 153
},
{
"epoch": 0.21422361328464615,
"grad_norm": 0.25126501089690145,
"learning_rate": 0.0001484005995098999,
"loss": 0.0808,
"step": 154
},
{
"epoch": 0.21561467570857243,
"grad_norm": 0.2198268163423245,
"learning_rate": 0.00014759473930370736,
"loss": 0.0845,
"step": 155
},
{
"epoch": 0.21700573813249868,
"grad_norm": 0.1914711978442268,
"learning_rate": 0.0001467848630075608,
"loss": 0.0755,
"step": 156
},
{
"epoch": 0.21839680055642496,
"grad_norm": 0.21872090855210674,
"learning_rate": 0.00014597103895960226,
"loss": 0.092,
"step": 157
},
{
"epoch": 0.21978786298035125,
"grad_norm": 0.2683036687709274,
"learning_rate": 0.00014515333583108896,
"loss": 0.0841,
"step": 158
},
{
"epoch": 0.22117892540427753,
"grad_norm": 0.23553724619883026,
"learning_rate": 0.0001443318226205986,
"loss": 0.0746,
"step": 159
},
{
"epoch": 0.22256998782820378,
"grad_norm": 0.2266632081232852,
"learning_rate": 0.00014350656864820733,
"loss": 0.0944,
"step": 160
},
{
"epoch": 0.22396105025213006,
"grad_norm": 0.250403713309342,
"learning_rate": 0.00014267764354964038,
"loss": 0.0885,
"step": 161
},
{
"epoch": 0.22535211267605634,
"grad_norm": 0.28822601640189355,
"learning_rate": 0.00014184511727039612,
"loss": 0.0809,
"step": 162
},
{
"epoch": 0.22674317509998262,
"grad_norm": 0.31961441071369967,
"learning_rate": 0.00014100906005984403,
"loss": 0.0865,
"step": 163
},
{
"epoch": 0.22813423752390888,
"grad_norm": 0.2693185549520274,
"learning_rate": 0.00014016954246529696,
"loss": 0.0908,
"step": 164
},
{
"epoch": 0.22952529994783516,
"grad_norm": 0.26679641451386776,
"learning_rate": 0.0001393266353260583,
"loss": 0.0819,
"step": 165
},
{
"epoch": 0.23091636237176144,
"grad_norm": 0.26218139806332996,
"learning_rate": 0.00013848040976744457,
"loss": 0.083,
"step": 166
},
{
"epoch": 0.2323074247956877,
"grad_norm": 0.23814008030441985,
"learning_rate": 0.00013763093719478358,
"loss": 0.0847,
"step": 167
},
{
"epoch": 0.23369848721961398,
"grad_norm": 0.21432464182252345,
"learning_rate": 0.00013677828928738934,
"loss": 0.0745,
"step": 168
},
{
"epoch": 0.23508954964354026,
"grad_norm": 0.30123225510109747,
"learning_rate": 0.00013592253799251376,
"loss": 0.0856,
"step": 169
},
{
"epoch": 0.23648061206746654,
"grad_norm": 0.19055454583960363,
"learning_rate": 0.00013506375551927547,
"loss": 0.0776,
"step": 170
},
{
"epoch": 0.2378716744913928,
"grad_norm": 0.212116148921165,
"learning_rate": 0.00013420201433256689,
"loss": 0.0886,
"step": 171
},
{
"epoch": 0.23926273691531907,
"grad_norm": 0.19167177362209964,
"learning_rate": 0.00013333738714693956,
"loss": 0.0842,
"step": 172
},
{
"epoch": 0.24065379933924536,
"grad_norm": 0.1955913122364729,
"learning_rate": 0.00013246994692046836,
"loss": 0.0747,
"step": 173
},
{
"epoch": 0.2420448617631716,
"grad_norm": 0.26435669820247026,
"learning_rate": 0.00013159976684859527,
"loss": 0.0738,
"step": 174
},
{
"epoch": 0.2434359241870979,
"grad_norm": 0.25940085895801324,
"learning_rate": 0.00013072692035795305,
"loss": 0.0953,
"step": 175
},
{
"epoch": 0.24482698661102417,
"grad_norm": 0.2123950826677995,
"learning_rate": 0.00012985148110016947,
"loss": 0.0875,
"step": 176
},
{
"epoch": 0.24621804903495045,
"grad_norm": 0.23814242981401218,
"learning_rate": 0.0001289735229456525,
"loss": 0.0952,
"step": 177
},
{
"epoch": 0.2476091114588767,
"grad_norm": 0.2591655333691455,
"learning_rate": 0.00012809311997735696,
"loss": 0.0929,
"step": 178
},
{
"epoch": 0.249000173882803,
"grad_norm": 0.2583368537549298,
"learning_rate": 0.00012721034648453353,
"loss": 0.0867,
"step": 179
},
{
"epoch": 0.25039123630672927,
"grad_norm": 0.21967625985685135,
"learning_rate": 0.00012632527695645993,
"loss": 0.0954,
"step": 180
},
{
"epoch": 0.25178229873065555,
"grad_norm": 0.2893619780495792,
"learning_rate": 0.00012543798607615565,
"loss": 0.0727,
"step": 181
},
{
"epoch": 0.25317336115458183,
"grad_norm": 0.2619468516121317,
"learning_rate": 0.00012454854871407994,
"loss": 0.0918,
"step": 182
},
{
"epoch": 0.25456442357850806,
"grad_norm": 0.2373057358885362,
"learning_rate": 0.00012365703992181425,
"loss": 0.0818,
"step": 183
},
{
"epoch": 0.25595548600243434,
"grad_norm": 0.25279252463420376,
"learning_rate": 0.00012276353492572935,
"loss": 0.0924,
"step": 184
},
{
"epoch": 0.2573465484263606,
"grad_norm": 0.2274489920314379,
"learning_rate": 0.0001218681091206376,
"loss": 0.0813,
"step": 185
},
{
"epoch": 0.2587376108502869,
"grad_norm": 0.20263131443643456,
"learning_rate": 0.00012097083806343103,
"loss": 0.0824,
"step": 186
},
{
"epoch": 0.2601286732742132,
"grad_norm": 0.18889107460185536,
"learning_rate": 0.00012007179746670592,
"loss": 0.0935,
"step": 187
},
{
"epoch": 0.26151973569813947,
"grad_norm": 0.23637605187786753,
"learning_rate": 0.00011917106319237386,
"loss": 0.0825,
"step": 188
},
{
"epoch": 0.26291079812206575,
"grad_norm": 0.18377805798032137,
"learning_rate": 0.00011826871124526071,
"loss": 0.0826,
"step": 189
},
{
"epoch": 0.26430186054599203,
"grad_norm": 0.2308900423110225,
"learning_rate": 0.00011736481776669306,
"loss": 0.0863,
"step": 190
},
{
"epoch": 0.26430186054599203,
"eval_loss": 0.11765718460083008,
"eval_runtime": 135.3729,
"eval_samples_per_second": 1.477,
"eval_steps_per_second": 0.739,
"step": 190
}
],
"logging_steps": 1,
"max_steps": 380,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 95,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.393066669768704e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}