|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.832340815391663,
|
|
"eval_steps": 500,
|
|
"global_step": 1000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.01832340815391663,
|
|
"grad_norm": 0.06011037901043892,
|
|
"learning_rate": 4e-05,
|
|
"loss": 1.296,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.03664681630783326,
|
|
"grad_norm": 0.05856110155582428,
|
|
"learning_rate": 8e-05,
|
|
"loss": 1.3316,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.054970224461749886,
|
|
"grad_norm": 0.0607464499771595,
|
|
"learning_rate": 0.00012,
|
|
"loss": 1.2794,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.07329363261566652,
|
|
"grad_norm": 0.06632011383771896,
|
|
"learning_rate": 0.00016,
|
|
"loss": 1.3129,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.09161704076958314,
|
|
"grad_norm": 0.06631691753864288,
|
|
"learning_rate": 0.0002,
|
|
"loss": 1.2741,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.10994044892349977,
|
|
"grad_norm": 0.056466877460479736,
|
|
"learning_rate": 0.00019998035748930052,
|
|
"loss": 1.2717,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.1282638570774164,
|
|
"grad_norm": 0.05860245227813721,
|
|
"learning_rate": 0.00019992143767376668,
|
|
"loss": 1.2091,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.14658726523133303,
|
|
"grad_norm": 0.06553175300359726,
|
|
"learning_rate": 0.00019982326370006058,
|
|
"loss": 1.1926,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.16491067338524965,
|
|
"grad_norm": 0.07061401754617691,
|
|
"learning_rate": 0.00019968587413584876,
|
|
"loss": 1.1767,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.1832340815391663,
|
|
"grad_norm": 0.07183243334293365,
|
|
"learning_rate": 0.000199509322954651,
|
|
"loss": 1.1183,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.2015574896930829,
|
|
"grad_norm": 0.06944898515939713,
|
|
"learning_rate": 0.00019929367951463655,
|
|
"loss": 1.0868,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.21988089784699955,
|
|
"grad_norm": 0.06642703711986542,
|
|
"learning_rate": 0.00019903902853137703,
|
|
"loss": 1.048,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.23820430600091616,
|
|
"grad_norm": 0.06603793054819107,
|
|
"learning_rate": 0.00019874547004456562,
|
|
"loss": 1.0195,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.2565277141548328,
|
|
"grad_norm": 0.06488285213708878,
|
|
"learning_rate": 0.00019841311937871675,
|
|
"loss": 1.0014,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.2748511223087494,
|
|
"grad_norm": 0.05940372124314308,
|
|
"learning_rate": 0.0001980421070978606,
|
|
"loss": 0.9943,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.29317453046266606,
|
|
"grad_norm": 0.059967171400785446,
|
|
"learning_rate": 0.00019763257895425113,
|
|
"loss": 0.9349,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.3114979386165827,
|
|
"grad_norm": 0.0554397851228714,
|
|
"learning_rate": 0.0001971846958311071,
|
|
"loss": 0.9045,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.3298213467704993,
|
|
"grad_norm": 0.055131904780864716,
|
|
"learning_rate": 0.00019669863367940935,
|
|
"loss": 0.8799,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.34814475492441593,
|
|
"grad_norm": 0.04358826205134392,
|
|
"learning_rate": 0.00019617458344877816,
|
|
"loss": 0.8504,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.3664681630783326,
|
|
"grad_norm": 0.04535752162337303,
|
|
"learning_rate": 0.00019561275101245883,
|
|
"loss": 0.828,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.3847915712322492,
|
|
"grad_norm": 0.04672062397003174,
|
|
"learning_rate": 0.00019501335708644414,
|
|
"loss": 0.8114,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.4031149793861658,
|
|
"grad_norm": 0.04161343351006508,
|
|
"learning_rate": 0.00019437663714276618,
|
|
"loss": 0.846,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.42143838754008245,
|
|
"grad_norm": 0.03887801244854927,
|
|
"learning_rate": 0.0001937028413169911,
|
|
"loss": 0.7911,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.4397617956939991,
|
|
"grad_norm": 0.03659196197986603,
|
|
"learning_rate": 0.00019299223430995323,
|
|
"loss": 0.7669,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.45808520384791573,
|
|
"grad_norm": 0.03447382524609566,
|
|
"learning_rate": 0.00019224509528376738,
|
|
"loss": 0.782,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.4764086120018323,
|
|
"grad_norm": 0.028725607320666313,
|
|
"learning_rate": 0.00019146171775215982,
|
|
"loss": 0.7183,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.49473202015574896,
|
|
"grad_norm": 0.027673941105604172,
|
|
"learning_rate": 0.0001906424094651615,
|
|
"loss": 0.7018,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.5130554283096656,
|
|
"grad_norm": 0.10227353870868683,
|
|
"learning_rate": 0.00018978749228820826,
|
|
"loss": 0.72,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.5313788364635822,
|
|
"grad_norm": 0.022650673985481262,
|
|
"learning_rate": 0.00018889730207569607,
|
|
"loss": 0.6936,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.5497022446174988,
|
|
"grad_norm": 0.023469725623726845,
|
|
"learning_rate": 0.00018797218853904037,
|
|
"loss": 0.6765,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.5680256527714155,
|
|
"grad_norm": 0.018101360648870468,
|
|
"learning_rate": 0.000187012515109292,
|
|
"loss": 0.6799,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.5863490609253321,
|
|
"grad_norm": 0.016794538125395775,
|
|
"learning_rate": 0.00018601865879436317,
|
|
"loss": 0.6732,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.6046724690792488,
|
|
"grad_norm": 0.017263714224100113,
|
|
"learning_rate": 0.00018499101003091993,
|
|
"loss": 0.6695,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.6229958772331654,
|
|
"grad_norm": 0.016381224617362022,
|
|
"learning_rate": 0.0001839299725309989,
|
|
"loss": 0.6928,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.641319285387082,
|
|
"grad_norm": 0.015325487591326237,
|
|
"learning_rate": 0.00018283596312340891,
|
|
"loss": 0.6622,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.6596426935409986,
|
|
"grad_norm": 0.014056784100830555,
|
|
"learning_rate": 0.0001817094115899799,
|
|
"loss": 0.7612,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.6779661016949152,
|
|
"grad_norm": 0.015031951479613781,
|
|
"learning_rate": 0.00018055076049672283,
|
|
"loss": 0.6596,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.6962895098488319,
|
|
"grad_norm": 0.01640532910823822,
|
|
"learning_rate": 0.00017936046501996762,
|
|
"loss": 0.6837,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.7146129180027485,
|
|
"grad_norm": 0.01830482669174671,
|
|
"learning_rate": 0.000178138992767547,
|
|
"loss": 0.6812,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.7329363261566652,
|
|
"grad_norm": 0.0472831092774868,
|
|
"learning_rate": 0.00017688682359509678,
|
|
"loss": 0.674,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.7512597343105818,
|
|
"grad_norm": 0.012456170283257961,
|
|
"learning_rate": 0.00017560444941754427,
|
|
"loss": 0.6518,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.7695831424644984,
|
|
"grad_norm": 0.01401186641305685,
|
|
"learning_rate": 0.0001742923740158595,
|
|
"loss": 0.6418,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.7879065506184151,
|
|
"grad_norm": 0.015530922450125217,
|
|
"learning_rate": 0.00017295111283914487,
|
|
"loss": 0.6465,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.8062299587723316,
|
|
"grad_norm": 0.01402275450527668,
|
|
"learning_rate": 0.0001715811928021406,
|
|
"loss": 0.6642,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.8245533669262483,
|
|
"grad_norm": 0.01176263578236103,
|
|
"learning_rate": 0.0001701831520782264,
|
|
"loss": 0.6336,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.8428767750801649,
|
|
"grad_norm": 0.013003438711166382,
|
|
"learning_rate": 0.00016875753988799982,
|
|
"loss": 0.6469,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.8612001832340815,
|
|
"grad_norm": 0.011523702181875706,
|
|
"learning_rate": 0.00016730491628351487,
|
|
"loss": 0.6434,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.8795235913879982,
|
|
"grad_norm": 0.011919384822249413,
|
|
"learning_rate": 0.00016582585192826543,
|
|
"loss": 0.6588,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.8978469995419148,
|
|
"grad_norm": 0.013994649983942509,
|
|
"learning_rate": 0.00016432092787299992,
|
|
"loss": 0.6315,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.9161704076958315,
|
|
"grad_norm": 0.013580686412751675,
|
|
"learning_rate": 0.00016279073532745553,
|
|
"loss": 0.6782,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.934493815849748,
|
|
"grad_norm": 0.01364163402467966,
|
|
"learning_rate": 0.00016123587542810118,
|
|
"loss": 0.6334,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.9528172240036646,
|
|
"grad_norm": 0.013080372475087643,
|
|
"learning_rate": 0.0001596569590019811,
|
|
"loss": 0.6233,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.9711406321575813,
|
|
"grad_norm": 0.056398555636405945,
|
|
"learning_rate": 0.00015805460632675112,
|
|
"loss": 0.6557,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.9894640403114979,
|
|
"grad_norm": 0.012467793188989162,
|
|
"learning_rate": 0.00015642944688700264,
|
|
"loss": 0.6315,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.0077874484654146,
|
|
"grad_norm": 0.012495579198002815,
|
|
"learning_rate": 0.00015478211912696929,
|
|
"loss": 0.6177,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 1.026110856619331,
|
|
"grad_norm": 0.010633349418640137,
|
|
"learning_rate": 0.00015311327019971413,
|
|
"loss": 0.644,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.0444342647732479,
|
|
"grad_norm": 0.012223353609442711,
|
|
"learning_rate": 0.00015142355571289533,
|
|
"loss": 0.6502,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.0627576729271644,
|
|
"grad_norm": 0.012305443175137043,
|
|
"learning_rate": 0.00014971363947121065,
|
|
"loss": 0.6185,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.0810810810810811,
|
|
"grad_norm": 0.017129750922322273,
|
|
"learning_rate": 0.0001479841932156215,
|
|
"loss": 0.6154,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.0994044892349977,
|
|
"grad_norm": 0.013129614293575287,
|
|
"learning_rate": 0.0001462358963594595,
|
|
"loss": 0.614,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.1177278973889144,
|
|
"grad_norm": 0.01199612207710743,
|
|
"learning_rate": 0.00014446943572151867,
|
|
"loss": 0.6128,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 1.136051305542831,
|
|
"grad_norm": 0.012518757954239845,
|
|
"learning_rate": 0.00014268550525623868,
|
|
"loss": 0.6169,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.1543747136967477,
|
|
"grad_norm": 0.01321893185377121,
|
|
"learning_rate": 0.00014088480578108454,
|
|
"loss": 0.6402,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 1.1726981218506642,
|
|
"grad_norm": 0.012497123330831528,
|
|
"learning_rate": 0.00013906804470123038,
|
|
"loss": 0.613,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.1910215300045808,
|
|
"grad_norm": 0.01103185210376978,
|
|
"learning_rate": 0.00013723593573165523,
|
|
"loss": 0.6114,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 1.2093449381584975,
|
|
"grad_norm": 0.012833209708333015,
|
|
"learning_rate": 0.00013538919861675979,
|
|
"loss": 0.617,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.227668346312414,
|
|
"grad_norm": 0.058991171419620514,
|
|
"learning_rate": 0.0001335285588476148,
|
|
"loss": 0.6298,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 1.2459917544663308,
|
|
"grad_norm": 0.013424506410956383,
|
|
"learning_rate": 0.00013165474737695184,
|
|
"loss": 0.6488,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.2643151626202473,
|
|
"grad_norm": 0.01241598092019558,
|
|
"learning_rate": 0.00012976850033200805,
|
|
"loss": 0.6088,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 1.2826385707741639,
|
|
"grad_norm": 0.012560844421386719,
|
|
"learning_rate": 0.00012787055872533865,
|
|
"loss": 0.6032,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.3009619789280806,
|
|
"grad_norm": 0.010990115813910961,
|
|
"learning_rate": 0.00012596166816371005,
|
|
"loss": 0.6282,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 1.3192853870819974,
|
|
"grad_norm": 0.01151216309517622,
|
|
"learning_rate": 0.00012404257855518782,
|
|
"loss": 0.6297,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.337608795235914,
|
|
"grad_norm": 0.011924243532121181,
|
|
"learning_rate": 0.0001221140438145353,
|
|
"loss": 0.6044,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 1.3559322033898304,
|
|
"grad_norm": 0.013133584521710873,
|
|
"learning_rate": 0.00012017682156703807,
|
|
"loss": 0.6107,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.3742556115437472,
|
|
"grad_norm": 0.014631664380431175,
|
|
"learning_rate": 0.00011823167285087063,
|
|
"loss": 0.6213,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 1.3925790196976637,
|
|
"grad_norm": 0.011716130189597607,
|
|
"learning_rate": 0.00011627936181812234,
|
|
"loss": 0.6179,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.4109024278515805,
|
|
"grad_norm": 0.013568080961704254,
|
|
"learning_rate": 0.00011432065543460015,
|
|
"loss": 0.5965,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 1.429225836005497,
|
|
"grad_norm": 0.012749516405165195,
|
|
"learning_rate": 0.00011235632317852605,
|
|
"loss": 0.6128,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.4475492441594136,
|
|
"grad_norm": 0.011930575594305992,
|
|
"learning_rate": 0.00011038713673824715,
|
|
"loss": 0.6117,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 1.4658726523133303,
|
|
"grad_norm": 0.013386845588684082,
|
|
"learning_rate": 0.00010841386970907785,
|
|
"loss": 0.6186,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.4841960604672468,
|
|
"grad_norm": 0.012542261742055416,
|
|
"learning_rate": 0.00010643729728939292,
|
|
"loss": 0.5909,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 1.5025194686211636,
|
|
"grad_norm": 0.010874781757593155,
|
|
"learning_rate": 0.0001044581959760903,
|
|
"loss": 0.5903,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.5208428767750801,
|
|
"grad_norm": 0.010801080614328384,
|
|
"learning_rate": 0.00010247734325954447,
|
|
"loss": 0.5929,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 1.5391662849289967,
|
|
"grad_norm": 0.012022151611745358,
|
|
"learning_rate": 0.00010049551731816902,
|
|
"loss": 0.6117,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.5574896930829134,
|
|
"grad_norm": 0.011683526448905468,
|
|
"learning_rate": 9.851349671270909e-05,
|
|
"loss": 0.6283,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 1.5758131012368302,
|
|
"grad_norm": 0.01242094673216343,
|
|
"learning_rate": 9.653206008038364e-05,
|
|
"loss": 0.5901,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.5941365093907467,
|
|
"grad_norm": 0.011935061775147915,
|
|
"learning_rate": 9.455198582899774e-05,
|
|
"loss": 0.5848,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 1.6124599175446632,
|
|
"grad_norm": 0.01208607666194439,
|
|
"learning_rate": 9.257405183114473e-05,
|
|
"loss": 0.5912,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.63078332569858,
|
|
"grad_norm": 0.01227467879652977,
|
|
"learning_rate": 9.059903511861891e-05,
|
|
"loss": 0.5859,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.6491067338524965,
|
|
"grad_norm": 0.0517101027071476,
|
|
"learning_rate": 8.862771157715847e-05,
|
|
"loss": 0.607,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.6674301420064133,
|
|
"grad_norm": 0.010776874609291553,
|
|
"learning_rate": 8.666085564163852e-05,
|
|
"loss": 0.5788,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 1.6857535501603298,
|
|
"grad_norm": 0.013487796299159527,
|
|
"learning_rate": 8.469923999183411e-05,
|
|
"loss": 0.5766,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.7040769583142463,
|
|
"grad_norm": 0.011671481654047966,
|
|
"learning_rate": 8.274363524887315e-05,
|
|
"loss": 0.5976,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 1.722400366468163,
|
|
"grad_norm": 0.01118433102965355,
|
|
"learning_rate": 8.079480967249737e-05,
|
|
"loss": 0.6021,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.7407237746220798,
|
|
"grad_norm": 0.013788875192403793,
|
|
"learning_rate": 7.88535288592514e-05,
|
|
"loss": 0.579,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 1.7590471827759964,
|
|
"grad_norm": 0.013310333713889122,
|
|
"learning_rate": 7.692055544171823e-05,
|
|
"loss": 0.5979,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.777370590929913,
|
|
"grad_norm": 0.053230684250593185,
|
|
"learning_rate": 7.49966487889185e-05,
|
|
"loss": 0.5906,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 1.7956939990838294,
|
|
"grad_norm": 0.013503102585673332,
|
|
"learning_rate": 7.308256470799256e-05,
|
|
"loss": 0.6061,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.8140174072377462,
|
|
"grad_norm": 0.01030020508915186,
|
|
"learning_rate": 7.117905514728107e-05,
|
|
"loss": 0.5776,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.832340815391663,
|
|
"grad_norm": 0.01204043161123991,
|
|
"learning_rate": 6.928686790092235e-05,
|
|
"loss": 0.584,
|
|
"step": 1000
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 1635,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 7.3780357496832e+17,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|