|
{ |
|
"best_metric": 2.2496395111083984, |
|
"best_model_checkpoint": "./yue_finetuned/checkpoint-900", |
|
"epoch": 2.99290780141844, |
|
"eval_steps": 100, |
|
"global_step": 951, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03152088258471237, |
|
"grad_norm": 1.0658416748046875, |
|
"learning_rate": 6.896551724137931e-05, |
|
"loss": 6.814, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06304176516942474, |
|
"grad_norm": 2.7176384925842285, |
|
"learning_rate": 0.00013793103448275863, |
|
"loss": 6.4042, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09456264775413711, |
|
"grad_norm": 1.2143285274505615, |
|
"learning_rate": 0.0001997830802603037, |
|
"loss": 4.8034, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12608353033884948, |
|
"grad_norm": 1.1138572692871094, |
|
"learning_rate": 0.00019761388286334056, |
|
"loss": 3.8628, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15760441292356187, |
|
"grad_norm": 0.7702319025993347, |
|
"learning_rate": 0.00019544468546637745, |
|
"loss": 3.3674, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18912529550827423, |
|
"grad_norm": 0.4743233919143677, |
|
"learning_rate": 0.00019327548806941433, |
|
"loss": 3.1137, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22064617809298662, |
|
"grad_norm": 0.4091566205024719, |
|
"learning_rate": 0.0001911062906724512, |
|
"loss": 2.8457, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25216706067769895, |
|
"grad_norm": 0.39599528908729553, |
|
"learning_rate": 0.00018893709327548808, |
|
"loss": 2.6799, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.28368794326241137, |
|
"grad_norm": 0.36687061190605164, |
|
"learning_rate": 0.00018676789587852494, |
|
"loss": 2.7454, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.31520882584712373, |
|
"grad_norm": 0.5227386951446533, |
|
"learning_rate": 0.00018459869848156183, |
|
"loss": 2.7114, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.31520882584712373, |
|
"eval_loss": 2.610507011413574, |
|
"eval_runtime": 1093.6124, |
|
"eval_samples_per_second": 0.517, |
|
"eval_steps_per_second": 0.259, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3467297084318361, |
|
"grad_norm": 0.41864681243896484, |
|
"learning_rate": 0.0001824295010845987, |
|
"loss": 2.6228, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.37825059101654845, |
|
"grad_norm": 0.4326777756214142, |
|
"learning_rate": 0.00018026030368763557, |
|
"loss": 2.4956, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4097714736012608, |
|
"grad_norm": 0.41428521275520325, |
|
"learning_rate": 0.00017809110629067246, |
|
"loss": 2.5776, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.44129235618597323, |
|
"grad_norm": 0.41970375180244446, |
|
"learning_rate": 0.00017592190889370934, |
|
"loss": 2.5208, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4728132387706856, |
|
"grad_norm": 0.45651790499687195, |
|
"learning_rate": 0.0001737527114967462, |
|
"loss": 2.4897, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5043341213553979, |
|
"grad_norm": 0.44131532311439514, |
|
"learning_rate": 0.0001715835140997831, |
|
"loss": 2.5342, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5358550039401103, |
|
"grad_norm": 0.5060796737670898, |
|
"learning_rate": 0.00016941431670281998, |
|
"loss": 2.5113, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5673758865248227, |
|
"grad_norm": 0.4486294686794281, |
|
"learning_rate": 0.00016724511930585684, |
|
"loss": 2.5553, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.598896769109535, |
|
"grad_norm": 0.45511698722839355, |
|
"learning_rate": 0.00016507592190889372, |
|
"loss": 2.4714, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6304176516942475, |
|
"grad_norm": 0.49814826250076294, |
|
"learning_rate": 0.00016290672451193058, |
|
"loss": 2.4529, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6304176516942475, |
|
"eval_loss": 2.440239906311035, |
|
"eval_runtime": 28.5385, |
|
"eval_samples_per_second": 19.798, |
|
"eval_steps_per_second": 9.916, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6619385342789598, |
|
"grad_norm": 0.5318950414657593, |
|
"learning_rate": 0.00016073752711496747, |
|
"loss": 2.5319, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6934594168636722, |
|
"grad_norm": 0.6193626523017883, |
|
"learning_rate": 0.00015856832971800435, |
|
"loss": 2.4968, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7249802994483846, |
|
"grad_norm": 0.5659565329551697, |
|
"learning_rate": 0.00015639913232104121, |
|
"loss": 2.4063, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7565011820330969, |
|
"grad_norm": 0.4969359338283539, |
|
"learning_rate": 0.0001542299349240781, |
|
"loss": 2.4599, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7880220646178093, |
|
"grad_norm": 0.5683227777481079, |
|
"learning_rate": 0.000152060737527115, |
|
"loss": 2.4214, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8195429472025216, |
|
"grad_norm": 0.5504446029663086, |
|
"learning_rate": 0.00014989154013015185, |
|
"loss": 2.3597, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.6029040217399597, |
|
"learning_rate": 0.00014772234273318873, |
|
"loss": 2.4472, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8825847123719465, |
|
"grad_norm": 0.5572171807289124, |
|
"learning_rate": 0.0001455531453362256, |
|
"loss": 2.3552, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9141055949566588, |
|
"grad_norm": 0.6272566914558411, |
|
"learning_rate": 0.00014338394793926248, |
|
"loss": 2.4717, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9456264775413712, |
|
"grad_norm": 0.6014458537101746, |
|
"learning_rate": 0.00014121475054229936, |
|
"loss": 2.4543, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9456264775413712, |
|
"eval_loss": 2.3703479766845703, |
|
"eval_runtime": 28.5951, |
|
"eval_samples_per_second": 19.759, |
|
"eval_steps_per_second": 9.897, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9771473601260835, |
|
"grad_norm": 0.6145516037940979, |
|
"learning_rate": 0.00013904555314533622, |
|
"loss": 2.4109, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0063041765169425, |
|
"grad_norm": 0.6649049520492554, |
|
"learning_rate": 0.0001368763557483731, |
|
"loss": 2.4834, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.037825059101655, |
|
"grad_norm": 0.6200373768806458, |
|
"learning_rate": 0.00013470715835141, |
|
"loss": 2.3862, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0693459416863673, |
|
"grad_norm": 0.5306605100631714, |
|
"learning_rate": 0.00013253796095444686, |
|
"loss": 2.3856, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1008668242710795, |
|
"grad_norm": 0.5235586762428284, |
|
"learning_rate": 0.00013036876355748374, |
|
"loss": 2.4574, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.132387706855792, |
|
"grad_norm": 0.5824336409568787, |
|
"learning_rate": 0.00012819956616052063, |
|
"loss": 2.4134, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1639085894405043, |
|
"grad_norm": 0.5699276328086853, |
|
"learning_rate": 0.0001260303687635575, |
|
"loss": 2.323, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1954294720252168, |
|
"grad_norm": 0.6251928806304932, |
|
"learning_rate": 0.00012386117136659438, |
|
"loss": 2.4463, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.226950354609929, |
|
"grad_norm": 0.6418842673301697, |
|
"learning_rate": 0.00012169197396963123, |
|
"loss": 2.406, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2584712371946414, |
|
"grad_norm": 0.5105799436569214, |
|
"learning_rate": 0.00011952277657266812, |
|
"loss": 2.351, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2584712371946414, |
|
"eval_loss": 2.3272600173950195, |
|
"eval_runtime": 28.699, |
|
"eval_samples_per_second": 19.687, |
|
"eval_steps_per_second": 9.861, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2899921197793538, |
|
"grad_norm": 0.6160938143730164, |
|
"learning_rate": 0.000117353579175705, |
|
"loss": 2.3101, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3215130023640662, |
|
"grad_norm": 0.6558836698532104, |
|
"learning_rate": 0.00011518438177874187, |
|
"loss": 2.3196, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3530338849487786, |
|
"grad_norm": 0.6231864094734192, |
|
"learning_rate": 0.00011301518438177874, |
|
"loss": 2.3871, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.384554767533491, |
|
"grad_norm": 0.5435105562210083, |
|
"learning_rate": 0.00011084598698481563, |
|
"loss": 2.3389, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4160756501182032, |
|
"grad_norm": 0.5544334650039673, |
|
"learning_rate": 0.0001086767895878525, |
|
"loss": 2.2573, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4475965327029157, |
|
"grad_norm": 0.6390591859817505, |
|
"learning_rate": 0.00010650759219088937, |
|
"loss": 2.4043, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.479117415287628, |
|
"grad_norm": 0.6744779348373413, |
|
"learning_rate": 0.00010433839479392625, |
|
"loss": 2.2799, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5106382978723403, |
|
"grad_norm": 0.6263852715492249, |
|
"learning_rate": 0.00010216919739696313, |
|
"loss": 2.2408, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5421591804570527, |
|
"grad_norm": 0.5550678968429565, |
|
"learning_rate": 0.0001, |
|
"loss": 2.405, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.573680063041765, |
|
"grad_norm": 0.6078540086746216, |
|
"learning_rate": 9.783080260303689e-05, |
|
"loss": 2.3524, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.573680063041765, |
|
"eval_loss": 2.3018314838409424, |
|
"eval_runtime": 28.7339, |
|
"eval_samples_per_second": 19.663, |
|
"eval_steps_per_second": 9.849, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6052009456264775, |
|
"grad_norm": 0.6397316455841064, |
|
"learning_rate": 9.566160520607375e-05, |
|
"loss": 2.3176, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.63672182821119, |
|
"grad_norm": 0.5407721996307373, |
|
"learning_rate": 9.349240780911064e-05, |
|
"loss": 2.2957, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6682427107959024, |
|
"grad_norm": 0.5824908018112183, |
|
"learning_rate": 9.132321041214751e-05, |
|
"loss": 2.2554, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6997635933806148, |
|
"grad_norm": 0.6469699740409851, |
|
"learning_rate": 8.91540130151844e-05, |
|
"loss": 2.3222, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.731284475965327, |
|
"grad_norm": 0.5473052263259888, |
|
"learning_rate": 8.698481561822126e-05, |
|
"loss": 2.339, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7628053585500394, |
|
"grad_norm": 0.6402776837348938, |
|
"learning_rate": 8.481561822125814e-05, |
|
"loss": 2.2986, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7943262411347518, |
|
"grad_norm": 0.6581558585166931, |
|
"learning_rate": 8.264642082429502e-05, |
|
"loss": 2.3298, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.825847123719464, |
|
"grad_norm": 0.623936653137207, |
|
"learning_rate": 8.047722342733189e-05, |
|
"loss": 2.3413, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8573680063041764, |
|
"grad_norm": 0.674414873123169, |
|
"learning_rate": 7.830802603036876e-05, |
|
"loss": 2.3229, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.7090038657188416, |
|
"learning_rate": 7.613882863340565e-05, |
|
"loss": 2.2195, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"eval_loss": 2.280967950820923, |
|
"eval_runtime": 29.0942, |
|
"eval_samples_per_second": 19.42, |
|
"eval_steps_per_second": 9.727, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9204097714736013, |
|
"grad_norm": 0.6059763431549072, |
|
"learning_rate": 7.396963123644252e-05, |
|
"loss": 2.3873, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9519306540583137, |
|
"grad_norm": 0.6435537934303284, |
|
"learning_rate": 7.18004338394794e-05, |
|
"loss": 2.3815, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.983451536643026, |
|
"grad_norm": 0.6055386066436768, |
|
"learning_rate": 6.963123644251627e-05, |
|
"loss": 2.2811, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.012608353033885, |
|
"grad_norm": 0.6461377739906311, |
|
"learning_rate": 6.746203904555315e-05, |
|
"loss": 2.268, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.0441292356185974, |
|
"grad_norm": 0.6220865249633789, |
|
"learning_rate": 6.529284164859003e-05, |
|
"loss": 2.2531, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.07565011820331, |
|
"grad_norm": 0.7532601356506348, |
|
"learning_rate": 6.31236442516269e-05, |
|
"loss": 2.3595, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.107171000788022, |
|
"grad_norm": 0.6474109888076782, |
|
"learning_rate": 6.0954446854663785e-05, |
|
"loss": 2.2329, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1386918833727346, |
|
"grad_norm": 0.6826472878456116, |
|
"learning_rate": 5.878524945770065e-05, |
|
"loss": 2.2751, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"grad_norm": 0.6584506630897522, |
|
"learning_rate": 5.661605206073753e-05, |
|
"loss": 2.3674, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.201733648542159, |
|
"grad_norm": 0.7180963158607483, |
|
"learning_rate": 5.4446854663774404e-05, |
|
"loss": 2.2365, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.201733648542159, |
|
"eval_loss": 2.2675509452819824, |
|
"eval_runtime": 29.0656, |
|
"eval_samples_per_second": 19.439, |
|
"eval_steps_per_second": 9.737, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.2332545311268714, |
|
"grad_norm": 0.5672284960746765, |
|
"learning_rate": 5.2277657266811284e-05, |
|
"loss": 2.2923, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.264775413711584, |
|
"grad_norm": 0.6805603504180908, |
|
"learning_rate": 5.010845986984816e-05, |
|
"loss": 2.3719, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2962962962962963, |
|
"grad_norm": 0.5467990040779114, |
|
"learning_rate": 4.793926247288503e-05, |
|
"loss": 2.2015, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.3278171788810087, |
|
"grad_norm": 0.6205419301986694, |
|
"learning_rate": 4.577006507592191e-05, |
|
"loss": 2.2902, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.359338061465721, |
|
"grad_norm": 0.7138726711273193, |
|
"learning_rate": 4.360086767895879e-05, |
|
"loss": 2.2435, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3908589440504335, |
|
"grad_norm": 0.6206688284873962, |
|
"learning_rate": 4.143167028199567e-05, |
|
"loss": 2.2338, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.422379826635146, |
|
"grad_norm": 0.5491693615913391, |
|
"learning_rate": 3.926247288503254e-05, |
|
"loss": 2.3459, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.453900709219858, |
|
"grad_norm": 0.7104467153549194, |
|
"learning_rate": 3.7093275488069415e-05, |
|
"loss": 2.2649, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4854215918045703, |
|
"grad_norm": 0.7069652080535889, |
|
"learning_rate": 3.4924078091106294e-05, |
|
"loss": 2.2289, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.5169424743892828, |
|
"grad_norm": 0.7097485661506653, |
|
"learning_rate": 3.275488069414317e-05, |
|
"loss": 2.2986, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5169424743892828, |
|
"eval_loss": 2.2567555904388428, |
|
"eval_runtime": 28.9859, |
|
"eval_samples_per_second": 19.492, |
|
"eval_steps_per_second": 9.763, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.548463356973995, |
|
"grad_norm": 0.6493655443191528, |
|
"learning_rate": 3.058568329718005e-05, |
|
"loss": 2.3154, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.5799842395587076, |
|
"grad_norm": 0.5883040428161621, |
|
"learning_rate": 2.841648590021692e-05, |
|
"loss": 2.3223, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.61150512214342, |
|
"grad_norm": 0.716276228427887, |
|
"learning_rate": 2.6247288503253796e-05, |
|
"loss": 2.2199, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.6430260047281324, |
|
"grad_norm": 0.5637353658676147, |
|
"learning_rate": 2.4078091106290673e-05, |
|
"loss": 2.4102, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.674546887312845, |
|
"grad_norm": 0.673851728439331, |
|
"learning_rate": 2.190889370932755e-05, |
|
"loss": 2.2731, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.7060677698975573, |
|
"grad_norm": 0.6776373386383057, |
|
"learning_rate": 1.9739696312364425e-05, |
|
"loss": 2.1956, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.7375886524822697, |
|
"grad_norm": 0.7007333636283875, |
|
"learning_rate": 1.75704989154013e-05, |
|
"loss": 2.2303, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.769109535066982, |
|
"grad_norm": 0.6189983487129211, |
|
"learning_rate": 1.540130151843818e-05, |
|
"loss": 2.2987, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.8006304176516945, |
|
"grad_norm": 0.6148092150688171, |
|
"learning_rate": 1.3232104121475056e-05, |
|
"loss": 2.2527, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.8321513002364065, |
|
"grad_norm": 0.6943378448486328, |
|
"learning_rate": 1.1062906724511932e-05, |
|
"loss": 2.3205, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8321513002364065, |
|
"eval_loss": 2.2496395111083984, |
|
"eval_runtime": 29.1023, |
|
"eval_samples_per_second": 19.414, |
|
"eval_steps_per_second": 9.724, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.863672182821119, |
|
"grad_norm": 0.6751441359519958, |
|
"learning_rate": 8.893709327548807e-06, |
|
"loss": 2.2436, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.8951930654058313, |
|
"grad_norm": 0.669208824634552, |
|
"learning_rate": 6.724511930585684e-06, |
|
"loss": 2.2762, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.9267139479905437, |
|
"grad_norm": 0.6294866800308228, |
|
"learning_rate": 4.55531453362256e-06, |
|
"loss": 2.2291, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.958234830575256, |
|
"grad_norm": 0.6172659397125244, |
|
"learning_rate": 2.386117136659436e-06, |
|
"loss": 2.2527, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9897557131599686, |
|
"grad_norm": 0.6540963649749756, |
|
"learning_rate": 2.1691973969631237e-07, |
|
"loss": 2.2856, |
|
"step": 950 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 951, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.610585377989755e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|