|
{ |
|
"best_metric": 0.08991385996341705, |
|
"best_model_checkpoint": "./fine_tuned_meta_encoder_20240823_155731/checkpoint-1100", |
|
"epoch": 2.9042904290429044, |
|
"eval_steps": 100, |
|
"global_step": 1100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.026402640264026403, |
|
"grad_norm": 1.2209537029266357, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3132, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.052805280528052806, |
|
"grad_norm": 0.8953336477279663, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5263, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07920792079207921, |
|
"grad_norm": 0.4192950427532196, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2848, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10561056105610561, |
|
"grad_norm": 0.27445298433303833, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2399, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.132013201320132, |
|
"grad_norm": 0.3270169794559479, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2224, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15841584158415842, |
|
"grad_norm": 0.27454960346221924, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2188, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1848184818481848, |
|
"grad_norm": 0.3361058235168457, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1904, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21122112211221122, |
|
"grad_norm": 0.2786653935909271, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1913, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2376237623762376, |
|
"grad_norm": 0.2567278742790222, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1818, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.264026402640264, |
|
"grad_norm": 0.26846471428871155, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1652, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.264026402640264, |
|
"eval_loss": 0.17934565246105194, |
|
"eval_runtime": 35.5662, |
|
"eval_samples_per_second": 18.951, |
|
"eval_steps_per_second": 4.752, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.29042904290429045, |
|
"grad_norm": 0.3001416027545929, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1711, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.31683168316831684, |
|
"grad_norm": 0.27367591857910156, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1649, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3432343234323432, |
|
"grad_norm": 0.2346794754266739, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1721, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3696369636963696, |
|
"grad_norm": 0.2470114529132843, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1599, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.39603960396039606, |
|
"grad_norm": 0.2749585807323456, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1648, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42244224422442245, |
|
"grad_norm": 0.23965956270694733, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1584, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.44884488448844884, |
|
"grad_norm": 0.2817020118236542, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1567, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4752475247524752, |
|
"grad_norm": 0.3235905170440674, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1494, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5016501650165016, |
|
"grad_norm": 0.2556610405445099, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1494, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.528052805280528, |
|
"grad_norm": 0.26074209809303284, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1496, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.528052805280528, |
|
"eval_loss": 0.15303318202495575, |
|
"eval_runtime": 35.5044, |
|
"eval_samples_per_second": 18.984, |
|
"eval_steps_per_second": 4.76, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5544554455445545, |
|
"grad_norm": 0.24563810229301453, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1523, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5808580858085809, |
|
"grad_norm": 0.2592566907405853, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1453, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6072607260726073, |
|
"grad_norm": 0.3210408389568329, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1414, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6336633663366337, |
|
"grad_norm": 0.2894737720489502, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1479, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6600660066006601, |
|
"grad_norm": 0.307718425989151, |
|
"learning_rate": 0.0002, |
|
"loss": 0.143, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6864686468646864, |
|
"grad_norm": 0.2861904799938202, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1422, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7128712871287128, |
|
"grad_norm": 0.2835065424442291, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1381, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7392739273927392, |
|
"grad_norm": 0.36195576190948486, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1364, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7656765676567657, |
|
"grad_norm": 0.36926591396331787, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1364, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7920792079207921, |
|
"grad_norm": 0.2543686628341675, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1295, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7920792079207921, |
|
"eval_loss": 0.13678725063800812, |
|
"eval_runtime": 35.5612, |
|
"eval_samples_per_second": 18.953, |
|
"eval_steps_per_second": 4.752, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8184818481848185, |
|
"grad_norm": 0.2578878402709961, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1373, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8448844884488449, |
|
"grad_norm": 0.2869018018245697, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1365, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8712871287128713, |
|
"grad_norm": 0.2525601089000702, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1263, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8976897689768977, |
|
"grad_norm": 0.33318400382995605, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1342, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9240924092409241, |
|
"grad_norm": 0.26829567551612854, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1217, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9504950495049505, |
|
"grad_norm": 0.29827719926834106, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1251, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.976897689768977, |
|
"grad_norm": 0.2644639313220978, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1172, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0033003300330032, |
|
"grad_norm": 0.2535085082054138, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1275, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0297029702970297, |
|
"grad_norm": 0.2713930606842041, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1162, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.056105610561056, |
|
"grad_norm": 0.26092982292175293, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1209, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.056105610561056, |
|
"eval_loss": 0.12458474934101105, |
|
"eval_runtime": 35.5248, |
|
"eval_samples_per_second": 18.973, |
|
"eval_steps_per_second": 4.757, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0825082508250825, |
|
"grad_norm": 0.3101023733615875, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1206, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.108910891089109, |
|
"grad_norm": 0.2629012167453766, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1165, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1353135313531353, |
|
"grad_norm": 0.2954414188861847, |
|
"learning_rate": 0.0002, |
|
"loss": 0.121, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1617161716171618, |
|
"grad_norm": 0.3219696879386902, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1155, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.188118811881188, |
|
"grad_norm": 0.2916986346244812, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1109, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2145214521452146, |
|
"grad_norm": 0.2607131600379944, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1125, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2409240924092408, |
|
"grad_norm": 0.25215160846710205, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1155, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2673267326732673, |
|
"grad_norm": 0.2367628961801529, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1119, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2937293729372938, |
|
"grad_norm": 0.2621775269508362, |
|
"learning_rate": 0.0002, |
|
"loss": 0.106, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3201320132013201, |
|
"grad_norm": 0.27597054839134216, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1071, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3201320132013201, |
|
"eval_loss": 0.11538028717041016, |
|
"eval_runtime": 35.6246, |
|
"eval_samples_per_second": 18.92, |
|
"eval_steps_per_second": 4.744, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3465346534653464, |
|
"grad_norm": 0.3191552758216858, |
|
"learning_rate": 0.0002, |
|
"loss": 0.11, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.372937293729373, |
|
"grad_norm": 0.32491299510002136, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1072, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3993399339933994, |
|
"grad_norm": 0.3281764090061188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1141, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4257425742574257, |
|
"grad_norm": 0.24705563485622406, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1026, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4521452145214522, |
|
"grad_norm": 0.24219994246959686, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1058, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4785478547854787, |
|
"grad_norm": 0.2313147485256195, |
|
"learning_rate": 0.0002, |
|
"loss": 0.104, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.504950495049505, |
|
"grad_norm": 0.34662193059921265, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1007, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5313531353135312, |
|
"grad_norm": 0.2655913829803467, |
|
"learning_rate": 0.0002, |
|
"loss": 0.102, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5577557755775577, |
|
"grad_norm": 0.2477431297302246, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1073, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5841584158415842, |
|
"grad_norm": 0.2847592532634735, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1049, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5841584158415842, |
|
"eval_loss": 0.10829836875200272, |
|
"eval_runtime": 35.5648, |
|
"eval_samples_per_second": 18.951, |
|
"eval_steps_per_second": 4.752, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6105610561056105, |
|
"grad_norm": 0.24044634401798248, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1003, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.636963696369637, |
|
"grad_norm": 0.2383195161819458, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1023, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6633663366336635, |
|
"grad_norm": 0.2620280981063843, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0978, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6897689768976898, |
|
"grad_norm": 0.27225416898727417, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1055, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.716171617161716, |
|
"grad_norm": 0.2959238588809967, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1015, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7425742574257426, |
|
"grad_norm": 0.2089966982603073, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1024, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.768976897689769, |
|
"grad_norm": 0.23935158550739288, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1002, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7953795379537953, |
|
"grad_norm": 0.2478724718093872, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1016, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8217821782178216, |
|
"grad_norm": 0.25742414593696594, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1016, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8481848184818483, |
|
"grad_norm": 0.2547028064727783, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0913, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8481848184818483, |
|
"eval_loss": 0.10148681700229645, |
|
"eval_runtime": 35.5107, |
|
"eval_samples_per_second": 18.98, |
|
"eval_steps_per_second": 4.759, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8745874587458746, |
|
"grad_norm": 0.2965940237045288, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0989, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.900990099009901, |
|
"grad_norm": 0.22955147922039032, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0936, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9273927392739274, |
|
"grad_norm": 0.2257162481546402, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0893, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.953795379537954, |
|
"grad_norm": 0.274249404668808, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0983, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9801980198019802, |
|
"grad_norm": 0.25331228971481323, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0956, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0066006600660065, |
|
"grad_norm": 0.2801855802536011, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0948, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.033003300330033, |
|
"grad_norm": 0.24239327013492584, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0874, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0594059405940595, |
|
"grad_norm": 0.292223185300827, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0874, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.0858085808580857, |
|
"grad_norm": 0.2646115720272064, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0857, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.112211221122112, |
|
"grad_norm": 0.22307829558849335, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0867, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.112211221122112, |
|
"eval_loss": 0.09895669668912888, |
|
"eval_runtime": 35.4576, |
|
"eval_samples_per_second": 19.009, |
|
"eval_steps_per_second": 4.766, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1386138613861387, |
|
"grad_norm": 0.2283334881067276, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0861, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.165016501650165, |
|
"grad_norm": 0.24495747685432434, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0865, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.1914191419141913, |
|
"grad_norm": 0.2823265790939331, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0851, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.217821782178218, |
|
"grad_norm": 0.29882529377937317, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0883, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.2442244224422443, |
|
"grad_norm": 0.2550313174724579, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0872, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.2706270627062706, |
|
"grad_norm": 0.24576956033706665, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0871, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.297029702970297, |
|
"grad_norm": 0.2293216437101364, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0852, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3234323432343236, |
|
"grad_norm": 0.2510163486003876, |
|
"learning_rate": 0.0002, |
|
"loss": 0.087, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.34983498349835, |
|
"grad_norm": 0.22960515320301056, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0829, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.376237623762376, |
|
"grad_norm": 0.26182201504707336, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0788, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.376237623762376, |
|
"eval_loss": 0.09652868658304214, |
|
"eval_runtime": 35.4522, |
|
"eval_samples_per_second": 19.012, |
|
"eval_steps_per_second": 4.767, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4026402640264024, |
|
"grad_norm": 0.22264224290847778, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0867, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.429042904290429, |
|
"grad_norm": 0.24472057819366455, |
|
"learning_rate": 0.0002, |
|
"loss": 0.085, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.4554455445544554, |
|
"grad_norm": 0.21409814059734344, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0902, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.4818481848184817, |
|
"grad_norm": 0.2275090515613556, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0833, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5082508250825084, |
|
"grad_norm": 0.24984212219715118, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0875, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.5346534653465347, |
|
"grad_norm": 0.2913050949573517, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0821, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.561056105610561, |
|
"grad_norm": 0.30677589774131775, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0835, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.5874587458745877, |
|
"grad_norm": 0.2401341199874878, |
|
"learning_rate": 0.0002, |
|
"loss": 0.088, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.613861386138614, |
|
"grad_norm": 0.25704678893089294, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0838, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.6402640264026402, |
|
"grad_norm": 0.20528723299503326, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0839, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.6402640264026402, |
|
"eval_loss": 0.09263003617525101, |
|
"eval_runtime": 35.5469, |
|
"eval_samples_per_second": 18.961, |
|
"eval_steps_per_second": 4.754, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.2680225968360901, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0877, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.693069306930693, |
|
"grad_norm": 0.24241389334201813, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0793, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7194719471947195, |
|
"grad_norm": 0.2348719835281372, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0857, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.745874587458746, |
|
"grad_norm": 0.25451603531837463, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0877, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.772277227722772, |
|
"grad_norm": 0.28436726331710815, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0829, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.798679867986799, |
|
"grad_norm": 0.269794225692749, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0826, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.825082508250825, |
|
"grad_norm": 0.22040368616580963, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0814, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.8514851485148514, |
|
"grad_norm": 0.2117665708065033, |
|
"learning_rate": 0.0002, |
|
"loss": 0.084, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.877887788778878, |
|
"grad_norm": 0.21133095026016235, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0838, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.9042904290429044, |
|
"grad_norm": 0.25484949350357056, |
|
"learning_rate": 0.0002, |
|
"loss": 0.082, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.9042904290429044, |
|
"eval_loss": 0.08991385996341705, |
|
"eval_runtime": 35.6202, |
|
"eval_samples_per_second": 18.922, |
|
"eval_steps_per_second": 4.744, |
|
"step": 1100 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1134, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.001 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.017625090355036e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|