{ "best_metric": null, "best_model_checkpoint": null, "epoch": 39.95179987797437, "eval_steps": 100.0, "global_step": 32760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6101281269066504, "grad_norm": 109.53021240234375, "learning_rate": 1.188e-06, "loss": 53.9873, "step": 500 }, { "epoch": 1.0, "eval_cer": 1.1283783783783783, "eval_loss": 19.713674545288086, "eval_runtime": 142.3027, "eval_samples_per_second": 48.067, "eval_steps_per_second": 6.008, "eval_wer": 1.0, "step": 820 }, { "epoch": 1.2196461256863942, "grad_norm": 96.87247467041016, "learning_rate": 2.3880000000000003e-06, "loss": 20.2826, "step": 1000 }, { "epoch": 1.8297742525930445, "grad_norm": 75.96553802490234, "learning_rate": 3.588e-06, "loss": 16.2788, "step": 1500 }, { "epoch": 2.0, "eval_cer": 1.1284080132764343, "eval_loss": 12.832098960876465, "eval_runtime": 145.6125, "eval_samples_per_second": 46.974, "eval_steps_per_second": 5.872, "eval_wer": 1.0, "step": 1640 }, { "epoch": 2.4392922513727884, "grad_norm": 39.1208381652832, "learning_rate": 4.788e-06, "loss": 11.6832, "step": 2000 }, { "epoch": 3.0, "eval_cer": 1.1284080132764343, "eval_loss": 5.618701934814453, "eval_runtime": 142.6056, "eval_samples_per_second": 47.964, "eval_steps_per_second": 5.996, "eval_wer": 1.0, "step": 2460 }, { "epoch": 3.048810250152532, "grad_norm": 27.440048217773438, "learning_rate": 5.988e-06, "loss": 7.1726, "step": 2500 }, { "epoch": 3.6589383770591826, "grad_norm": 5.054373264312744, "learning_rate": 7.1880000000000005e-06, "loss": 4.3059, "step": 3000 }, { "epoch": 4.0, "eval_cer": 1.1284080132764343, "eval_loss": 3.3938002586364746, "eval_runtime": 143.6745, "eval_samples_per_second": 47.608, "eval_steps_per_second": 5.951, "eval_wer": 1.0, "step": 3280 }, { "epoch": 4.268456375838926, "grad_norm": 1.1531486511230469, "learning_rate": 8.388e-06, "loss": 3.4228, "step": 3500 }, { "epoch": 4.878584502745577, "grad_norm": 1.089985966682434, "learning_rate": 9.588e-06, "loss": 3.1294, "step": 4000 }, { "epoch": 5.0, "eval_cer": 1.1284080132764343, "eval_loss": 2.8853955268859863, "eval_runtime": 142.5876, "eval_samples_per_second": 47.971, "eval_steps_per_second": 5.996, "eval_wer": 1.0, "step": 4100 }, { "epoch": 5.48810250152532, "grad_norm": 2.860553503036499, "learning_rate": 1.0787999999999999e-05, "loss": 2.6448, "step": 4500 }, { "epoch": 6.0, "eval_cer": 1.1018551446183025, "eval_loss": 1.4072996377944946, "eval_runtime": 143.5621, "eval_samples_per_second": 47.645, "eval_steps_per_second": 5.956, "eval_wer": 1.0, "step": 4920 }, { "epoch": 6.097620500305064, "grad_norm": 4.616542816162109, "learning_rate": 1.1988000000000001e-05, "loss": 1.7348, "step": 5000 }, { "epoch": 6.707748627211714, "grad_norm": 4.588995933532715, "learning_rate": 1.3188e-05, "loss": 1.022, "step": 5500 }, { "epoch": 7.0, "eval_cer": 1.033576339497392, "eval_loss": 0.6322615146636963, "eval_runtime": 146.5646, "eval_samples_per_second": 46.669, "eval_steps_per_second": 5.834, "eval_wer": 1.0, "step": 5740 }, { "epoch": 7.317266625991458, "grad_norm": 3.685751438140869, "learning_rate": 1.4388000000000002e-05, "loss": 0.6784, "step": 6000 }, { "epoch": 7.927394752898109, "grad_norm": 5.702773094177246, "learning_rate": 1.5588e-05, "loss": 0.4449, "step": 6500 }, { "epoch": 8.0, "eval_cer": 1.0403330962541488, "eval_loss": 0.3109678328037262, "eval_runtime": 145.6018, "eval_samples_per_second": 46.977, "eval_steps_per_second": 5.872, "eval_wer": 0.9988304093567252, "step": 6560 }, { "epoch": 8.536912751677852, "grad_norm": 7.984670162200928, "learning_rate": 1.6788e-05, "loss": 0.305, "step": 7000 }, { "epoch": 9.0, "eval_cer": 1.0484826932195352, "eval_loss": 0.23479318618774414, "eval_runtime": 140.3606, "eval_samples_per_second": 48.732, "eval_steps_per_second": 6.091, "eval_wer": 0.9991228070175439, "step": 7380 }, { "epoch": 9.146430750457595, "grad_norm": 2.9451138973236084, "learning_rate": 1.7988e-05, "loss": 0.2291, "step": 7500 }, { "epoch": 9.756558877364247, "grad_norm": 8.89615535736084, "learning_rate": 1.9188e-05, "loss": 0.1895, "step": 8000 }, { "epoch": 10.0, "eval_cer": 1.0229670459933617, "eval_loss": 0.11090246587991714, "eval_runtime": 134.2378, "eval_samples_per_second": 50.954, "eval_steps_per_second": 6.369, "eval_wer": 0.9988304093567252, "step": 8200 }, { "epoch": 10.36607687614399, "grad_norm": 5.290111541748047, "learning_rate": 2.0388e-05, "loss": 0.1634, "step": 8500 }, { "epoch": 10.97620500305064, "grad_norm": 6.373618125915527, "learning_rate": 2.1588e-05, "loss": 0.1437, "step": 9000 }, { "epoch": 11.0, "eval_cer": 1.0221076339497392, "eval_loss": 0.09310559928417206, "eval_runtime": 142.4966, "eval_samples_per_second": 48.001, "eval_steps_per_second": 6.0, "eval_wer": 0.9989766081871345, "step": 9020 }, { "epoch": 11.585723001830385, "grad_norm": 7.480563640594482, "learning_rate": 2.2788000000000003e-05, "loss": 0.1258, "step": 9500 }, { "epoch": 12.0, "eval_cer": 1.0258119962067331, "eval_loss": 0.0827784314751625, "eval_runtime": 143.9999, "eval_samples_per_second": 47.5, "eval_steps_per_second": 5.938, "eval_wer": 0.9988304093567252, "step": 9840 }, { "epoch": 12.195241000610128, "grad_norm": 8.04295825958252, "learning_rate": 2.3988e-05, "loss": 0.1275, "step": 10000 }, { "epoch": 12.805369127516778, "grad_norm": 1.162207007408142, "learning_rate": 2.5188e-05, "loss": 0.1175, "step": 10500 }, { "epoch": 13.0, "eval_cer": 1.0232337600758654, "eval_loss": 0.08141058683395386, "eval_runtime": 145.6362, "eval_samples_per_second": 46.966, "eval_steps_per_second": 5.871, "eval_wer": 0.9991228070175439, "step": 10660 }, { "epoch": 13.414887126296522, "grad_norm": 3.6727077960968018, "learning_rate": 2.6388000000000002e-05, "loss": 0.1083, "step": 11000 }, { "epoch": 14.0, "eval_cer": 1.0193515884305357, "eval_loss": 0.04150446131825447, "eval_runtime": 145.6958, "eval_samples_per_second": 46.947, "eval_steps_per_second": 5.868, "eval_wer": 0.9988304093567252, "step": 11480 }, { "epoch": 14.024405125076266, "grad_norm": 0.8475801348686218, "learning_rate": 2.7585600000000002e-05, "loss": 0.1029, "step": 11500 }, { "epoch": 14.634533251982916, "grad_norm": 2.529123067855835, "learning_rate": 2.87856e-05, "loss": 0.0974, "step": 12000 }, { "epoch": 15.0, "eval_cer": 1.0239153627311521, "eval_loss": 0.0653274655342102, "eval_runtime": 140.8698, "eval_samples_per_second": 48.555, "eval_steps_per_second": 6.069, "eval_wer": 0.9989766081871345, "step": 12300 }, { "epoch": 15.24405125076266, "grad_norm": 4.318971157073975, "learning_rate": 2.99856e-05, "loss": 0.1027, "step": 12500 }, { "epoch": 15.854179377669311, "grad_norm": 5.327524662017822, "learning_rate": 2.9956013070043084e-05, "loss": 0.0967, "step": 13000 }, { "epoch": 16.0, "eval_cer": 1.0200331910858227, "eval_loss": 0.04946194589138031, "eval_runtime": 129.206, "eval_samples_per_second": 52.939, "eval_steps_per_second": 6.617, "eval_wer": 0.9991228070175439, "step": 13120 }, { "epoch": 16.463697376449055, "grad_norm": 3.569943904876709, "learning_rate": 2.9822174136311704e-05, "loss": 0.087, "step": 13500 }, { "epoch": 17.0, "eval_cer": 1.0223743480322427, "eval_loss": 0.06013474613428116, "eval_runtime": 137.3873, "eval_samples_per_second": 49.786, "eval_steps_per_second": 6.223, "eval_wer": 0.9989766081871345, "step": 13940 }, { "epoch": 17.0732153752288, "grad_norm": 6.857547283172607, "learning_rate": 2.9599814696946643e-05, "loss": 0.0885, "step": 14000 }, { "epoch": 17.683343502135447, "grad_norm": 0.8758929371833801, "learning_rate": 2.9289379955813937e-05, "loss": 0.0798, "step": 14500 }, { "epoch": 18.0, "eval_cer": 1.0217816500711236, "eval_loss": 0.054376766085624695, "eval_runtime": 139.8471, "eval_samples_per_second": 48.911, "eval_steps_per_second": 6.114, "eval_wer": 0.9989766081871345, "step": 14760 }, { "epoch": 18.29286150091519, "grad_norm": 3.2238011360168457, "learning_rate": 2.8893968733503523e-05, "loss": 0.0765, "step": 15000 }, { "epoch": 18.902989627821842, "grad_norm": 5.363631725311279, "learning_rate": 2.84143727148899e-05, "loss": 0.0719, "step": 15500 }, { "epoch": 19.0, "eval_cer": 1.0190848743480323, "eval_loss": 0.04263332858681679, "eval_runtime": 125.432, "eval_samples_per_second": 54.532, "eval_steps_per_second": 6.816, "eval_wer": 0.9989766081871345, "step": 15580 }, { "epoch": 19.512507626601586, "grad_norm": 0.8454777002334595, "learning_rate": 2.785418066112353e-05, "loss": 0.0731, "step": 16000 }, { "epoch": 20.0, "eval_cer": 1.0208333333333333, "eval_loss": 0.058695003390312195, "eval_runtime": 139.9224, "eval_samples_per_second": 48.884, "eval_steps_per_second": 6.111, "eval_wer": 0.9991228070175439, "step": 16400 }, { "epoch": 20.12202562538133, "grad_norm": 1.441945195198059, "learning_rate": 2.7216758309791792e-05, "loss": 0.0686, "step": 16500 }, { "epoch": 20.73215375228798, "grad_norm": 1.6329078674316406, "learning_rate": 2.6505935412410244e-05, "loss": 0.0693, "step": 17000 }, { "epoch": 21.0, "eval_cer": 1.0221965386439071, "eval_loss": 0.060316070914268494, "eval_runtime": 126.6576, "eval_samples_per_second": 54.004, "eval_steps_per_second": 6.75, "eval_wer": 0.9988304093567252, "step": 17220 }, { "epoch": 21.341671751067725, "grad_norm": 2.643927574157715, "learning_rate": 2.5725982724566367e-05, "loss": 0.0665, "step": 17500 }, { "epoch": 21.951799877974373, "grad_norm": 6.064180850982666, "learning_rate": 2.4881586346429215e-05, "loss": 0.0614, "step": 18000 }, { "epoch": 22.0, "eval_cer": 1.0191145092460883, "eval_loss": 0.03614737093448639, "eval_runtime": 143.2008, "eval_samples_per_second": 47.765, "eval_steps_per_second": 5.971, "eval_wer": 0.9988304093567252, "step": 18040 }, { "epoch": 22.561317876754117, "grad_norm": 0.2802174389362335, "learning_rate": 2.3977819567791885e-05, "loss": 0.0582, "step": 18500 }, { "epoch": 23.0, "eval_cer": 1.0173364153627311, "eval_loss": 0.03316638618707657, "eval_runtime": 137.4557, "eval_samples_per_second": 49.761, "eval_steps_per_second": 6.22, "eval_wer": 0.9988304093567252, "step": 18860 }, { "epoch": 23.17083587553386, "grad_norm": 4.600666522979736, "learning_rate": 2.302011238680703e-05, "loss": 0.0614, "step": 19000 }, { "epoch": 23.780964002440513, "grad_norm": 3.028637647628784, "learning_rate": 2.2014218885552525e-05, "loss": 0.0535, "step": 19500 }, { "epoch": 24.0, "eval_cer": 1.0172475106685632, "eval_loss": 0.03474853187799454, "eval_runtime": 142.7947, "eval_samples_per_second": 47.901, "eval_steps_per_second": 5.988, "eval_wer": 0.9988304093567252, "step": 19680 }, { "epoch": 24.390482001220256, "grad_norm": 0.652704656124115, "learning_rate": 2.096618265844089e-05, "loss": 0.0509, "step": 20000 }, { "epoch": 25.0, "grad_norm": 1.3256402015686035, "learning_rate": 1.988230050118496e-05, "loss": 0.0467, "step": 20500 }, { "epoch": 25.0, "eval_cer": 1.0179587482219061, "eval_loss": 0.033395446836948395, "eval_runtime": 143.6182, "eval_samples_per_second": 47.626, "eval_steps_per_second": 5.953, "eval_wer": 0.9988304093567252, "step": 20500 }, { "epoch": 25.610128126906652, "grad_norm": 2.4054596424102783, "learning_rate": 1.876908457848333e-05, "loss": 0.0456, "step": 21000 }, { "epoch": 26.0, "eval_cer": 1.0163880986249407, "eval_loss": 0.028318189084529877, "eval_runtime": 136.1666, "eval_samples_per_second": 50.233, "eval_steps_per_second": 6.279, "eval_wer": 0.9988304093567252, "step": 21320 }, { "epoch": 26.219646125686396, "grad_norm": 2.3439478874206543, "learning_rate": 1.7635513102937044e-05, "loss": 0.041, "step": 21500 }, { "epoch": 26.829774252593044, "grad_norm": 4.5279974937438965, "learning_rate": 1.6486170220352805e-05, "loss": 0.0389, "step": 22000 }, { "epoch": 27.0, "eval_cer": 1.0172178757705073, "eval_loss": 0.03608373552560806, "eval_runtime": 133.2593, "eval_samples_per_second": 51.329, "eval_steps_per_second": 6.416, "eval_wer": 0.9988304093567252, "step": 22140 }, { "epoch": 27.439292251372787, "grad_norm": 0.17273186147212982, "learning_rate": 1.5325608410059234e-05, "loss": 0.04, "step": 22500 }, { "epoch": 28.0, "eval_cer": 1.016714082503556, "eval_loss": 0.025806378573179245, "eval_runtime": 139.0344, "eval_samples_per_second": 49.196, "eval_steps_per_second": 6.15, "eval_wer": 0.9988304093567252, "step": 22960 }, { "epoch": 28.04881025015253, "grad_norm": 0.3772958219051361, "learning_rate": 1.4163090284146517e-05, "loss": 0.0383, "step": 23000 }, { "epoch": 28.658938377059183, "grad_norm": 2.2773993015289307, "learning_rate": 1.3005600466773616e-05, "loss": 0.0348, "step": 23500 }, { "epoch": 29.0, "eval_cer": 1.0176031294452348, "eval_loss": 0.03283878415822983, "eval_runtime": 134.9976, "eval_samples_per_second": 50.668, "eval_steps_per_second": 6.333, "eval_wer": 0.9989766081871345, "step": 23780 }, { "epoch": 29.268456375838927, "grad_norm": 0.017471132799983025, "learning_rate": 1.186009337109073e-05, "loss": 0.0356, "step": 24000 }, { "epoch": 29.878584502745575, "grad_norm": 0.18886829912662506, "learning_rate": 1.0733451415837331e-05, "loss": 0.0343, "step": 24500 }, { "epoch": 30.0, "eval_cer": 1.016239924134661, "eval_loss": 0.027611197903752327, "eval_runtime": 136.0309, "eval_samples_per_second": 50.283, "eval_steps_per_second": 6.285, "eval_wer": 0.9988304093567252, "step": 24600 }, { "epoch": 30.48810250152532, "grad_norm": 0.37121689319610596, "learning_rate": 9.634615680568962e-06, "loss": 0.0323, "step": 25000 }, { "epoch": 31.0, "eval_cer": 1.0165066382171646, "eval_loss": 0.029725994914770126, "eval_runtime": 136.5772, "eval_samples_per_second": 50.082, "eval_steps_per_second": 6.26, "eval_wer": 0.9988304093567252, "step": 25420 }, { "epoch": 31.097620500305062, "grad_norm": 0.04804076626896858, "learning_rate": 8.56578623342252e-06, "loss": 0.0298, "step": 25500 }, { "epoch": 31.707748627211714, "grad_norm": 3.0204851627349854, "learning_rate": 7.535614733981355e-06, "loss": 0.0283, "step": 26000 }, { "epoch": 32.0, "eval_cer": 1.0165362731152205, "eval_loss": 0.029090913012623787, "eval_runtime": 146.0361, "eval_samples_per_second": 46.838, "eval_steps_per_second": 5.855, "eval_wer": 0.9988304093567252, "step": 26240 }, { "epoch": 32.31726662599146, "grad_norm": 0.10753169655799866, "learning_rate": 6.550290643366546e-06, "loss": 0.0297, "step": 26500 }, { "epoch": 32.92739475289811, "grad_norm": 0.41164013743400574, "learning_rate": 5.615733971162722e-06, "loss": 0.0275, "step": 27000 }, { "epoch": 33.0, "eval_cer": 1.0160917496443813, "eval_loss": 0.0251940805464983, "eval_runtime": 140.9294, "eval_samples_per_second": 48.535, "eval_steps_per_second": 6.067, "eval_wer": 0.9988304093567252, "step": 27060 }, { "epoch": 33.53691275167785, "grad_norm": 0.8919934630393982, "learning_rate": 4.737559706904321e-06, "loss": 0.0256, "step": 27500 }, { "epoch": 34.0, "eval_cer": 1.0164473684210527, "eval_loss": 0.024480827152729034, "eval_runtime": 145.8102, "eval_samples_per_second": 46.91, "eval_steps_per_second": 5.864, "eval_wer": 0.9988304093567252, "step": 27880 }, { "epoch": 34.1464307504576, "grad_norm": 0.016134992241859436, "learning_rate": 3.921044084178765e-06, "loss": 0.0265, "step": 28000 }, { "epoch": 34.75655887736425, "grad_norm": 0.048243045806884766, "learning_rate": 3.171092880038068e-06, "loss": 0.0241, "step": 28500 }, { "epoch": 35.0, "eval_cer": 1.0159139402560455, "eval_loss": 0.023953931406140327, "eval_runtime": 141.1272, "eval_samples_per_second": 48.467, "eval_steps_per_second": 6.058, "eval_wer": 0.9988304093567252, "step": 28700 }, { "epoch": 35.36607687614399, "grad_norm": 0.09765101969242096, "learning_rate": 2.4922119401819466e-06, "loss": 0.0243, "step": 29000 }, { "epoch": 35.97620500305064, "grad_norm": 1.2347666025161743, "learning_rate": 1.8896100834437107e-06, "loss": 0.0237, "step": 29500 }, { "epoch": 36.0, "eval_cer": 1.0165659080132765, "eval_loss": 0.0277857668697834, "eval_runtime": 141.5894, "eval_samples_per_second": 48.309, "eval_steps_per_second": 6.039, "eval_wer": 0.9988304093567252, "step": 29520 }, { "epoch": 36.58572300183038, "grad_norm": 2.3807311058044434, "learning_rate": 1.3654632704576153e-06, "loss": 0.0238, "step": 30000 }, { "epoch": 37.0, "eval_cer": 1.0162991939307728, "eval_loss": 0.027532441541552544, "eval_runtime": 145.7443, "eval_samples_per_second": 46.932, "eval_steps_per_second": 5.866, "eval_wer": 0.9988304093567252, "step": 30340 }, { "epoch": 37.195241000610125, "grad_norm": 1.2338793277740479, "learning_rate": 9.221049871134596e-07, "loss": 0.0237, "step": 30500 }, { "epoch": 37.80536912751678, "grad_norm": 0.02291560173034668, "learning_rate": 5.633293037499832e-07, "loss": 0.022, "step": 31000 }, { "epoch": 38.0, "eval_cer": 1.0162991939307728, "eval_loss": 0.024687692523002625, "eval_runtime": 136.7797, "eval_samples_per_second": 50.007, "eval_steps_per_second": 6.251, "eval_wer": 0.9988304093567252, "step": 31160 }, { "epoch": 38.41488712629652, "grad_norm": 1.2824342250823975, "learning_rate": 2.912918111057888e-07, "loss": 0.0184, "step": 31500 }, { "epoch": 39.0, "eval_cer": 1.0162695590327169, "eval_loss": 0.02619771659374237, "eval_runtime": 143.9756, "eval_samples_per_second": 47.508, "eval_steps_per_second": 5.939, "eval_wer": 0.9988304093567252, "step": 31980 }, { "epoch": 39.024405125076264, "grad_norm": 0.024750633165240288, "learning_rate": 1.0762696080869105e-07, "loss": 0.0228, "step": 32000 }, { "epoch": 39.634533251982916, "grad_norm": 0.007463211193680763, "learning_rate": 1.3438245287707985e-08, "loss": 0.0199, "step": 32500 }, { "epoch": 39.95179987797437, "eval_cer": 1.0160324798482694, "eval_loss": 0.024437865242362022, "eval_runtime": 142.7998, "eval_samples_per_second": 47.899, "eval_steps_per_second": 5.987, "eval_wer": 0.9988304093567252, "step": 32760 }, { "epoch": 39.95179987797437, "step": 32760, "total_flos": 3.0824437935703523e+19, "train_loss": 1.9907507799775026, "train_runtime": 53433.8738, "train_samples_per_second": 39.256, "train_steps_per_second": 0.613 } ], "logging_steps": 500, "max_steps": 32760, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0824437935703523e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }