{ "best_metric": 0.3642527461051941, "best_model_checkpoint": "./working/checkpoint-9000", "epoch": 107.14285714285714, "eval_steps": 1000, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.19, "grad_norm": 5.874335765838623, "learning_rate": 1.5e-06, "loss": 12.5691, "step": 100 }, { "epoch": 2.38, "grad_norm": 11.808794975280762, "learning_rate": 2.9850000000000002e-06, "loss": 9.8251, "step": 200 }, { "epoch": 3.57, "grad_norm": 10.97357177734375, "learning_rate": 4.485e-06, "loss": 6.5329, "step": 300 }, { "epoch": 4.76, "grad_norm": 10.752958297729492, "learning_rate": 5.985e-06, "loss": 5.2405, "step": 400 }, { "epoch": 5.95, "grad_norm": 5.865973949432373, "learning_rate": 7.485e-06, "loss": 4.629, "step": 500 }, { "epoch": 7.14, "grad_norm": 4.192840099334717, "learning_rate": 8.985e-06, "loss": 4.2437, "step": 600 }, { "epoch": 8.33, "grad_norm": 3.2876224517822266, "learning_rate": 1.0485e-05, "loss": 3.9784, "step": 700 }, { "epoch": 9.52, "grad_norm": 1.2407236099243164, "learning_rate": 1.1985000000000001e-05, "loss": 3.7904, "step": 800 }, { "epoch": 10.71, "grad_norm": 0.6437963247299194, "learning_rate": 1.3485e-05, "loss": 3.6682, "step": 900 }, { "epoch": 11.9, "grad_norm": 0.3295372724533081, "learning_rate": 1.4985e-05, "loss": 3.5325, "step": 1000 }, { "epoch": 11.9, "eval_cer": 0.926601333602748, "eval_loss": 3.4897494316101074, "eval_runtime": 10.3726, "eval_samples_per_second": 32.393, "eval_steps_per_second": 4.049, "eval_wer": 1.0, "step": 1000 }, { "epoch": 13.1, "grad_norm": 0.61485356092453, "learning_rate": 1.6485e-05, "loss": 3.4934, "step": 1100 }, { "epoch": 14.29, "grad_norm": 0.4778901934623718, "learning_rate": 1.7985e-05, "loss": 3.4662, "step": 1200 }, { "epoch": 15.48, "grad_norm": 0.56944739818573, "learning_rate": 1.9485e-05, "loss": 3.4413, "step": 1300 }, { "epoch": 16.67, "grad_norm": 0.37235376238822937, "learning_rate": 2.0985e-05, "loss": 3.4089, "step": 1400 }, { "epoch": 17.86, "grad_norm": 0.4277956187725067, "learning_rate": 2.2485000000000002e-05, "loss": 3.3384, "step": 1500 }, { "epoch": 19.05, "grad_norm": 0.50371253490448, "learning_rate": 2.3985e-05, "loss": 3.2455, "step": 1600 }, { "epoch": 20.24, "grad_norm": 1.1383576393127441, "learning_rate": 2.5485e-05, "loss": 3.0681, "step": 1700 }, { "epoch": 21.43, "grad_norm": 0.8668686747550964, "learning_rate": 2.6985e-05, "loss": 2.7949, "step": 1800 }, { "epoch": 22.62, "grad_norm": 1.0731563568115234, "learning_rate": 2.8485000000000003e-05, "loss": 2.4866, "step": 1900 }, { "epoch": 23.81, "grad_norm": 1.3317248821258545, "learning_rate": 2.9985000000000002e-05, "loss": 2.1973, "step": 2000 }, { "epoch": 23.81, "eval_cer": 0.24030107092341887, "eval_loss": 1.1350404024124146, "eval_runtime": 10.3527, "eval_samples_per_second": 32.455, "eval_steps_per_second": 4.057, "eval_wer": 0.839647119875454, "step": 2000 }, { "epoch": 25.0, "grad_norm": 2.0239453315734863, "learning_rate": 2.962875e-05, "loss": 1.9821, "step": 2100 }, { "epoch": 26.19, "grad_norm": 1.464921236038208, "learning_rate": 2.925375e-05, "loss": 1.853, "step": 2200 }, { "epoch": 27.38, "grad_norm": 1.6508703231811523, "learning_rate": 2.887875e-05, "loss": 1.7547, "step": 2300 }, { "epoch": 28.57, "grad_norm": 1.3476407527923584, "learning_rate": 2.850375e-05, "loss": 1.7171, "step": 2400 }, { "epoch": 29.76, "grad_norm": 1.2977994680404663, "learning_rate": 2.812875e-05, "loss": 1.6498, "step": 2500 }, { "epoch": 30.95, "grad_norm": 1.8536533117294312, "learning_rate": 2.775375e-05, "loss": 1.5965, "step": 2600 }, { "epoch": 32.14, "grad_norm": 1.7063647508621216, "learning_rate": 2.7378750000000003e-05, "loss": 1.5744, "step": 2700 }, { "epoch": 33.33, "grad_norm": 1.613274097442627, "learning_rate": 2.700375e-05, "loss": 1.5483, "step": 2800 }, { "epoch": 34.52, "grad_norm": 1.6182752847671509, "learning_rate": 2.662875e-05, "loss": 1.5076, "step": 2900 }, { "epoch": 35.71, "grad_norm": 1.9291083812713623, "learning_rate": 2.6253750000000003e-05, "loss": 1.4762, "step": 3000 }, { "epoch": 35.71, "eval_cer": 0.15634471610426348, "eval_loss": 0.527005672454834, "eval_runtime": 10.3175, "eval_samples_per_second": 32.566, "eval_steps_per_second": 4.071, "eval_wer": 0.6844836533471718, "step": 3000 }, { "epoch": 36.9, "grad_norm": 1.7794642448425293, "learning_rate": 2.587875e-05, "loss": 1.4752, "step": 3100 }, { "epoch": 38.1, "grad_norm": 1.7410004138946533, "learning_rate": 2.550375e-05, "loss": 1.4586, "step": 3200 }, { "epoch": 39.29, "grad_norm": 1.6410831212997437, "learning_rate": 2.512875e-05, "loss": 1.4172, "step": 3300 }, { "epoch": 40.48, "grad_norm": 2.350106954574585, "learning_rate": 2.475375e-05, "loss": 1.3751, "step": 3400 }, { "epoch": 41.67, "grad_norm": 1.7794309854507446, "learning_rate": 2.437875e-05, "loss": 1.3516, "step": 3500 }, { "epoch": 42.86, "grad_norm": 1.8536804914474487, "learning_rate": 2.400375e-05, "loss": 1.338, "step": 3600 }, { "epoch": 44.05, "grad_norm": 2.043091058731079, "learning_rate": 2.362875e-05, "loss": 1.31, "step": 3700 }, { "epoch": 45.24, "grad_norm": 2.556605577468872, "learning_rate": 2.325375e-05, "loss": 1.2737, "step": 3800 }, { "epoch": 46.43, "grad_norm": 2.167360544204712, "learning_rate": 2.2878750000000002e-05, "loss": 1.2706, "step": 3900 }, { "epoch": 47.62, "grad_norm": 1.933358907699585, "learning_rate": 2.2503750000000003e-05, "loss": 1.2409, "step": 4000 }, { "epoch": 47.62, "eval_cer": 0.140280864821176, "eval_loss": 0.41946256160736084, "eval_runtime": 10.4076, "eval_samples_per_second": 32.284, "eval_steps_per_second": 4.036, "eval_wer": 0.6331084587441619, "step": 4000 }, { "epoch": 48.81, "grad_norm": 2.1927788257598877, "learning_rate": 2.212875e-05, "loss": 1.2476, "step": 4100 }, { "epoch": 50.0, "grad_norm": 3.7042958736419678, "learning_rate": 2.175375e-05, "loss": 1.2211, "step": 4200 }, { "epoch": 51.19, "grad_norm": 2.503298282623291, "learning_rate": 2.13825e-05, "loss": 1.1974, "step": 4300 }, { "epoch": 52.38, "grad_norm": 2.378753423690796, "learning_rate": 2.101125e-05, "loss": 1.19, "step": 4400 }, { "epoch": 53.57, "grad_norm": 6.145068645477295, "learning_rate": 2.063625e-05, "loss": 1.1734, "step": 4500 }, { "epoch": 54.76, "grad_norm": 2.2741551399230957, "learning_rate": 2.026125e-05, "loss": 1.1664, "step": 4600 }, { "epoch": 55.95, "grad_norm": 3.21976900100708, "learning_rate": 1.988625e-05, "loss": 1.1555, "step": 4700 }, { "epoch": 57.14, "grad_norm": 3.06923508644104, "learning_rate": 1.951125e-05, "loss": 1.1391, "step": 4800 }, { "epoch": 58.33, "grad_norm": 1.8809341192245483, "learning_rate": 1.9136249999999998e-05, "loss": 1.1271, "step": 4900 }, { "epoch": 59.52, "grad_norm": 2.041844367980957, "learning_rate": 1.876125e-05, "loss": 1.1241, "step": 5000 }, { "epoch": 59.52, "eval_cer": 0.13785613255203072, "eval_loss": 0.38446417450904846, "eval_runtime": 10.417, "eval_samples_per_second": 32.255, "eval_steps_per_second": 4.032, "eval_wer": 0.63622210690192, "step": 5000 }, { "epoch": 60.71, "grad_norm": 2.3043465614318848, "learning_rate": 1.838625e-05, "loss": 1.1042, "step": 5100 }, { "epoch": 61.9, "grad_norm": 3.682835340499878, "learning_rate": 1.801125e-05, "loss": 1.0921, "step": 5200 }, { "epoch": 63.1, "grad_norm": 4.466809272766113, "learning_rate": 1.7636250000000002e-05, "loss": 1.095, "step": 5300 }, { "epoch": 64.29, "grad_norm": 2.615339994430542, "learning_rate": 1.726125e-05, "loss": 1.1057, "step": 5400 }, { "epoch": 65.48, "grad_norm": 3.483346700668335, "learning_rate": 1.688625e-05, "loss": 1.0553, "step": 5500 }, { "epoch": 66.67, "grad_norm": 2.141965866088867, "learning_rate": 1.651125e-05, "loss": 1.0656, "step": 5600 }, { "epoch": 67.86, "grad_norm": 2.2111611366271973, "learning_rate": 1.613625e-05, "loss": 1.0673, "step": 5700 }, { "epoch": 69.05, "grad_norm": 2.071429491043091, "learning_rate": 1.576125e-05, "loss": 1.0632, "step": 5800 }, { "epoch": 70.24, "grad_norm": 4.86116886138916, "learning_rate": 1.538625e-05, "loss": 1.0447, "step": 5900 }, { "epoch": 71.43, "grad_norm": 3.3076369762420654, "learning_rate": 1.5011250000000001e-05, "loss": 1.024, "step": 6000 }, { "epoch": 71.43, "eval_cer": 0.13548191553849262, "eval_loss": 0.3715837895870209, "eval_runtime": 10.3955, "eval_samples_per_second": 32.322, "eval_steps_per_second": 4.04, "eval_wer": 0.6320705760249092, "step": 6000 }, { "epoch": 72.62, "grad_norm": 2.3496508598327637, "learning_rate": 1.463625e-05, "loss": 1.0379, "step": 6100 }, { "epoch": 73.81, "grad_norm": 2.621004343032837, "learning_rate": 1.426125e-05, "loss": 1.0515, "step": 6200 }, { "epoch": 75.0, "grad_norm": 5.240926742553711, "learning_rate": 1.388625e-05, "loss": 1.0253, "step": 6300 }, { "epoch": 76.19, "grad_norm": 2.9943532943725586, "learning_rate": 1.351125e-05, "loss": 1.0131, "step": 6400 }, { "epoch": 77.38, "grad_norm": 2.475804328918457, "learning_rate": 1.3136250000000001e-05, "loss": 1.0227, "step": 6500 }, { "epoch": 78.57, "grad_norm": 2.5056631565093994, "learning_rate": 1.2761250000000001e-05, "loss": 1.0025, "step": 6600 }, { "epoch": 79.76, "grad_norm": 3.9102323055267334, "learning_rate": 1.238625e-05, "loss": 1.0181, "step": 6700 }, { "epoch": 80.95, "grad_norm": 3.3800106048583984, "learning_rate": 1.201125e-05, "loss": 0.9892, "step": 6800 }, { "epoch": 82.14, "grad_norm": 2.165987014770508, "learning_rate": 1.164e-05, "loss": 0.9822, "step": 6900 }, { "epoch": 83.33, "grad_norm": 2.426816463470459, "learning_rate": 1.1265e-05, "loss": 0.9922, "step": 7000 }, { "epoch": 83.33, "eval_cer": 0.13310769852495455, "eval_loss": 0.3727741539478302, "eval_runtime": 10.3942, "eval_samples_per_second": 32.326, "eval_steps_per_second": 4.041, "eval_wer": 0.628956927867151, "step": 7000 }, { "epoch": 84.52, "grad_norm": 3.203552007675171, "learning_rate": 1.089e-05, "loss": 0.9995, "step": 7100 }, { "epoch": 85.71, "grad_norm": 2.827246904373169, "learning_rate": 1.0515e-05, "loss": 0.9683, "step": 7200 }, { "epoch": 86.9, "grad_norm": 2.9927895069122314, "learning_rate": 1.0140000000000001e-05, "loss": 0.9864, "step": 7300 }, { "epoch": 88.1, "grad_norm": 2.151737928390503, "learning_rate": 9.765e-06, "loss": 0.9744, "step": 7400 }, { "epoch": 89.29, "grad_norm": 2.5920581817626953, "learning_rate": 9.39e-06, "loss": 0.9794, "step": 7500 }, { "epoch": 90.48, "grad_norm": 3.2127621173858643, "learning_rate": 9.015e-06, "loss": 0.9622, "step": 7600 }, { "epoch": 91.67, "grad_norm": 3.541879892349243, "learning_rate": 8.64e-06, "loss": 0.9624, "step": 7700 }, { "epoch": 92.86, "grad_norm": 2.827958345413208, "learning_rate": 8.265000000000001e-06, "loss": 0.9637, "step": 7800 }, { "epoch": 94.05, "grad_norm": 2.648591995239258, "learning_rate": 7.89e-06, "loss": 0.9684, "step": 7900 }, { "epoch": 95.24, "grad_norm": 4.264640808105469, "learning_rate": 7.515e-06, "loss": 0.9432, "step": 8000 }, { "epoch": 95.24, "eval_cer": 0.13209739341281068, "eval_loss": 0.3648131787776947, "eval_runtime": 10.4236, "eval_samples_per_second": 32.234, "eval_steps_per_second": 4.029, "eval_wer": 0.6170212765957447, "step": 8000 }, { "epoch": 96.43, "grad_norm": 2.8603055477142334, "learning_rate": 7.14e-06, "loss": 0.9576, "step": 8100 }, { "epoch": 97.62, "grad_norm": 2.931117296218872, "learning_rate": 6.7650000000000005e-06, "loss": 0.9579, "step": 8200 }, { "epoch": 98.81, "grad_norm": 3.449780225753784, "learning_rate": 6.39e-06, "loss": 0.9535, "step": 8300 }, { "epoch": 100.0, "grad_norm": 4.3435468673706055, "learning_rate": 6.015000000000001e-06, "loss": 0.9463, "step": 8400 }, { "epoch": 101.19, "grad_norm": 2.2839837074279785, "learning_rate": 5.64e-06, "loss": 0.9413, "step": 8500 }, { "epoch": 102.38, "grad_norm": 3.1021485328674316, "learning_rate": 5.2649999999999996e-06, "loss": 0.9436, "step": 8600 }, { "epoch": 103.57, "grad_norm": 2.9421229362487793, "learning_rate": 4.890000000000001e-06, "loss": 0.939, "step": 8700 }, { "epoch": 104.76, "grad_norm": 2.0578436851501465, "learning_rate": 4.515e-06, "loss": 0.9338, "step": 8800 }, { "epoch": 105.95, "grad_norm": 3.5860297679901123, "learning_rate": 4.14e-06, "loss": 0.9315, "step": 8900 }, { "epoch": 107.14, "grad_norm": 2.1002159118652344, "learning_rate": 3.765e-06, "loss": 0.9279, "step": 9000 }, { "epoch": 107.14, "eval_cer": 0.13245100020206102, "eval_loss": 0.3642527461051941, "eval_runtime": 10.5077, "eval_samples_per_second": 31.976, "eval_steps_per_second": 3.997, "eval_wer": 0.6248053969901401, "step": 9000 } ], "logging_steps": 100, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 120, "save_steps": 1000, "total_flos": 2.7115316842745717e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }