diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a1864f6d71855ba41d8133ddf3533b03b4544613 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +*/*__pycache__* +*/checkpoint*/ +*/data*/ +*/mdls*/ +*/model* +*__pycache__* +checkpoint*/ +data*/ +mdls*/ +input*/ +output*/ +model* diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..ca795558511c7a57d3a08e73ea8432b36bc91c0d --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 31, + "": 30 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e149dea5139e8252b5ac1290ed7e44ac4991cc4b --- /dev/null +++ b/config.json @@ -0,0 +1,109 @@ +{ + "_name_or_path": "facebook/wav2vec2-large-lv60", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 29, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.42.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} diff --git a/demo.4gram.py b/demo.4gram.py new file mode 100644 index 0000000000000000000000000000000000000000..cc925bf8eff04c9aec630d875dd2d08549445782 --- /dev/null +++ b/demo.4gram.py @@ -0,0 +1,22 @@ +# import +import librosa +from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM + +# load the processor +processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") +model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") + +# load the audio data (use your own wav file here!) +input_audio, sr = librosa.load('my_wav_file.wav', sr=16000) + +# tokenize +input_values = processor(input_audio, return_tensors="pt", padding="longest").input_values + +# retrieve logits +logits = model(input_values).logits + +# decode using n-gram +transcription = processor.batch_decode(logits.detach().numpy()).text + +# print the output +print(transcription) diff --git a/demo.nolm.py b/demo.nolm.py new file mode 100644 index 0000000000000000000000000000000000000000..9cbb25dc8a14f0cfe0c4945ede71b73a4ce4807f --- /dev/null +++ b/demo.nolm.py @@ -0,0 +1,22 @@ +# import +import librosa, torch +from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer + +# load the tokenizer and model +tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h") +model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") + +# load the audio data (use your own wav file here!) +input_audio, sr = librosa.load('my_wav_file.wav', sr=16000) + +# tokenize +input_values = tokenizer(input_audio, return_tensors="pt", padding="longest").input_values + +# retrieve logits +logits = model(input_values).logits + +# take argmax and decode +transcription = tokenizer.batch_decode(torch.argmax(logits, dim=-1)) + +# print the output +print(transcription) diff --git a/hub/version.txt b/hub/version.txt new file mode 100644 index 0000000000000000000000000000000000000000..56a6051ca2b02b04ef92d5150c9ef600403cb1de --- /dev/null +++ b/hub/version.txt @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67dee51bf073271c3a11d2a8768d86288a0d9f83 --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run.ami.log b/run.ami.log new file mode 100644 index 0000000000000000000000000000000000000000..5c430c4b5572195cbda7400badc4e9420d5b0c40 --- /dev/null +++ b/run.ami.log @@ -0,0 +1,23038 @@ +/opt/conda/lib/python3.12/site-packages/transformers/training_args.py:1483: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +05/25/2024 17:57:49 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True +05/25/2024 17:57:49 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +batch_eval_metrics=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_steps=1000, +eval_strategy=IntervalStrategy.STEPS, +evaluation_strategy=steps, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=0.0003, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=./runs/May25_17-57-49_tz579-raptorlake, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=IntervalStrategy.STEPS, +lr_scheduler_kwargs={}, +lr_scheduler_type=SchedulerType.LINEAR, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=2.0, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=./, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=./, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=400, +save_strategy=IntervalStrategy.STEPS, +save_total_limit=3, +seed=42, +skip_memory_metrics=True, +split_batches=None, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=500, +weight_decay=0.0, +) +/opt/conda/lib/python3.12/site-packages/datasets/load.py:1486: FutureWarning: The repository for edinburghcstr/ami contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/edinburghcstr/ami +You can avoid this message in future by passing the argument `trust_remote_code=True`. +Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. + warnings.warn( +loading configuration file config.json from cache at /home/Work/common_huggingface/hub/models--facebook--wav2vec2-large-lv60/snapshots/0cde644b64dac88d8416bec1c92a4099b850ba0b/config.json +Model config Wav2Vec2Config { + "_name_or_path": "facebook/wav2vec2-large-lv60", + "activation_dropout": 0.1, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForPreTraining" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.1, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "transformers_version": "4.42.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} + + Map: 0%| | 0/108502 [00:00', 'eos_token': '', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True), added_tokens_decoder={ + 28: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 29: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 30: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 31: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), +} + +{ + "processor_class": "Wav2Vec2Processor" +} + +Using auto half precision backend +The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running training ***** + Num examples = 102,201 + Num Epochs = 2 + Instantaneous batch size per device = 16 + Total train batch size (w. parallel, distributed & accumulation) = 16 + Gradient Accumulation steps = 1 + Total optimization steps = 12,776 + Number of trainable parameters = 311,261,344 + 0%| | 0/12776 [00:00 the best checkpoint (2000 steps) +## was then taken. +## MAKE SURE TO DO HYPER-PARAMETER TUNING TO GET BETTER RESULTS +python run_speech_recognition_ctc.py \ + --token="${HF_TOKEN}" \ + --dataset_name="edinburghcstr/ami" \ + --model_name_or_path="facebook/wav2vec2-large-lv60" \ + --dataset_config_name="ihm" \ + --train_split_name="train" \ + --eval_split_name="validation" \ + --output_dir="./" \ + --preprocessing_num_workers="16" \ + --overwrite_output_dir \ + --num_train_epochs="2" \ + --per_device_train_batch_size="16" \ + --per_device_eval_batch_size="16" \ + --gradient_accumulation_steps="1" \ + --learning_rate="3e-4" \ + --warmup_steps="500" \ + --evaluation_strategy="steps" \ + --text_column_name="text" \ + --min_duration_in_seconds="0.25" \ + --save_steps="400" \ + --eval_steps="1000" \ + --logging_steps="1" \ + --layerdrop="0.0" \ + --save_total_limit="3" \ + --freeze_feature_encoder \ + --gradient_checkpointing \ + --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” \ + --fp16 \ + --group_by_length \ + --push_to_hub \ + --do_eval \ + --do_train --do_eval diff --git a/run.timit.log b/run.timit.log new file mode 100644 index 0000000000000000000000000000000000000000..c17c70d1b2aaf1684c11d594a599ccaae7c392db --- /dev/null +++ b/run.timit.log @@ -0,0 +1,8730 @@ +/opt/conda/lib/python3.12/site-packages/transformers/training_args.py:1483: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +05/24/2024 13:33:16 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True +05/24/2024 13:33:16 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +batch_eval_metrics=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_steps=100, +eval_strategy=IntervalStrategy.STEPS, +evaluation_strategy=steps, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=0.0001, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=./wav2vec2-base-timit-fine-tuned/runs/May24_13-33-16_tz579-raptorlake, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=IntervalStrategy.STEPS, +lr_scheduler_kwargs={}, +lr_scheduler_type=SchedulerType.LINEAR, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=20.0, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=./wav2vec2-base-timit-fine-tuned, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=1, +per_device_train_batch_size=32, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=./wav2vec2-base-timit-fine-tuned, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=400, +save_strategy=IntervalStrategy.STEPS, +save_total_limit=3, +seed=42, +skip_memory_metrics=True, +split_batches=None, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=1000, +weight_decay=0.005, +) +/opt/conda/lib/python3.12/site-packages/datasets/load.py:1486: FutureWarning: The repository for timit_asr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/timit_asr +You can avoid this message in future by passing the argument `trust_remote_code=True`. +Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. + warnings.warn( + Downloading builder script: 0%| | 0.00/7.48k [00:00', 'eos_token': '', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True), added_tokens_decoder={ + 27: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 28: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 29: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 30: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), +} + +{ + "processor_class": "Wav2Vec2Processor" +} + +Using auto half precision backend +The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running training ***** + Num examples = 3,696 + Num Epochs = 20 + Instantaneous batch size per device = 32 + Total train batch size (w. parallel, distributed & accumulation) = 32 + Gradient Accumulation steps = 1 + Total optimization steps = 2,320 + Number of trainable parameters = 90,195,103 + 0%| | 0/2320 [00:00, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=0.0001, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=./wav2vec2-base-timit-fine-tuned/runs/May19_22-08-09_tz579-raptorlake, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=IntervalStrategy.STEPS, +lr_scheduler_kwargs={}, +lr_scheduler_type=SchedulerType.LINEAR, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=20.0, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=./wav2vec2-base-timit-fine-tuned, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=1, +per_device_train_batch_size=32, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=./wav2vec2-base-timit-fine-tuned, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=400, +save_strategy=IntervalStrategy.STEPS, +save_total_limit=3, +seed=42, +skip_memory_metrics=True, +split_batches=None, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=1000, +weight_decay=0.005, +) +/opt/conda/lib/python3.12/site-packages/datasets/load.py:1486: FutureWarning: The repository for timit_asr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/timit_asr +You can avoid this message in future by passing the argument `trust_remote_code=True`. +Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. + warnings.warn( +/opt/conda/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json +/opt/conda/lib/python3.12/site-packages/transformers/configuration_utils.py:364: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`. + warnings.warn( +Model config Wav2Vec2Config { + "_name_or_path": "facebook/wav2vec2-base", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForPreTraining" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "gradient_checkpointing": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "transformers_version": "4.42.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} + +Map: 100%|███████████████████████████████████████████████████████████████████████████████| 3696/3696 [00:00<00:00, 258999.36 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████| 1344/1344 [00:00<00:00, 582229.35 examples/s] +`use_fast` is set to `True` but the tokenizer class does not have a fast version. Falling back to the slow version. +loading file vocab.json +loading file tokenizer_config.json +loading file added_tokens.json +loading file special_tokens_map.json +loading file tokenizer.json +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/preprocessor_config.json +loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json +Model config Wav2Vec2Config { + "_name_or_path": "facebook/wav2vec2-base", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForPreTraining" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "gradient_checkpointing": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "transformers_version": "4.42.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} + +Feature extractor Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} + +loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/pytorch_model.bin +Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v'] +- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json +Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json +loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +loading configuration file ./wav2vec2-base-timit-fine-tuned/config.json +Model config Wav2Vec2Config { + "_name_or_path": "./wav2vec2-base-timit-fine-tuned", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForPreTraining" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 28, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "transformers_version": "4.42.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 31, + "xvector_output_dim": 512 +} + +loading configuration file ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +Feature extractor Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} + +loading file vocab.json +loading file tokenizer_config.json +loading file added_tokens.json +loading file special_tokens_map.json +loading file tokenizer.json +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Processor Wav2Vec2Processor: +- feature_extractor: Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} + +- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='./wav2vec2-base-timit-fine-tuned', vocab_size=29, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True), added_tokens_decoder={ + 27: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 28: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 29: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 30: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), +} + +{ + "processor_class": "Wav2Vec2Processor" +} + +Using auto half precision backend +The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running training ***** + Num examples = 3,696 + Num Epochs = 20 + Instantaneous batch size per device = 32 + Total train batch size (w. parallel, distributed & accumulation) = 32 + Gradient Accumulation steps = 1 + Total optimization steps = 2,320 + Number of trainable parameters = 90,195,103 + 0%|▎ | 7/2320 [00:10<48:36, 1.26s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.) + return F.conv1d(input, weight, bias, self.stride, +{'loss': 9.1142, 'grad_norm': 9.595185279846191, 'learning_rate': 9e-07, 'epoch': 0.09} +{'loss': 8.3446, 'grad_norm': 9.732986450195312, 'learning_rate': 1.9e-06, 'epoch': 0.17} +{'loss': 8.6592, 'grad_norm': 14.272214889526367, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.26} +{'loss': 7.6985, 'grad_norm': 15.0160493850708, 'learning_rate': 3.8e-06, 'epoch': 0.34} +{'loss': 6.9688, 'grad_norm': 16.610979080200195, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.43} +{'loss': 6.232, 'grad_norm': 17.26924705505371, 'learning_rate': 5.8e-06, 'epoch': 0.52} +{'loss': 4.7271, 'grad_norm': 11.347734451293945, 'learning_rate': 6.800000000000001e-06, 'epoch': 0.6} +{'loss': 3.7919, 'grad_norm': 4.237112045288086, 'learning_rate': 7.8e-06, 'epoch': 0.69} +{'loss': 3.3967, 'grad_norm': 1.8833028078079224, 'learning_rate': 8.8e-06, 'epoch': 0.78} +{'loss': 3.1618, 'grad_norm': 1.3788093328475952, 'learning_rate': 9.800000000000001e-06, 'epoch': 0.86} + 4%|████▏ | 100/2320 [01:39<33:07, 1.12it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 3.1117007732391357, 'eval_wer': 1.0, 'eval_runtime': 40.0512, 'eval_samples_per_second': 33.557, 'eval_steps_per_second': 33.557, 'epoch': 0.86} +{'loss': 3.0865, 'grad_norm': 1.729278802871704, 'learning_rate': 1.08e-05, 'epoch': 0.95} +{'loss': 3.0809, 'grad_norm': 1.905969500541687, 'learning_rate': 1.18e-05, 'epoch': 1.03} +{'loss': 3.0346, 'grad_norm': 0.8360918760299683, 'learning_rate': 1.2800000000000001e-05, 'epoch': 1.12} +{'loss': 3.0106, 'grad_norm': 0.7653716206550598, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.21} +{'loss': 3.0165, 'grad_norm': 0.94779372215271, 'learning_rate': 1.48e-05, 'epoch': 1.29} +{'loss': 3.0, 'grad_norm': 0.8457741737365723, 'learning_rate': 1.58e-05, 'epoch': 1.38} +{'loss': 2.9903, 'grad_norm': 1.4369837045669556, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.47} +{'loss': 2.9852, 'grad_norm': 1.8290436267852783, 'learning_rate': 1.78e-05, 'epoch': 1.55} +{'loss': 2.99, 'grad_norm': 1.1530190706253052, 'learning_rate': 1.88e-05, 'epoch': 1.64} +{'loss': 2.9798, 'grad_norm': 1.1261711120605469, 'learning_rate': 1.9800000000000004e-05, 'epoch': 1.72} + 9%|████████▎ | 200/2320 [03:52<24:28, 1.44it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 2.9736363887786865, 'eval_wer': 1.0, 'eval_runtime': 39.6236, 'eval_samples_per_second': 33.919, 'eval_steps_per_second': 33.919, 'epoch': 1.72} +{'loss': 2.9718, 'grad_norm': 0.903380811214447, 'learning_rate': 2.08e-05, 'epoch': 1.81} +{'loss': 2.9766, 'grad_norm': 0.4889620244503021, 'learning_rate': 2.18e-05, 'epoch': 1.9} +{'loss': 2.9658, 'grad_norm': 1.3861790895462036, 'learning_rate': 2.2800000000000002e-05, 'epoch': 1.98} +{'loss': 2.9588, 'grad_norm': 0.7976490259170532, 'learning_rate': 2.38e-05, 'epoch': 2.07} +{'loss': 2.9523, 'grad_norm': 0.698798418045044, 'learning_rate': 2.48e-05, 'epoch': 2.16} +{'loss': 2.9496, 'grad_norm': 1.0858148336410522, 'learning_rate': 2.58e-05, 'epoch': 2.24} +{'loss': 2.9421, 'grad_norm': 0.5658290386199951, 'learning_rate': 2.6800000000000004e-05, 'epoch': 2.33} +{'loss': 2.9427, 'grad_norm': 0.5713534355163574, 'learning_rate': 2.7800000000000005e-05, 'epoch': 2.41} +{'loss': 2.9228, 'grad_norm': 0.7386118769645691, 'learning_rate': 2.88e-05, 'epoch': 2.5} +{'loss': 2.9144, 'grad_norm': 0.767816960811615, 'learning_rate': 2.98e-05, 'epoch': 2.59} + 13%|████████████▍ | 300/2320 [06:10<33:46, 1.00s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 2.9074809551239014, 'eval_wer': 1.0, 'eval_runtime': 39.8997, 'eval_samples_per_second': 33.684, 'eval_steps_per_second': 33.684, 'epoch': 2.59} +{'loss': 2.8965, 'grad_norm': 0.8676608204841614, 'learning_rate': 3.08e-05, 'epoch': 2.67} +{'loss': 2.8815, 'grad_norm': 1.6954621076583862, 'learning_rate': 3.18e-05, 'epoch': 2.76} +{'loss': 2.855, 'grad_norm': 1.1631884574890137, 'learning_rate': 3.2800000000000004e-05, 'epoch': 2.84} +{'loss': 2.781, 'grad_norm': 1.625454306602478, 'learning_rate': 3.38e-05, 'epoch': 2.93} +{'loss': 2.7756, 'grad_norm': 2.0763564109802246, 'learning_rate': 3.48e-05, 'epoch': 3.02} +{'loss': 2.6458, 'grad_norm': 2.036031723022461, 'learning_rate': 3.58e-05, 'epoch': 3.1} +{'loss': 2.5189, 'grad_norm': 1.366801142692566, 'learning_rate': 3.68e-05, 'epoch': 3.19} +{'loss': 2.433, 'grad_norm': 2.034527540206909, 'learning_rate': 3.7800000000000004e-05, 'epoch': 3.28} +{'loss': 2.2885, 'grad_norm': 3.8338165283203125, 'learning_rate': 3.88e-05, 'epoch': 3.36} +{'loss': 2.1714, 'grad_norm': 2.3443217277526855, 'learning_rate': 3.9800000000000005e-05, 'epoch': 3.45} + 17%|████████████████▌ | 400/2320 [08:24<23:08, 1.38it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 2.0944502353668213, 'eval_wer': 1.0325047801147227, 'eval_runtime': 39.7668, 'eval_samples_per_second': 33.797, 'eval_steps_per_second': 33.797, 'epoch': 3.45} + 17%|████████████████▌ | 400/2320 [09:04<23:08, 1.38it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-400 +Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/config.json +Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/model.safetensors +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-400/added_tokens.json +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json + 17%|████████████████▏ | 401/2320 [09:06<6:52:25, 12.90s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.) + return F.conv1d(input, weight, bias, self.stride, +{'loss': 2.0881, 'grad_norm': 4.349735260009766, 'learning_rate': 4.08e-05, 'epoch': 3.53} +{'loss': 1.9522, 'grad_norm': 2.450747489929199, 'learning_rate': 4.18e-05, 'epoch': 3.62} +{'loss': 1.8395, 'grad_norm': 2.2519729137420654, 'learning_rate': 4.2800000000000004e-05, 'epoch': 3.71} +{'loss': 1.7525, 'grad_norm': 2.693664789199829, 'learning_rate': 4.38e-05, 'epoch': 3.79} +{'loss': 1.6222, 'grad_norm': 1.9744929075241089, 'learning_rate': 4.4800000000000005e-05, 'epoch': 3.88} +{'loss': 1.5397, 'grad_norm': 3.802494764328003, 'learning_rate': 4.58e-05, 'epoch': 3.97} +{'loss': 1.4376, 'grad_norm': 2.301044225692749, 'learning_rate': 4.6800000000000006e-05, 'epoch': 4.05} +{'loss': 1.2829, 'grad_norm': 2.279372215270996, 'learning_rate': 4.78e-05, 'epoch': 4.14} +{'loss': 1.1976, 'grad_norm': 3.314736843109131, 'learning_rate': 4.88e-05, 'epoch': 4.22} +{'loss': 1.1579, 'grad_norm': 2.434694290161133, 'learning_rate': 4.9800000000000004e-05, 'epoch': 4.31} + 22%|████████████████████▋ | 500/2320 [10:43<34:53, 1.15s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 1.045101284980774, 'eval_wer': 0.8299189656742239, 'eval_runtime': 39.7455, 'eval_samples_per_second': 33.815, 'eval_steps_per_second': 33.815, 'epoch': 4.31} +{'loss': 1.0684, 'grad_norm': 1.8384031057357788, 'learning_rate': 5.08e-05, 'epoch': 4.4} +{'loss': 1.0319, 'grad_norm': 3.599148988723755, 'learning_rate': 5.1800000000000005e-05, 'epoch': 4.48} +{'loss': 0.9179, 'grad_norm': 2.066476583480835, 'learning_rate': 5.28e-05, 'epoch': 4.57} +{'loss': 0.8838, 'grad_norm': 2.2173750400543213, 'learning_rate': 5.380000000000001e-05, 'epoch': 4.66} +{'loss': 0.8991, 'grad_norm': 2.427091121673584, 'learning_rate': 5.4800000000000004e-05, 'epoch': 4.74} +{'loss': 0.8, 'grad_norm': 2.7432241439819336, 'learning_rate': 5.580000000000001e-05, 'epoch': 4.83} +{'loss': 0.7803, 'grad_norm': 3.254221200942993, 'learning_rate': 5.68e-05, 'epoch': 4.91} +{'loss': 0.8205, 'grad_norm': 4.457448482513428, 'learning_rate': 5.7799999999999995e-05, 'epoch': 5.0} +{'loss': 0.6703, 'grad_norm': 3.1023166179656982, 'learning_rate': 5.88e-05, 'epoch': 5.09} +{'loss': 0.6087, 'grad_norm': 2.5916504859924316, 'learning_rate': 5.9800000000000003e-05, 'epoch': 5.17} + 26%|████████████████████████▊ | 600/2320 [12:58<23:53, 1.20it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.6753795146942139, 'eval_wer': 0.6440863152144223, 'eval_runtime': 39.7485, 'eval_samples_per_second': 33.813, 'eval_steps_per_second': 33.813, 'epoch': 5.17} +{'loss': 0.6569, 'grad_norm': 2.1707613468170166, 'learning_rate': 6.08e-05, 'epoch': 5.26} +{'loss': 0.5627, 'grad_norm': 2.4291555881500244, 'learning_rate': 6.18e-05, 'epoch': 5.34} +{'loss': 0.5381, 'grad_norm': 2.249617338180542, 'learning_rate': 6.280000000000001e-05, 'epoch': 5.43} +{'loss': 0.6338, 'grad_norm': 1.6661946773529053, 'learning_rate': 6.38e-05, 'epoch': 5.52} +{'loss': 0.5181, 'grad_norm': 2.60294771194458, 'learning_rate': 6.48e-05, 'epoch': 5.6} +{'loss': 0.5189, 'grad_norm': 3.3003089427948, 'learning_rate': 6.58e-05, 'epoch': 5.69} +{'loss': 0.564, 'grad_norm': 1.880764126777649, 'learning_rate': 6.680000000000001e-05, 'epoch': 5.78} +{'loss': 0.4729, 'grad_norm': 2.0575127601623535, 'learning_rate': 6.780000000000001e-05, 'epoch': 5.86} +{'loss': 0.4899, 'grad_norm': 2.5159761905670166, 'learning_rate': 6.879999999999999e-05, 'epoch': 5.95} +{'loss': 0.481, 'grad_norm': 1.4463504552841187, 'learning_rate': 6.98e-05, 'epoch': 6.03} + 30%|████████████████████████████▉ | 700/2320 [15:14<36:18, 1.34s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.5275412201881409, 'eval_wer': 0.5760721114449604, 'eval_runtime': 39.9601, 'eval_samples_per_second': 33.634, 'eval_steps_per_second': 33.634, 'epoch': 6.03} +{'loss': 0.3865, 'grad_norm': 1.788765549659729, 'learning_rate': 7.08e-05, 'epoch': 6.12} +{'loss': 0.3726, 'grad_norm': 1.862762212753296, 'learning_rate': 7.18e-05, 'epoch': 6.21} +{'loss': 0.4116, 'grad_norm': 1.6512093544006348, 'learning_rate': 7.280000000000001e-05, 'epoch': 6.29} +{'loss': 0.3779, 'grad_norm': 2.098067045211792, 'learning_rate': 7.38e-05, 'epoch': 6.38} +{'loss': 0.3728, 'grad_norm': 3.3030078411102295, 'learning_rate': 7.48e-05, 'epoch': 6.47} +{'loss': 0.4047, 'grad_norm': 2.1799120903015137, 'learning_rate': 7.58e-05, 'epoch': 6.55} +{'loss': 0.313, 'grad_norm': 1.862434983253479, 'learning_rate': 7.680000000000001e-05, 'epoch': 6.64} +{'loss': 0.4052, 'grad_norm': 6.29113245010376, 'learning_rate': 7.780000000000001e-05, 'epoch': 6.72} +{'loss': 0.3218, 'grad_norm': 1.4220325946807861, 'learning_rate': 7.88e-05, 'epoch': 6.81} +{'loss': 0.3072, 'grad_norm': 2.586819648742676, 'learning_rate': 7.98e-05, 'epoch': 6.9} + 34%|█████████████████████████████████ | 800/2320 [17:30<20:39, 1.23it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.4836220443248749, 'eval_wer': 0.5264499681325685, 'eval_runtime': 39.8762, 'eval_samples_per_second': 33.704, 'eval_steps_per_second': 33.704, 'epoch': 6.9} + 34%|█████████████████████████████████ | 800/2320 [18:10<20:39, 1.23it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-800 +Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/config.json +Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/model.safetensors +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-800/added_tokens.json +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json +{'loss': 0.3862, 'grad_norm': 1.6589460372924805, 'learning_rate': 8.080000000000001e-05, 'epoch': 6.98} +{'loss': 0.2938, 'grad_norm': 1.7299175262451172, 'learning_rate': 8.18e-05, 'epoch': 7.07} +{'loss': 0.249, 'grad_norm': 2.0545098781585693, 'learning_rate': 8.28e-05, 'epoch': 7.16} + 36%|██████████████████████████████████▋ | 837/2320 [18:46<17:32, 1.41it/s]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.) + return F.conv1d(input, weight, bias, self.stride, +{'loss': 0.3202, 'grad_norm': 24.935670852661133, 'learning_rate': 8.38e-05, 'epoch': 7.24} +{'loss': 0.2803, 'grad_norm': 2.497840642929077, 'learning_rate': 8.48e-05, 'epoch': 7.33} +{'loss': 0.2473, 'grad_norm': 2.698636531829834, 'learning_rate': 8.58e-05, 'epoch': 7.41} +{'loss': 0.3223, 'grad_norm': 1.4561227560043335, 'learning_rate': 8.680000000000001e-05, 'epoch': 7.5} +{'loss': 0.2481, 'grad_norm': 1.7760556936264038, 'learning_rate': 8.78e-05, 'epoch': 7.59} +{'loss': 0.2545, 'grad_norm': 2.308103084564209, 'learning_rate': 8.88e-05, 'epoch': 7.67} +{'loss': 0.332, 'grad_norm': 1.4128385782241821, 'learning_rate': 8.98e-05, 'epoch': 7.76} + 39%|█████████████████████████████████████▏ | 900/2320 [19:48<29:47, 1.26s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.44030094146728516, 'eval_wer': 0.5233542747883092, 'eval_runtime': 39.9401, 'eval_samples_per_second': 33.65, 'eval_steps_per_second': 33.65, 'epoch': 7.76} +{'loss': 0.2411, 'grad_norm': 1.7903906106948853, 'learning_rate': 9.080000000000001e-05, 'epoch': 7.84} +{'loss': 0.2707, 'grad_norm': 2.0804216861724854, 'learning_rate': 9.180000000000001e-05, 'epoch': 7.93} +{'loss': 0.3186, 'grad_norm': 1.4420605897903442, 'learning_rate': 9.28e-05, 'epoch': 8.02} +{'loss': 0.1937, 'grad_norm': 2.2910854816436768, 'learning_rate': 9.38e-05, 'epoch': 8.1} +{'loss': 0.2321, 'grad_norm': 3.5892796516418457, 'learning_rate': 9.48e-05, 'epoch': 8.19} +{'loss': 0.2868, 'grad_norm': 1.6509956121444702, 'learning_rate': 9.58e-05, 'epoch': 8.28} +{'loss': 0.2004, 'grad_norm': 1.6983604431152344, 'learning_rate': 9.680000000000001e-05, 'epoch': 8.36} +{'loss': 0.2025, 'grad_norm': 2.061176061630249, 'learning_rate': 9.78e-05, 'epoch': 8.45} +{'loss': 0.2598, 'grad_norm': 1.7732270956039429, 'learning_rate': 9.88e-05, 'epoch': 8.53} +{'loss': 0.1876, 'grad_norm': 1.8335466384887695, 'learning_rate': 9.98e-05, 'epoch': 8.62} + 43%|████████████████████████████████████████▉ | 1000/2320 [22:05<20:18, 1.08it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.4757933020591736, 'eval_wer': 0.5221706273331512, 'eval_runtime': 39.8291, 'eval_samples_per_second': 33.744, 'eval_steps_per_second': 33.744, 'epoch': 8.62} +{'loss': 0.2456, 'grad_norm': 2.52902889251709, 'learning_rate': 9.939393939393939e-05, 'epoch': 8.71} +{'loss': 0.2499, 'grad_norm': 1.7294162511825562, 'learning_rate': 9.863636363636364e-05, 'epoch': 8.79} +{'loss': 0.1854, 'grad_norm': 21.9121150970459, 'learning_rate': 9.787878787878789e-05, 'epoch': 8.88} +{'loss': 0.2576, 'grad_norm': 3.9164559841156006, 'learning_rate': 9.712121212121212e-05, 'epoch': 8.97} +{'loss': 0.2118, 'grad_norm': 1.239221215248108, 'learning_rate': 9.636363636363637e-05, 'epoch': 9.05} +{'loss': 0.1577, 'grad_norm': 3.1416544914245605, 'learning_rate': 9.560606060606061e-05, 'epoch': 9.14} +{'loss': 0.2092, 'grad_norm': 2.4253621101379395, 'learning_rate': 9.484848484848486e-05, 'epoch': 9.22} +{'loss': 0.1876, 'grad_norm': 1.194345474243164, 'learning_rate': 9.40909090909091e-05, 'epoch': 9.31} +{'loss': 0.1546, 'grad_norm': 2.411029100418091, 'learning_rate': 9.333333333333334e-05, 'epoch': 9.4} +{'loss': 0.2232, 'grad_norm': 3.246082067489624, 'learning_rate': 9.257575757575758e-05, 'epoch': 9.48} + 47%|█████████████████████████████████████████████ | 1100/2320 [24:18<14:01, 1.45it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.45077577233314514, 'eval_wer': 0.48921059819721385, 'eval_runtime': 39.9221, 'eval_samples_per_second': 33.666, 'eval_steps_per_second': 33.666, 'epoch': 9.48} +{'loss': 0.1777, 'grad_norm': 1.3427454233169556, 'learning_rate': 9.181818181818183e-05, 'epoch': 9.57} +{'loss': 0.1646, 'grad_norm': 1.5090447664260864, 'learning_rate': 9.106060606060606e-05, 'epoch': 9.66} +{'loss': 0.225, 'grad_norm': 1.3060975074768066, 'learning_rate': 9.030303030303031e-05, 'epoch': 9.74} +{'loss': 0.1552, 'grad_norm': 1.3011540174484253, 'learning_rate': 8.954545454545455e-05, 'epoch': 9.83} +{'loss': 0.1715, 'grad_norm': 1.9938538074493408, 'learning_rate': 8.87878787878788e-05, 'epoch': 9.91} +{'loss': 0.2092, 'grad_norm': 3.334385395050049, 'learning_rate': 8.803030303030304e-05, 'epoch': 10.0} +{'loss': 0.14, 'grad_norm': 1.011092185974121, 'learning_rate': 8.727272727272727e-05, 'epoch': 10.09} +{'loss': 0.1512, 'grad_norm': 2.517902135848999, 'learning_rate': 8.651515151515152e-05, 'epoch': 10.17} +{'loss': 0.1846, 'grad_norm': 1.2418378591537476, 'learning_rate': 8.575757575757576e-05, 'epoch': 10.26} +{'loss': 0.1332, 'grad_norm': 1.5885329246520996, 'learning_rate': 8.5e-05, 'epoch': 10.34} + 52%|█████████████████████████████████████████████████▏ | 1200/2320 [26:37<18:40, 1.00s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.4394075274467468, 'eval_wer': 0.4740052808886461, 'eval_runtime': 39.9367, 'eval_samples_per_second': 33.653, 'eval_steps_per_second': 33.653, 'epoch': 10.34} + 52%|█████████████████████████████████████████████████▏ | 1200/2320 [27:17<18:40, 1.00s/itSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-1200 +Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/config.json +Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/model.safetensors +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1200/added_tokens.json +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json +{'loss': 0.1485, 'grad_norm': 1.2539469003677368, 'learning_rate': 8.424242424242424e-05, 'epoch': 10.43} +{'loss': 0.1988, 'grad_norm': 1.357601284980774, 'learning_rate': 8.348484848484849e-05, 'epoch': 10.52} + 53%|██████████████████████████████████████████████████▏ | 1227/2320 [27:45<19:01, 1.04s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.) + return F.conv1d(input, weight, bias, self.stride, +{'loss': 0.137, 'grad_norm': 2.0564587116241455, 'learning_rate': 8.272727272727273e-05, 'epoch': 10.6} +{'loss': 0.1245, 'grad_norm': 2.48364520072937, 'learning_rate': 8.196969696969698e-05, 'epoch': 10.69} +{'loss': 0.1602, 'grad_norm': 1.015891671180725, 'learning_rate': 8.121212121212121e-05, 'epoch': 10.78} +{'loss': 0.1215, 'grad_norm': 1.1023950576782227, 'learning_rate': 8.045454545454546e-05, 'epoch': 10.86} +{'loss': 0.1621, 'grad_norm': 2.703427791595459, 'learning_rate': 7.96969696969697e-05, 'epoch': 10.95} +{'loss': 0.1651, 'grad_norm': 1.1821691989898682, 'learning_rate': 7.893939393939395e-05, 'epoch': 11.03} +{'loss': 0.1066, 'grad_norm': 0.930283784866333, 'learning_rate': 7.818181818181818e-05, 'epoch': 11.12} +{'loss': 0.1085, 'grad_norm': 1.6548758745193481, 'learning_rate': 7.742424242424243e-05, 'epoch': 11.21} + 56%|█████████████████████████████████████████████████████▏ | 1300/2320 [28:53<12:42, 1.34it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.4466467499732971, 'eval_wer': 0.46207775653282346, 'eval_runtime': 39.8633, 'eval_samples_per_second': 33.715, 'eval_steps_per_second': 33.715, 'epoch': 11.21} +{'loss': 0.1418, 'grad_norm': 1.1760716438293457, 'learning_rate': 7.666666666666667e-05, 'epoch': 11.29} +{'loss': 0.1133, 'grad_norm': 2.1062755584716797, 'learning_rate': 7.59090909090909e-05, 'epoch': 11.38} +{'loss': 0.1318, 'grad_norm': 2.67399001121521, 'learning_rate': 7.515151515151515e-05, 'epoch': 11.47} +{'loss': 0.1474, 'grad_norm': 1.0049142837524414, 'learning_rate': 7.439393939393939e-05, 'epoch': 11.55} +{'loss': 0.0908, 'grad_norm': 1.586559772491455, 'learning_rate': 7.363636363636364e-05, 'epoch': 11.64} +{'loss': 0.1521, 'grad_norm': 3.784040927886963, 'learning_rate': 7.287878787878788e-05, 'epoch': 11.72} +{'loss': 0.1163, 'grad_norm': 1.125501275062561, 'learning_rate': 7.212121212121213e-05, 'epoch': 11.81} +{'loss': 0.1109, 'grad_norm': 2.1989808082580566, 'learning_rate': 7.136363636363636e-05, 'epoch': 11.9} +{'loss': 0.152, 'grad_norm': 1.1287301778793335, 'learning_rate': 7.060606060606061e-05, 'epoch': 11.98} +{'loss': 0.098, 'grad_norm': 1.538678765296936, 'learning_rate': 6.984848484848485e-05, 'epoch': 12.07} + 60%|█████████████████████████████████████████████████████████▎ | 1400/2320 [31:12<18:06, 1.18s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.42302384972572327, 'eval_wer': 0.44933078393881454, 'eval_runtime': 40.1773, 'eval_samples_per_second': 33.452, 'eval_steps_per_second': 33.452, 'epoch': 12.07} +{'loss': 0.092, 'grad_norm': 1.400772213935852, 'learning_rate': 6.90909090909091e-05, 'epoch': 12.16} +{'loss': 0.1649, 'grad_norm': 3.6780846118927, 'learning_rate': 6.833333333333333e-05, 'epoch': 12.24} +{'loss': 0.091, 'grad_norm': 1.5424057245254517, 'learning_rate': 6.757575757575758e-05, 'epoch': 12.33} +{'loss': 0.0869, 'grad_norm': 1.4868180751800537, 'learning_rate': 6.681818181818183e-05, 'epoch': 12.41} +{'loss': 0.1499, 'grad_norm': 1.1947145462036133, 'learning_rate': 6.606060606060607e-05, 'epoch': 12.5} +{'loss': 0.0954, 'grad_norm': 1.0430784225463867, 'learning_rate': 6.530303030303032e-05, 'epoch': 12.59} +{'loss': 0.1032, 'grad_norm': 2.4261584281921387, 'learning_rate': 6.454545454545455e-05, 'epoch': 12.67} +{'loss': 0.1158, 'grad_norm': 1.033467411994934, 'learning_rate': 6.37878787878788e-05, 'epoch': 12.76} +{'loss': 0.0864, 'grad_norm': 1.1535651683807373, 'learning_rate': 6.303030303030302e-05, 'epoch': 12.84} +{'loss': 0.1219, 'grad_norm': 1.28826105594635, 'learning_rate': 6.227272727272727e-05, 'epoch': 12.93} + 65%|█████████████████████████████████████████████████████████████▍ | 1500/2320 [33:26<10:01, 1.36it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.418023020029068, 'eval_wer': 0.44596194118182647, 'eval_runtime': 40.2192, 'eval_samples_per_second': 33.417, 'eval_steps_per_second': 33.417, 'epoch': 12.93} +{'loss': 0.1289, 'grad_norm': 1.055411458015442, 'learning_rate': 6.151515151515151e-05, 'epoch': 13.02} +{'loss': 0.0776, 'grad_norm': 1.1269094944000244, 'learning_rate': 6.075757575757576e-05, 'epoch': 13.1} +{'loss': 0.0871, 'grad_norm': 1.7149118185043335, 'learning_rate': 6e-05, 'epoch': 13.19} +{'loss': 0.1087, 'grad_norm': 1.7456856966018677, 'learning_rate': 5.9242424242424244e-05, 'epoch': 13.28} +{'loss': 0.0821, 'grad_norm': 1.3434715270996094, 'learning_rate': 5.848484848484849e-05, 'epoch': 13.36} +{'loss': 0.0878, 'grad_norm': 2.103512763977051, 'learning_rate': 5.772727272727273e-05, 'epoch': 13.45} +{'loss': 0.1044, 'grad_norm': 1.240224838256836, 'learning_rate': 5.696969696969697e-05, 'epoch': 13.53} +{'loss': 0.0753, 'grad_norm': 0.7336703538894653, 'learning_rate': 5.6212121212121215e-05, 'epoch': 13.62} +{'loss': 0.1059, 'grad_norm': 2.293342351913452, 'learning_rate': 5.545454545454546e-05, 'epoch': 13.71} +{'loss': 0.1021, 'grad_norm': 1.1853971481323242, 'learning_rate': 5.46969696969697e-05, 'epoch': 13.79} + 69%|█████████████████████████████████████████████████████████████████▌ | 1600/2320 [35:45<13:55, 1.16s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.41785839200019836, 'eval_wer': 0.4405900027314941, 'eval_runtime': 40.2906, 'eval_samples_per_second': 33.358, 'eval_steps_per_second': 33.358, 'epoch': 13.79} + 69%|█████████████████████████████████████████████████████████████████▌ | 1600/2320 [36:25<13:55, 1.16s/itSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-1600 +Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/config.json +Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/model.safetensors +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-1600/added_tokens.json +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json +Deleting older checkpoint [wav2vec2-base-timit-fine-tuned/checkpoint-400] due to args.save_total_limit +{'loss': 0.0648, 'grad_norm': 1.331200361251831, 'learning_rate': 5.393939393939394e-05, 'epoch': 13.88} +{'loss': 0.1121, 'grad_norm': 2.28397536277771, 'learning_rate': 5.3181818181818186e-05, 'epoch': 13.97} +{'loss': 0.0725, 'grad_norm': 0.9436893463134766, 'learning_rate': 5.242424242424243e-05, 'epoch': 14.05} +{'loss': 0.0691, 'grad_norm': 1.6113288402557373, 'learning_rate': 5.166666666666667e-05, 'epoch': 14.14} +{'loss': 0.0979, 'grad_norm': 2.479888439178467, 'learning_rate': 5.090909090909091e-05, 'epoch': 14.22} +{'loss': 0.0909, 'grad_norm': 1.006616473197937, 'learning_rate': 5.015151515151515e-05, 'epoch': 14.31} + 72%|████████████████████████████████████████████████████████████████████ | 1663/2320 [37:27<11:20, 1.04s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.) + return F.conv1d(input, weight, bias, self.stride, +{'loss': 0.0761, 'grad_norm': 1.4571704864501953, 'learning_rate': 4.93939393939394e-05, 'epoch': 14.4} +{'loss': 0.0862, 'grad_norm': 1.5729875564575195, 'learning_rate': 4.863636363636364e-05, 'epoch': 14.48} +{'loss': 0.0646, 'grad_norm': 1.2180376052856445, 'learning_rate': 4.787878787878788e-05, 'epoch': 14.57} +{'loss': 0.0741, 'grad_norm': 1.7464072704315186, 'learning_rate': 4.712121212121212e-05, 'epoch': 14.66} + 73%|█████████████████████████████████████████████████████████████████████▌ | 1700/2320 [38:02<08:27, 1.22it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.4113341271877289, 'eval_wer': 0.4309387234817445, 'eval_runtime': 40.2841, 'eval_samples_per_second': 33.363, 'eval_steps_per_second': 33.363, 'epoch': 14.66} +{'loss': 0.1315, 'grad_norm': 0.8571386337280273, 'learning_rate': 4.6439393939393944e-05, 'epoch': 14.74} +{'loss': 0.0603, 'grad_norm': 1.331377387046814, 'learning_rate': 4.5681818181818186e-05, 'epoch': 14.83} +{'loss': 0.0796, 'grad_norm': 1.5398732423782349, 'learning_rate': 4.492424242424242e-05, 'epoch': 14.91} +{'loss': 0.085, 'grad_norm': 3.689671754837036, 'learning_rate': 4.4166666666666665e-05, 'epoch': 15.0} +{'loss': 0.0544, 'grad_norm': 1.132613182067871, 'learning_rate': 4.340909090909091e-05, 'epoch': 15.09} +{'loss': 0.0601, 'grad_norm': 1.5951859951019287, 'learning_rate': 4.265151515151515e-05, 'epoch': 15.17} +{'loss': 0.097, 'grad_norm': 0.5179944634437561, 'learning_rate': 4.189393939393939e-05, 'epoch': 15.26} +{'loss': 0.0596, 'grad_norm': 0.9744370579719543, 'learning_rate': 4.113636363636364e-05, 'epoch': 15.34} +{'loss': 0.0677, 'grad_norm': 1.8794275522232056, 'learning_rate': 4.0378787878787885e-05, 'epoch': 15.43} +{'loss': 0.0896, 'grad_norm': 0.748386025428772, 'learning_rate': 3.962121212121213e-05, 'epoch': 15.52} + 78%|█████████████████████████████████████████████████████████████████████████▋ | 1800/2320 [40:18<11:05, 1.28s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.43920788168907166, 'eval_wer': 0.4307566238732587, 'eval_runtime': 40.1997, 'eval_samples_per_second': 33.433, 'eval_steps_per_second': 33.433, 'epoch': 15.52} +{'loss': 0.0604, 'grad_norm': 0.9639837145805359, 'learning_rate': 3.8863636363636364e-05, 'epoch': 15.6} +{'loss': 0.0711, 'grad_norm': 1.9640839099884033, 'learning_rate': 3.810606060606061e-05, 'epoch': 15.69} +{'loss': 0.0867, 'grad_norm': 1.4438735246658325, 'learning_rate': 3.734848484848485e-05, 'epoch': 15.78} +{'loss': 0.0605, 'grad_norm': 1.0062426328659058, 'learning_rate': 3.659090909090909e-05, 'epoch': 15.86} +{'loss': 0.0662, 'grad_norm': 1.6331523656845093, 'learning_rate': 3.5833333333333335e-05, 'epoch': 15.95} +{'loss': 0.0765, 'grad_norm': 0.8070217370986938, 'learning_rate': 3.507575757575758e-05, 'epoch': 16.03} +{'loss': 0.0537, 'grad_norm': 1.4137670993804932, 'learning_rate': 3.431818181818182e-05, 'epoch': 16.12} +{'loss': 0.0684, 'grad_norm': 1.5437769889831543, 'learning_rate': 3.356060606060606e-05, 'epoch': 16.21} +{'loss': 0.0744, 'grad_norm': 0.90281081199646, 'learning_rate': 3.2803030303030305e-05, 'epoch': 16.29} +{'loss': 0.0492, 'grad_norm': 1.139837622642517, 'learning_rate': 3.204545454545455e-05, 'epoch': 16.38} + 82%|█████████████████████████████████████████████████████████████████████████████▊ | 1900/2320 [42:36<06:26, 1.09it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.4201890528202057, 'eval_wer': 0.4313029226987162, 'eval_runtime': 40.1502, 'eval_samples_per_second': 33.474, 'eval_steps_per_second': 33.474, 'epoch': 16.38} +{'loss': 0.0652, 'grad_norm': 1.679457426071167, 'learning_rate': 3.128787878787879e-05, 'epoch': 16.47} +{'loss': 0.0649, 'grad_norm': 0.6661111116409302, 'learning_rate': 3.0530303030303034e-05, 'epoch': 16.55} +{'loss': 0.0469, 'grad_norm': 1.1774355173110962, 'learning_rate': 2.9772727272727273e-05, 'epoch': 16.64} +{'loss': 0.0752, 'grad_norm': 1.783923864364624, 'learning_rate': 2.901515151515152e-05, 'epoch': 16.72} +{'loss': 0.0519, 'grad_norm': 1.176321268081665, 'learning_rate': 2.825757575757576e-05, 'epoch': 16.81} +{'loss': 0.0547, 'grad_norm': 1.3150608539581299, 'learning_rate': 2.7500000000000004e-05, 'epoch': 16.9} +{'loss': 0.0799, 'grad_norm': 0.983769953250885, 'learning_rate': 2.674242424242424e-05, 'epoch': 16.98} +{'loss': 0.0577, 'grad_norm': 0.996890127658844, 'learning_rate': 2.5984848484848483e-05, 'epoch': 17.07} +{'loss': 0.0515, 'grad_norm': 2.3034253120422363, 'learning_rate': 2.5227272727272726e-05, 'epoch': 17.16} +{'loss': 0.0759, 'grad_norm': 3.7528610229492188, 'learning_rate': 2.4469696969696972e-05, 'epoch': 17.24} + 86%|█████████████████████████████████████████████████████████████████████████████████▉ | 2000/2320 [44:50<03:48, 1.40it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.43480169773101807, 'eval_wer': 0.4207411454065374, 'eval_runtime': 40.017, 'eval_samples_per_second': 33.586, 'eval_steps_per_second': 33.586, 'epoch': 17.24} + 86%|█████████████████████████████████████████████████████████████████████████████████▉ | 2000/2320 [45:30<03:48, 1.40it/sSaving model checkpoint to ./wav2vec2-base-timit-fine-tuned/checkpoint-2000 +Configuration saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/config.json +Model weights saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/model.safetensors +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/checkpoint-2000/added_tokens.json +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json +Deleting older checkpoint [wav2vec2-base-timit-fine-tuned/checkpoint-800] due to args.save_total_limit +{'loss': 0.0419, 'grad_norm': 0.6646668314933777, 'learning_rate': 2.3712121212121214e-05, 'epoch': 17.33} +{'loss': 0.0595, 'grad_norm': 1.3250740766525269, 'learning_rate': 2.2954545454545457e-05, 'epoch': 17.41} +{'loss': 0.0691, 'grad_norm': 0.8094995021820068, 'learning_rate': 2.21969696969697e-05, 'epoch': 17.5} +{'loss': 0.052, 'grad_norm': 0.846946120262146, 'learning_rate': 2.143939393939394e-05, 'epoch': 17.59} +{'loss': 0.0565, 'grad_norm': 1.652417540550232, 'learning_rate': 2.0681818181818182e-05, 'epoch': 17.67} +{'loss': 0.0745, 'grad_norm': 1.0080279111862183, 'learning_rate': 1.9924242424242425e-05, 'epoch': 17.76} + 89%|████████████████████████████████████████████████████████████████████████████████████▌ | 2064/2320 [46:36<04:53, 1.15s/it]/opt/conda/lib/python3.12/site-packages/torch/nn/modules/conv.py:306: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1715567101190/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.) + return F.conv1d(input, weight, bias, self.stride, +{'loss': 0.0513, 'grad_norm': 0.7252691388130188, 'learning_rate': 1.9166666666666667e-05, 'epoch': 17.84} +{'loss': 0.055, 'grad_norm': 1.58548903465271, 'learning_rate': 1.840909090909091e-05, 'epoch': 17.93} +{'loss': 0.0658, 'grad_norm': 0.6634634733200073, 'learning_rate': 1.7651515151515153e-05, 'epoch': 18.02} +{'loss': 0.0406, 'grad_norm': 1.1495524644851685, 'learning_rate': 1.6893939393939395e-05, 'epoch': 18.1} + 91%|█████████████████████████████████████████████████████████████████████████████████████▉ | 2100/2320 [47:11<03:46, 1.03s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.44191813468933105, 'eval_wer': 0.42046799599380863, 'eval_runtime': 40.0967, 'eval_samples_per_second': 33.519, 'eval_steps_per_second': 33.519, 'epoch': 18.1} +{'loss': 0.0381, 'grad_norm': 0.9788354635238647, 'learning_rate': 1.6136363636363638e-05, 'epoch': 18.19} +{'loss': 0.071, 'grad_norm': 1.093633770942688, 'learning_rate': 1.5378787878787877e-05, 'epoch': 18.28} +{'loss': 0.0439, 'grad_norm': 0.7164376974105835, 'learning_rate': 1.4621212121212122e-05, 'epoch': 18.36} +{'loss': 0.0481, 'grad_norm': 0.9887032508850098, 'learning_rate': 1.3863636363636364e-05, 'epoch': 18.45} +{'loss': 0.0571, 'grad_norm': 0.45052286982536316, 'learning_rate': 1.3106060606060607e-05, 'epoch': 18.53} +{'loss': 0.0452, 'grad_norm': 1.167181134223938, 'learning_rate': 1.234848484848485e-05, 'epoch': 18.62} +{'loss': 0.0643, 'grad_norm': 1.378661870956421, 'learning_rate': 1.159090909090909e-05, 'epoch': 18.71} +{'loss': 0.0587, 'grad_norm': 0.854932963848114, 'learning_rate': 1.0833333333333334e-05, 'epoch': 18.79} +{'loss': 0.0395, 'grad_norm': 0.8007526397705078, 'learning_rate': 1.0075757575757576e-05, 'epoch': 18.88} +{'loss': 0.074, 'grad_norm': 3.317830801010132, 'learning_rate': 9.318181818181819e-06, 'epoch': 18.97} + 95%|██████████████████████████████████████████████████████████████████████████████████████████ | 2200/2320 [49:24<01:19, 1.51it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.43061742186546326, 'eval_wer': 0.420012746972594, 'eval_runtime': 40.0034, 'eval_samples_per_second': 33.597, 'eval_steps_per_second': 33.597, 'epoch': 18.97} +{'loss': 0.046, 'grad_norm': 0.7710875272750854, 'learning_rate': 8.56060606060606e-06, 'epoch': 19.05} +{'loss': 0.0394, 'grad_norm': 0.5200530886650085, 'learning_rate': 7.803030303030304e-06, 'epoch': 19.14} +{'loss': 0.0582, 'grad_norm': 1.3544327020645142, 'learning_rate': 7.045454545454545e-06, 'epoch': 19.22} +{'loss': 0.0606, 'grad_norm': 0.8653574585914612, 'learning_rate': 6.287878787878789e-06, 'epoch': 19.31} +{'loss': 0.0367, 'grad_norm': 1.5852700471878052, 'learning_rate': 5.530303030303031e-06, 'epoch': 19.4} +{'loss': 0.0782, 'grad_norm': 2.2167246341705322, 'learning_rate': 4.772727272727273e-06, 'epoch': 19.48} +{'loss': 0.0416, 'grad_norm': 0.5891330242156982, 'learning_rate': 4.015151515151515e-06, 'epoch': 19.57} +{'loss': 0.0515, 'grad_norm': 1.1137330532073975, 'learning_rate': 3.257575757575758e-06, 'epoch': 19.66} +{'loss': 0.0512, 'grad_norm': 0.8132285475730896, 'learning_rate': 2.5e-06, 'epoch': 19.74} +{'loss': 0.0378, 'grad_norm': 0.7994781136512756, 'learning_rate': 1.7424242424242427e-06, 'epoch': 19.83} + 99%|██████████████████████████████████████████████████████████████████████████████████████████████▏| 2300/2320 [51:43<00:20, 1.02s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +{'eval_loss': 0.4273350238800049, 'eval_wer': 0.41728125284530637, 'eval_runtime': 40.0934, 'eval_samples_per_second': 33.522, 'eval_steps_per_second': 33.522, 'epoch': 19.83} +{'loss': 0.0489, 'grad_norm': 0.9775754809379578, 'learning_rate': 9.848484848484847e-07, 'epoch': 19.91} +{'loss': 0.0554, 'grad_norm': 0.8857516050338745, 'learning_rate': 2.2727272727272726e-07, 'epoch': 20.0} +100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2320/2320 [52:39<00:00, 1.41it/s] + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +{'train_runtime': 3159.4128, 'train_samples_per_second': 23.397, 'train_steps_per_second': 0.734, 'train_loss': 0.8618391515622879, 'epoch': 20.0} +100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2320/2320 [52:39<00:00, 1.36s/it] +Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned +Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json +Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json +Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned +Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json +Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json +events.out.tfevents.1716174523.tz579-raptorlake.65634.0: 100%|██████████████████████████████████████| 63.2k/63.2k [00:00<00:00, 232kB/s] +model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████| 378M/378M [03:30<00:00, 1.79MB/s] +Upload 2 LFS files: 100%|████████████████████████████████████████████████████████████████████████████████| 2/2 [03:31<00:00, 105.69s/it] +***** train metrics *****████████████████████████████████████████ | 1/2 [03:31<03:31, 211.39s/it] + epoch = 20.0 + total_flos = 2000175347GF + train_loss = 0.8618 + train_runtime = 0:52:39.41 + train_samples = 3696 + train_samples_per_second = 23.397 + train_steps_per_second = 0.734 +05/19/2024 23:04:57 - INFO - __main__ - *** Evaluate *** +The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`, you can safely ignore this message. +***** Running Evaluation ***** + Num examples = 1344 + Batch size = 1 +100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1344/1344 [00:39<00:00, 34.00it/s] +***** eval metrics ***** + epoch = 20.0 + eval_loss = 0.4275 + eval_runtime = 0:00:39.60 + eval_samples = 1344 + eval_samples_per_second = 33.935 + eval_steps_per_second = 33.935 + eval_wer = 0.4173 +Saving model checkpoint to ./wav2vec2-base-timit-fine-tuned +Configuration saved in ./wav2vec2-base-timit-fine-tuned/config.json +Model weights saved in ./wav2vec2-base-timit-fine-tuned/model.safetensors +Feature extractor saved in ./wav2vec2-base-timit-fine-tuned/preprocessor_config.json +tokenizer config file saved in ./wav2vec2-base-timit-fine-tuned/tokenizer_config.json +Special tokens file saved in ./wav2vec2-base-timit-fine-tuned/special_tokens_map.json +added tokens file saved in ./wav2vec2-base-timit-fine-tuned/added_tokens.json +events.out.tfevents.1716177937.tz579-raptorlake.65634.1: 100%|███████████████████████████████████████████| 406/406 [00:00<00:00, 884B/s] diff --git a/run.timit.sh b/run.timit.sh new file mode 100644 index 0000000000000000000000000000000000000000..8aad142e42f8df3e600ffe8eaa764bcf7eafdd77 --- /dev/null +++ b/run.timit.sh @@ -0,0 +1,30 @@ +export HF_TOKEN=`cat /home/huggingface.token` +export HF_HOME="/home/Work/common_huggingface" + +python run_speech_recognition_ctc.py \ + --token="${HF_TOKEN}" \ + --dataset_name="timit_asr" \ + --dataset_path="/home/Work_/common_darpa/Timit_data/data" \ + --model_name_or_path="facebook/wav2vec2-base" \ + --overwrite_output_dir \ + --output_dir="./wav2vec2-base-timit-fine-tuned" \ + --train_split_name="train" \ + --num_train_epochs="20" \ + --per_device_train_batch_size="32" \ + --per_device_eval_batch_size="1" \ + --weight_decay="0.005" \ + --learning_rate="1e-4" \ + --warmup_steps="1000" \ + --evaluation_strategy="steps" \ + --text_column_name="text" \ + --save_steps="400" \ + --eval_steps="100" \ + --logging_steps="10" \ + --layerdrop="0.0" \ + --save_total_limit="3" \ + --freeze_feature_encoder \ + --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ + --fp16 \ + --group_by_length \ + --push_to_hub \ + --do_train --do_eval \ diff --git a/run_speech_recognition_ctc.py b/run_speech_recognition_ctc.py new file mode 100644 index 0000000000000000000000000000000000000000..0b274712834d55eef00d4669d5448181504a8ffc --- /dev/null +++ b/run_speech_recognition_ctc.py @@ -0,0 +1,840 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition""" + +import functools +import json +import logging +import os +import re +import sys +import warnings +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union + +import datasets +import evaluate +import torch +from datasets import DatasetDict, load_dataset + +import transformers +from transformers import ( + AutoConfig, + AutoFeatureExtractor, + AutoModelForCTC, + AutoProcessor, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + Wav2Vec2Processor, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint, is_main_process +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.41.0.dev0") + +require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") + + +logger = logging.getLogger(__name__) + + +def list_field(default=None, metadata=None): + return field(default_factory=lambda: default, metadata=metadata) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + tokenizer_name_or_path: Optional[str] = field( + default=None, + metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"}, + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + freeze_feature_encoder: bool = field( + default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."} + ) + attention_dropout: float = field( + default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."} + ) + activation_dropout: float = field( + default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."} + ) + feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."}) + hidden_dropout: float = field( + default=0.0, + metadata={ + "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler." + }, + ) + final_dropout: float = field( + default=0.0, + metadata={"help": "The dropout probability for the final projection layer."}, + ) + mask_time_prob: float = field( + default=0.05, + metadata={ + "help": ( + "Probability of each feature vector along the time axis to be chosen as the start of the vector " + "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature " + "vectors will be masked along the time axis." + ) + }, + ) + mask_time_length: int = field( + default=10, + metadata={"help": "Length of vector span to mask along the time axis."}, + ) + mask_feature_prob: float = field( + default=0.0, + metadata={ + "help": ( + "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan" + " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature" + " bins will be masked along the time axis." + ) + }, + ) + mask_feature_length: int = field( + default=10, + metadata={"help": "Length of vector span to mask along the feature axis."}, + ) + layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."}) + ctc_loss_reduction: Optional[str] = field( + default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."} + ) + ctc_zero_infinity: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly" + " occur when the inputs are too short to be aligned to the targets." + }, + ) + add_adapter: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very" + "useful to downsample the output length." + }, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + dataset_name: str = field( + metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + dataset_path: str = field( + default=None, metadata={"help": "The configuration path of the dataset to use (via the datasets library)."} + ) + dataset_config_name: str = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_split_name: str = field( + default="train+validation", + metadata={ + "help": ( + "The name of the training data set split to use (via the datasets library). Defaults to " + "'train+validation'" + ) + }, + ) + eval_split_name: str = field( + default="test", + metadata={ + "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'" + }, + ) + audio_column_name: str = field( + default="audio", + metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"}, + ) + text_column_name: str = field( + default="text", + metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of validation examples to this " + "value if set." + ) + }, + ) + chars_to_ignore: Optional[List[str]] = list_field( + default=None, + metadata={"help": "A list of characters to remove from the transcripts."}, + ) + eval_metrics: List[str] = list_field( + default=["wer"], + metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"}, + ) + max_duration_in_seconds: float = field( + default=20.0, + metadata={ + "help": ( + "Filter audio files that are longer than `max_duration_in_seconds` seconds to" + " 'max_duration_in_seconds`" + ) + }, + ) + min_duration_in_seconds: float = field( + default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} + ) + preprocessing_only: bool = field( + default=False, + metadata={ + "help": ( + "Whether to only do data preprocessing and skip training. This is especially useful when data" + " preprocessing errors out in distributed training due to timeout. In this case, one should run the" + " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets" + " can consequently be loaded in distributed training" + ) + }, + ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) + use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." + ) + }, + ) + unk_token: str = field( + default="[UNK]", + metadata={"help": "The unk token for the tokenizer"}, + ) + pad_token: str = field( + default="[PAD]", + metadata={"help": "The padding token for the tokenizer"}, + ) + word_delimiter_token: str = field( + default="|", + metadata={"help": "The word delimiter token for the tokenizer"}, + ) + phoneme_language: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The target language that should be used be" + " passed to the tokenizer for tokenization. Note that" + " this is only relevant if the model classifies the" + " input audio to a sequence of phoneme sequences." + ) + }, + ) + + +@dataclass +class DataCollatorCTCWithPadding: + """ + Data collator that will dynamically pad the inputs received. + Args: + processor (:class:`~transformers.AutoProcessor`) + The processor used for proccessing the data. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding index) + among: + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the ``input_values`` of the returned list and optionally padding length (see above). + max_length_labels (:obj:`int`, `optional`): + Maximum length of the ``labels`` returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.5 (Volta). + """ + + processor: AutoProcessor + padding: Union[bool, str] = "longest" + pad_to_multiple_of: Optional[int] = None + pad_to_multiple_of_labels: Optional[int] = None + feature_extractor_input_name: Optional[str] = "input_values" + + def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: + # split inputs and labels since they have to be of different lengths and need + # different padding methods + input_features = [ + {self.feature_extractor_input_name: feature[self.feature_extractor_input_name]} for feature in features + ] + label_features = [{"input_ids": feature["labels"]} for feature in features] + + batch = self.processor.pad( + input_features, + padding=self.padding, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors="pt", + ) + + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + pad_to_multiple_of=self.pad_to_multiple_of_labels, + return_tensors="pt", + ) + + # replace padding with -100 to ignore loss correctly + labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) + + batch["labels"] = labels + if "attention_mask" in batch: + batch["attention_mask"] = batch["attention_mask"].to(torch.long) + + return batch + + +def create_vocabulary_from_data( + datasets: DatasetDict, + word_delimiter_token: Optional[str] = None, + unk_token: Optional[str] = None, + pad_token: Optional[str] = None, +): + # Given training and test labels create vocabulary + def extract_all_chars(batch): + all_text = " ".join(batch["target_text"]) + vocab = list(set(all_text)) + return {"vocab": [vocab], "all_text": [all_text]} + + vocabs = datasets.map( + extract_all_chars, + batched=True, + batch_size=-1, + keep_in_memory=True, + remove_columns=datasets["train"].column_names, + ) + + # take union of all unique characters in each dataset + vocab_set = functools.reduce( + lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values() + ) + + vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))} + + # replace white space with delimiter token + if word_delimiter_token is not None: + vocab_dict[word_delimiter_token] = vocab_dict[" "] + del vocab_dict[" "] + + # add unk and pad token + if unk_token is not None: + vocab_dict[unk_token] = len(vocab_dict) + + if pad_token is not None: + vocab_dict[pad_token] = len(vocab_dict) + + return vocab_dict + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if data_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if data_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + data_args.token = data_args.use_auth_token + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_speech_recognition_ctc", model_args, data_args) + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # 1. First, let's load the dataset + raw_datasets = DatasetDict() + + if training_args.do_train: + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + data_dir=data_args.dataset_path, + split=data_args.train_split_name, + token=data_args.token, + ) + + if data_args.audio_column_name not in raw_datasets["train"].column_names: + raise ValueError( + f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." + " Make sure to set `--audio_column_name` to the correct audio column - one of" + f" {', '.join(raw_datasets['train'].column_names)}." + ) + + if data_args.text_column_name not in raw_datasets["train"].column_names: + raise ValueError( + f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " + "Make sure to set `--text_column_name` to the correct text column - one of " + f"{', '.join(raw_datasets['train'].column_names)}." + ) + + if data_args.max_train_samples is not None: + raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) + + if training_args.do_eval: + raw_datasets["eval"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + data_dir=data_args.dataset_path, + split=data_args.eval_split_name, + token=data_args.token, + ) + + if data_args.max_eval_samples is not None: + raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples)) + + # 2. We remove some special characters from the datasets + # that make training complicated and do not help in transcribing the speech + # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic + # that could be easily picked up by the model + chars_to_ignore_regex = ( + f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None + ) + text_column_name = data_args.text_column_name + + def remove_special_characters(batch): + if chars_to_ignore_regex is not None: + batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " " + else: + batch["target_text"] = batch[text_column_name].lower() + " " + return batch + + with training_args.main_process_first(desc="dataset map special characters removal"): + raw_datasets = raw_datasets.map( + remove_special_characters, + remove_columns=[text_column_name], + desc="remove special characters from datasets", + ) + + # save special tokens for tokenizer + word_delimiter_token = data_args.word_delimiter_token + unk_token = data_args.unk_token + pad_token = data_args.pad_token + + # 3. Next, let's load the config as we might need it to create + # the tokenizer + # load config + config = AutoConfig.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + token=data_args.token, + trust_remote_code=data_args.trust_remote_code, + ) + + # 4. Next, if no tokenizer file is defined, + # we create the vocabulary of the model by extracting all unique characters from + # the training and evaluation datasets + # We need to make sure that only first rank saves vocabulary + # make sure all processes wait until vocab is created + tokenizer_name_or_path = model_args.tokenizer_name_or_path + tokenizer_kwargs = {} + if tokenizer_name_or_path is None: + # save vocab in training output dir + tokenizer_name_or_path = training_args.output_dir + + vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") + + with training_args.main_process_first(): + if training_args.overwrite_output_dir and os.path.isfile(vocab_file): + try: + os.remove(vocab_file) + except OSError: + # in shared file-systems it might be the case that + # two processes try to delete the vocab file at the some time + pass + + with training_args.main_process_first(desc="dataset map vocabulary creation"): + if not os.path.isfile(vocab_file): + os.makedirs(tokenizer_name_or_path, exist_ok=True) + vocab_dict = create_vocabulary_from_data( + raw_datasets, + word_delimiter_token=word_delimiter_token, + unk_token=unk_token, + pad_token=pad_token, + ) + + # save vocab dict to be loaded into tokenizer + with open(vocab_file, "w") as file: + json.dump(vocab_dict, file) + + # if tokenizer has just been created + # it is defined by `tokenizer_class` if present in config else by `model_type` + tokenizer_kwargs = { + "config": config if config.tokenizer_class is not None else None, + "tokenizer_type": config.model_type if config.tokenizer_class is None else None, + "unk_token": unk_token, + "pad_token": pad_token, + "word_delimiter_token": word_delimiter_token, + } + + # 5. Now we can instantiate the feature extractor, tokenizer and model + # Note for distributed training, the .from_pretrained methods guarantee that only + # one local process can concurrently download model & vocab. + + # load feature_extractor and tokenizer + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name_or_path, + token=data_args.token, + trust_remote_code=data_args.trust_remote_code, + **tokenizer_kwargs, + ) + feature_extractor = AutoFeatureExtractor.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + token=data_args.token, + trust_remote_code=data_args.trust_remote_code, + ) + + # adapt config + config.update( + { + "feat_proj_dropout": model_args.feat_proj_dropout, + "attention_dropout": model_args.attention_dropout, + "hidden_dropout": model_args.hidden_dropout, + "final_dropout": model_args.final_dropout, + "mask_time_prob": model_args.mask_time_prob, + "mask_time_length": model_args.mask_time_length, + "mask_feature_prob": model_args.mask_feature_prob, + "mask_feature_length": model_args.mask_feature_length, + "gradient_checkpointing": training_args.gradient_checkpointing, + "layerdrop": model_args.layerdrop, + "ctc_loss_reduction": model_args.ctc_loss_reduction, + "ctc_zero_infinity": model_args.ctc_zero_infinity, + "pad_token_id": tokenizer.pad_token_id, + "vocab_size": len(tokenizer), + "activation_dropout": model_args.activation_dropout, + "add_adapter": model_args.add_adapter, + } + ) + + # create model + model = AutoModelForCTC.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + config=config, + token=data_args.token, + trust_remote_code=data_args.trust_remote_code, + ) + + # freeze encoder + if model_args.freeze_feature_encoder: + model.freeze_feature_encoder() + + # 6. Now we preprocess the datasets including loading the audio, resampling and normalization + # Thankfully, `datasets` takes care of automatically loading and resampling the audio, + # so that we just need to set the correct target sampling rate and normalize the input + # via the `feature_extractor` + + # make sure that dataset decodes audio with correct sampling rate + dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate + if dataset_sampling_rate != feature_extractor.sampling_rate: + raw_datasets = raw_datasets.cast_column( + data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) + ) + + # derive max & min input length for sample rate & max duration + max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate + min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate + audio_column_name = data_args.audio_column_name + num_workers = data_args.preprocessing_num_workers + feature_extractor_input_name = feature_extractor.model_input_names[0] + + # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification + phoneme_language = data_args.phoneme_language + + # Preprocessing the datasets. + # We need to read the audio files as arrays and tokenize the targets. + def prepare_dataset(batch): + # load audio + sample = batch[audio_column_name] + + inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) + batch[feature_extractor_input_name] = getattr(inputs, feature_extractor_input_name)[0] + # take length of raw audio waveform + batch["input_length"] = len(sample["array"].squeeze()) + + # encode targets + additional_kwargs = {} + if phoneme_language is not None: + additional_kwargs["phonemizer_lang"] = phoneme_language + + batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids + return batch + + with training_args.main_process_first(desc="dataset map preprocessing"): + vectorized_datasets = raw_datasets.map( + prepare_dataset, + remove_columns=next(iter(raw_datasets.values())).column_names, + num_proc=num_workers, + desc="preprocess datasets", + ) + + def is_audio_in_length_range(length): + return length > min_input_length and length < max_input_length + + # filter data that is shorter than min_input_length + vectorized_datasets = vectorized_datasets.filter( + is_audio_in_length_range, + num_proc=num_workers, + input_columns=["input_length"], + ) + + # 7. Next, we can prepare the training. + # Let's use word error rate (WER) as our evaluation metric, + # instantiate a data collator and the trainer + + # Define evaluation metrics during training, *i.e.* word error rate, character error rate + eval_metrics = {metric: evaluate.load(metric, cache_dir=model_args.cache_dir) for metric in data_args.eval_metrics} + + # for large datasets it is advised to run the preprocessing on a + # single machine first with ``args.preprocessing_only`` since there will mostly likely + # be a timeout when running the script in distributed mode. + # In a second step ``args.preprocessing_only`` can then be set to `False` to load the + # cached dataset + if data_args.preprocessing_only: + logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}") + return + + # For languages like Chinese with large vocabulary size, we need to discard logits + # and only keep the argmax, otherwise we run out of memory during evaluation. + def preprocess_logits_for_metrics(logits, labels): + pred_ids = torch.argmax(logits, dim=-1) + return pred_ids, labels + + def compute_metrics(pred): + pred_ids = pred.predictions[0] + pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id + + pred_str = tokenizer.batch_decode(pred_ids) + # we do not want to group tokens when computing the metrics + label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) + + metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()} + + return metrics + + # Now save everything to be able to create a single processor later + # make sure all processes wait until data is saved + with training_args.main_process_first(): + # only the main process saves them + if is_main_process(training_args.local_rank): + # save feature extractor, tokenizer and config + feature_extractor.save_pretrained(training_args.output_dir) + tokenizer.save_pretrained(training_args.output_dir) + config.save_pretrained(training_args.output_dir) + + try: + processor = AutoProcessor.from_pretrained(training_args.output_dir) + except (OSError, KeyError): + warnings.warn( + "Loading a processor from a feature extractor config that does not" + " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " + " attribute to your `preprocessor_config.json` file to suppress this warning: " + " `'processor_class': 'Wav2Vec2Processor'`", + FutureWarning, + ) + processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir) + + # Instantiate custom data collator + data_collator = DataCollatorCTCWithPadding( + processor=processor, feature_extractor_input_name=feature_extractor_input_name + ) + + # Initialize Trainer + trainer = Trainer( + model=model, + data_collator=data_collator, + args=training_args, + compute_metrics=compute_metrics, + train_dataset=vectorized_datasets["train"] if training_args.do_train else None, + eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, + tokenizer=processor, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # 8. Finally, we can start training + + # Training + if training_args.do_train: + # use last checkpoint if exist + if last_checkpoint is not None: + checkpoint = last_checkpoint + elif os.path.isdir(model_args.model_name_or_path): + checkpoint = model_args.model_name_or_path + else: + checkpoint = None + + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() + + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples + if data_args.max_train_samples is not None + else len(vectorized_datasets["train"]) + ) + metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + results = {} + if training_args.do_eval: + logger.info("*** Evaluate ***") + metrics = trainer.evaluate() + max_eval_samples = ( + data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"]) + ) + metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Write model card and (optionally) push to hub + config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na" + kwargs = { + "finetuned_from": model_args.model_name_or_path, + "tasks": "automatic-speech-recognition", + "tags": ["automatic-speech-recognition", data_args.dataset_name], + "dataset_args": ( + f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:" + f" {data_args.eval_split_name}" + ), + "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", + } + if "common_voice" in data_args.dataset_name: + kwargs["language"] = config_name + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + return results + + +if __name__ == "__main__": + main() diff --git a/run_speech_recognition_ctc.py. b/run_speech_recognition_ctc.py. new file mode 100644 index 0000000000000000000000000000000000000000..d80d470b4308b24fcdc8c7867ed011c3dfb588a0 --- /dev/null +++ b/run_speech_recognition_ctc.py. @@ -0,0 +1,835 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition""" + +import functools +import json +import logging +import os +import re +import sys +import warnings +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union + +import datasets +import evaluate +import torch +from datasets import DatasetDict, load_dataset + +import transformers +from transformers import ( + AutoConfig, + AutoFeatureExtractor, + AutoModelForCTC, + AutoProcessor, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + Wav2Vec2Processor, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint, is_main_process +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.41.0.dev0") + +require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") + + +logger = logging.getLogger(__name__) + + +def list_field(default=None, metadata=None): + return field(default_factory=lambda: default, metadata=metadata) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + tokenizer_name_or_path: Optional[str] = field( + default=None, + metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"}, + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + freeze_feature_encoder: bool = field( + default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."} + ) + attention_dropout: float = field( + default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."} + ) + activation_dropout: float = field( + default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."} + ) + feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."}) + hidden_dropout: float = field( + default=0.0, + metadata={ + "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler." + }, + ) + final_dropout: float = field( + default=0.0, + metadata={"help": "The dropout probability for the final projection layer."}, + ) + mask_time_prob: float = field( + default=0.05, + metadata={ + "help": ( + "Probability of each feature vector along the time axis to be chosen as the start of the vector " + "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature " + "vectors will be masked along the time axis." + ) + }, + ) + mask_time_length: int = field( + default=10, + metadata={"help": "Length of vector span to mask along the time axis."}, + ) + mask_feature_prob: float = field( + default=0.0, + metadata={ + "help": ( + "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan" + " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature" + " bins will be masked along the time axis." + ) + }, + ) + mask_feature_length: int = field( + default=10, + metadata={"help": "Length of vector span to mask along the feature axis."}, + ) + layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."}) + ctc_loss_reduction: Optional[str] = field( + default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."} + ) + ctc_zero_infinity: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly" + " occur when the inputs are too short to be aligned to the targets." + }, + ) + add_adapter: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very" + "useful to downsample the output length." + }, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + dataset_name: str = field( + metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: str = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_split_name: str = field( + default="train+validation", + metadata={ + "help": ( + "The name of the training data set split to use (via the datasets library). Defaults to " + "'train+validation'" + ) + }, + ) + eval_split_name: str = field( + default="test", + metadata={ + "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'" + }, + ) + audio_column_name: str = field( + default="audio", + metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"}, + ) + text_column_name: str = field( + default="text", + metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of validation examples to this " + "value if set." + ) + }, + ) + chars_to_ignore: Optional[List[str]] = list_field( + default=None, + metadata={"help": "A list of characters to remove from the transcripts."}, + ) + eval_metrics: List[str] = list_field( + default=["wer"], + metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"}, + ) + max_duration_in_seconds: float = field( + default=20.0, + metadata={ + "help": ( + "Filter audio files that are longer than `max_duration_in_seconds` seconds to" + " 'max_duration_in_seconds`" + ) + }, + ) + min_duration_in_seconds: float = field( + default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} + ) + preprocessing_only: bool = field( + default=False, + metadata={ + "help": ( + "Whether to only do data preprocessing and skip training. This is especially useful when data" + " preprocessing errors out in distributed training due to timeout. In this case, one should run the" + " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets" + " can consequently be loaded in distributed training" + ) + }, + ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) + use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." + ) + }, + ) + unk_token: str = field( + default="[UNK]", + metadata={"help": "The unk token for the tokenizer"}, + ) + pad_token: str = field( + default="[PAD]", + metadata={"help": "The padding token for the tokenizer"}, + ) + word_delimiter_token: str = field( + default="|", + metadata={"help": "The word delimiter token for the tokenizer"}, + ) + phoneme_language: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The target language that should be used be" + " passed to the tokenizer for tokenization. Note that" + " this is only relevant if the model classifies the" + " input audio to a sequence of phoneme sequences." + ) + }, + ) + + +@dataclass +class DataCollatorCTCWithPadding: + """ + Data collator that will dynamically pad the inputs received. + Args: + processor (:class:`~transformers.AutoProcessor`) + The processor used for proccessing the data. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding index) + among: + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the ``input_values`` of the returned list and optionally padding length (see above). + max_length_labels (:obj:`int`, `optional`): + Maximum length of the ``labels`` returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.5 (Volta). + """ + + processor: AutoProcessor + padding: Union[bool, str] = "longest" + pad_to_multiple_of: Optional[int] = None + pad_to_multiple_of_labels: Optional[int] = None + feature_extractor_input_name: Optional[str] = "input_values" + + def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: + # split inputs and labels since they have to be of different lengths and need + # different padding methods + input_features = [ + {self.feature_extractor_input_name: feature[self.feature_extractor_input_name]} for feature in features + ] + label_features = [{"input_ids": feature["labels"]} for feature in features] + + batch = self.processor.pad( + input_features, + padding=self.padding, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors="pt", + ) + + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + pad_to_multiple_of=self.pad_to_multiple_of_labels, + return_tensors="pt", + ) + + # replace padding with -100 to ignore loss correctly + labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) + + batch["labels"] = labels + if "attention_mask" in batch: + batch["attention_mask"] = batch["attention_mask"].to(torch.long) + + return batch + + +def create_vocabulary_from_data( + datasets: DatasetDict, + word_delimiter_token: Optional[str] = None, + unk_token: Optional[str] = None, + pad_token: Optional[str] = None, +): + # Given training and test labels create vocabulary + def extract_all_chars(batch): + all_text = " ".join(batch["target_text"]) + vocab = list(set(all_text)) + return {"vocab": [vocab], "all_text": [all_text]} + + vocabs = datasets.map( + extract_all_chars, + batched=True, + batch_size=-1, + keep_in_memory=True, + remove_columns=datasets["train"].column_names, + ) + + # take union of all unique characters in each dataset + vocab_set = functools.reduce( + lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values() + ) + + vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))} + + # replace white space with delimiter token + if word_delimiter_token is not None: + vocab_dict[word_delimiter_token] = vocab_dict[" "] + del vocab_dict[" "] + + # add unk and pad token + if unk_token is not None: + vocab_dict[unk_token] = len(vocab_dict) + + if pad_token is not None: + vocab_dict[pad_token] = len(vocab_dict) + + return vocab_dict + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if data_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if data_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + data_args.token = data_args.use_auth_token + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_speech_recognition_ctc", model_args, data_args) + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # 1. First, let's load the dataset + raw_datasets = DatasetDict() + + if training_args.do_train: + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=data_args.train_split_name, + token=data_args.token, + ) + + if data_args.audio_column_name not in raw_datasets["train"].column_names: + raise ValueError( + f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." + " Make sure to set `--audio_column_name` to the correct audio column - one of" + f" {', '.join(raw_datasets['train'].column_names)}." + ) + + if data_args.text_column_name not in raw_datasets["train"].column_names: + raise ValueError( + f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " + "Make sure to set `--text_column_name` to the correct text column - one of " + f"{', '.join(raw_datasets['train'].column_names)}." + ) + + if data_args.max_train_samples is not None: + raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) + + if training_args.do_eval: + raw_datasets["eval"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=data_args.eval_split_name, + token=data_args.token, + ) + + if data_args.max_eval_samples is not None: + raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples)) + + # 2. We remove some special characters from the datasets + # that make training complicated and do not help in transcribing the speech + # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic + # that could be easily picked up by the model + chars_to_ignore_regex = ( + f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None + ) + text_column_name = data_args.text_column_name + + def remove_special_characters(batch): + if chars_to_ignore_regex is not None: + batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " " + else: + batch["target_text"] = batch[text_column_name].lower() + " " + return batch + + with training_args.main_process_first(desc="dataset map special characters removal"): + raw_datasets = raw_datasets.map( + remove_special_characters, + remove_columns=[text_column_name], + desc="remove special characters from datasets", + ) + + # save special tokens for tokenizer + word_delimiter_token = data_args.word_delimiter_token + unk_token = data_args.unk_token + pad_token = data_args.pad_token + + # 3. Next, let's load the config as we might need it to create + # the tokenizer + # load config + config = AutoConfig.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + token=data_args.token, + trust_remote_code=data_args.trust_remote_code, + ) + + # 4. Next, if no tokenizer file is defined, + # we create the vocabulary of the model by extracting all unique characters from + # the training and evaluation datasets + # We need to make sure that only first rank saves vocabulary + # make sure all processes wait until vocab is created + tokenizer_name_or_path = model_args.tokenizer_name_or_path + tokenizer_kwargs = {} + if tokenizer_name_or_path is None: + # save vocab in training output dir + tokenizer_name_or_path = training_args.output_dir + + vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") + + with training_args.main_process_first(): + if training_args.overwrite_output_dir and os.path.isfile(vocab_file): + try: + os.remove(vocab_file) + except OSError: + # in shared file-systems it might be the case that + # two processes try to delete the vocab file at the some time + pass + + with training_args.main_process_first(desc="dataset map vocabulary creation"): + if not os.path.isfile(vocab_file): + os.makedirs(tokenizer_name_or_path, exist_ok=True) + vocab_dict = create_vocabulary_from_data( + raw_datasets, + word_delimiter_token=word_delimiter_token, + unk_token=unk_token, + pad_token=pad_token, + ) + + # save vocab dict to be loaded into tokenizer + with open(vocab_file, "w") as file: + json.dump(vocab_dict, file) + + # if tokenizer has just been created + # it is defined by `tokenizer_class` if present in config else by `model_type` + tokenizer_kwargs = { + "config": config if config.tokenizer_class is not None else None, + "tokenizer_type": config.model_type if config.tokenizer_class is None else None, + "unk_token": unk_token, + "pad_token": pad_token, + "word_delimiter_token": word_delimiter_token, + } + + # 5. Now we can instantiate the feature extractor, tokenizer and model + # Note for distributed training, the .from_pretrained methods guarantee that only + # one local process can concurrently download model & vocab. + + # load feature_extractor and tokenizer + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name_or_path, + token=data_args.token, + trust_remote_code=data_args.trust_remote_code, + **tokenizer_kwargs, + ) + feature_extractor = AutoFeatureExtractor.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + token=data_args.token, + trust_remote_code=data_args.trust_remote_code, + ) + + # adapt config + config.update( + { + "feat_proj_dropout": model_args.feat_proj_dropout, + "attention_dropout": model_args.attention_dropout, + "hidden_dropout": model_args.hidden_dropout, + "final_dropout": model_args.final_dropout, + "mask_time_prob": model_args.mask_time_prob, + "mask_time_length": model_args.mask_time_length, + "mask_feature_prob": model_args.mask_feature_prob, + "mask_feature_length": model_args.mask_feature_length, + "gradient_checkpointing": training_args.gradient_checkpointing, + "layerdrop": model_args.layerdrop, + "ctc_loss_reduction": model_args.ctc_loss_reduction, + "ctc_zero_infinity": model_args.ctc_zero_infinity, + "pad_token_id": tokenizer.pad_token_id, + "vocab_size": len(tokenizer), + "activation_dropout": model_args.activation_dropout, + "add_adapter": model_args.add_adapter, + } + ) + + # create model + model = AutoModelForCTC.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + config=config, + token=data_args.token, + trust_remote_code=data_args.trust_remote_code, + ) + + # freeze encoder + if model_args.freeze_feature_encoder: + model.freeze_feature_encoder() + + # 6. Now we preprocess the datasets including loading the audio, resampling and normalization + # Thankfully, `datasets` takes care of automatically loading and resampling the audio, + # so that we just need to set the correct target sampling rate and normalize the input + # via the `feature_extractor` + + # make sure that dataset decodes audio with correct sampling rate + dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate + if dataset_sampling_rate != feature_extractor.sampling_rate: + raw_datasets = raw_datasets.cast_column( + data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) + ) + + # derive max & min input length for sample rate & max duration + max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate + min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate + audio_column_name = data_args.audio_column_name + num_workers = data_args.preprocessing_num_workers + feature_extractor_input_name = feature_extractor.model_input_names[0] + + # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification + phoneme_language = data_args.phoneme_language + + # Preprocessing the datasets. + # We need to read the audio files as arrays and tokenize the targets. + def prepare_dataset(batch): + # load audio + sample = batch[audio_column_name] + + inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) + batch[feature_extractor_input_name] = getattr(inputs, feature_extractor_input_name)[0] + # take length of raw audio waveform + batch["input_length"] = len(sample["array"].squeeze()) + + # encode targets + additional_kwargs = {} + if phoneme_language is not None: + additional_kwargs["phonemizer_lang"] = phoneme_language + + batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids + return batch + + with training_args.main_process_first(desc="dataset map preprocessing"): + vectorized_datasets = raw_datasets.map( + prepare_dataset, + remove_columns=next(iter(raw_datasets.values())).column_names, + num_proc=num_workers, + desc="preprocess datasets", + ) + + def is_audio_in_length_range(length): + return length > min_input_length and length < max_input_length + + # filter data that is shorter than min_input_length + vectorized_datasets = vectorized_datasets.filter( + is_audio_in_length_range, + num_proc=num_workers, + input_columns=["input_length"], + ) + + # 7. Next, we can prepare the training. + # Let's use word error rate (WER) as our evaluation metric, + # instantiate a data collator and the trainer + + # Define evaluation metrics during training, *i.e.* word error rate, character error rate + eval_metrics = {metric: evaluate.load(metric, cache_dir=model_args.cache_dir) for metric in data_args.eval_metrics} + + # for large datasets it is advised to run the preprocessing on a + # single machine first with ``args.preprocessing_only`` since there will mostly likely + # be a timeout when running the script in distributed mode. + # In a second step ``args.preprocessing_only`` can then be set to `False` to load the + # cached dataset + if data_args.preprocessing_only: + logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}") + return + + # For languages like Chinese with large vocabulary size, we need to discard logits + # and only keep the argmax, otherwise we run out of memory during evaluation. + def preprocess_logits_for_metrics(logits, labels): + pred_ids = torch.argmax(logits, dim=-1) + return pred_ids, labels + + def compute_metrics(pred): + pred_ids = pred.predictions[0] + pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id + + pred_str = tokenizer.batch_decode(pred_ids) + # we do not want to group tokens when computing the metrics + label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) + + metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()} + + return metrics + + # Now save everything to be able to create a single processor later + # make sure all processes wait until data is saved + with training_args.main_process_first(): + # only the main process saves them + if is_main_process(training_args.local_rank): + # save feature extractor, tokenizer and config + feature_extractor.save_pretrained(training_args.output_dir) + tokenizer.save_pretrained(training_args.output_dir) + config.save_pretrained(training_args.output_dir) + + try: + processor = AutoProcessor.from_pretrained(training_args.output_dir) + except (OSError, KeyError): + warnings.warn( + "Loading a processor from a feature extractor config that does not" + " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " + " attribute to your `preprocessor_config.json` file to suppress this warning: " + " `'processor_class': 'Wav2Vec2Processor'`", + FutureWarning, + ) + processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir) + + # Instantiate custom data collator + data_collator = DataCollatorCTCWithPadding( + processor=processor, feature_extractor_input_name=feature_extractor_input_name + ) + + # Initialize Trainer + trainer = Trainer( + model=model, + data_collator=data_collator, + args=training_args, + compute_metrics=compute_metrics, + train_dataset=vectorized_datasets["train"] if training_args.do_train else None, + eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, + tokenizer=processor, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # 8. Finally, we can start training + + # Training + if training_args.do_train: + # use last checkpoint if exist + if last_checkpoint is not None: + checkpoint = last_checkpoint + elif os.path.isdir(model_args.model_name_or_path): + checkpoint = model_args.model_name_or_path + else: + checkpoint = None + + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() + + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples + if data_args.max_train_samples is not None + else len(vectorized_datasets["train"]) + ) + metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + results = {} + if training_args.do_eval: + logger.info("*** Evaluate ***") + metrics = trainer.evaluate() + max_eval_samples = ( + data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"]) + ) + metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Write model card and (optionally) push to hub + config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na" + kwargs = { + "finetuned_from": model_args.model_name_or_path, + "tasks": "automatic-speech-recognition", + "tags": ["automatic-speech-recognition", data_args.dataset_name], + "dataset_args": ( + f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:" + f" {data_args.eval_split_name}" + ), + "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", + } + if "common_voice" in data_args.dataset_name: + kwargs["language"] = config_name + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + return results + + +if __name__ == "__main__": + main() diff --git a/runs/May24_15-21-50_tz579-raptorlake/events.out.tfevents.1716583096.tz579-raptorlake.20455.0 b/runs/May24_15-21-50_tz579-raptorlake/events.out.tfevents.1716583096.tz579-raptorlake.20455.0 new file mode 100644 index 0000000000000000000000000000000000000000..f2b2665cb0dd91af115e2ec0e23f8c9bf75de51b --- /dev/null +++ b/runs/May24_15-21-50_tz579-raptorlake/events.out.tfevents.1716583096.tz579-raptorlake.20455.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71563281c4fabcd575cd0a3087d26a44f9ce3cb361c20f297e548acb0eb445c9 +size 6192 diff --git a/runs/May24_15-39-25_tz579-raptorlake/events.out.tfevents.1716583898.tz579-raptorlake.21170.0 b/runs/May24_15-39-25_tz579-raptorlake/events.out.tfevents.1716583898.tz579-raptorlake.21170.0 new file mode 100644 index 0000000000000000000000000000000000000000..970e7acdb5bcb5b9176d916dc138126e6100daab --- /dev/null +++ b/runs/May24_15-39-25_tz579-raptorlake/events.out.tfevents.1716583898.tz579-raptorlake.21170.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:953c2638529ff4539eb1a8e0ae75d76c54e1cbca06486770b8edf490b9a48786 +size 6192 diff --git a/runs/May24_16-00-52_tz579-raptorlake/events.out.tfevents.1716585087.tz579-raptorlake.23058.0 b/runs/May24_16-00-52_tz579-raptorlake/events.out.tfevents.1716585087.tz579-raptorlake.23058.0 new file mode 100644 index 0000000000000000000000000000000000000000..ba5ecdacfae1d8bb18368b55ad7a06a00f83c24e --- /dev/null +++ b/runs/May24_16-00-52_tz579-raptorlake/events.out.tfevents.1716585087.tz579-raptorlake.23058.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f56de21e92a371394b396346e90360ea59c7a18cf2c51d858605250a065be8d4 +size 6192 diff --git a/runs/May24_16-12-34_tz579-raptorlake/events.out.tfevents.1716585779.tz579-raptorlake.23433.0 b/runs/May24_16-12-34_tz579-raptorlake/events.out.tfevents.1716585779.tz579-raptorlake.23433.0 new file mode 100644 index 0000000000000000000000000000000000000000..cb242dd4117fa14ef13a698133d4f18d578de3f0 --- /dev/null +++ b/runs/May24_16-12-34_tz579-raptorlake/events.out.tfevents.1716585779.tz579-raptorlake.23433.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17633b209ac376da406114e746fca837219aa83d66d90d0aaf0816712bc41868 +size 6192 diff --git a/runs/May24_16-38-27_tz579-raptorlake/events.out.tfevents.1716587350.tz579-raptorlake.23924.0 b/runs/May24_16-38-27_tz579-raptorlake/events.out.tfevents.1716587350.tz579-raptorlake.23924.0 new file mode 100644 index 0000000000000000000000000000000000000000..4a3b60a3282a1306e8b47d624403363472be07db --- /dev/null +++ b/runs/May24_16-38-27_tz579-raptorlake/events.out.tfevents.1716587350.tz579-raptorlake.23924.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:074142b6f3a30c0127fde6336a36a48e016a9bad0bd48ad40f6d74dec816ce41 +size 6192 diff --git a/runs/May24_16-51-07_tz579-raptorlake/events.out.tfevents.1716588108.tz579-raptorlake.24192.0 b/runs/May24_16-51-07_tz579-raptorlake/events.out.tfevents.1716588108.tz579-raptorlake.24192.0 new file mode 100644 index 0000000000000000000000000000000000000000..abae3f5a84b31d6a88a1125a171cad691230d42c --- /dev/null +++ b/runs/May24_16-51-07_tz579-raptorlake/events.out.tfevents.1716588108.tz579-raptorlake.24192.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2079110f867071fbc495cd436b0a1360f44dbf8fcc3c41a0217289c6def7d32c +size 6604 diff --git a/runs/May24_17-08-47_tz579-raptorlake/events.out.tfevents.1716589182.tz579-raptorlake.24529.0 b/runs/May24_17-08-47_tz579-raptorlake/events.out.tfevents.1716589182.tz579-raptorlake.24529.0 new file mode 100644 index 0000000000000000000000000000000000000000..0aa5ba05d19ee2750c3ad93c1d121a48a5d7bc41 --- /dev/null +++ b/runs/May24_17-08-47_tz579-raptorlake/events.out.tfevents.1716589182.tz579-raptorlake.24529.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cca4979180e1ca2d32ca5566488d6178248d8aa0bd1caf46ed9f6285c6d11f7d +size 6811 diff --git a/runs/May24_17-20-23_tz579-raptorlake/events.out.tfevents.1716589861.tz579-raptorlake.26175.0 b/runs/May24_17-20-23_tz579-raptorlake/events.out.tfevents.1716589861.tz579-raptorlake.26175.0 new file mode 100644 index 0000000000000000000000000000000000000000..6357f53b8f3e915f78858b89ca08255630a0b6ca --- /dev/null +++ b/runs/May24_17-20-23_tz579-raptorlake/events.out.tfevents.1716589861.tz579-raptorlake.26175.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301dd9f07f5890a3822f9edc73ca7e5b85cd3e4d20c17b4e2068856f1b89aaf2 +size 6604 diff --git a/runs/May24_17-36-29_tz579-raptorlake/events.out.tfevents.1716590831.tz579-raptorlake.28308.0 b/runs/May24_17-36-29_tz579-raptorlake/events.out.tfevents.1716590831.tz579-raptorlake.28308.0 new file mode 100644 index 0000000000000000000000000000000000000000..9680a34b9983ecb4177d17cc34650abb8ae4dc79 --- /dev/null +++ b/runs/May24_17-36-29_tz579-raptorlake/events.out.tfevents.1716590831.tz579-raptorlake.28308.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34d45df376f452d62294b65ea3032948330a86c9f276f172cc5c542030498b41 +size 6604 diff --git a/runs/May25_17-16-21_tz579-raptorlake/events.out.tfevents.1716676030.tz579-raptorlake.8078.0 b/runs/May25_17-16-21_tz579-raptorlake/events.out.tfevents.1716676030.tz579-raptorlake.8078.0 new file mode 100644 index 0000000000000000000000000000000000000000..563f5b2dbf4df5b58aff5eb1936edb1a998ae19c --- /dev/null +++ b/runs/May25_17-16-21_tz579-raptorlake/events.out.tfevents.1716676030.tz579-raptorlake.8078.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5c9eba6b7b28381ba8f17287db0093268fa02468f97b2be4a3d643ef2cb185d +size 158868 diff --git a/runs/May25_17-29-56_tz579-raptorlake/events.out.tfevents.1716676963.tz579-raptorlake.9227.0 b/runs/May25_17-29-56_tz579-raptorlake/events.out.tfevents.1716676963.tz579-raptorlake.9227.0 new file mode 100644 index 0000000000000000000000000000000000000000..f1c3bb6ab7917589ad0e4657ac3760f7d162611b --- /dev/null +++ b/runs/May25_17-29-56_tz579-raptorlake/events.out.tfevents.1716676963.tz579-raptorlake.9227.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3f1c7a3afdbaaf70f0ab6eb3465364ec36b297ba72135f39057eeb3a379306d +size 21715 diff --git a/runs/May25_17-45-58_tz579-raptorlake/events.out.tfevents.1716677780.tz579-raptorlake.9961.0 b/runs/May25_17-45-58_tz579-raptorlake/events.out.tfevents.1716677780.tz579-raptorlake.9961.0 new file mode 100644 index 0000000000000000000000000000000000000000..6209113c80058aba6f9998de665b93ce0884c082 --- /dev/null +++ b/runs/May25_17-45-58_tz579-raptorlake/events.out.tfevents.1716677780.tz579-raptorlake.9961.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a39bb0aaef6ce9ea8f32d9ffba51fae1a21efd988361cd138b3bad7827ab8bc0 +size 16542 diff --git a/runs/May25_17-57-49_tz579-raptorlake/events.out.tfevents.1716678504.tz579-raptorlake.10764.0 b/runs/May25_17-57-49_tz579-raptorlake/events.out.tfevents.1716678504.tz579-raptorlake.10764.0 new file mode 100644 index 0000000000000000000000000000000000000000..43d8ecaee250c68a537a7b58279cf9a35c1abdf5 --- /dev/null +++ b/runs/May25_17-57-49_tz579-raptorlake/events.out.tfevents.1716678504.tz579-raptorlake.10764.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc9f6dad36f586790d3e4754501f93d810f582a8e97a2dd84ae3cc2683a992e +size 2705590 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..7f1669a2a2a16dae6adcf40b222f836ed75cbd1e --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false + }, + "unk_token": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false + } +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e64faf013fe6444487563b1c509d8c5aed754e04 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,48 @@ +{ + "added_tokens_decoder": { + "28": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "29": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "processor_class": "Wav2Vec2Processor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "[UNK]", + "word_delimiter_token": "|" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cff707a60a35e3bfb31ab95bcbf9f33af13b882a --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ae2085582750eed1574146e140f321d91e803d129c7d445814e499b412abc85 +size 5048 diff --git a/vocab.json b/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..257f2d9b7c7fd70df0b1481775b2c18d0629fb48 --- /dev/null +++ b/vocab.json @@ -0,0 +1,32 @@ +{ + "@": 1, + "[PAD]": 29, + "[UNK]": 28, + "a": 2, + "b": 3, + "c": 4, + "d": 5, + "e": 6, + "f": 7, + "g": 8, + "h": 9, + "i": 10, + "j": 11, + "k": 12, + "l": 13, + "m": 14, + "n": 15, + "o": 16, + "p": 17, + "q": 18, + "r": 19, + "s": 20, + "t": 21, + "u": 22, + "v": 23, + "w": 24, + "x": 25, + "y": 26, + "z": 27, + "|": 0 +} diff --git a/wav2vec2-base-timit-fine-tuned./README.md b/wav2vec2-base-timit-fine-tuned./README.md new file mode 100644 index 0000000000000000000000000000000000000000..13eb3aaca9f12627ed3accd52e186af9a9ffda2c --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./README.md @@ -0,0 +1,101 @@ +--- +license: apache-2.0 +base_model: facebook/wav2vec2-base +tags: +- automatic-speech-recognition +- timit_asr +- generated_from_trainer +datasets: +- timit_asr +metrics: +- wer +model-index: +- name: wav2vec2-base-timit-fine-tuned + results: + - task: + name: Automatic Speech Recognition + type: automatic-speech-recognition + dataset: + name: TIMIT_ASR - NA + type: timit_asr + config: clean + split: test + args: 'Config: na, Training split: train, Eval split: test' + metrics: + - name: Wer + type: wer + value: 0.41728125284530637 +--- + + + +# wav2vec2-base-timit-fine-tuned + +This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co./facebook/wav2vec2-base) on the TIMIT_ASR - NA dataset. +It achieves the following results on the evaluation set: +- Loss: 0.4275 +- Wer: 0.4173 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0001 +- train_batch_size: 32 +- eval_batch_size: 1 +- seed: 42 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: linear +- lr_scheduler_warmup_steps: 1000 +- num_epochs: 20.0 +- mixed_precision_training: Native AMP + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Wer | +|:-------------:|:-------:|:----:|:---------------:|:------:| +| 3.1618 | 0.8621 | 100 | 3.1117 | 1.0 | +| 2.9798 | 1.7241 | 200 | 2.9736 | 1.0 | +| 2.9144 | 2.5862 | 300 | 2.9075 | 1.0 | +| 2.1714 | 3.4483 | 400 | 2.0945 | 1.0325 | +| 1.1579 | 4.3103 | 500 | 1.0451 | 0.8299 | +| 0.6087 | 5.1724 | 600 | 0.6754 | 0.6441 | +| 0.481 | 6.0345 | 700 | 0.5275 | 0.5761 | +| 0.3072 | 6.8966 | 800 | 0.4836 | 0.5264 | +| 0.332 | 7.7586 | 900 | 0.4403 | 0.5234 | +| 0.1876 | 8.6207 | 1000 | 0.4758 | 0.5222 | +| 0.2232 | 9.4828 | 1100 | 0.4508 | 0.4892 | +| 0.1332 | 10.3448 | 1200 | 0.4394 | 0.4740 | +| 0.1085 | 11.2069 | 1300 | 0.4466 | 0.4621 | +| 0.098 | 12.0690 | 1400 | 0.4230 | 0.4493 | +| 0.1219 | 12.9310 | 1500 | 0.4180 | 0.4460 | +| 0.1021 | 13.7931 | 1600 | 0.4179 | 0.4406 | +| 0.0741 | 14.6552 | 1700 | 0.4113 | 0.4309 | +| 0.0896 | 15.5172 | 1800 | 0.4392 | 0.4308 | +| 0.0492 | 16.3793 | 1900 | 0.4202 | 0.4313 | +| 0.0759 | 17.2414 | 2000 | 0.4348 | 0.4207 | +| 0.0406 | 18.1034 | 2100 | 0.4419 | 0.4205 | +| 0.074 | 18.9655 | 2200 | 0.4306 | 0.4200 | +| 0.0378 | 19.8276 | 2300 | 0.4273 | 0.4173 | + + +### Framework versions + +- Transformers 4.42.0.dev0 +- Pytorch 2.3.0.post300 +- Datasets 2.19.1 +- Tokenizers 0.19.1 diff --git a/wav2vec2-base-timit-fine-tuned./added_tokens.json b/wav2vec2-base-timit-fine-tuned./added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..cc95e40ec61ca6dc6f02948ceaad78a75e854f3f --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 30, + "": 29 +} diff --git a/wav2vec2-base-timit-fine-tuned./all_results.json b/wav2vec2-base-timit-fine-tuned./all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1cdeada0b17066ccaf020082dd9d0cd268ca56c4 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./all_results.json @@ -0,0 +1,15 @@ +{ + "epoch": 20.0, + "eval_loss": 0.42749759554862976, + "eval_runtime": 39.6053, + "eval_samples": 1344, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 33.935, + "eval_wer": 0.41728125284530637, + "total_flos": 2.1476719263248095e+18, + "train_loss": 0.8618391515622879, + "train_runtime": 3159.4128, + "train_samples": 3696, + "train_samples_per_second": 23.397, + "train_steps_per_second": 0.734 +} \ No newline at end of file diff --git a/wav2vec2-base-timit-fine-tuned./config.json b/wav2vec2-base-timit-fine-tuned./config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f36e3992ad6bb713cc84ca1334595fb49014d73 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./config.json @@ -0,0 +1,119 @@ +{ + "_name_or_path": "facebook/wav2vec2-base", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 28, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.42.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 31, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-base-timit-fine-tuned./eval_results.json b/wav2vec2-base-timit-fine-tuned./eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5aa472fae5dd1008057fa12bfb31eff2da58243f --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./eval_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 20.0, + "eval_loss": 0.42749759554862976, + "eval_runtime": 39.6053, + "eval_samples": 1344, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 33.935, + "eval_wer": 0.41728125284530637 +} \ No newline at end of file diff --git a/wav2vec2-base-timit-fine-tuned./preprocessor_config.json b/wav2vec2-base-timit-fine-tuned./preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c626b5517871d529f0ed94aded16d875d0dd4ea2 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716174523.tz579-raptorlake.65634.0 b/wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716174523.tz579-raptorlake.65634.0 new file mode 100644 index 0000000000000000000000000000000000000000..f1fc1b1cef312e2a4004f49e2c1958219b7f5495 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716174523.tz579-raptorlake.65634.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1499de7f8d44ad8690a4fee9818a4ec46085f303e71f1d916a3979f95334b4f +size 63169 diff --git a/wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716177937.tz579-raptorlake.65634.1 b/wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716177937.tz579-raptorlake.65634.1 new file mode 100644 index 0000000000000000000000000000000000000000..5a9f0ce63bd4940e48d719495812957132b35a23 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./runs/May19_22-08-09_tz579-raptorlake/events.out.tfevents.1716177937.tz579-raptorlake.65634.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:761f8f6656c0c227f5c72fd2abed63841c5757356b4cb775dfa24da593234fff +size 406 diff --git a/wav2vec2-base-timit-fine-tuned./special_tokens_map.json b/wav2vec2-base-timit-fine-tuned./special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..7f1669a2a2a16dae6adcf40b222f836ed75cbd1e --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false + }, + "unk_token": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false + } +} diff --git a/wav2vec2-base-timit-fine-tuned./tokenizer_config.json b/wav2vec2-base-timit-fine-tuned./tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fa10530c450758d82ee7c9c929838cf2ba1ac18e --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./tokenizer_config.json @@ -0,0 +1,48 @@ +{ + "added_tokens_decoder": { + "27": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "28": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "processor_class": "Wav2Vec2Processor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "[UNK]", + "word_delimiter_token": "|" +} diff --git a/wav2vec2-base-timit-fine-tuned./train_results.json b/wav2vec2-base-timit-fine-tuned./train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..71c4b5ec42dca1b373fbf74579be3c6dc22cde86 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 20.0, + "total_flos": 2.1476719263248095e+18, + "train_loss": 0.8618391515622879, + "train_runtime": 3159.4128, + "train_samples": 3696, + "train_samples_per_second": 23.397, + "train_steps_per_second": 0.734 +} \ No newline at end of file diff --git a/wav2vec2-base-timit-fine-tuned./trainer_state.json b/wav2vec2-base-timit-fine-tuned./trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..647e0e055e0fc22ccc82219d6bc8a793b2ce7eed --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./trainer_state.json @@ -0,0 +1,1873 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 100, + "global_step": 2320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08620689655172414, + "grad_norm": 9.595185279846191, + "learning_rate": 9e-07, + "loss": 9.1142, + "step": 10 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 9.732986450195312, + "learning_rate": 1.9e-06, + "loss": 8.3446, + "step": 20 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 14.272214889526367, + "learning_rate": 2.8000000000000003e-06, + "loss": 8.6592, + "step": 30 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 15.0160493850708, + "learning_rate": 3.8e-06, + "loss": 7.6985, + "step": 40 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 16.610979080200195, + "learning_rate": 4.800000000000001e-06, + "loss": 6.9688, + "step": 50 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 17.26924705505371, + "learning_rate": 5.8e-06, + "loss": 6.232, + "step": 60 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 11.347734451293945, + "learning_rate": 6.800000000000001e-06, + "loss": 4.7271, + "step": 70 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 4.237112045288086, + "learning_rate": 7.8e-06, + "loss": 3.7919, + "step": 80 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 1.8833028078079224, + "learning_rate": 8.8e-06, + "loss": 3.3967, + "step": 90 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 1.3788093328475952, + "learning_rate": 9.800000000000001e-06, + "loss": 3.1618, + "step": 100 + }, + { + "epoch": 0.8620689655172413, + "eval_loss": 3.1117007732391357, + "eval_runtime": 40.0512, + "eval_samples_per_second": 33.557, + "eval_steps_per_second": 33.557, + "eval_wer": 1.0, + "step": 100 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 1.729278802871704, + "learning_rate": 1.08e-05, + "loss": 3.0865, + "step": 110 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 1.905969500541687, + "learning_rate": 1.18e-05, + "loss": 3.0809, + "step": 120 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 0.8360918760299683, + "learning_rate": 1.2800000000000001e-05, + "loss": 3.0346, + "step": 130 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 0.7653716206550598, + "learning_rate": 1.3800000000000002e-05, + "loss": 3.0106, + "step": 140 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 0.94779372215271, + "learning_rate": 1.48e-05, + "loss": 3.0165, + "step": 150 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.8457741737365723, + "learning_rate": 1.58e-05, + "loss": 3.0, + "step": 160 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 1.4369837045669556, + "learning_rate": 1.6800000000000002e-05, + "loss": 2.9903, + "step": 170 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 1.8290436267852783, + "learning_rate": 1.78e-05, + "loss": 2.9852, + "step": 180 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 1.1530190706253052, + "learning_rate": 1.88e-05, + "loss": 2.99, + "step": 190 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 1.1261711120605469, + "learning_rate": 1.9800000000000004e-05, + "loss": 2.9798, + "step": 200 + }, + { + "epoch": 1.7241379310344827, + "eval_loss": 2.9736363887786865, + "eval_runtime": 39.6236, + "eval_samples_per_second": 33.919, + "eval_steps_per_second": 33.919, + "eval_wer": 1.0, + "step": 200 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 0.903380811214447, + "learning_rate": 2.08e-05, + "loss": 2.9718, + "step": 210 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 0.4889620244503021, + "learning_rate": 2.18e-05, + "loss": 2.9766, + "step": 220 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 1.3861790895462036, + "learning_rate": 2.2800000000000002e-05, + "loss": 2.9658, + "step": 230 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.7976490259170532, + "learning_rate": 2.38e-05, + "loss": 2.9588, + "step": 240 + }, + { + "epoch": 2.1551724137931036, + "grad_norm": 0.698798418045044, + "learning_rate": 2.48e-05, + "loss": 2.9523, + "step": 250 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 1.0858148336410522, + "learning_rate": 2.58e-05, + "loss": 2.9496, + "step": 260 + }, + { + "epoch": 2.3275862068965516, + "grad_norm": 0.5658290386199951, + "learning_rate": 2.6800000000000004e-05, + "loss": 2.9421, + "step": 270 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 0.5713534355163574, + "learning_rate": 2.7800000000000005e-05, + "loss": 2.9427, + "step": 280 + }, + { + "epoch": 2.5, + "grad_norm": 0.7386118769645691, + "learning_rate": 2.88e-05, + "loss": 2.9228, + "step": 290 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 0.767816960811615, + "learning_rate": 2.98e-05, + "loss": 2.9144, + "step": 300 + }, + { + "epoch": 2.586206896551724, + "eval_loss": 2.9074809551239014, + "eval_runtime": 39.8997, + "eval_samples_per_second": 33.684, + "eval_steps_per_second": 33.684, + "eval_wer": 1.0, + "step": 300 + }, + { + "epoch": 2.6724137931034484, + "grad_norm": 0.8676608204841614, + "learning_rate": 3.08e-05, + "loss": 2.8965, + "step": 310 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.6954621076583862, + "learning_rate": 3.18e-05, + "loss": 2.8815, + "step": 320 + }, + { + "epoch": 2.844827586206897, + "grad_norm": 1.1631884574890137, + "learning_rate": 3.2800000000000004e-05, + "loss": 2.855, + "step": 330 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 1.625454306602478, + "learning_rate": 3.38e-05, + "loss": 2.781, + "step": 340 + }, + { + "epoch": 3.0172413793103448, + "grad_norm": 2.0763564109802246, + "learning_rate": 3.48e-05, + "loss": 2.7756, + "step": 350 + }, + { + "epoch": 3.103448275862069, + "grad_norm": 2.036031723022461, + "learning_rate": 3.58e-05, + "loss": 2.6458, + "step": 360 + }, + { + "epoch": 3.189655172413793, + "grad_norm": 1.366801142692566, + "learning_rate": 3.68e-05, + "loss": 2.5189, + "step": 370 + }, + { + "epoch": 3.2758620689655173, + "grad_norm": 2.034527540206909, + "learning_rate": 3.7800000000000004e-05, + "loss": 2.433, + "step": 380 + }, + { + "epoch": 3.3620689655172415, + "grad_norm": 3.8338165283203125, + "learning_rate": 3.88e-05, + "loss": 2.2885, + "step": 390 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 2.3443217277526855, + "learning_rate": 3.9800000000000005e-05, + "loss": 2.1714, + "step": 400 + }, + { + "epoch": 3.4482758620689653, + "eval_loss": 2.0944502353668213, + "eval_runtime": 39.7668, + "eval_samples_per_second": 33.797, + "eval_steps_per_second": 33.797, + "eval_wer": 1.0325047801147227, + "step": 400 + }, + { + "epoch": 3.5344827586206895, + "grad_norm": 4.349735260009766, + "learning_rate": 4.08e-05, + "loss": 2.0881, + "step": 410 + }, + { + "epoch": 3.6206896551724137, + "grad_norm": 2.450747489929199, + "learning_rate": 4.18e-05, + "loss": 1.9522, + "step": 420 + }, + { + "epoch": 3.706896551724138, + "grad_norm": 2.2519729137420654, + "learning_rate": 4.2800000000000004e-05, + "loss": 1.8395, + "step": 430 + }, + { + "epoch": 3.793103448275862, + "grad_norm": 2.693664789199829, + "learning_rate": 4.38e-05, + "loss": 1.7525, + "step": 440 + }, + { + "epoch": 3.8793103448275863, + "grad_norm": 1.9744929075241089, + "learning_rate": 4.4800000000000005e-05, + "loss": 1.6222, + "step": 450 + }, + { + "epoch": 3.9655172413793105, + "grad_norm": 3.802494764328003, + "learning_rate": 4.58e-05, + "loss": 1.5397, + "step": 460 + }, + { + "epoch": 4.051724137931035, + "grad_norm": 2.301044225692749, + "learning_rate": 4.6800000000000006e-05, + "loss": 1.4376, + "step": 470 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 2.279372215270996, + "learning_rate": 4.78e-05, + "loss": 1.2829, + "step": 480 + }, + { + "epoch": 4.224137931034483, + "grad_norm": 3.314736843109131, + "learning_rate": 4.88e-05, + "loss": 1.1976, + "step": 490 + }, + { + "epoch": 4.310344827586207, + "grad_norm": 2.434694290161133, + "learning_rate": 4.9800000000000004e-05, + "loss": 1.1579, + "step": 500 + }, + { + "epoch": 4.310344827586207, + "eval_loss": 1.045101284980774, + "eval_runtime": 39.7455, + "eval_samples_per_second": 33.815, + "eval_steps_per_second": 33.815, + "eval_wer": 0.8299189656742239, + "step": 500 + }, + { + "epoch": 4.396551724137931, + "grad_norm": 1.8384031057357788, + "learning_rate": 5.08e-05, + "loss": 1.0684, + "step": 510 + }, + { + "epoch": 4.482758620689655, + "grad_norm": 3.599148988723755, + "learning_rate": 5.1800000000000005e-05, + "loss": 1.0319, + "step": 520 + }, + { + "epoch": 4.568965517241379, + "grad_norm": 2.066476583480835, + "learning_rate": 5.28e-05, + "loss": 0.9179, + "step": 530 + }, + { + "epoch": 4.655172413793103, + "grad_norm": 2.2173750400543213, + "learning_rate": 5.380000000000001e-05, + "loss": 0.8838, + "step": 540 + }, + { + "epoch": 4.741379310344827, + "grad_norm": 2.427091121673584, + "learning_rate": 5.4800000000000004e-05, + "loss": 0.8991, + "step": 550 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 2.7432241439819336, + "learning_rate": 5.580000000000001e-05, + "loss": 0.8, + "step": 560 + }, + { + "epoch": 4.913793103448276, + "grad_norm": 3.254221200942993, + "learning_rate": 5.68e-05, + "loss": 0.7803, + "step": 570 + }, + { + "epoch": 5.0, + "grad_norm": 4.457448482513428, + "learning_rate": 5.7799999999999995e-05, + "loss": 0.8205, + "step": 580 + }, + { + "epoch": 5.086206896551724, + "grad_norm": 3.1023166179656982, + "learning_rate": 5.88e-05, + "loss": 0.6703, + "step": 590 + }, + { + "epoch": 5.172413793103448, + "grad_norm": 2.5916504859924316, + "learning_rate": 5.9800000000000003e-05, + "loss": 0.6087, + "step": 600 + }, + { + "epoch": 5.172413793103448, + "eval_loss": 0.6753795146942139, + "eval_runtime": 39.7485, + "eval_samples_per_second": 33.813, + "eval_steps_per_second": 33.813, + "eval_wer": 0.6440863152144223, + "step": 600 + }, + { + "epoch": 5.258620689655173, + "grad_norm": 2.1707613468170166, + "learning_rate": 6.08e-05, + "loss": 0.6569, + "step": 610 + }, + { + "epoch": 5.344827586206897, + "grad_norm": 2.4291555881500244, + "learning_rate": 6.18e-05, + "loss": 0.5627, + "step": 620 + }, + { + "epoch": 5.431034482758621, + "grad_norm": 2.249617338180542, + "learning_rate": 6.280000000000001e-05, + "loss": 0.5381, + "step": 630 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 1.6661946773529053, + "learning_rate": 6.38e-05, + "loss": 0.6338, + "step": 640 + }, + { + "epoch": 5.603448275862069, + "grad_norm": 2.60294771194458, + "learning_rate": 6.48e-05, + "loss": 0.5181, + "step": 650 + }, + { + "epoch": 5.689655172413794, + "grad_norm": 3.3003089427948, + "learning_rate": 6.58e-05, + "loss": 0.5189, + "step": 660 + }, + { + "epoch": 5.775862068965517, + "grad_norm": 1.880764126777649, + "learning_rate": 6.680000000000001e-05, + "loss": 0.564, + "step": 670 + }, + { + "epoch": 5.862068965517241, + "grad_norm": 2.0575127601623535, + "learning_rate": 6.780000000000001e-05, + "loss": 0.4729, + "step": 680 + }, + { + "epoch": 5.948275862068965, + "grad_norm": 2.5159761905670166, + "learning_rate": 6.879999999999999e-05, + "loss": 0.4899, + "step": 690 + }, + { + "epoch": 6.0344827586206895, + "grad_norm": 1.4463504552841187, + "learning_rate": 6.98e-05, + "loss": 0.481, + "step": 700 + }, + { + "epoch": 6.0344827586206895, + "eval_loss": 0.5275412201881409, + "eval_runtime": 39.9601, + "eval_samples_per_second": 33.634, + "eval_steps_per_second": 33.634, + "eval_wer": 0.5760721114449604, + "step": 700 + }, + { + "epoch": 6.120689655172414, + "grad_norm": 1.788765549659729, + "learning_rate": 7.08e-05, + "loss": 0.3865, + "step": 710 + }, + { + "epoch": 6.206896551724138, + "grad_norm": 1.862762212753296, + "learning_rate": 7.18e-05, + "loss": 0.3726, + "step": 720 + }, + { + "epoch": 6.293103448275862, + "grad_norm": 1.6512093544006348, + "learning_rate": 7.280000000000001e-05, + "loss": 0.4116, + "step": 730 + }, + { + "epoch": 6.379310344827586, + "grad_norm": 2.098067045211792, + "learning_rate": 7.38e-05, + "loss": 0.3779, + "step": 740 + }, + { + "epoch": 6.4655172413793105, + "grad_norm": 3.3030078411102295, + "learning_rate": 7.48e-05, + "loss": 0.3728, + "step": 750 + }, + { + "epoch": 6.551724137931035, + "grad_norm": 2.1799120903015137, + "learning_rate": 7.58e-05, + "loss": 0.4047, + "step": 760 + }, + { + "epoch": 6.637931034482759, + "grad_norm": 1.862434983253479, + "learning_rate": 7.680000000000001e-05, + "loss": 0.313, + "step": 770 + }, + { + "epoch": 6.724137931034483, + "grad_norm": 6.29113245010376, + "learning_rate": 7.780000000000001e-05, + "loss": 0.4052, + "step": 780 + }, + { + "epoch": 6.810344827586206, + "grad_norm": 1.4220325946807861, + "learning_rate": 7.88e-05, + "loss": 0.3218, + "step": 790 + }, + { + "epoch": 6.896551724137931, + "grad_norm": 2.586819648742676, + "learning_rate": 7.98e-05, + "loss": 0.3072, + "step": 800 + }, + { + "epoch": 6.896551724137931, + "eval_loss": 0.4836220443248749, + "eval_runtime": 39.8762, + "eval_samples_per_second": 33.704, + "eval_steps_per_second": 33.704, + "eval_wer": 0.5264499681325685, + "step": 800 + }, + { + "epoch": 6.982758620689655, + "grad_norm": 1.6589460372924805, + "learning_rate": 8.080000000000001e-05, + "loss": 0.3862, + "step": 810 + }, + { + "epoch": 7.068965517241379, + "grad_norm": 1.7299175262451172, + "learning_rate": 8.18e-05, + "loss": 0.2938, + "step": 820 + }, + { + "epoch": 7.155172413793103, + "grad_norm": 2.0545098781585693, + "learning_rate": 8.28e-05, + "loss": 0.249, + "step": 830 + }, + { + "epoch": 7.241379310344827, + "grad_norm": 24.935670852661133, + "learning_rate": 8.38e-05, + "loss": 0.3202, + "step": 840 + }, + { + "epoch": 7.327586206896552, + "grad_norm": 2.497840642929077, + "learning_rate": 8.48e-05, + "loss": 0.2803, + "step": 850 + }, + { + "epoch": 7.413793103448276, + "grad_norm": 2.698636531829834, + "learning_rate": 8.58e-05, + "loss": 0.2473, + "step": 860 + }, + { + "epoch": 7.5, + "grad_norm": 1.4561227560043335, + "learning_rate": 8.680000000000001e-05, + "loss": 0.3223, + "step": 870 + }, + { + "epoch": 7.586206896551724, + "grad_norm": 1.7760556936264038, + "learning_rate": 8.78e-05, + "loss": 0.2481, + "step": 880 + }, + { + "epoch": 7.672413793103448, + "grad_norm": 2.308103084564209, + "learning_rate": 8.88e-05, + "loss": 0.2545, + "step": 890 + }, + { + "epoch": 7.758620689655173, + "grad_norm": 1.4128385782241821, + "learning_rate": 8.98e-05, + "loss": 0.332, + "step": 900 + }, + { + "epoch": 7.758620689655173, + "eval_loss": 0.44030094146728516, + "eval_runtime": 39.9401, + "eval_samples_per_second": 33.65, + "eval_steps_per_second": 33.65, + "eval_wer": 0.5233542747883092, + "step": 900 + }, + { + "epoch": 7.844827586206897, + "grad_norm": 1.7903906106948853, + "learning_rate": 9.080000000000001e-05, + "loss": 0.2411, + "step": 910 + }, + { + "epoch": 7.931034482758621, + "grad_norm": 2.0804216861724854, + "learning_rate": 9.180000000000001e-05, + "loss": 0.2707, + "step": 920 + }, + { + "epoch": 8.017241379310345, + "grad_norm": 1.4420605897903442, + "learning_rate": 9.28e-05, + "loss": 0.3186, + "step": 930 + }, + { + "epoch": 8.10344827586207, + "grad_norm": 2.2910854816436768, + "learning_rate": 9.38e-05, + "loss": 0.1937, + "step": 940 + }, + { + "epoch": 8.189655172413794, + "grad_norm": 3.5892796516418457, + "learning_rate": 9.48e-05, + "loss": 0.2321, + "step": 950 + }, + { + "epoch": 8.275862068965518, + "grad_norm": 1.6509956121444702, + "learning_rate": 9.58e-05, + "loss": 0.2868, + "step": 960 + }, + { + "epoch": 8.362068965517242, + "grad_norm": 1.6983604431152344, + "learning_rate": 9.680000000000001e-05, + "loss": 0.2004, + "step": 970 + }, + { + "epoch": 8.448275862068966, + "grad_norm": 2.061176061630249, + "learning_rate": 9.78e-05, + "loss": 0.2025, + "step": 980 + }, + { + "epoch": 8.53448275862069, + "grad_norm": 1.7732270956039429, + "learning_rate": 9.88e-05, + "loss": 0.2598, + "step": 990 + }, + { + "epoch": 8.620689655172415, + "grad_norm": 1.8335466384887695, + "learning_rate": 9.98e-05, + "loss": 0.1876, + "step": 1000 + }, + { + "epoch": 8.620689655172415, + "eval_loss": 0.4757933020591736, + "eval_runtime": 39.8291, + "eval_samples_per_second": 33.744, + "eval_steps_per_second": 33.744, + "eval_wer": 0.5221706273331512, + "step": 1000 + }, + { + "epoch": 8.706896551724139, + "grad_norm": 2.52902889251709, + "learning_rate": 9.939393939393939e-05, + "loss": 0.2456, + "step": 1010 + }, + { + "epoch": 8.793103448275861, + "grad_norm": 1.7294162511825562, + "learning_rate": 9.863636363636364e-05, + "loss": 0.2499, + "step": 1020 + }, + { + "epoch": 8.879310344827585, + "grad_norm": 21.9121150970459, + "learning_rate": 9.787878787878789e-05, + "loss": 0.1854, + "step": 1030 + }, + { + "epoch": 8.96551724137931, + "grad_norm": 3.9164559841156006, + "learning_rate": 9.712121212121212e-05, + "loss": 0.2576, + "step": 1040 + }, + { + "epoch": 9.051724137931034, + "grad_norm": 1.239221215248108, + "learning_rate": 9.636363636363637e-05, + "loss": 0.2118, + "step": 1050 + }, + { + "epoch": 9.137931034482758, + "grad_norm": 3.1416544914245605, + "learning_rate": 9.560606060606061e-05, + "loss": 0.1577, + "step": 1060 + }, + { + "epoch": 9.224137931034482, + "grad_norm": 2.4253621101379395, + "learning_rate": 9.484848484848486e-05, + "loss": 0.2092, + "step": 1070 + }, + { + "epoch": 9.310344827586206, + "grad_norm": 1.194345474243164, + "learning_rate": 9.40909090909091e-05, + "loss": 0.1876, + "step": 1080 + }, + { + "epoch": 9.39655172413793, + "grad_norm": 2.411029100418091, + "learning_rate": 9.333333333333334e-05, + "loss": 0.1546, + "step": 1090 + }, + { + "epoch": 9.482758620689655, + "grad_norm": 3.246082067489624, + "learning_rate": 9.257575757575758e-05, + "loss": 0.2232, + "step": 1100 + }, + { + "epoch": 9.482758620689655, + "eval_loss": 0.45077577233314514, + "eval_runtime": 39.9221, + "eval_samples_per_second": 33.666, + "eval_steps_per_second": 33.666, + "eval_wer": 0.48921059819721385, + "step": 1100 + }, + { + "epoch": 9.568965517241379, + "grad_norm": 1.3427454233169556, + "learning_rate": 9.181818181818183e-05, + "loss": 0.1777, + "step": 1110 + }, + { + "epoch": 9.655172413793103, + "grad_norm": 1.5090447664260864, + "learning_rate": 9.106060606060606e-05, + "loss": 0.1646, + "step": 1120 + }, + { + "epoch": 9.741379310344827, + "grad_norm": 1.3060975074768066, + "learning_rate": 9.030303030303031e-05, + "loss": 0.225, + "step": 1130 + }, + { + "epoch": 9.827586206896552, + "grad_norm": 1.3011540174484253, + "learning_rate": 8.954545454545455e-05, + "loss": 0.1552, + "step": 1140 + }, + { + "epoch": 9.913793103448276, + "grad_norm": 1.9938538074493408, + "learning_rate": 8.87878787878788e-05, + "loss": 0.1715, + "step": 1150 + }, + { + "epoch": 10.0, + "grad_norm": 3.334385395050049, + "learning_rate": 8.803030303030304e-05, + "loss": 0.2092, + "step": 1160 + }, + { + "epoch": 10.086206896551724, + "grad_norm": 1.011092185974121, + "learning_rate": 8.727272727272727e-05, + "loss": 0.14, + "step": 1170 + }, + { + "epoch": 10.172413793103448, + "grad_norm": 2.517902135848999, + "learning_rate": 8.651515151515152e-05, + "loss": 0.1512, + "step": 1180 + }, + { + "epoch": 10.258620689655173, + "grad_norm": 1.2418378591537476, + "learning_rate": 8.575757575757576e-05, + "loss": 0.1846, + "step": 1190 + }, + { + "epoch": 10.344827586206897, + "grad_norm": 1.5885329246520996, + "learning_rate": 8.5e-05, + "loss": 0.1332, + "step": 1200 + }, + { + "epoch": 10.344827586206897, + "eval_loss": 0.4394075274467468, + "eval_runtime": 39.9367, + "eval_samples_per_second": 33.653, + "eval_steps_per_second": 33.653, + "eval_wer": 0.4740052808886461, + "step": 1200 + }, + { + "epoch": 10.431034482758621, + "grad_norm": 1.2539469003677368, + "learning_rate": 8.424242424242424e-05, + "loss": 0.1485, + "step": 1210 + }, + { + "epoch": 10.517241379310345, + "grad_norm": 1.357601284980774, + "learning_rate": 8.348484848484849e-05, + "loss": 0.1988, + "step": 1220 + }, + { + "epoch": 10.60344827586207, + "grad_norm": 2.0564587116241455, + "learning_rate": 8.272727272727273e-05, + "loss": 0.137, + "step": 1230 + }, + { + "epoch": 10.689655172413794, + "grad_norm": 2.48364520072937, + "learning_rate": 8.196969696969698e-05, + "loss": 0.1245, + "step": 1240 + }, + { + "epoch": 10.775862068965518, + "grad_norm": 1.015891671180725, + "learning_rate": 8.121212121212121e-05, + "loss": 0.1602, + "step": 1250 + }, + { + "epoch": 10.862068965517242, + "grad_norm": 1.1023950576782227, + "learning_rate": 8.045454545454546e-05, + "loss": 0.1215, + "step": 1260 + }, + { + "epoch": 10.948275862068966, + "grad_norm": 2.703427791595459, + "learning_rate": 7.96969696969697e-05, + "loss": 0.1621, + "step": 1270 + }, + { + "epoch": 11.03448275862069, + "grad_norm": 1.1821691989898682, + "learning_rate": 7.893939393939395e-05, + "loss": 0.1651, + "step": 1280 + }, + { + "epoch": 11.120689655172415, + "grad_norm": 0.930283784866333, + "learning_rate": 7.818181818181818e-05, + "loss": 0.1066, + "step": 1290 + }, + { + "epoch": 11.206896551724139, + "grad_norm": 1.6548758745193481, + "learning_rate": 7.742424242424243e-05, + "loss": 0.1085, + "step": 1300 + }, + { + "epoch": 11.206896551724139, + "eval_loss": 0.4466467499732971, + "eval_runtime": 39.8633, + "eval_samples_per_second": 33.715, + "eval_steps_per_second": 33.715, + "eval_wer": 0.46207775653282346, + "step": 1300 + }, + { + "epoch": 11.293103448275861, + "grad_norm": 1.1760716438293457, + "learning_rate": 7.666666666666667e-05, + "loss": 0.1418, + "step": 1310 + }, + { + "epoch": 11.379310344827585, + "grad_norm": 2.1062755584716797, + "learning_rate": 7.59090909090909e-05, + "loss": 0.1133, + "step": 1320 + }, + { + "epoch": 11.46551724137931, + "grad_norm": 2.67399001121521, + "learning_rate": 7.515151515151515e-05, + "loss": 0.1318, + "step": 1330 + }, + { + "epoch": 11.551724137931034, + "grad_norm": 1.0049142837524414, + "learning_rate": 7.439393939393939e-05, + "loss": 0.1474, + "step": 1340 + }, + { + "epoch": 11.637931034482758, + "grad_norm": 1.586559772491455, + "learning_rate": 7.363636363636364e-05, + "loss": 0.0908, + "step": 1350 + }, + { + "epoch": 11.724137931034482, + "grad_norm": 3.784040927886963, + "learning_rate": 7.287878787878788e-05, + "loss": 0.1521, + "step": 1360 + }, + { + "epoch": 11.810344827586206, + "grad_norm": 1.125501275062561, + "learning_rate": 7.212121212121213e-05, + "loss": 0.1163, + "step": 1370 + }, + { + "epoch": 11.89655172413793, + "grad_norm": 2.1989808082580566, + "learning_rate": 7.136363636363636e-05, + "loss": 0.1109, + "step": 1380 + }, + { + "epoch": 11.982758620689655, + "grad_norm": 1.1287301778793335, + "learning_rate": 7.060606060606061e-05, + "loss": 0.152, + "step": 1390 + }, + { + "epoch": 12.068965517241379, + "grad_norm": 1.538678765296936, + "learning_rate": 6.984848484848485e-05, + "loss": 0.098, + "step": 1400 + }, + { + "epoch": 12.068965517241379, + "eval_loss": 0.42302384972572327, + "eval_runtime": 40.1773, + "eval_samples_per_second": 33.452, + "eval_steps_per_second": 33.452, + "eval_wer": 0.44933078393881454, + "step": 1400 + }, + { + "epoch": 12.155172413793103, + "grad_norm": 1.400772213935852, + "learning_rate": 6.90909090909091e-05, + "loss": 0.092, + "step": 1410 + }, + { + "epoch": 12.241379310344827, + "grad_norm": 3.6780846118927, + "learning_rate": 6.833333333333333e-05, + "loss": 0.1649, + "step": 1420 + }, + { + "epoch": 12.327586206896552, + "grad_norm": 1.5424057245254517, + "learning_rate": 6.757575757575758e-05, + "loss": 0.091, + "step": 1430 + }, + { + "epoch": 12.413793103448276, + "grad_norm": 1.4868180751800537, + "learning_rate": 6.681818181818183e-05, + "loss": 0.0869, + "step": 1440 + }, + { + "epoch": 12.5, + "grad_norm": 1.1947145462036133, + "learning_rate": 6.606060606060607e-05, + "loss": 0.1499, + "step": 1450 + }, + { + "epoch": 12.586206896551724, + "grad_norm": 1.0430784225463867, + "learning_rate": 6.530303030303032e-05, + "loss": 0.0954, + "step": 1460 + }, + { + "epoch": 12.672413793103448, + "grad_norm": 2.4261584281921387, + "learning_rate": 6.454545454545455e-05, + "loss": 0.1032, + "step": 1470 + }, + { + "epoch": 12.758620689655173, + "grad_norm": 1.033467411994934, + "learning_rate": 6.37878787878788e-05, + "loss": 0.1158, + "step": 1480 + }, + { + "epoch": 12.844827586206897, + "grad_norm": 1.1535651683807373, + "learning_rate": 6.303030303030302e-05, + "loss": 0.0864, + "step": 1490 + }, + { + "epoch": 12.931034482758621, + "grad_norm": 1.28826105594635, + "learning_rate": 6.227272727272727e-05, + "loss": 0.1219, + "step": 1500 + }, + { + "epoch": 12.931034482758621, + "eval_loss": 0.418023020029068, + "eval_runtime": 40.2192, + "eval_samples_per_second": 33.417, + "eval_steps_per_second": 33.417, + "eval_wer": 0.44596194118182647, + "step": 1500 + }, + { + "epoch": 13.017241379310345, + "grad_norm": 1.055411458015442, + "learning_rate": 6.151515151515151e-05, + "loss": 0.1289, + "step": 1510 + }, + { + "epoch": 13.10344827586207, + "grad_norm": 1.1269094944000244, + "learning_rate": 6.075757575757576e-05, + "loss": 0.0776, + "step": 1520 + }, + { + "epoch": 13.189655172413794, + "grad_norm": 1.7149118185043335, + "learning_rate": 6e-05, + "loss": 0.0871, + "step": 1530 + }, + { + "epoch": 13.275862068965518, + "grad_norm": 1.7456856966018677, + "learning_rate": 5.9242424242424244e-05, + "loss": 0.1087, + "step": 1540 + }, + { + "epoch": 13.362068965517242, + "grad_norm": 1.3434715270996094, + "learning_rate": 5.848484848484849e-05, + "loss": 0.0821, + "step": 1550 + }, + { + "epoch": 13.448275862068966, + "grad_norm": 2.103512763977051, + "learning_rate": 5.772727272727273e-05, + "loss": 0.0878, + "step": 1560 + }, + { + "epoch": 13.53448275862069, + "grad_norm": 1.240224838256836, + "learning_rate": 5.696969696969697e-05, + "loss": 0.1044, + "step": 1570 + }, + { + "epoch": 13.620689655172415, + "grad_norm": 0.7336703538894653, + "learning_rate": 5.6212121212121215e-05, + "loss": 0.0753, + "step": 1580 + }, + { + "epoch": 13.706896551724139, + "grad_norm": 2.293342351913452, + "learning_rate": 5.545454545454546e-05, + "loss": 0.1059, + "step": 1590 + }, + { + "epoch": 13.793103448275861, + "grad_norm": 1.1853971481323242, + "learning_rate": 5.46969696969697e-05, + "loss": 0.1021, + "step": 1600 + }, + { + "epoch": 13.793103448275861, + "eval_loss": 0.41785839200019836, + "eval_runtime": 40.2906, + "eval_samples_per_second": 33.358, + "eval_steps_per_second": 33.358, + "eval_wer": 0.4405900027314941, + "step": 1600 + }, + { + "epoch": 13.879310344827585, + "grad_norm": 1.331200361251831, + "learning_rate": 5.393939393939394e-05, + "loss": 0.0648, + "step": 1610 + }, + { + "epoch": 13.96551724137931, + "grad_norm": 2.28397536277771, + "learning_rate": 5.3181818181818186e-05, + "loss": 0.1121, + "step": 1620 + }, + { + "epoch": 14.051724137931034, + "grad_norm": 0.9436893463134766, + "learning_rate": 5.242424242424243e-05, + "loss": 0.0725, + "step": 1630 + }, + { + "epoch": 14.137931034482758, + "grad_norm": 1.6113288402557373, + "learning_rate": 5.166666666666667e-05, + "loss": 0.0691, + "step": 1640 + }, + { + "epoch": 14.224137931034482, + "grad_norm": 2.479888439178467, + "learning_rate": 5.090909090909091e-05, + "loss": 0.0979, + "step": 1650 + }, + { + "epoch": 14.310344827586206, + "grad_norm": 1.006616473197937, + "learning_rate": 5.015151515151515e-05, + "loss": 0.0909, + "step": 1660 + }, + { + "epoch": 14.39655172413793, + "grad_norm": 1.4571704864501953, + "learning_rate": 4.93939393939394e-05, + "loss": 0.0761, + "step": 1670 + }, + { + "epoch": 14.482758620689655, + "grad_norm": 1.5729875564575195, + "learning_rate": 4.863636363636364e-05, + "loss": 0.0862, + "step": 1680 + }, + { + "epoch": 14.568965517241379, + "grad_norm": 1.2180376052856445, + "learning_rate": 4.787878787878788e-05, + "loss": 0.0646, + "step": 1690 + }, + { + "epoch": 14.655172413793103, + "grad_norm": 1.7464072704315186, + "learning_rate": 4.712121212121212e-05, + "loss": 0.0741, + "step": 1700 + }, + { + "epoch": 14.655172413793103, + "eval_loss": 0.4113341271877289, + "eval_runtime": 40.2841, + "eval_samples_per_second": 33.363, + "eval_steps_per_second": 33.363, + "eval_wer": 0.4309387234817445, + "step": 1700 + }, + { + "epoch": 14.741379310344827, + "grad_norm": 0.8571386337280273, + "learning_rate": 4.6439393939393944e-05, + "loss": 0.1315, + "step": 1710 + }, + { + "epoch": 14.827586206896552, + "grad_norm": 1.331377387046814, + "learning_rate": 4.5681818181818186e-05, + "loss": 0.0603, + "step": 1720 + }, + { + "epoch": 14.913793103448276, + "grad_norm": 1.5398732423782349, + "learning_rate": 4.492424242424242e-05, + "loss": 0.0796, + "step": 1730 + }, + { + "epoch": 15.0, + "grad_norm": 3.689671754837036, + "learning_rate": 4.4166666666666665e-05, + "loss": 0.085, + "step": 1740 + }, + { + "epoch": 15.086206896551724, + "grad_norm": 1.132613182067871, + "learning_rate": 4.340909090909091e-05, + "loss": 0.0544, + "step": 1750 + }, + { + "epoch": 15.172413793103448, + "grad_norm": 1.5951859951019287, + "learning_rate": 4.265151515151515e-05, + "loss": 0.0601, + "step": 1760 + }, + { + "epoch": 15.258620689655173, + "grad_norm": 0.5179944634437561, + "learning_rate": 4.189393939393939e-05, + "loss": 0.097, + "step": 1770 + }, + { + "epoch": 15.344827586206897, + "grad_norm": 0.9744370579719543, + "learning_rate": 4.113636363636364e-05, + "loss": 0.0596, + "step": 1780 + }, + { + "epoch": 15.431034482758621, + "grad_norm": 1.8794275522232056, + "learning_rate": 4.0378787878787885e-05, + "loss": 0.0677, + "step": 1790 + }, + { + "epoch": 15.517241379310345, + "grad_norm": 0.748386025428772, + "learning_rate": 3.962121212121213e-05, + "loss": 0.0896, + "step": 1800 + }, + { + "epoch": 15.517241379310345, + "eval_loss": 0.43920788168907166, + "eval_runtime": 40.1997, + "eval_samples_per_second": 33.433, + "eval_steps_per_second": 33.433, + "eval_wer": 0.4307566238732587, + "step": 1800 + }, + { + "epoch": 15.60344827586207, + "grad_norm": 0.9639837145805359, + "learning_rate": 3.8863636363636364e-05, + "loss": 0.0604, + "step": 1810 + }, + { + "epoch": 15.689655172413794, + "grad_norm": 1.9640839099884033, + "learning_rate": 3.810606060606061e-05, + "loss": 0.0711, + "step": 1820 + }, + { + "epoch": 15.775862068965518, + "grad_norm": 1.4438735246658325, + "learning_rate": 3.734848484848485e-05, + "loss": 0.0867, + "step": 1830 + }, + { + "epoch": 15.862068965517242, + "grad_norm": 1.0062426328659058, + "learning_rate": 3.659090909090909e-05, + "loss": 0.0605, + "step": 1840 + }, + { + "epoch": 15.948275862068966, + "grad_norm": 1.6331523656845093, + "learning_rate": 3.5833333333333335e-05, + "loss": 0.0662, + "step": 1850 + }, + { + "epoch": 16.03448275862069, + "grad_norm": 0.8070217370986938, + "learning_rate": 3.507575757575758e-05, + "loss": 0.0765, + "step": 1860 + }, + { + "epoch": 16.120689655172413, + "grad_norm": 1.4137670993804932, + "learning_rate": 3.431818181818182e-05, + "loss": 0.0537, + "step": 1870 + }, + { + "epoch": 16.20689655172414, + "grad_norm": 1.5437769889831543, + "learning_rate": 3.356060606060606e-05, + "loss": 0.0684, + "step": 1880 + }, + { + "epoch": 16.29310344827586, + "grad_norm": 0.90281081199646, + "learning_rate": 3.2803030303030305e-05, + "loss": 0.0744, + "step": 1890 + }, + { + "epoch": 16.379310344827587, + "grad_norm": 1.139837622642517, + "learning_rate": 3.204545454545455e-05, + "loss": 0.0492, + "step": 1900 + }, + { + "epoch": 16.379310344827587, + "eval_loss": 0.4201890528202057, + "eval_runtime": 40.1502, + "eval_samples_per_second": 33.474, + "eval_steps_per_second": 33.474, + "eval_wer": 0.4313029226987162, + "step": 1900 + }, + { + "epoch": 16.46551724137931, + "grad_norm": 1.679457426071167, + "learning_rate": 3.128787878787879e-05, + "loss": 0.0652, + "step": 1910 + }, + { + "epoch": 16.551724137931036, + "grad_norm": 0.6661111116409302, + "learning_rate": 3.0530303030303034e-05, + "loss": 0.0649, + "step": 1920 + }, + { + "epoch": 16.637931034482758, + "grad_norm": 1.1774355173110962, + "learning_rate": 2.9772727272727273e-05, + "loss": 0.0469, + "step": 1930 + }, + { + "epoch": 16.724137931034484, + "grad_norm": 1.783923864364624, + "learning_rate": 2.901515151515152e-05, + "loss": 0.0752, + "step": 1940 + }, + { + "epoch": 16.810344827586206, + "grad_norm": 1.176321268081665, + "learning_rate": 2.825757575757576e-05, + "loss": 0.0519, + "step": 1950 + }, + { + "epoch": 16.896551724137932, + "grad_norm": 1.3150608539581299, + "learning_rate": 2.7500000000000004e-05, + "loss": 0.0547, + "step": 1960 + }, + { + "epoch": 16.982758620689655, + "grad_norm": 0.983769953250885, + "learning_rate": 2.674242424242424e-05, + "loss": 0.0799, + "step": 1970 + }, + { + "epoch": 17.06896551724138, + "grad_norm": 0.996890127658844, + "learning_rate": 2.5984848484848483e-05, + "loss": 0.0577, + "step": 1980 + }, + { + "epoch": 17.155172413793103, + "grad_norm": 2.3034253120422363, + "learning_rate": 2.5227272727272726e-05, + "loss": 0.0515, + "step": 1990 + }, + { + "epoch": 17.24137931034483, + "grad_norm": 3.7528610229492188, + "learning_rate": 2.4469696969696972e-05, + "loss": 0.0759, + "step": 2000 + }, + { + "epoch": 17.24137931034483, + "eval_loss": 0.43480169773101807, + "eval_runtime": 40.017, + "eval_samples_per_second": 33.586, + "eval_steps_per_second": 33.586, + "eval_wer": 0.4207411454065374, + "step": 2000 + }, + { + "epoch": 17.32758620689655, + "grad_norm": 0.6646668314933777, + "learning_rate": 2.3712121212121214e-05, + "loss": 0.0419, + "step": 2010 + }, + { + "epoch": 17.413793103448278, + "grad_norm": 1.3250740766525269, + "learning_rate": 2.2954545454545457e-05, + "loss": 0.0595, + "step": 2020 + }, + { + "epoch": 17.5, + "grad_norm": 0.8094995021820068, + "learning_rate": 2.21969696969697e-05, + "loss": 0.0691, + "step": 2030 + }, + { + "epoch": 17.586206896551722, + "grad_norm": 0.846946120262146, + "learning_rate": 2.143939393939394e-05, + "loss": 0.052, + "step": 2040 + }, + { + "epoch": 17.67241379310345, + "grad_norm": 1.652417540550232, + "learning_rate": 2.0681818181818182e-05, + "loss": 0.0565, + "step": 2050 + }, + { + "epoch": 17.75862068965517, + "grad_norm": 1.0080279111862183, + "learning_rate": 1.9924242424242425e-05, + "loss": 0.0745, + "step": 2060 + }, + { + "epoch": 17.844827586206897, + "grad_norm": 0.7252691388130188, + "learning_rate": 1.9166666666666667e-05, + "loss": 0.0513, + "step": 2070 + }, + { + "epoch": 17.93103448275862, + "grad_norm": 1.58548903465271, + "learning_rate": 1.840909090909091e-05, + "loss": 0.055, + "step": 2080 + }, + { + "epoch": 18.017241379310345, + "grad_norm": 0.6634634733200073, + "learning_rate": 1.7651515151515153e-05, + "loss": 0.0658, + "step": 2090 + }, + { + "epoch": 18.103448275862068, + "grad_norm": 1.1495524644851685, + "learning_rate": 1.6893939393939395e-05, + "loss": 0.0406, + "step": 2100 + }, + { + "epoch": 18.103448275862068, + "eval_loss": 0.44191813468933105, + "eval_runtime": 40.0967, + "eval_samples_per_second": 33.519, + "eval_steps_per_second": 33.519, + "eval_wer": 0.42046799599380863, + "step": 2100 + }, + { + "epoch": 18.189655172413794, + "grad_norm": 0.9788354635238647, + "learning_rate": 1.6136363636363638e-05, + "loss": 0.0381, + "step": 2110 + }, + { + "epoch": 18.275862068965516, + "grad_norm": 1.093633770942688, + "learning_rate": 1.5378787878787877e-05, + "loss": 0.071, + "step": 2120 + }, + { + "epoch": 18.362068965517242, + "grad_norm": 0.7164376974105835, + "learning_rate": 1.4621212121212122e-05, + "loss": 0.0439, + "step": 2130 + }, + { + "epoch": 18.448275862068964, + "grad_norm": 0.9887032508850098, + "learning_rate": 1.3863636363636364e-05, + "loss": 0.0481, + "step": 2140 + }, + { + "epoch": 18.53448275862069, + "grad_norm": 0.45052286982536316, + "learning_rate": 1.3106060606060607e-05, + "loss": 0.0571, + "step": 2150 + }, + { + "epoch": 18.620689655172413, + "grad_norm": 1.167181134223938, + "learning_rate": 1.234848484848485e-05, + "loss": 0.0452, + "step": 2160 + }, + { + "epoch": 18.70689655172414, + "grad_norm": 1.378661870956421, + "learning_rate": 1.159090909090909e-05, + "loss": 0.0643, + "step": 2170 + }, + { + "epoch": 18.79310344827586, + "grad_norm": 0.854932963848114, + "learning_rate": 1.0833333333333334e-05, + "loss": 0.0587, + "step": 2180 + }, + { + "epoch": 18.879310344827587, + "grad_norm": 0.8007526397705078, + "learning_rate": 1.0075757575757576e-05, + "loss": 0.0395, + "step": 2190 + }, + { + "epoch": 18.96551724137931, + "grad_norm": 3.317830801010132, + "learning_rate": 9.318181818181819e-06, + "loss": 0.074, + "step": 2200 + }, + { + "epoch": 18.96551724137931, + "eval_loss": 0.43061742186546326, + "eval_runtime": 40.0034, + "eval_samples_per_second": 33.597, + "eval_steps_per_second": 33.597, + "eval_wer": 0.420012746972594, + "step": 2200 + }, + { + "epoch": 19.051724137931036, + "grad_norm": 0.7710875272750854, + "learning_rate": 8.56060606060606e-06, + "loss": 0.046, + "step": 2210 + }, + { + "epoch": 19.137931034482758, + "grad_norm": 0.5200530886650085, + "learning_rate": 7.803030303030304e-06, + "loss": 0.0394, + "step": 2220 + }, + { + "epoch": 19.224137931034484, + "grad_norm": 1.3544327020645142, + "learning_rate": 7.045454545454545e-06, + "loss": 0.0582, + "step": 2230 + }, + { + "epoch": 19.310344827586206, + "grad_norm": 0.8653574585914612, + "learning_rate": 6.287878787878789e-06, + "loss": 0.0606, + "step": 2240 + }, + { + "epoch": 19.396551724137932, + "grad_norm": 1.5852700471878052, + "learning_rate": 5.530303030303031e-06, + "loss": 0.0367, + "step": 2250 + }, + { + "epoch": 19.482758620689655, + "grad_norm": 2.2167246341705322, + "learning_rate": 4.772727272727273e-06, + "loss": 0.0782, + "step": 2260 + }, + { + "epoch": 19.56896551724138, + "grad_norm": 0.5891330242156982, + "learning_rate": 4.015151515151515e-06, + "loss": 0.0416, + "step": 2270 + }, + { + "epoch": 19.655172413793103, + "grad_norm": 1.1137330532073975, + "learning_rate": 3.257575757575758e-06, + "loss": 0.0515, + "step": 2280 + }, + { + "epoch": 19.74137931034483, + "grad_norm": 0.8132285475730896, + "learning_rate": 2.5e-06, + "loss": 0.0512, + "step": 2290 + }, + { + "epoch": 19.82758620689655, + "grad_norm": 0.7994781136512756, + "learning_rate": 1.7424242424242427e-06, + "loss": 0.0378, + "step": 2300 + }, + { + "epoch": 19.82758620689655, + "eval_loss": 0.4273350238800049, + "eval_runtime": 40.0934, + "eval_samples_per_second": 33.522, + "eval_steps_per_second": 33.522, + "eval_wer": 0.41728125284530637, + "step": 2300 + }, + { + "epoch": 19.913793103448278, + "grad_norm": 0.9775754809379578, + "learning_rate": 9.848484848484847e-07, + "loss": 0.0489, + "step": 2310 + }, + { + "epoch": 20.0, + "grad_norm": 0.8857516050338745, + "learning_rate": 2.2727272727272726e-07, + "loss": 0.0554, + "step": 2320 + }, + { + "epoch": 20.0, + "step": 2320, + "total_flos": 2.1476719263248095e+18, + "train_loss": 0.8618391515622879, + "train_runtime": 3159.4128, + "train_samples_per_second": 23.397, + "train_steps_per_second": 0.734 + } + ], + "logging_steps": 10, + "max_steps": 2320, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1476719263248095e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/wav2vec2-base-timit-fine-tuned./training_args.bin b/wav2vec2-base-timit-fine-tuned./training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..705e4cb7fa647ea01e9e1a80ab8fef0ae79990c1 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abed99ebdf15c43d2882e1b9d49f7e81da386dc7c0be97a54f7bddbea730415d +size 5112 diff --git a/wav2vec2-base-timit-fine-tuned./vocab.json b/wav2vec2-base-timit-fine-tuned./vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..4540c2af87902c582c6b65cb6a96c93e879a4090 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned./vocab.json @@ -0,0 +1,31 @@ +{ + "[PAD]": 28, + "[UNK]": 27, + "a": 1, + "b": 2, + "c": 3, + "d": 4, + "e": 5, + "f": 6, + "g": 7, + "h": 8, + "i": 9, + "j": 10, + "k": 11, + "l": 12, + "m": 13, + "n": 14, + "o": 15, + "p": 16, + "q": 17, + "r": 18, + "s": 19, + "t": 20, + "u": 21, + "v": 22, + "w": 23, + "x": 24, + "y": 25, + "z": 26, + "|": 0 +} diff --git a/wav2vec2-base-timit-fine-tuned/README.md b/wav2vec2-base-timit-fine-tuned/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b0b4c6d73a25b1c832460f8931e519dd6724ef83 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/README.md @@ -0,0 +1,101 @@ +--- +license: apache-2.0 +base_model: facebook/wav2vec2-base +tags: +- automatic-speech-recognition +- timit_asr +- generated_from_trainer +datasets: +- timit_asr +metrics: +- wer +model-index: +- name: wav2vec2-base-timit-fine-tuned + results: + - task: + name: Automatic Speech Recognition + type: automatic-speech-recognition + dataset: + name: TIMIT_ASR - NA + type: timit_asr + config: clean + split: test + args: 'Config: na, Training split: train, Eval split: test' + metrics: + - name: Wer + type: wer + value: 0.4090867704634435 +--- + + + +# wav2vec2-base-timit-fine-tuned + +This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co./facebook/wav2vec2-base) on the TIMIT_ASR - NA dataset. +It achieves the following results on the evaluation set: +- Loss: 0.4218 +- Wer: 0.4091 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0001 +- train_batch_size: 32 +- eval_batch_size: 1 +- seed: 42 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: linear +- lr_scheduler_warmup_steps: 1000 +- num_epochs: 20.0 +- mixed_precision_training: Native AMP + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Wer | +|:-------------:|:-------:|:----:|:---------------:|:------:| +| 3.1612 | 0.8621 | 100 | 3.1181 | 1.0 | +| 2.978 | 1.7241 | 200 | 2.9722 | 1.0 | +| 2.9185 | 2.5862 | 300 | 2.9098 | 1.0 | +| 2.1282 | 3.4483 | 400 | 2.0066 | 1.0247 | +| 1.1234 | 4.3103 | 500 | 1.0197 | 0.8393 | +| 0.602 | 5.1724 | 600 | 0.6714 | 0.6600 | +| 0.5032 | 6.0345 | 700 | 0.5285 | 0.5659 | +| 0.3101 | 6.8966 | 800 | 0.4819 | 0.5282 | +| 0.3432 | 7.7586 | 900 | 0.4653 | 0.5272 | +| 0.1922 | 8.6207 | 1000 | 0.4672 | 0.4918 | +| 0.2284 | 9.4828 | 1100 | 0.4834 | 0.4870 | +| 0.1372 | 10.3448 | 1200 | 0.4380 | 0.4727 | +| 0.1105 | 11.2069 | 1300 | 0.4509 | 0.4594 | +| 0.0992 | 12.0690 | 1400 | 0.4196 | 0.4544 | +| 0.1226 | 12.9310 | 1500 | 0.4237 | 0.4321 | +| 0.1013 | 13.7931 | 1600 | 0.4113 | 0.4298 | +| 0.0661 | 14.6552 | 1700 | 0.4038 | 0.4276 | +| 0.0901 | 15.5172 | 1800 | 0.4321 | 0.4225 | +| 0.053 | 16.3793 | 1900 | 0.4076 | 0.4236 | +| 0.0805 | 17.2414 | 2000 | 0.4336 | 0.4156 | +| 0.049 | 18.1034 | 2100 | 0.4193 | 0.4114 | +| 0.0717 | 18.9655 | 2200 | 0.4139 | 0.4091 | +| 0.0389 | 19.8276 | 2300 | 0.4216 | 0.4087 | + + +### Framework versions + +- Transformers 4.42.0.dev0 +- Pytorch 2.3.0a0+git71dd2de +- Datasets 2.19.1 +- Tokenizers 0.19.1 diff --git a/wav2vec2-base-timit-fine-tuned/added_tokens.json b/wav2vec2-base-timit-fine-tuned/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..cc95e40ec61ca6dc6f02948ceaad78a75e854f3f --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/added_tokens.json @@ -0,0 +1,4 @@ +{ + "": 30, + "": 29 +} diff --git a/wav2vec2-base-timit-fine-tuned/all_results.json b/wav2vec2-base-timit-fine-tuned/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c1af96b4a9e3e18a1115bc424db3f3b4a462b826 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/all_results.json @@ -0,0 +1,15 @@ +{ + "epoch": 20.0, + "eval_loss": 0.42176610231399536, + "eval_runtime": 39.428, + "eval_samples": 1344, + "eval_samples_per_second": 34.087, + "eval_steps_per_second": 34.087, + "eval_wer": 0.4090867704634435, + "total_flos": 2.1476719263248095e+18, + "train_loss": 0.8590125822430027, + "train_runtime": 3151.1477, + "train_samples": 3696, + "train_samples_per_second": 23.458, + "train_steps_per_second": 0.736 +} \ No newline at end of file diff --git a/wav2vec2-base-timit-fine-tuned/config.json b/wav2vec2-base-timit-fine-tuned/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f36e3992ad6bb713cc84ca1334595fb49014d73 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/config.json @@ -0,0 +1,119 @@ +{ + "_name_or_path": "facebook/wav2vec2-base", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 28, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.42.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 31, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-base-timit-fine-tuned/eval_results.json b/wav2vec2-base-timit-fine-tuned/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0ba977ff9a7faa9fe52c752f18cf73efaaba8911 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/eval_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 20.0, + "eval_loss": 0.42176610231399536, + "eval_runtime": 39.428, + "eval_samples": 1344, + "eval_samples_per_second": 34.087, + "eval_steps_per_second": 34.087, + "eval_wer": 0.4090867704634435 +} \ No newline at end of file diff --git a/wav2vec2-base-timit-fine-tuned/preprocessor_config.json b/wav2vec2-base-timit-fine-tuned/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c626b5517871d529f0ed94aded16d875d0dd4ea2 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/wav2vec2-base-timit-fine-tuned/runs/May24_13-33-16_tz579-raptorlake/events.out.tfevents.1716575663.tz579-raptorlake.8629.0 b/wav2vec2-base-timit-fine-tuned/runs/May24_13-33-16_tz579-raptorlake/events.out.tfevents.1716575663.tz579-raptorlake.8629.0 new file mode 100644 index 0000000000000000000000000000000000000000..947a5bba77d7234b623526cf7c71d9fd9bd84609 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/runs/May24_13-33-16_tz579-raptorlake/events.out.tfevents.1716575663.tz579-raptorlake.8629.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b3eb131616f5351e9874ffdb5016b298b669748ac6ada590496ad43d4f8c552 +size 63169 diff --git a/wav2vec2-base-timit-fine-tuned/runs/May24_13-33-16_tz579-raptorlake/events.out.tfevents.1716578945.tz579-raptorlake.8629.1 b/wav2vec2-base-timit-fine-tuned/runs/May24_13-33-16_tz579-raptorlake/events.out.tfevents.1716578945.tz579-raptorlake.8629.1 new file mode 100644 index 0000000000000000000000000000000000000000..fe8eac4b991732b85a6d6586c3bd2ff604a2534c --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/runs/May24_13-33-16_tz579-raptorlake/events.out.tfevents.1716578945.tz579-raptorlake.8629.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c9ee27ee5a4f448bdea41fb3a7d1a43aa1359d8b96e5a3d4273a7e92ffa9066 +size 406 diff --git a/wav2vec2-base-timit-fine-tuned/special_tokens_map.json b/wav2vec2-base-timit-fine-tuned/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..7f1669a2a2a16dae6adcf40b222f836ed75cbd1e --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false + }, + "unk_token": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false + } +} diff --git a/wav2vec2-base-timit-fine-tuned/tokenizer_config.json b/wav2vec2-base-timit-fine-tuned/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fa10530c450758d82ee7c9c929838cf2ba1ac18e --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/tokenizer_config.json @@ -0,0 +1,48 @@ +{ + "added_tokens_decoder": { + "27": { + "content": "[UNK]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "28": { + "content": "[PAD]", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "processor_class": "Wav2Vec2Processor", + "replace_word_delimiter_char": " ", + "target_lang": null, + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "unk_token": "[UNK]", + "word_delimiter_token": "|" +} diff --git a/wav2vec2-base-timit-fine-tuned/train_results.json b/wav2vec2-base-timit-fine-tuned/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..60aec3ec772286226cd5c83de23c799ed323ba18 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 20.0, + "total_flos": 2.1476719263248095e+18, + "train_loss": 0.8590125822430027, + "train_runtime": 3151.1477, + "train_samples": 3696, + "train_samples_per_second": 23.458, + "train_steps_per_second": 0.736 +} \ No newline at end of file diff --git a/wav2vec2-base-timit-fine-tuned/trainer_state.json b/wav2vec2-base-timit-fine-tuned/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6ea8eab3e505139e995c8c2d7d8aa3289a7d6023 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/trainer_state.json @@ -0,0 +1,1873 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 100, + "global_step": 2320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08620689655172414, + "grad_norm": 9.590513229370117, + "learning_rate": 9e-07, + "loss": 9.1142, + "step": 10 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 9.729034423828125, + "learning_rate": 1.9e-06, + "loss": 8.3444, + "step": 20 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 14.263296127319336, + "learning_rate": 2.8000000000000003e-06, + "loss": 8.6571, + "step": 30 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 15.165566444396973, + "learning_rate": 3.8e-06, + "loss": 7.6933, + "step": 40 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 16.664026260375977, + "learning_rate": 4.800000000000001e-06, + "loss": 6.9526, + "step": 50 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 17.301790237426758, + "learning_rate": 5.8e-06, + "loss": 6.2005, + "step": 60 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 11.270517349243164, + "learning_rate": 6.800000000000001e-06, + "loss": 4.7052, + "step": 70 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 4.181836128234863, + "learning_rate": 7.8e-06, + "loss": 3.7839, + "step": 80 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 1.8710567951202393, + "learning_rate": 8.8e-06, + "loss": 3.3947, + "step": 90 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 1.407578706741333, + "learning_rate": 9.800000000000001e-06, + "loss": 3.1612, + "step": 100 + }, + { + "epoch": 0.8620689655172413, + "eval_loss": 3.118124485015869, + "eval_runtime": 39.9765, + "eval_samples_per_second": 33.62, + "eval_steps_per_second": 33.62, + "eval_wer": 1.0, + "step": 100 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 1.6558985710144043, + "learning_rate": 1.08e-05, + "loss": 3.0858, + "step": 110 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 1.8191890716552734, + "learning_rate": 1.18e-05, + "loss": 3.0791, + "step": 120 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 0.9610480070114136, + "learning_rate": 1.2800000000000001e-05, + "loss": 3.034, + "step": 130 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 0.7719367742538452, + "learning_rate": 1.3800000000000002e-05, + "loss": 3.0098, + "step": 140 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 0.7965385913848877, + "learning_rate": 1.48e-05, + "loss": 3.0138, + "step": 150 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.9387674927711487, + "learning_rate": 1.58e-05, + "loss": 2.9973, + "step": 160 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 0.8531718254089355, + "learning_rate": 1.6800000000000002e-05, + "loss": 2.9875, + "step": 170 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 0.5282021760940552, + "learning_rate": 1.78e-05, + "loss": 2.9905, + "step": 180 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 1.1362160444259644, + "learning_rate": 1.88e-05, + "loss": 2.9881, + "step": 190 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 1.0381989479064941, + "learning_rate": 1.9800000000000004e-05, + "loss": 2.978, + "step": 200 + }, + { + "epoch": 1.7241379310344827, + "eval_loss": 2.9721522331237793, + "eval_runtime": 39.4986, + "eval_samples_per_second": 34.026, + "eval_steps_per_second": 34.026, + "eval_wer": 1.0, + "step": 200 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 1.0819815397262573, + "learning_rate": 2.08e-05, + "loss": 2.9728, + "step": 210 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 0.5016360878944397, + "learning_rate": 2.18e-05, + "loss": 2.9769, + "step": 220 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 1.476746678352356, + "learning_rate": 2.2800000000000002e-05, + "loss": 2.9638, + "step": 230 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.416255384683609, + "learning_rate": 2.38e-05, + "loss": 2.9566, + "step": 240 + }, + { + "epoch": 2.1551724137931036, + "grad_norm": 0.5918602347373962, + "learning_rate": 2.48e-05, + "loss": 2.9507, + "step": 250 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 0.8085893988609314, + "learning_rate": 2.58e-05, + "loss": 2.9469, + "step": 260 + }, + { + "epoch": 2.3275862068965516, + "grad_norm": 0.8484460115432739, + "learning_rate": 2.6800000000000004e-05, + "loss": 2.9438, + "step": 270 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 0.4717480540275574, + "learning_rate": 2.7800000000000005e-05, + "loss": 2.9421, + "step": 280 + }, + { + "epoch": 2.5, + "grad_norm": 2.0097577571868896, + "learning_rate": 2.88e-05, + "loss": 2.9234, + "step": 290 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 1.3988488912582397, + "learning_rate": 2.98e-05, + "loss": 2.9185, + "step": 300 + }, + { + "epoch": 2.586206896551724, + "eval_loss": 2.909769058227539, + "eval_runtime": 39.5502, + "eval_samples_per_second": 33.982, + "eval_steps_per_second": 33.982, + "eval_wer": 1.0, + "step": 300 + }, + { + "epoch": 2.6724137931034484, + "grad_norm": 2.138706922531128, + "learning_rate": 3.08e-05, + "loss": 2.9028, + "step": 310 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.854528546333313, + "learning_rate": 3.18e-05, + "loss": 2.8889, + "step": 320 + }, + { + "epoch": 2.844827586206897, + "grad_norm": 0.8741695284843445, + "learning_rate": 3.2800000000000004e-05, + "loss": 2.8698, + "step": 330 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 2.8528709411621094, + "learning_rate": 3.38e-05, + "loss": 2.7851, + "step": 340 + }, + { + "epoch": 3.0172413793103448, + "grad_norm": 2.0051960945129395, + "learning_rate": 3.48e-05, + "loss": 2.7537, + "step": 350 + }, + { + "epoch": 3.103448275862069, + "grad_norm": 1.6813557147979736, + "learning_rate": 3.58e-05, + "loss": 2.6125, + "step": 360 + }, + { + "epoch": 3.189655172413793, + "grad_norm": 2.679445266723633, + "learning_rate": 3.68e-05, + "loss": 2.4925, + "step": 370 + }, + { + "epoch": 3.2758620689655173, + "grad_norm": 1.8131614923477173, + "learning_rate": 3.7800000000000004e-05, + "loss": 2.4007, + "step": 380 + }, + { + "epoch": 3.3620689655172415, + "grad_norm": 2.168797016143799, + "learning_rate": 3.88e-05, + "loss": 2.2533, + "step": 390 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 2.1998753547668457, + "learning_rate": 3.9800000000000005e-05, + "loss": 2.1282, + "step": 400 + }, + { + "epoch": 3.4482758620689653, + "eval_loss": 2.0066494941711426, + "eval_runtime": 39.5789, + "eval_samples_per_second": 33.958, + "eval_steps_per_second": 33.958, + "eval_wer": 1.0246744969498316, + "step": 400 + }, + { + "epoch": 3.5344827586206895, + "grad_norm": 4.27025842666626, + "learning_rate": 4.08e-05, + "loss": 2.0277, + "step": 410 + }, + { + "epoch": 3.6206896551724137, + "grad_norm": 2.3157191276550293, + "learning_rate": 4.18e-05, + "loss": 1.8844, + "step": 420 + }, + { + "epoch": 3.706896551724138, + "grad_norm": 3.1167590618133545, + "learning_rate": 4.2800000000000004e-05, + "loss": 1.7777, + "step": 430 + }, + { + "epoch": 3.793103448275862, + "grad_norm": 4.118265151977539, + "learning_rate": 4.38e-05, + "loss": 1.7015, + "step": 440 + }, + { + "epoch": 3.8793103448275863, + "grad_norm": 2.6515376567840576, + "learning_rate": 4.4800000000000005e-05, + "loss": 1.5855, + "step": 450 + }, + { + "epoch": 3.9655172413793105, + "grad_norm": 3.6305439472198486, + "learning_rate": 4.58e-05, + "loss": 1.5015, + "step": 460 + }, + { + "epoch": 4.051724137931035, + "grad_norm": 1.8111392259597778, + "learning_rate": 4.6800000000000006e-05, + "loss": 1.3926, + "step": 470 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 2.117809295654297, + "learning_rate": 4.78e-05, + "loss": 1.2363, + "step": 480 + }, + { + "epoch": 4.224137931034483, + "grad_norm": 2.7456188201904297, + "learning_rate": 4.88e-05, + "loss": 1.156, + "step": 490 + }, + { + "epoch": 4.310344827586207, + "grad_norm": 2.3859684467315674, + "learning_rate": 4.9800000000000004e-05, + "loss": 1.1234, + "step": 500 + }, + { + "epoch": 4.310344827586207, + "eval_loss": 1.0196667909622192, + "eval_runtime": 39.5574, + "eval_samples_per_second": 33.976, + "eval_steps_per_second": 33.976, + "eval_wer": 0.8392970955112447, + "step": 500 + }, + { + "epoch": 4.396551724137931, + "grad_norm": 2.3013603687286377, + "learning_rate": 5.08e-05, + "loss": 1.0335, + "step": 510 + }, + { + "epoch": 4.482758620689655, + "grad_norm": 5.266668796539307, + "learning_rate": 5.1800000000000005e-05, + "loss": 1.002, + "step": 520 + }, + { + "epoch": 4.568965517241379, + "grad_norm": 2.516477346420288, + "learning_rate": 5.28e-05, + "loss": 0.915, + "step": 530 + }, + { + "epoch": 4.655172413793103, + "grad_norm": 2.389324903488159, + "learning_rate": 5.380000000000001e-05, + "loss": 0.8828, + "step": 540 + }, + { + "epoch": 4.741379310344827, + "grad_norm": 2.378732204437256, + "learning_rate": 5.4800000000000004e-05, + "loss": 0.8822, + "step": 550 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 2.126372814178467, + "learning_rate": 5.580000000000001e-05, + "loss": 0.7924, + "step": 560 + }, + { + "epoch": 4.913793103448276, + "grad_norm": 2.4130356311798096, + "learning_rate": 5.68e-05, + "loss": 0.7577, + "step": 570 + }, + { + "epoch": 5.0, + "grad_norm": 4.412507057189941, + "learning_rate": 5.7799999999999995e-05, + "loss": 0.8124, + "step": 580 + }, + { + "epoch": 5.086206896551724, + "grad_norm": 2.4421167373657227, + "learning_rate": 5.88e-05, + "loss": 0.6599, + "step": 590 + }, + { + "epoch": 5.172413793103448, + "grad_norm": 2.1765213012695312, + "learning_rate": 5.9800000000000003e-05, + "loss": 0.602, + "step": 600 + }, + { + "epoch": 5.172413793103448, + "eval_loss": 0.6714467406272888, + "eval_runtime": 39.6477, + "eval_samples_per_second": 33.899, + "eval_steps_per_second": 33.899, + "eval_wer": 0.6600200309569334, + "step": 600 + }, + { + "epoch": 5.258620689655173, + "grad_norm": 2.5929956436157227, + "learning_rate": 6.08e-05, + "loss": 0.6666, + "step": 610 + }, + { + "epoch": 5.344827586206897, + "grad_norm": 2.023226261138916, + "learning_rate": 6.18e-05, + "loss": 0.557, + "step": 620 + }, + { + "epoch": 5.431034482758621, + "grad_norm": 2.348583936691284, + "learning_rate": 6.280000000000001e-05, + "loss": 0.536, + "step": 630 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 1.9607300758361816, + "learning_rate": 6.38e-05, + "loss": 0.618, + "step": 640 + }, + { + "epoch": 5.603448275862069, + "grad_norm": 2.136683940887451, + "learning_rate": 6.48e-05, + "loss": 0.5107, + "step": 650 + }, + { + "epoch": 5.689655172413794, + "grad_norm": 2.5568346977233887, + "learning_rate": 6.58e-05, + "loss": 0.512, + "step": 660 + }, + { + "epoch": 5.775862068965517, + "grad_norm": 1.6805388927459717, + "learning_rate": 6.680000000000001e-05, + "loss": 0.5663, + "step": 670 + }, + { + "epoch": 5.862068965517241, + "grad_norm": 3.271713972091675, + "learning_rate": 6.780000000000001e-05, + "loss": 0.4748, + "step": 680 + }, + { + "epoch": 5.948275862068965, + "grad_norm": 12.830371856689453, + "learning_rate": 6.879999999999999e-05, + "loss": 0.4823, + "step": 690 + }, + { + "epoch": 6.0344827586206895, + "grad_norm": 1.5290584564208984, + "learning_rate": 6.98e-05, + "loss": 0.5032, + "step": 700 + }, + { + "epoch": 6.0344827586206895, + "eval_loss": 0.5284600257873535, + "eval_runtime": 39.6104, + "eval_samples_per_second": 33.931, + "eval_steps_per_second": 33.931, + "eval_wer": 0.5658745333697532, + "step": 700 + }, + { + "epoch": 6.120689655172414, + "grad_norm": 1.902786374092102, + "learning_rate": 7.08e-05, + "loss": 0.387, + "step": 710 + }, + { + "epoch": 6.206896551724138, + "grad_norm": 2.5520503520965576, + "learning_rate": 7.18e-05, + "loss": 0.3752, + "step": 720 + }, + { + "epoch": 6.293103448275862, + "grad_norm": 1.8766177892684937, + "learning_rate": 7.280000000000001e-05, + "loss": 0.422, + "step": 730 + }, + { + "epoch": 6.379310344827586, + "grad_norm": 1.9960404634475708, + "learning_rate": 7.38e-05, + "loss": 0.3703, + "step": 740 + }, + { + "epoch": 6.4655172413793105, + "grad_norm": 2.9510915279388428, + "learning_rate": 7.48e-05, + "loss": 0.3777, + "step": 750 + }, + { + "epoch": 6.551724137931035, + "grad_norm": 1.5135278701782227, + "learning_rate": 7.58e-05, + "loss": 0.3978, + "step": 760 + }, + { + "epoch": 6.637931034482759, + "grad_norm": 2.198090076446533, + "learning_rate": 7.680000000000001e-05, + "loss": 0.306, + "step": 770 + }, + { + "epoch": 6.724137931034483, + "grad_norm": 4.511343955993652, + "learning_rate": 7.780000000000001e-05, + "loss": 0.4127, + "step": 780 + }, + { + "epoch": 6.810344827586206, + "grad_norm": 1.5059940814971924, + "learning_rate": 7.88e-05, + "loss": 0.3259, + "step": 790 + }, + { + "epoch": 6.896551724137931, + "grad_norm": 2.252959728240967, + "learning_rate": 7.98e-05, + "loss": 0.3101, + "step": 800 + }, + { + "epoch": 6.896551724137931, + "eval_loss": 0.48188281059265137, + "eval_runtime": 39.7499, + "eval_samples_per_second": 33.811, + "eval_steps_per_second": 33.811, + "eval_wer": 0.528179914413184, + "step": 800 + }, + { + "epoch": 6.982758620689655, + "grad_norm": 2.095763683319092, + "learning_rate": 8.080000000000001e-05, + "loss": 0.3926, + "step": 810 + }, + { + "epoch": 7.068965517241379, + "grad_norm": 1.679442048072815, + "learning_rate": 8.18e-05, + "loss": 0.3035, + "step": 820 + }, + { + "epoch": 7.155172413793103, + "grad_norm": 1.725831151008606, + "learning_rate": 8.28e-05, + "loss": 0.2546, + "step": 830 + }, + { + "epoch": 7.241379310344827, + "grad_norm": 4.802426338195801, + "learning_rate": 8.38e-05, + "loss": 0.3201, + "step": 840 + }, + { + "epoch": 7.327586206896552, + "grad_norm": 1.871408462524414, + "learning_rate": 8.48e-05, + "loss": 0.2891, + "step": 850 + }, + { + "epoch": 7.413793103448276, + "grad_norm": 2.0499789714813232, + "learning_rate": 8.58e-05, + "loss": 0.2513, + "step": 860 + }, + { + "epoch": 7.5, + "grad_norm": 1.3894842863082886, + "learning_rate": 8.680000000000001e-05, + "loss": 0.3401, + "step": 870 + }, + { + "epoch": 7.586206896551724, + "grad_norm": 1.56522798538208, + "learning_rate": 8.78e-05, + "loss": 0.2398, + "step": 880 + }, + { + "epoch": 7.672413793103448, + "grad_norm": 1.908964991569519, + "learning_rate": 8.88e-05, + "loss": 0.2656, + "step": 890 + }, + { + "epoch": 7.758620689655173, + "grad_norm": 1.5925772190093994, + "learning_rate": 8.98e-05, + "loss": 0.3432, + "step": 900 + }, + { + "epoch": 7.758620689655173, + "eval_loss": 0.46526795625686646, + "eval_runtime": 39.52, + "eval_samples_per_second": 34.008, + "eval_steps_per_second": 34.008, + "eval_wer": 0.5271783665665118, + "step": 900 + }, + { + "epoch": 7.844827586206897, + "grad_norm": 1.7011960744857788, + "learning_rate": 9.080000000000001e-05, + "loss": 0.2539, + "step": 910 + }, + { + "epoch": 7.931034482758621, + "grad_norm": 1.718232274055481, + "learning_rate": 9.180000000000001e-05, + "loss": 0.2653, + "step": 920 + }, + { + "epoch": 8.017241379310345, + "grad_norm": 1.6879725456237793, + "learning_rate": 9.28e-05, + "loss": 0.3313, + "step": 930 + }, + { + "epoch": 8.10344827586207, + "grad_norm": 2.626966714859009, + "learning_rate": 9.38e-05, + "loss": 0.2116, + "step": 940 + }, + { + "epoch": 8.189655172413794, + "grad_norm": 2.0977375507354736, + "learning_rate": 9.48e-05, + "loss": 0.2445, + "step": 950 + }, + { + "epoch": 8.275862068965518, + "grad_norm": 1.6694329977035522, + "learning_rate": 9.58e-05, + "loss": 0.2928, + "step": 960 + }, + { + "epoch": 8.362068965517242, + "grad_norm": 1.8011162281036377, + "learning_rate": 9.680000000000001e-05, + "loss": 0.2008, + "step": 970 + }, + { + "epoch": 8.448275862068966, + "grad_norm": 2.6468820571899414, + "learning_rate": 9.78e-05, + "loss": 0.2089, + "step": 980 + }, + { + "epoch": 8.53448275862069, + "grad_norm": 1.5480060577392578, + "learning_rate": 9.88e-05, + "loss": 0.2593, + "step": 990 + }, + { + "epoch": 8.620689655172415, + "grad_norm": 2.0152957439422607, + "learning_rate": 9.98e-05, + "loss": 0.1922, + "step": 1000 + }, + { + "epoch": 8.620689655172415, + "eval_loss": 0.46724727749824524, + "eval_runtime": 39.7331, + "eval_samples_per_second": 33.826, + "eval_steps_per_second": 33.826, + "eval_wer": 0.49175999271601567, + "step": 1000 + }, + { + "epoch": 8.706896551724139, + "grad_norm": 2.7271480560302734, + "learning_rate": 9.939393939393939e-05, + "loss": 0.2466, + "step": 1010 + }, + { + "epoch": 8.793103448275861, + "grad_norm": 1.4664121866226196, + "learning_rate": 9.863636363636364e-05, + "loss": 0.259, + "step": 1020 + }, + { + "epoch": 8.879310344827585, + "grad_norm": 2.0848429203033447, + "learning_rate": 9.787878787878789e-05, + "loss": 0.1964, + "step": 1030 + }, + { + "epoch": 8.96551724137931, + "grad_norm": 2.68035888671875, + "learning_rate": 9.712121212121212e-05, + "loss": 0.2528, + "step": 1040 + }, + { + "epoch": 9.051724137931034, + "grad_norm": 1.2563536167144775, + "learning_rate": 9.636363636363637e-05, + "loss": 0.2322, + "step": 1050 + }, + { + "epoch": 9.137931034482758, + "grad_norm": 1.8890479803085327, + "learning_rate": 9.560606060606061e-05, + "loss": 0.1597, + "step": 1060 + }, + { + "epoch": 9.224137931034482, + "grad_norm": 2.1262826919555664, + "learning_rate": 9.484848484848486e-05, + "loss": 0.2046, + "step": 1070 + }, + { + "epoch": 9.310344827586206, + "grad_norm": 1.9750289916992188, + "learning_rate": 9.40909090909091e-05, + "loss": 0.2035, + "step": 1080 + }, + { + "epoch": 9.39655172413793, + "grad_norm": 1.9659441709518433, + "learning_rate": 9.333333333333334e-05, + "loss": 0.1623, + "step": 1090 + }, + { + "epoch": 9.482758620689655, + "grad_norm": 3.0013935565948486, + "learning_rate": 9.257575757575758e-05, + "loss": 0.2284, + "step": 1100 + }, + { + "epoch": 9.482758620689655, + "eval_loss": 0.48344284296035767, + "eval_runtime": 39.5675, + "eval_samples_per_second": 33.967, + "eval_steps_per_second": 33.967, + "eval_wer": 0.48702540289538376, + "step": 1100 + }, + { + "epoch": 9.568965517241379, + "grad_norm": 1.5623351335525513, + "learning_rate": 9.181818181818183e-05, + "loss": 0.182, + "step": 1110 + }, + { + "epoch": 9.655172413793103, + "grad_norm": 1.5138722658157349, + "learning_rate": 9.106060606060606e-05, + "loss": 0.1741, + "step": 1120 + }, + { + "epoch": 9.741379310344827, + "grad_norm": 1.3799549341201782, + "learning_rate": 9.030303030303031e-05, + "loss": 0.2227, + "step": 1130 + }, + { + "epoch": 9.827586206896552, + "grad_norm": 1.6967511177062988, + "learning_rate": 8.954545454545455e-05, + "loss": 0.1617, + "step": 1140 + }, + { + "epoch": 9.913793103448276, + "grad_norm": 2.4861340522766113, + "learning_rate": 8.87878787878788e-05, + "loss": 0.1712, + "step": 1150 + }, + { + "epoch": 10.0, + "grad_norm": 4.324175834655762, + "learning_rate": 8.803030303030304e-05, + "loss": 0.2196, + "step": 1160 + }, + { + "epoch": 10.086206896551724, + "grad_norm": 1.889611840248108, + "learning_rate": 8.727272727272727e-05, + "loss": 0.1392, + "step": 1170 + }, + { + "epoch": 10.172413793103448, + "grad_norm": 1.7731642723083496, + "learning_rate": 8.651515151515152e-05, + "loss": 0.147, + "step": 1180 + }, + { + "epoch": 10.258620689655173, + "grad_norm": 1.3258129358291626, + "learning_rate": 8.575757575757576e-05, + "loss": 0.1793, + "step": 1190 + }, + { + "epoch": 10.344827586206897, + "grad_norm": 2.139970302581787, + "learning_rate": 8.5e-05, + "loss": 0.1372, + "step": 1200 + }, + { + "epoch": 10.344827586206897, + "eval_loss": 0.43795427680015564, + "eval_runtime": 39.566, + "eval_samples_per_second": 33.969, + "eval_steps_per_second": 33.969, + "eval_wer": 0.4727305836292452, + "step": 1200 + }, + { + "epoch": 10.431034482758621, + "grad_norm": 2.1916654109954834, + "learning_rate": 8.424242424242424e-05, + "loss": 0.1442, + "step": 1210 + }, + { + "epoch": 10.517241379310345, + "grad_norm": 1.1983482837677002, + "learning_rate": 8.348484848484849e-05, + "loss": 0.1926, + "step": 1220 + }, + { + "epoch": 10.60344827586207, + "grad_norm": 1.9001710414886475, + "learning_rate": 8.272727272727273e-05, + "loss": 0.1368, + "step": 1230 + }, + { + "epoch": 10.689655172413794, + "grad_norm": 1.9396876096725464, + "learning_rate": 8.196969696969698e-05, + "loss": 0.131, + "step": 1240 + }, + { + "epoch": 10.775862068965518, + "grad_norm": 1.2405284643173218, + "learning_rate": 8.121212121212121e-05, + "loss": 0.162, + "step": 1250 + }, + { + "epoch": 10.862068965517242, + "grad_norm": 1.1841609477996826, + "learning_rate": 8.045454545454546e-05, + "loss": 0.1199, + "step": 1260 + }, + { + "epoch": 10.948275862068966, + "grad_norm": 1.863655924797058, + "learning_rate": 7.96969696969697e-05, + "loss": 0.1571, + "step": 1270 + }, + { + "epoch": 11.03448275862069, + "grad_norm": 1.102363109588623, + "learning_rate": 7.893939393939395e-05, + "loss": 0.1672, + "step": 1280 + }, + { + "epoch": 11.120689655172415, + "grad_norm": 1.411650538444519, + "learning_rate": 7.818181818181818e-05, + "loss": 0.1004, + "step": 1290 + }, + { + "epoch": 11.206896551724139, + "grad_norm": 1.5993527173995972, + "learning_rate": 7.742424242424243e-05, + "loss": 0.1105, + "step": 1300 + }, + { + "epoch": 11.206896551724139, + "eval_loss": 0.45091673731803894, + "eval_runtime": 39.5057, + "eval_samples_per_second": 34.02, + "eval_steps_per_second": 34.02, + "eval_wer": 0.45943731220977874, + "step": 1300 + }, + { + "epoch": 11.293103448275861, + "grad_norm": 0.9610430598258972, + "learning_rate": 7.666666666666667e-05, + "loss": 0.1514, + "step": 1310 + }, + { + "epoch": 11.379310344827585, + "grad_norm": 1.1394908428192139, + "learning_rate": 7.59090909090909e-05, + "loss": 0.1087, + "step": 1320 + }, + { + "epoch": 11.46551724137931, + "grad_norm": 2.085495948791504, + "learning_rate": 7.515151515151515e-05, + "loss": 0.1328, + "step": 1330 + }, + { + "epoch": 11.551724137931034, + "grad_norm": 1.0012094974517822, + "learning_rate": 7.439393939393939e-05, + "loss": 0.1406, + "step": 1340 + }, + { + "epoch": 11.637931034482758, + "grad_norm": 1.3768068552017212, + "learning_rate": 7.363636363636364e-05, + "loss": 0.1068, + "step": 1350 + }, + { + "epoch": 11.724137931034482, + "grad_norm": 3.6142425537109375, + "learning_rate": 7.287878787878788e-05, + "loss": 0.1447, + "step": 1360 + }, + { + "epoch": 11.810344827586206, + "grad_norm": 1.16116201877594, + "learning_rate": 7.212121212121213e-05, + "loss": 0.1099, + "step": 1370 + }, + { + "epoch": 11.89655172413793, + "grad_norm": 1.4963595867156982, + "learning_rate": 7.136363636363636e-05, + "loss": 0.098, + "step": 1380 + }, + { + "epoch": 11.982758620689655, + "grad_norm": 1.1868886947631836, + "learning_rate": 7.060606060606061e-05, + "loss": 0.1603, + "step": 1390 + }, + { + "epoch": 12.068965517241379, + "grad_norm": 1.6679799556732178, + "learning_rate": 6.984848484848485e-05, + "loss": 0.0992, + "step": 1400 + }, + { + "epoch": 12.068965517241379, + "eval_loss": 0.41960862278938293, + "eval_runtime": 39.5319, + "eval_samples_per_second": 33.998, + "eval_steps_per_second": 33.998, + "eval_wer": 0.4544295729764181, + "step": 1400 + }, + { + "epoch": 12.155172413793103, + "grad_norm": 0.9624470472335815, + "learning_rate": 6.90909090909091e-05, + "loss": 0.092, + "step": 1410 + }, + { + "epoch": 12.241379310344827, + "grad_norm": 3.408127546310425, + "learning_rate": 6.833333333333333e-05, + "loss": 0.1716, + "step": 1420 + }, + { + "epoch": 12.327586206896552, + "grad_norm": 1.4163497686386108, + "learning_rate": 6.757575757575758e-05, + "loss": 0.0885, + "step": 1430 + }, + { + "epoch": 12.413793103448276, + "grad_norm": 1.494627594947815, + "learning_rate": 6.681818181818183e-05, + "loss": 0.0891, + "step": 1440 + }, + { + "epoch": 12.5, + "grad_norm": 1.1766793727874756, + "learning_rate": 6.606060606060607e-05, + "loss": 0.138, + "step": 1450 + }, + { + "epoch": 12.586206896551724, + "grad_norm": 1.2334414720535278, + "learning_rate": 6.530303030303032e-05, + "loss": 0.0946, + "step": 1460 + }, + { + "epoch": 12.672413793103448, + "grad_norm": 2.576019763946533, + "learning_rate": 6.454545454545455e-05, + "loss": 0.1043, + "step": 1470 + }, + { + "epoch": 12.758620689655173, + "grad_norm": 0.9484899640083313, + "learning_rate": 6.37878787878788e-05, + "loss": 0.1155, + "step": 1480 + }, + { + "epoch": 12.844827586206897, + "grad_norm": 1.0187716484069824, + "learning_rate": 6.303030303030302e-05, + "loss": 0.0828, + "step": 1490 + }, + { + "epoch": 12.931034482758621, + "grad_norm": 1.0715346336364746, + "learning_rate": 6.227272727272727e-05, + "loss": 0.1226, + "step": 1500 + }, + { + "epoch": 12.931034482758621, + "eval_loss": 0.4237208664417267, + "eval_runtime": 39.6767, + "eval_samples_per_second": 33.874, + "eval_steps_per_second": 33.874, + "eval_wer": 0.4321223709369025, + "step": 1500 + }, + { + "epoch": 13.017241379310345, + "grad_norm": 0.8969095945358276, + "learning_rate": 6.151515151515151e-05, + "loss": 0.1272, + "step": 1510 + }, + { + "epoch": 13.10344827586207, + "grad_norm": 1.4273531436920166, + "learning_rate": 6.075757575757576e-05, + "loss": 0.0804, + "step": 1520 + }, + { + "epoch": 13.189655172413794, + "grad_norm": 1.5268211364746094, + "learning_rate": 6e-05, + "loss": 0.0907, + "step": 1530 + }, + { + "epoch": 13.275862068965518, + "grad_norm": 1.1891446113586426, + "learning_rate": 5.9242424242424244e-05, + "loss": 0.122, + "step": 1540 + }, + { + "epoch": 13.362068965517242, + "grad_norm": 1.2826952934265137, + "learning_rate": 5.848484848484849e-05, + "loss": 0.079, + "step": 1550 + }, + { + "epoch": 13.448275862068966, + "grad_norm": 1.4693151712417603, + "learning_rate": 5.772727272727273e-05, + "loss": 0.0953, + "step": 1560 + }, + { + "epoch": 13.53448275862069, + "grad_norm": 1.1638069152832031, + "learning_rate": 5.696969696969697e-05, + "loss": 0.1076, + "step": 1570 + }, + { + "epoch": 13.620689655172415, + "grad_norm": 2.33617901802063, + "learning_rate": 5.6212121212121215e-05, + "loss": 0.0712, + "step": 1580 + }, + { + "epoch": 13.706896551724139, + "grad_norm": 2.3127667903900146, + "learning_rate": 5.545454545454546e-05, + "loss": 0.0947, + "step": 1590 + }, + { + "epoch": 13.793103448275861, + "grad_norm": 0.9676732420921326, + "learning_rate": 5.46969696969697e-05, + "loss": 0.1013, + "step": 1600 + }, + { + "epoch": 13.793103448275861, + "eval_loss": 0.4112694263458252, + "eval_runtime": 39.48, + "eval_samples_per_second": 34.043, + "eval_steps_per_second": 34.043, + "eval_wer": 0.4298461258308295, + "step": 1600 + }, + { + "epoch": 13.879310344827585, + "grad_norm": 1.8281100988388062, + "learning_rate": 5.393939393939394e-05, + "loss": 0.0696, + "step": 1610 + }, + { + "epoch": 13.96551724137931, + "grad_norm": 2.354693651199341, + "learning_rate": 5.3181818181818186e-05, + "loss": 0.1129, + "step": 1620 + }, + { + "epoch": 14.051724137931034, + "grad_norm": 1.0465914011001587, + "learning_rate": 5.242424242424243e-05, + "loss": 0.0744, + "step": 1630 + }, + { + "epoch": 14.137931034482758, + "grad_norm": 1.0246185064315796, + "learning_rate": 5.166666666666667e-05, + "loss": 0.0656, + "step": 1640 + }, + { + "epoch": 14.224137931034482, + "grad_norm": 2.083202838897705, + "learning_rate": 5.090909090909091e-05, + "loss": 0.0869, + "step": 1650 + }, + { + "epoch": 14.310344827586206, + "grad_norm": 1.176715612411499, + "learning_rate": 5.015151515151515e-05, + "loss": 0.0821, + "step": 1660 + }, + { + "epoch": 14.39655172413793, + "grad_norm": 1.6385036706924438, + "learning_rate": 4.93939393939394e-05, + "loss": 0.0695, + "step": 1670 + }, + { + "epoch": 14.482758620689655, + "grad_norm": 2.735548973083496, + "learning_rate": 4.863636363636364e-05, + "loss": 0.0875, + "step": 1680 + }, + { + "epoch": 14.568965517241379, + "grad_norm": 1.3907177448272705, + "learning_rate": 4.787878787878788e-05, + "loss": 0.0773, + "step": 1690 + }, + { + "epoch": 14.655172413793103, + "grad_norm": 1.582645297050476, + "learning_rate": 4.712121212121212e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 14.655172413793103, + "eval_loss": 0.4037941098213196, + "eval_runtime": 39.716, + "eval_samples_per_second": 33.84, + "eval_steps_per_second": 33.84, + "eval_wer": 0.42756988072475643, + "step": 1700 + }, + { + "epoch": 14.741379310344827, + "grad_norm": 1.1024818420410156, + "learning_rate": 4.636363636363636e-05, + "loss": 0.1285, + "step": 1710 + }, + { + "epoch": 14.827586206896552, + "grad_norm": 1.0617228746414185, + "learning_rate": 4.5606060606060606e-05, + "loss": 0.0638, + "step": 1720 + }, + { + "epoch": 14.913793103448276, + "grad_norm": 1.8958903551101685, + "learning_rate": 4.484848484848485e-05, + "loss": 0.0772, + "step": 1730 + }, + { + "epoch": 15.0, + "grad_norm": 2.5858068466186523, + "learning_rate": 4.409090909090909e-05, + "loss": 0.094, + "step": 1740 + }, + { + "epoch": 15.086206896551724, + "grad_norm": 0.6185240745544434, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.0506, + "step": 1750 + }, + { + "epoch": 15.172413793103448, + "grad_norm": 1.9594557285308838, + "learning_rate": 4.257575757575758e-05, + "loss": 0.0667, + "step": 1760 + }, + { + "epoch": 15.258620689655173, + "grad_norm": 0.6401175260543823, + "learning_rate": 4.181818181818182e-05, + "loss": 0.0909, + "step": 1770 + }, + { + "epoch": 15.344827586206897, + "grad_norm": 1.2541193962097168, + "learning_rate": 4.106060606060606e-05, + "loss": 0.0573, + "step": 1780 + }, + { + "epoch": 15.431034482758621, + "grad_norm": 1.74609375, + "learning_rate": 4.0303030303030305e-05, + "loss": 0.0666, + "step": 1790 + }, + { + "epoch": 15.517241379310345, + "grad_norm": 1.3555305004119873, + "learning_rate": 3.954545454545455e-05, + "loss": 0.0901, + "step": 1800 + }, + { + "epoch": 15.517241379310345, + "eval_loss": 0.43212294578552246, + "eval_runtime": 40.1884, + "eval_samples_per_second": 33.443, + "eval_steps_per_second": 33.443, + "eval_wer": 0.42247109168715286, + "step": 1800 + }, + { + "epoch": 15.60344827586207, + "grad_norm": 0.8425617218017578, + "learning_rate": 3.878787878787879e-05, + "loss": 0.0567, + "step": 1810 + }, + { + "epoch": 15.689655172413794, + "grad_norm": 1.145479679107666, + "learning_rate": 3.803030303030303e-05, + "loss": 0.0682, + "step": 1820 + }, + { + "epoch": 15.775862068965518, + "grad_norm": 1.0084822177886963, + "learning_rate": 3.7272727272727276e-05, + "loss": 0.0847, + "step": 1830 + }, + { + "epoch": 15.862068965517242, + "grad_norm": 0.986998438835144, + "learning_rate": 3.651515151515152e-05, + "loss": 0.0623, + "step": 1840 + }, + { + "epoch": 15.948275862068966, + "grad_norm": 1.5837907791137695, + "learning_rate": 3.575757575757576e-05, + "loss": 0.0719, + "step": 1850 + }, + { + "epoch": 16.03448275862069, + "grad_norm": 0.8865498900413513, + "learning_rate": 3.5e-05, + "loss": 0.0766, + "step": 1860 + }, + { + "epoch": 16.120689655172413, + "grad_norm": 1.3217542171478271, + "learning_rate": 3.424242424242424e-05, + "loss": 0.0598, + "step": 1870 + }, + { + "epoch": 16.20689655172414, + "grad_norm": 1.5297330617904663, + "learning_rate": 3.348484848484848e-05, + "loss": 0.0632, + "step": 1880 + }, + { + "epoch": 16.29310344827586, + "grad_norm": 0.6415010094642639, + "learning_rate": 3.272727272727273e-05, + "loss": 0.0732, + "step": 1890 + }, + { + "epoch": 16.379310344827587, + "grad_norm": 2.088172435760498, + "learning_rate": 3.1969696969696974e-05, + "loss": 0.053, + "step": 1900 + }, + { + "epoch": 16.379310344827587, + "eval_loss": 0.4076109826564789, + "eval_runtime": 40.2143, + "eval_samples_per_second": 33.421, + "eval_steps_per_second": 33.421, + "eval_wer": 0.42356368933806793, + "step": 1900 + }, + { + "epoch": 16.46551724137931, + "grad_norm": 1.5730756521224976, + "learning_rate": 3.121212121212122e-05, + "loss": 0.0722, + "step": 1910 + }, + { + "epoch": 16.551724137931036, + "grad_norm": 0.7452704906463623, + "learning_rate": 3.0454545454545456e-05, + "loss": 0.0743, + "step": 1920 + }, + { + "epoch": 16.637931034482758, + "grad_norm": 0.890084445476532, + "learning_rate": 2.96969696969697e-05, + "loss": 0.0462, + "step": 1930 + }, + { + "epoch": 16.724137931034484, + "grad_norm": 2.400942325592041, + "learning_rate": 2.893939393939394e-05, + "loss": 0.076, + "step": 1940 + }, + { + "epoch": 16.810344827586206, + "grad_norm": 1.0481088161468506, + "learning_rate": 2.818181818181818e-05, + "loss": 0.0577, + "step": 1950 + }, + { + "epoch": 16.896551724137932, + "grad_norm": 1.7372900247573853, + "learning_rate": 2.7424242424242424e-05, + "loss": 0.0539, + "step": 1960 + }, + { + "epoch": 16.982758620689655, + "grad_norm": 0.7519413828849792, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.076, + "step": 1970 + }, + { + "epoch": 17.06896551724138, + "grad_norm": 1.1846277713775635, + "learning_rate": 2.590909090909091e-05, + "loss": 0.0553, + "step": 1980 + }, + { + "epoch": 17.155172413793103, + "grad_norm": 0.7746219038963318, + "learning_rate": 2.5151515151515155e-05, + "loss": 0.0617, + "step": 1990 + }, + { + "epoch": 17.24137931034483, + "grad_norm": 1.951456904411316, + "learning_rate": 2.4393939393939395e-05, + "loss": 0.0805, + "step": 2000 + }, + { + "epoch": 17.24137931034483, + "eval_loss": 0.43357911705970764, + "eval_runtime": 40.2275, + "eval_samples_per_second": 33.41, + "eval_steps_per_second": 33.41, + "eval_wer": 0.4156423563689338, + "step": 2000 + }, + { + "epoch": 17.32758620689655, + "grad_norm": 0.5796881318092346, + "learning_rate": 2.3636363636363637e-05, + "loss": 0.0466, + "step": 2010 + }, + { + "epoch": 17.413793103448278, + "grad_norm": 1.2703924179077148, + "learning_rate": 2.287878787878788e-05, + "loss": 0.0557, + "step": 2020 + }, + { + "epoch": 17.5, + "grad_norm": 1.242013692855835, + "learning_rate": 2.2121212121212123e-05, + "loss": 0.0719, + "step": 2030 + }, + { + "epoch": 17.586206896551722, + "grad_norm": 1.1339091062545776, + "learning_rate": 2.1363636363636362e-05, + "loss": 0.0488, + "step": 2040 + }, + { + "epoch": 17.67241379310345, + "grad_norm": 1.4959948062896729, + "learning_rate": 2.0606060606060608e-05, + "loss": 0.0522, + "step": 2050 + }, + { + "epoch": 17.75862068965517, + "grad_norm": 0.9912744760513306, + "learning_rate": 1.984848484848485e-05, + "loss": 0.0777, + "step": 2060 + }, + { + "epoch": 17.844827586206897, + "grad_norm": 1.1574323177337646, + "learning_rate": 1.9090909090909094e-05, + "loss": 0.0517, + "step": 2070 + }, + { + "epoch": 17.93103448275862, + "grad_norm": 1.5049959421157837, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.0565, + "step": 2080 + }, + { + "epoch": 18.017241379310345, + "grad_norm": 0.6796301603317261, + "learning_rate": 1.7575757575757576e-05, + "loss": 0.0618, + "step": 2090 + }, + { + "epoch": 18.103448275862068, + "grad_norm": 0.8808640837669373, + "learning_rate": 1.6818181818181818e-05, + "loss": 0.049, + "step": 2100 + }, + { + "epoch": 18.103448275862068, + "eval_loss": 0.4193398654460907, + "eval_runtime": 40.1393, + "eval_samples_per_second": 33.483, + "eval_steps_per_second": 33.483, + "eval_wer": 0.41136301556951654, + "step": 2100 + }, + { + "epoch": 18.189655172413794, + "grad_norm": 1.1478749513626099, + "learning_rate": 1.606060606060606e-05, + "loss": 0.0467, + "step": 2110 + }, + { + "epoch": 18.275862068965516, + "grad_norm": 0.9449758529663086, + "learning_rate": 1.5303030303030304e-05, + "loss": 0.074, + "step": 2120 + }, + { + "epoch": 18.362068965517242, + "grad_norm": 0.7945724725723267, + "learning_rate": 1.4545454545454545e-05, + "loss": 0.0415, + "step": 2130 + }, + { + "epoch": 18.448275862068964, + "grad_norm": 1.1694364547729492, + "learning_rate": 1.3787878787878789e-05, + "loss": 0.0547, + "step": 2140 + }, + { + "epoch": 18.53448275862069, + "grad_norm": 0.7521646022796631, + "learning_rate": 1.3030303030303032e-05, + "loss": 0.0544, + "step": 2150 + }, + { + "epoch": 18.620689655172413, + "grad_norm": 0.9526033997535706, + "learning_rate": 1.2272727272727273e-05, + "loss": 0.0507, + "step": 2160 + }, + { + "epoch": 18.70689655172414, + "grad_norm": 1.0075325965881348, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.0595, + "step": 2170 + }, + { + "epoch": 18.79310344827586, + "grad_norm": 0.6182655096054077, + "learning_rate": 1.0757575757575758e-05, + "loss": 0.0593, + "step": 2180 + }, + { + "epoch": 18.879310344827587, + "grad_norm": 0.5009527206420898, + "learning_rate": 1e-05, + "loss": 0.0415, + "step": 2190 + }, + { + "epoch": 18.96551724137931, + "grad_norm": 2.459338426589966, + "learning_rate": 9.242424242424244e-06, + "loss": 0.0717, + "step": 2200 + }, + { + "epoch": 18.96551724137931, + "eval_loss": 0.4138683080673218, + "eval_runtime": 40.2569, + "eval_samples_per_second": 33.386, + "eval_steps_per_second": 33.386, + "eval_wer": 0.4090867704634435, + "step": 2200 + }, + { + "epoch": 19.051724137931036, + "grad_norm": 1.2094837427139282, + "learning_rate": 8.484848484848486e-06, + "loss": 0.0486, + "step": 2210 + }, + { + "epoch": 19.137931034482758, + "grad_norm": 1.1685006618499756, + "learning_rate": 7.727272727272727e-06, + "loss": 0.0376, + "step": 2220 + }, + { + "epoch": 19.224137931034484, + "grad_norm": 0.764283299446106, + "learning_rate": 6.969696969696971e-06, + "loss": 0.0598, + "step": 2230 + }, + { + "epoch": 19.310344827586206, + "grad_norm": 0.6357612013816833, + "learning_rate": 6.212121212121212e-06, + "loss": 0.0494, + "step": 2240 + }, + { + "epoch": 19.396551724137932, + "grad_norm": 2.4395949840545654, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.0395, + "step": 2250 + }, + { + "epoch": 19.482758620689655, + "grad_norm": 2.8990790843963623, + "learning_rate": 4.696969696969697e-06, + "loss": 0.0643, + "step": 2260 + }, + { + "epoch": 19.56896551724138, + "grad_norm": 0.5647782683372498, + "learning_rate": 3.939393939393939e-06, + "loss": 0.0458, + "step": 2270 + }, + { + "epoch": 19.655172413793103, + "grad_norm": 0.6177101731300354, + "learning_rate": 3.1818181818181817e-06, + "loss": 0.0521, + "step": 2280 + }, + { + "epoch": 19.74137931034483, + "grad_norm": 0.9529480934143066, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.0497, + "step": 2290 + }, + { + "epoch": 19.82758620689655, + "grad_norm": 0.7542730569839478, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0389, + "step": 2300 + }, + { + "epoch": 19.82758620689655, + "eval_loss": 0.4216426610946655, + "eval_runtime": 39.4724, + "eval_samples_per_second": 34.049, + "eval_steps_per_second": 34.049, + "eval_wer": 0.4087225712464718, + "step": 2300 + }, + { + "epoch": 19.913793103448278, + "grad_norm": 0.9613261222839355, + "learning_rate": 9.09090909090909e-07, + "loss": 0.0483, + "step": 2310 + }, + { + "epoch": 20.0, + "grad_norm": 1.2119308710098267, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.0557, + "step": 2320 + }, + { + "epoch": 20.0, + "step": 2320, + "total_flos": 2.1476719263248095e+18, + "train_loss": 0.8590125822430027, + "train_runtime": 3151.1477, + "train_samples_per_second": 23.458, + "train_steps_per_second": 0.736 + } + ], + "logging_steps": 10, + "max_steps": 2320, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.1476719263248095e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/wav2vec2-base-timit-fine-tuned/training_args.bin b/wav2vec2-base-timit-fine-tuned/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eb6a781d906163b88d68b3864b5efa910ae6f2db --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e556b40b2db1dddea7c7a4a9293c1d434b0f2ab66de2fb1e7bc1d88cf629c80 +size 5112 diff --git a/wav2vec2-base-timit-fine-tuned/vocab.json b/wav2vec2-base-timit-fine-tuned/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..4540c2af87902c582c6b65cb6a96c93e879a4090 --- /dev/null +++ b/wav2vec2-base-timit-fine-tuned/vocab.json @@ -0,0 +1,31 @@ +{ + "[PAD]": 28, + "[UNK]": 27, + "a": 1, + "b": 2, + "c": 3, + "d": 4, + "e": 5, + "f": 6, + "g": 7, + "h": 8, + "i": 9, + "j": 10, + "k": 11, + "l": 12, + "m": 13, + "n": 14, + "o": 15, + "p": 16, + "q": 17, + "r": 18, + "s": 19, + "t": 20, + "u": 21, + "v": 22, + "w": 23, + "x": 24, + "y": 25, + "z": 26, + "|": 0 +}