NbAiLab
/

wav2vec2-large-voxrex-npsc-nst

@@ -1,8 +1,6 @@
 WANDB_ENTITY=NbAiLab WANDB_PROJECT=wav2vec2 python run_speech_recognition_ctc.py \
-        --dataset_name="NbAiLab/NST" \
         --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
-        --hub_model_id="NbAiLab/wav2vec2-large-voxrex-nst" \
-        --dataset_config_name="no-close" \
         --output_dir="./" \
         --overwrite_output_dir \
         --num_train_epochs="15" \

 WANDB_ENTITY=NbAiLab WANDB_PROJECT=wav2vec2 python run_speech_recognition_ctc.py \
         --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
+        --hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nst" \
         --output_dir="./" \
         --overwrite_output_dir \
         --num_train_epochs="15" \

run_speech_recognition_ctc.py CHANGED Viewed

@@ -47,13 +47,11 @@ from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.16.0.dev0")
 require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 logger = logging.getLogger(__name__)
@@ -102,8 +100,8 @@ class ModelArguments:
         default=0.05,
         metadata={
             "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
-            "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
-            "vectors will be masked along the time axis."
         },
     )
     mask_time_length: int = field(
@@ -114,7 +112,7 @@ class ModelArguments:
         default=0.0,
         metadata={
             "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
-            "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
         },
     )
     mask_feature_length: int = field(
@@ -129,6 +127,7 @@ class ModelArguments:
         default=False, metadata={"help": "If True, will try yo aboud the CTC loss goinf to infinity."}
     )
 @dataclass
 class DataTrainingArguments:
     """
@@ -176,14 +175,14 @@ class DataTrainingArguments:
         default=None,
         metadata={
             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
         },
     )
     max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
             "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
         },
     )
     chars_to_ignore: Optional[List[str]] = list_field(
@@ -207,16 +206,16 @@ class DataTrainingArguments:
         default=False,
         metadata={
             "help": "Whether to only do data preprocessing and skip training. "
-            "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
-            "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
-            "so that the cached datasets can consequently be loaded in distributed training"
         },
     )
     use_auth_token: bool = field(
         default=False,
         metadata={
             "help": "If :obj:`True`, will use the token generated when running"
-            ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
         },
     )
     unk_token: str = field(
@@ -235,9 +234,9 @@ class DataTrainingArguments:
         default=None,
         metadata={
             "help": "The target language that should be used be"
-            " passed to the tokenizer for tokenization. Note that"
-            " this is only relevant if the model classifies the"
-            " input audio to a sequence of phoneme sequences."
         },
     )
@@ -303,10 +302,10 @@ class DataCollatorCTCWithPadding:
 def create_vocabulary_from_data(
-    datasets: DatasetDict,
-    word_delimiter_token: Optional[str] = None,
-    unk_token: Optional[str] = None,
-    pad_token: Optional[str] = None,
 ):
     # Given training and test labels create vocabulary
     def extract_all_chars(batch):
@@ -344,6 +343,85 @@ def create_vocabulary_from_data(
     return vocab_dict
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
@@ -393,45 +471,10 @@ def main():
     # Set seed before initializing model.
     set_seed(training_args.seed)
-    # Pre-processing dataset
-    import re
-    def map_dataset(entry):
-        text = entry["text"].lower()
-        text = text.replace("(...Vær stille under dette opptaket...)", "")
-        text = re.sub('[áàâ]', 'a', text)
-        text = re.sub('[ä]', 'æ', text)
-        text = re.sub('[éèëê]', 'e', text)
-        text = re.sub('[íìïî]', 'i', text)
-        text = re.sub('[óòöô]', 'o', text)
-        text = re.sub('[ö]', 'ø', text)
-        text = re.sub('[ç]', 'c', text)
-        text = re.sub('[úùüû]', 'u', text)
-        # text = re.sub('\\(?=(Punktum|Komma|Utropstegn|Spørsmålstegn))', ' ', text)
-        text = re.sub('\s+', ' ', text)
-        return {"text": text}
-    def filter_dataset(entry):
-        if not (len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3):
-            return False  # Too short
-        if re.match(entry["type"], "pIW|CA"):
-            return False  # Spelling out words
-        return True
     # 1. First, let's load the dataset
-    raw_datasets = DatasetDict()
     if training_args.do_train:
-        raw_datasets["train"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=data_args.train_split_name,
-            use_auth_token=data_args.use_auth_token,
-        ).shuffle()
-        raw_datasets["train"] = raw_datasets["train"].filter(filter_dataset)
-        raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
             raise ValueError(
                 f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
@@ -450,28 +493,18 @@ def main():
             raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
     if training_args.do_eval:
-        raw_datasets["eval"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=data_args.eval_split_name,
-            use_auth_token=data_args.use_auth_token,
-        ).shuffle()
-        raw_datasets["eval"] = raw_datasets["eval"].filter(filter_dataset)
-        raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
         if data_args.max_eval_samples is not None:
             raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
     # 2. We remove some special characters from the datasets
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
-    #chars_to_ignore_regex = (
     #    f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
-    #)
     chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
     text_column_name = data_args.text_column_name
     def remove_special_characters(batch):

 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.16.0.dev0")
 require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 logger = logging.getLogger(__name__)
         default=0.05,
         metadata={
             "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
+                    "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
+                    "vectors will be masked along the time axis."
         },
     )
     mask_time_length: int = field(
         default=0.0,
         metadata={
             "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
+                    "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
         },
     )
     mask_feature_length: int = field(
         default=False, metadata={"help": "If True, will try yo aboud the CTC loss goinf to infinity."}
     )
 @dataclass
 class DataTrainingArguments:
     """
         default=None,
         metadata={
             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+                    "value if set."
         },
     )
     max_eval_samples: Optional[int] = field(
         default=None,
         metadata={
             "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                    "value if set."
         },
     )
     chars_to_ignore: Optional[List[str]] = list_field(
         default=False,
         metadata={
             "help": "Whether to only do data preprocessing and skip training. "
+                    "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
+                    "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
+                    "so that the cached datasets can consequently be loaded in distributed training"
         },
     )
     use_auth_token: bool = field(
         default=False,
         metadata={
             "help": "If :obj:`True`, will use the token generated when running"
+                    ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
         },
     )
     unk_token: str = field(
         default=None,
         metadata={
             "help": "The target language that should be used be"
+                    " passed to the tokenizer for tokenization. Note that"
+                    " this is only relevant if the model classifies the"
+                    " input audio to a sequence of phoneme sequences."
         },
     )
 def create_vocabulary_from_data(
+        datasets: DatasetDict,
+        word_delimiter_token: Optional[str] = None,
+        unk_token: Optional[str] = None,
+        pad_token: Optional[str] = None,
 ):
     # Given training and test labels create vocabulary
     def extract_all_chars(batch):
     return vocab_dict
+def make_dataset(seed=42):
+    # Pre-processing dataset
+    import re
+    def map_nst(entry):
+        text = entry["text"].lower()
+        text = text.replace("(...Vær stille under dette opptaket...)", "")
+        text = re.sub('[áàâ]', 'a', text)
+        text = re.sub('[ä]', 'æ', text)
+        text = re.sub('[éèëê]', 'e', text)
+        text = re.sub('[íìïî]', 'i', text)
+        text = re.sub('[óòöô]', 'o', text)
+        text = re.sub('[ö]', 'ø', text)
+        text = re.sub('[ç]', 'c', text)
+        text = re.sub('[úùüû]', 'u', text)
+        # text = re.sub('\\(?=(Punktum|Komma|Utropstegn|Spørsmålstegn))', ' ', text)
+        text = re.sub('\s+', ' ', text)
+        return {"text": text}
+    def filter_nst(entry):
+        if not ((len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)):
+            return False  # Too short
+        if re.match(entry["type"], "pIW|CA"):
+            return False  # Spelling out words
+        return True
+    def filter_npsc(entry):
+        # False if there are digits in the text
+        if not ((len(entry["text"]) <= len(entry["audio"]["array"]) // 320) and (len(entry["text"].strip()) >= 3)):
+            return False  # Too short
+        if re.search("\d", entry["text"]):
+            return False
+        return True
+    def map_npsc(entry):
+        batch = {"text": entry["text"].lower()}
+        batch["text"] = re.sub('[áàâ]', 'a', batch["text"])
+        batch["text"] = re.sub('[ä]', 'æ', batch["text"])
+        batch["text"] = re.sub('[éèëê]', 'e', batch["text"])
+        batch["text"] = re.sub('[íìïî]', 'i', batch["text"])
+        batch["text"] = re.sub('[óòöô]', 'o', batch["text"])
+        batch["text"] = re.sub('[ö]', 'ø', batch["text"])
+        batch["text"] = re.sub('[ç]', 'c', batch["text"])
+        batch["text"] = re.sub('[úùüû]', 'u', batch["text"])
+        batch["text"] = re.sub('\s', ' ', batch["text"])
+        batch["text"] = re.sub('<ee>', 'eee', batch["text"])
+        batch["text"] = re.sub('<qq>', 'qqq', batch["text"])
+        batch["text"] = re.sub('<mm>', 'mmm', batch["text"])
+        batch["text"] = re.sub('<inaudible>', 'xxx', batch["text"])
+        # batch["text"] = re.sub('<inaudible>', '?', batch["text"])
+        if "<" in batch["text"]:
+            raise ValueError(batch["text"])
+        return batch
+    nst = datasets.load_dataset("NbAiLab/NST", "no-close")
+    npsc = datasets.load_dataset("NbAiLab/NPSC", "16K_mp3")
+    # TODO NST_hesitate
+    split = len(npsc["train"]) / (len(npsc["train"]) + len(npsc["validation"]))  # Use same train/val ratio as NPSC
+    nst_train = nst["train"].train_test_split(train_size=split, seed=seed)
+    nst["train"] = nst_train["train"]
+    nst["validation"] = nst_train["test"]
+    nst = nst.filter(filter_nst).map(map_nst).shuffle(seed=seed)
+    npsc = npsc.filter(filter_npsc).map(map_npsc).shuffle(seed=seed)
+    npsc_base = npsc.remove_columns([col for col in npsc["train"].column_names if col not in ["text", "audio"]])
+    nst_base = nst.remove_columns([col for col in nst["train"].column_names if col not in ["text", "audio"]])
+    combined = {}
+    for split in "train", "validation", "test":
+        probs = np.array([len(nst_base[split]), len(npsc_base[split])])  # Weight by number of examples
+        probs = (probs / probs.sum()).tolist()
+        comb = datasets.interleave_datasets([nst_base[split], npsc_base[split]], probabilities=probs, seed=seed)
+        combined[split] = comb
+    return datasets.DatasetDict(**combined)
 def main():
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # Set seed before initializing model.
     set_seed(training_args.seed)
     # 1. First, let's load the dataset
+    raw_datasets = make_dataset(seed=training_args.seed)
     if training_args.do_train:
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
             raise ValueError(
                 f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
             raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
     if training_args.do_eval:
         if data_args.max_eval_samples is not None:
             raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
     # 2. We remove some special characters from the datasets
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
+    # chars_to_ignore_regex = (
     #    f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+    # )
     chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'
     text_column_name = data_args.text_column_name
     def remove_special_characters(batch):