Spaces:

None
/

Examples

No application file

App Files Files Community

None commited on Mar 3, 2022

Commit

3b17866

1 Parent(s): fa2d879

Create SimpleDataset.py

Browse files

Simplest possible audio dataset for ASR ?

Files changed (1) hide show

SimpleDataset.py +120 -0

SimpleDataset.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Lint as: python3
+"""Simple, minimal ASR dataset template."""
+import csv
+import os
+import datasets
+from datasets.tasks import AutomaticSpeechRecognition
+_CITATION = ""
+_DESCRIPTION = """\
+This is a private dataset
+"""
+_URL = "https://localhost"
+_DL_URL = "http://localhost:8000/data_simple.tgz"
+class SimpleTplConfig(datasets.BuilderConfig):
+    """BuilderConfig for LucerneTest."""
+    def __init__(self, name, **kwargs):
+        """
+        Args:
+          data_dir: `string`, the path to the folder containing the audio files
+            in the downloaded .tar.gz file.
+          citation: `string`, optional citation for the dataset.
+          url: `string`, url for information about the dataset.
+          **kwargs: keyword arguments forwarded to super.
+        """
+        self.num_of_voice = 100
+        description = f"Simple Dataset."
+        super(SimpleTplConfig, self).__init__(
+            name=name, version=datasets.Version("1.1.0", ""), description=description, **kwargs
+        )
+class SimpleTpl(datasets.GeneratorBasedBuilder):
+    """Simple Speech dataset."""
+    VERSION = datasets.Version("1.1.0")
+    #SimpleTplConfig(name="simpletpl")
+    DEFAULT_WRITER_BATCH_SIZE = 1000
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="main",
+            version=VERSION,
+            description="The simple dataset"
+        )
+    ]
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "audio": datasets.Audio(sampling_rate=16000),
+                    "path": datasets.Value("string"),
+                    "sentence": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_URL,
+            citation=_CITATION,
+            task_templates=[
+                AutomaticSpeechRecognition(
+                    audio_file_path_column="path",
+                    transcription_column="sentence")
+            ],
+        )
+    def _split_generators(self, dl_manager):
+        root_path = dl_manager.download_and_extract(_DL_URL)
+        root_path = os.path.join(root_path, "data_simple")
+        wav_path = os.path.join(root_path, "audio")
+        train_csv = os.path.join(root_path, "train.csv")
+        valid_csv = os.path.join(root_path, "valid.csv")
+        test_csv = os.path.join(root_path, "test.csv")
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"wav_path": wav_path, "csv_path": train_csv}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"wav_path": wav_path, "csv_path": valid_csv}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"wav_path": wav_path, "csv_path": test_csv}
+            ),
+        ]
+    def _generate_examples(self, wav_path, csv_path):
+        """Generate examples from a Speech archive_path."""
+        with open(csv_path, encoding="utf-8") as csv_file:
+            csv_reader = csv.reader(
+                csv_file,
+                delimiter=",",
+                quotechar=None,
+                skipinitialspace=True
+            )
+            for idx,row in enumerate(csv_reader):
+                if idx == 0:
+                    continue
+                wav_path, sentence = row
+                example = {
+                    "path": wav_path,
+                    "audio": wav_path,
+                    "sentence": sentence,
+                }
+                yield wav_path, example