Examples / SimpleDataset.py
None's picture
Create SimpleDataset.py
3b17866
# Lint as: python3
"""Simple, minimal ASR dataset template."""
import csv
import os
import datasets
from datasets.tasks import AutomaticSpeechRecognition
_CITATION = ""
_DESCRIPTION = """\
This is a private dataset
"""
_URL = "https://localhost"
_DL_URL = "http://localhost:8000/data_simple.tgz"
class SimpleTplConfig(datasets.BuilderConfig):
"""BuilderConfig for LucerneTest."""
def __init__(self, name, **kwargs):
"""
Args:
data_dir: `string`, the path to the folder containing the audio files
in the downloaded .tar.gz file.
citation: `string`, optional citation for the dataset.
url: `string`, url for information about the dataset.
**kwargs: keyword arguments forwarded to super.
"""
self.num_of_voice = 100
description = f"Simple Dataset."
super(SimpleTplConfig, self).__init__(
name=name, version=datasets.Version("1.1.0", ""), description=description, **kwargs
)
class SimpleTpl(datasets.GeneratorBasedBuilder):
"""Simple Speech dataset."""
VERSION = datasets.Version("1.1.0")
#SimpleTplConfig(name="simpletpl")
DEFAULT_WRITER_BATCH_SIZE = 1000
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="main",
version=VERSION,
description="The simple dataset"
)
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"audio": datasets.Audio(sampling_rate=16000),
"path": datasets.Value("string"),
"sentence": datasets.Value("string"),
}
),
supervised_keys=None,
homepage=_URL,
citation=_CITATION,
task_templates=[
AutomaticSpeechRecognition(
audio_file_path_column="path",
transcription_column="sentence")
],
)
def _split_generators(self, dl_manager):
root_path = dl_manager.download_and_extract(_DL_URL)
root_path = os.path.join(root_path, "data_simple")
wav_path = os.path.join(root_path, "audio")
train_csv = os.path.join(root_path, "train.csv")
valid_csv = os.path.join(root_path, "valid.csv")
test_csv = os.path.join(root_path, "test.csv")
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"wav_path": wav_path, "csv_path": train_csv}
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"wav_path": wav_path, "csv_path": valid_csv}
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"wav_path": wav_path, "csv_path": test_csv}
),
]
def _generate_examples(self, wav_path, csv_path):
"""Generate examples from a Speech archive_path."""
with open(csv_path, encoding="utf-8") as csv_file:
csv_reader = csv.reader(
csv_file,
delimiter=",",
quotechar=None,
skipinitialspace=True
)
for idx,row in enumerate(csv_reader):
if idx == 0:
continue
wav_path, sentence = row
example = {
"path": wav_path,
"audio": wav_path,
"sentence": sentence,
}
yield wav_path, example