|
|
|
"""Simple, minimal ASR dataset template.""" |
|
|
|
|
|
import csv |
|
import os |
|
|
|
import datasets |
|
from datasets.tasks import AutomaticSpeechRecognition |
|
|
|
|
|
_CITATION = "" |
|
|
|
_DESCRIPTION = """\ |
|
This is a private dataset |
|
""" |
|
|
|
_URL = "https://localhost" |
|
_DL_URL = "http://localhost:8000/data_simple.tgz" |
|
|
|
|
|
class SimpleTplConfig(datasets.BuilderConfig): |
|
"""BuilderConfig for LucerneTest.""" |
|
|
|
def __init__(self, name, **kwargs): |
|
""" |
|
Args: |
|
data_dir: `string`, the path to the folder containing the audio files |
|
in the downloaded .tar.gz file. |
|
citation: `string`, optional citation for the dataset. |
|
url: `string`, url for information about the dataset. |
|
**kwargs: keyword arguments forwarded to super. |
|
""" |
|
self.num_of_voice = 100 |
|
|
|
description = f"Simple Dataset." |
|
super(SimpleTplConfig, self).__init__( |
|
name=name, version=datasets.Version("1.1.0", ""), description=description, **kwargs |
|
) |
|
|
|
class SimpleTpl(datasets.GeneratorBasedBuilder): |
|
"""Simple Speech dataset.""" |
|
|
|
VERSION = datasets.Version("1.1.0") |
|
|
|
|
|
DEFAULT_WRITER_BATCH_SIZE = 1000 |
|
BUILDER_CONFIGS = [ |
|
datasets.BuilderConfig( |
|
name="main", |
|
version=VERSION, |
|
description="The simple dataset" |
|
) |
|
] |
|
|
|
def _info(self): |
|
return datasets.DatasetInfo( |
|
description=_DESCRIPTION, |
|
features=datasets.Features( |
|
{ |
|
"audio": datasets.Audio(sampling_rate=16000), |
|
"path": datasets.Value("string"), |
|
"sentence": datasets.Value("string"), |
|
} |
|
), |
|
supervised_keys=None, |
|
homepage=_URL, |
|
citation=_CITATION, |
|
task_templates=[ |
|
AutomaticSpeechRecognition( |
|
audio_file_path_column="path", |
|
transcription_column="sentence") |
|
], |
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
root_path = dl_manager.download_and_extract(_DL_URL) |
|
root_path = os.path.join(root_path, "data_simple") |
|
wav_path = os.path.join(root_path, "audio") |
|
train_csv = os.path.join(root_path, "train.csv") |
|
valid_csv = os.path.join(root_path, "valid.csv") |
|
test_csv = os.path.join(root_path, "test.csv") |
|
|
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
gen_kwargs={"wav_path": wav_path, "csv_path": train_csv} |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.VALIDATION, |
|
gen_kwargs={"wav_path": wav_path, "csv_path": valid_csv} |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TEST, |
|
gen_kwargs={"wav_path": wav_path, "csv_path": test_csv} |
|
), |
|
] |
|
|
|
def _generate_examples(self, wav_path, csv_path): |
|
"""Generate examples from a Speech archive_path.""" |
|
|
|
with open(csv_path, encoding="utf-8") as csv_file: |
|
csv_reader = csv.reader( |
|
csv_file, |
|
delimiter=",", |
|
quotechar=None, |
|
skipinitialspace=True |
|
) |
|
|
|
for idx,row in enumerate(csv_reader): |
|
if idx == 0: |
|
continue |
|
wav_path, sentence = row |
|
example = { |
|
"path": wav_path, |
|
"audio": wav_path, |
|
"sentence": sentence, |
|
} |
|
|
|
yield wav_path, example |