Spaces:

None
/

Examples

No application file

App Files Files Community

Examples / SimpleDataset.py

None

Create SimpleDataset.py

3b17866 almost 3 years ago

raw

history blame contribute delete

3.67 kB

	# Lint as: python3
	"""Simple, minimal ASR dataset template."""


	import csv
	import os

	import datasets
	from datasets.tasks import AutomaticSpeechRecognition


	_CITATION = ""

	_DESCRIPTION = """\
	This is a private dataset
	"""

	_URL = "https://localhost"
	_DL_URL = "http://localhost:8000/data_simple.tgz"


	class SimpleTplConfig(datasets.BuilderConfig):
	"""BuilderConfig for LucerneTest."""

	def __init__(self, name, **kwargs):
	"""
	Args:
	data_dir: `string`, the path to the folder containing the audio files
	in the downloaded .tar.gz file.
	citation: `string`, optional citation for the dataset.
	url: `string`, url for information about the dataset.
	**kwargs: keyword arguments forwarded to super.
	"""
	self.num_of_voice = 100

	description = f"Simple Dataset."
	super(SimpleTplConfig, self).__init__(
	name=name, version=datasets.Version("1.1.0", ""), description=description, **kwargs
	)

	class SimpleTpl(datasets.GeneratorBasedBuilder):
	"""Simple Speech dataset."""

	VERSION = datasets.Version("1.1.0")
	#SimpleTplConfig(name="simpletpl")

	DEFAULT_WRITER_BATCH_SIZE = 1000
	BUILDER_CONFIGS = [
	datasets.BuilderConfig(
	name="main",
	version=VERSION,
	description="The simple dataset"
	)
	]

	def _info(self):
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features(
	{
	"audio": datasets.Audio(sampling_rate=16000),
	"path": datasets.Value("string"),
	"sentence": datasets.Value("string"),
	}
	),
	supervised_keys=None,
	homepage=_URL,
	citation=_CITATION,
	task_templates=[
	AutomaticSpeechRecognition(
	audio_file_path_column="path",
	transcription_column="sentence")
	],
	)

	def _split_generators(self, dl_manager):
	root_path = dl_manager.download_and_extract(_DL_URL)
	root_path = os.path.join(root_path, "data_simple")
	wav_path = os.path.join(root_path, "audio")
	train_csv = os.path.join(root_path, "train.csv")
	valid_csv = os.path.join(root_path, "valid.csv")
	test_csv = os.path.join(root_path, "test.csv")

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	gen_kwargs={"wav_path": wav_path, "csv_path": train_csv}
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	gen_kwargs={"wav_path": wav_path, "csv_path": valid_csv}
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	gen_kwargs={"wav_path": wav_path, "csv_path": test_csv}
	),
	]

	def _generate_examples(self, wav_path, csv_path):
	"""Generate examples from a Speech archive_path."""

	with open(csv_path, encoding="utf-8") as csv_file:
	csv_reader = csv.reader(
	csv_file,
	delimiter=",",
	quotechar=None,
	skipinitialspace=True
	)

	for idx,row in enumerate(csv_reader):
	if idx == 0:
	continue
	wav_path, sentence = row
	example = {
	"path": wav_path,
	"audio": wav_path,
	"sentence": sentence,
	}

	yield wav_path, example