Bauyrjan commited on
Commit
ae63903
1 Parent(s): 55eb5a5

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +42 -0
utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import torchaudio
3
+ import re
4
+
5
+
6
+ def get_test_dataset(data_path='ISSAI_KSC_335RS_v1.1'):
7
+ def read_sentence(idx):
8
+ with open(f"{data_path}/Transcriptions/{idx}.txt", 'r') as f:
9
+ text = ' '.join(f.readlines())
10
+ return text
11
+
12
+ def read_text(batch):
13
+ batch["sentence"] = read_sentence(batch['uttID'])
14
+ return batch
15
+
16
+ chars_to_ignore = ["f", "m"]
17
+ chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
18
+
19
+ def process_text(batch):
20
+ batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
21
+ batch['text'] = batch['text'].replace('a', 'а').replace('ə', 'ә').replace('ɵ', 'ө')
22
+ return batch
23
+
24
+ def load_audio(batch):
25
+ path = f"{data_path}/Audios_flac/{batch['uttID']}.flac"
26
+ speech_array, sr = torchaudio.load(path)
27
+ batch["speech"] = speech_array
28
+ batch["sampling_rate"] = sr
29
+ return batch
30
+
31
+ test_dataset = datasets.load_dataset(
32
+ 'csv',
33
+ data_files=f"{data_path}/Meta/test.csv",
34
+ delimiter=' ',
35
+ split='train'
36
+ )
37
+
38
+ test_dataset = test_dataset.map(read_text)
39
+ test_dataset = test_dataset.map(process_text)
40
+ test_dataset = test_dataset.map(load_audio, num_proc=1)
41
+
42
+ return test_dataset