Afegeix models
Browse files- Dockerfile +2 -2
- models/bsc/best_model.pth +3 -0
- models/bsc/config.json +262 -0
- models/bsc/speaker_map.json +10 -0
- models/bsc/speakers.pth +3 -0
- models/collectivat/catotron-ona-TTS-API-entry.json +10 -0
- models/collectivat/fast-speech_best_model.pth +3 -0
- models/collectivat/fast-speech_config.json +213 -0
- models/collectivat/ljspeech--hifigan_v2_config.json +158 -0
- models/collectivat/ljspeech--hifigan_v2_model_file.pth +3 -0
- models/piper/MODEL_CARD +15 -0
- models/piper/ca-upc_ona-x-low.onnx +3 -0
- models/piper/ca-upc_ona-x-low.onnx.json +409 -0
Dockerfile
CHANGED
@@ -12,6 +12,7 @@ RUN cd espeak-ng && \
|
|
12 |
|
13 |
COPY requirements.txt .
|
14 |
COPY app.py .
|
|
|
15 |
|
16 |
RUN pip install -r requirements.txt
|
17 |
|
@@ -20,7 +21,6 @@ RUN mkdir -p cache && chmod 777 cache
|
|
20 |
ENV NUMBA_CACHE_DIR=./cache
|
21 |
ENV MPLCONFIGDIR=./cache
|
22 |
|
23 |
-
|
24 |
EXPOSE 7860
|
25 |
|
26 |
-
CMD python app.py
|
|
|
12 |
|
13 |
COPY requirements.txt .
|
14 |
COPY app.py .
|
15 |
+
COPY models .
|
16 |
|
17 |
RUN pip install -r requirements.txt
|
18 |
|
|
|
21 |
ENV NUMBA_CACHE_DIR=./cache
|
22 |
ENV MPLCONFIGDIR=./cache
|
23 |
|
|
|
24 |
EXPOSE 7860
|
25 |
|
26 |
+
CMD ["python", "app.py"]
|
models/bsc/best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
|
3 |
+
size 1038659133
|
models/bsc/config.json
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
|
3 |
+
"logger_uri": null,
|
4 |
+
"run_name": "multispeaker_vits_ca_1e4_1e4_32",
|
5 |
+
"project_name": null,
|
6 |
+
"run_description": "\ud83d\udc38Coqui trainer run.",
|
7 |
+
"print_step": 25,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "tensorboard",
|
12 |
+
"log_model_step": 1000,
|
13 |
+
"save_step": 1000,
|
14 |
+
"save_n_checkpoints": 5,
|
15 |
+
"save_checkpoints": true,
|
16 |
+
"save_all_best": true,
|
17 |
+
"save_best_after": 10000,
|
18 |
+
"target_loss": null,
|
19 |
+
"print_eval": true,
|
20 |
+
"test_delay_epochs": -1,
|
21 |
+
"run_eval": true,
|
22 |
+
"run_eval_steps": null,
|
23 |
+
"distributed_backend": "nccl",
|
24 |
+
"distributed_url": "tcp://localhost:54321",
|
25 |
+
"mixed_precision": false,
|
26 |
+
"epochs": 1000,
|
27 |
+
"batch_size": 16,
|
28 |
+
"eval_batch_size": 8,
|
29 |
+
"grad_clip": [
|
30 |
+
1000.0,
|
31 |
+
1000.0
|
32 |
+
],
|
33 |
+
"scheduler_after_epoch": true,
|
34 |
+
"lr": 0.001,
|
35 |
+
"optimizer": "AdamW",
|
36 |
+
"optimizer_params": {
|
37 |
+
"betas": [
|
38 |
+
0.8,
|
39 |
+
0.99
|
40 |
+
],
|
41 |
+
"eps": 1e-09,
|
42 |
+
"weight_decay": 0.01
|
43 |
+
},
|
44 |
+
"lr_scheduler": "",
|
45 |
+
"lr_scheduler_params": null,
|
46 |
+
"use_grad_scaler": false,
|
47 |
+
"cudnn_enable": true,
|
48 |
+
"cudnn_deterministic": false,
|
49 |
+
"cudnn_benchmark": false,
|
50 |
+
"training_seed": 54321,
|
51 |
+
"model": "vits",
|
52 |
+
"num_loader_workers": 4,
|
53 |
+
"num_eval_loader_workers": 4,
|
54 |
+
"use_noise_augment": false,
|
55 |
+
"audio": {
|
56 |
+
"fft_size": 1024,
|
57 |
+
"sample_rate": 22050,
|
58 |
+
"win_length": 1024,
|
59 |
+
"hop_length": 256,
|
60 |
+
"num_mels": 80,
|
61 |
+
"mel_fmin": 0,
|
62 |
+
"mel_fmax": null
|
63 |
+
},
|
64 |
+
"use_phonemes": true,
|
65 |
+
"phonemizer": "espeak",
|
66 |
+
"phoneme_language": "ca",
|
67 |
+
"compute_input_seq_cache": true,
|
68 |
+
"text_cleaner": "multilingual_cleaners",
|
69 |
+
"enable_eos_bos_chars": false,
|
70 |
+
"test_sentences_file": "",
|
71 |
+
"phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
|
72 |
+
"characters": {
|
73 |
+
"characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
|
74 |
+
"vocab_dict": null,
|
75 |
+
"pad": "<PAD>",
|
76 |
+
"eos": "<EOS>",
|
77 |
+
"bos": "<BOS>",
|
78 |
+
"blank": "<BLNK>",
|
79 |
+
"characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
|
80 |
+
"punctuations": "!'(),-.:;? ",
|
81 |
+
"phonemes": null,
|
82 |
+
"is_unique": false,
|
83 |
+
"is_sorted": true
|
84 |
+
},
|
85 |
+
"add_blank": true,
|
86 |
+
"batch_group_size": 5,
|
87 |
+
"loss_masking": null,
|
88 |
+
"min_audio_len": 1,
|
89 |
+
"max_audio_len": Infinity,
|
90 |
+
"min_text_len": 1,
|
91 |
+
"max_text_len": 325,
|
92 |
+
"compute_f0": false,
|
93 |
+
"compute_linear_spec": true,
|
94 |
+
"precompute_num_workers": 0,
|
95 |
+
"start_by_longest": false,
|
96 |
+
"datasets": [
|
97 |
+
{
|
98 |
+
"formatter": "vctk_old",
|
99 |
+
"dataset_name": "vctk_old",
|
100 |
+
"path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
|
101 |
+
"meta_file_train": "",
|
102 |
+
"ignored_speakers": [
|
103 |
+
"uri",
|
104 |
+
"09796",
|
105 |
+
"05450"
|
106 |
+
],
|
107 |
+
"language": "ca",
|
108 |
+
"meta_file_val": "",
|
109 |
+
"meta_file_attn_mask": ""
|
110 |
+
}
|
111 |
+
],
|
112 |
+
"test_sentences": [
|
113 |
+
[
|
114 |
+
"Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
|
115 |
+
],
|
116 |
+
[
|
117 |
+
"Preguntin-se si aix\u00f2 era necessari."
|
118 |
+
],
|
119 |
+
[
|
120 |
+
"La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
|
121 |
+
],
|
122 |
+
[
|
123 |
+
"\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
|
124 |
+
]
|
125 |
+
],
|
126 |
+
"eval_split_max_size": null,
|
127 |
+
"eval_split_size": 0.01,
|
128 |
+
"use_speaker_weighted_sampler": false,
|
129 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
130 |
+
"use_language_weighted_sampler": false,
|
131 |
+
"language_weighted_sampler_alpha": 1.0,
|
132 |
+
"use_length_weighted_sampler": false,
|
133 |
+
"length_weighted_sampler_alpha": 1.0,
|
134 |
+
"model_args": {
|
135 |
+
"num_chars": 131,
|
136 |
+
"out_channels": 513,
|
137 |
+
"spec_segment_size": 32,
|
138 |
+
"hidden_channels": 192,
|
139 |
+
"hidden_channels_ffn_text_encoder": 768,
|
140 |
+
"num_heads_text_encoder": 2,
|
141 |
+
"num_layers_text_encoder": 6,
|
142 |
+
"kernel_size_text_encoder": 3,
|
143 |
+
"dropout_p_text_encoder": 0.1,
|
144 |
+
"dropout_p_duration_predictor": 0.5,
|
145 |
+
"kernel_size_posterior_encoder": 5,
|
146 |
+
"dilation_rate_posterior_encoder": 1,
|
147 |
+
"num_layers_posterior_encoder": 16,
|
148 |
+
"kernel_size_flow": 5,
|
149 |
+
"dilation_rate_flow": 1,
|
150 |
+
"num_layers_flow": 4,
|
151 |
+
"resblock_type_decoder": "1",
|
152 |
+
"resblock_kernel_sizes_decoder": [
|
153 |
+
3,
|
154 |
+
7,
|
155 |
+
11
|
156 |
+
],
|
157 |
+
"resblock_dilation_sizes_decoder": [
|
158 |
+
[
|
159 |
+
1,
|
160 |
+
3,
|
161 |
+
5
|
162 |
+
],
|
163 |
+
[
|
164 |
+
1,
|
165 |
+
3,
|
166 |
+
5
|
167 |
+
],
|
168 |
+
[
|
169 |
+
1,
|
170 |
+
3,
|
171 |
+
5
|
172 |
+
]
|
173 |
+
],
|
174 |
+
"upsample_rates_decoder": [
|
175 |
+
8,
|
176 |
+
8,
|
177 |
+
2,
|
178 |
+
2
|
179 |
+
],
|
180 |
+
"upsample_initial_channel_decoder": 512,
|
181 |
+
"upsample_kernel_sizes_decoder": [
|
182 |
+
16,
|
183 |
+
16,
|
184 |
+
4,
|
185 |
+
4
|
186 |
+
],
|
187 |
+
"periods_multi_period_discriminator": [
|
188 |
+
2,
|
189 |
+
3,
|
190 |
+
5,
|
191 |
+
7,
|
192 |
+
11
|
193 |
+
],
|
194 |
+
"use_sdp": true,
|
195 |
+
"noise_scale": 1.0,
|
196 |
+
"inference_noise_scale": 0.667,
|
197 |
+
"length_scale": 1.0,
|
198 |
+
"noise_scale_dp": 1.0,
|
199 |
+
"inference_noise_scale_dp": 1.0,
|
200 |
+
"max_inference_len": null,
|
201 |
+
"init_discriminator": true,
|
202 |
+
"use_spectral_norm_disriminator": false,
|
203 |
+
"use_speaker_embedding": true,
|
204 |
+
"num_speakers": 257,
|
205 |
+
"speakers_file": "/home/user/app/speakers.pth",
|
206 |
+
"d_vector_file": null,
|
207 |
+
"speaker_embedding_channels": 256,
|
208 |
+
"use_d_vector_file": false,
|
209 |
+
"d_vector_dim": 0,
|
210 |
+
"detach_dp_input": true,
|
211 |
+
"use_language_embedding": false,
|
212 |
+
"embedded_language_dim": 4,
|
213 |
+
"num_languages": 0,
|
214 |
+
"language_ids_file": null,
|
215 |
+
"use_speaker_encoder_as_loss": false,
|
216 |
+
"speaker_encoder_config_path": "",
|
217 |
+
"speaker_encoder_model_path": "",
|
218 |
+
"condition_dp_on_speaker": true,
|
219 |
+
"freeze_encoder": false,
|
220 |
+
"freeze_DP": false,
|
221 |
+
"freeze_PE": false,
|
222 |
+
"freeze_flow_decoder": false,
|
223 |
+
"freeze_waveform_decoder": false,
|
224 |
+
"encoder_sample_rate": null,
|
225 |
+
"interpolate_z": true,
|
226 |
+
"reinit_DP": false,
|
227 |
+
"reinit_text_encoder": false
|
228 |
+
},
|
229 |
+
"lr_gen": 0.0001,
|
230 |
+
"lr_disc": 0.0001,
|
231 |
+
"lr_scheduler_gen": "ExponentialLR",
|
232 |
+
"lr_scheduler_gen_params": {
|
233 |
+
"gamma": 0.999875,
|
234 |
+
"last_epoch": -1
|
235 |
+
},
|
236 |
+
"lr_scheduler_disc": "ExponentialLR",
|
237 |
+
"lr_scheduler_disc_params": {
|
238 |
+
"gamma": 0.999875,
|
239 |
+
"last_epoch": -1
|
240 |
+
},
|
241 |
+
"kl_loss_alpha": 1.0,
|
242 |
+
"disc_loss_alpha": 1.0,
|
243 |
+
"gen_loss_alpha": 1.0,
|
244 |
+
"feat_loss_alpha": 1.0,
|
245 |
+
"mel_loss_alpha": 45.0,
|
246 |
+
"dur_loss_alpha": 1.0,
|
247 |
+
"speaker_encoder_loss_alpha": 1.0,
|
248 |
+
"return_wav": true,
|
249 |
+
"use_weighted_sampler": false,
|
250 |
+
"weighted_sampler_attrs": null,
|
251 |
+
"weighted_sampler_multipliers": null,
|
252 |
+
"r": 1,
|
253 |
+
"num_speakers": 257,
|
254 |
+
"use_speaker_embedding": true,
|
255 |
+
"speakers_file": "/home/user/app/speakers.pth",
|
256 |
+
"speaker_embedding_channels": 256,
|
257 |
+
"language_ids_file": null,
|
258 |
+
"use_language_embedding": false,
|
259 |
+
"use_d_vector_file": false,
|
260 |
+
"d_vector_file": null,
|
261 |
+
"d_vector_dim": 0
|
262 |
+
}
|
models/bsc/speaker_map.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"f_cen_05": "05739",
|
3 |
+
"f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
|
4 |
+
"f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
|
5 |
+
"f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
|
6 |
+
"f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
|
7 |
+
"m_cen_08": "08935",
|
8 |
+
"m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
|
9 |
+
"m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
|
10 |
+
}
|
models/bsc/speakers.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
|
3 |
+
size 30191
|
models/collectivat/catotron-ona-TTS-API-entry.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"voice": "ona-fast-hifigan",
|
3 |
+
"lang": "ca",
|
4 |
+
"model_type": "coqui",
|
5 |
+
"tts_config_path": "fast-speech_config.json",
|
6 |
+
"tts_model_path": "fast-speech_best_model.pth",
|
7 |
+
"vocoder_config_path": "ljspeech--hifigan_v2_config.json",
|
8 |
+
"vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
|
9 |
+
"load": true
|
10 |
+
}
|
models/collectivat/fast-speech_best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
|
3 |
+
size 457921637
|
models/collectivat/fast-speech_config.json
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
|
3 |
+
"logger_uri": null,
|
4 |
+
"run_name": "fast_pitch_ljspeech",
|
5 |
+
"project_name": null,
|
6 |
+
"run_description": "\ud83d\udc38Coqui trainer run.",
|
7 |
+
"print_step": 50,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "tensorboard",
|
12 |
+
"log_model_step": null,
|
13 |
+
"save_step": 10000,
|
14 |
+
"save_n_checkpoints": 5,
|
15 |
+
"save_checkpoints": true,
|
16 |
+
"save_all_best": false,
|
17 |
+
"save_best_after": 1000,
|
18 |
+
"target_loss": null,
|
19 |
+
"print_eval": false,
|
20 |
+
"test_delay_epochs": -1,
|
21 |
+
"run_eval": true,
|
22 |
+
"run_eval_steps": null,
|
23 |
+
"distributed_backend": "nccl",
|
24 |
+
"distributed_url": "tcp://localhost:54321",
|
25 |
+
"mixed_precision": false,
|
26 |
+
"epochs": 1000,
|
27 |
+
"batch_size": 16,
|
28 |
+
"eval_batch_size": 16,
|
29 |
+
"grad_clip": 5.0,
|
30 |
+
"scheduler_after_epoch": true,
|
31 |
+
"lr": 0.0001,
|
32 |
+
"optimizer": "Adam",
|
33 |
+
"optimizer_params": {
|
34 |
+
"betas": [
|
35 |
+
0.9,
|
36 |
+
0.998
|
37 |
+
],
|
38 |
+
"weight_decay": 1e-06
|
39 |
+
},
|
40 |
+
"lr_scheduler": "NoamLR",
|
41 |
+
"lr_scheduler_params": {
|
42 |
+
"warmup_steps": 4000
|
43 |
+
},
|
44 |
+
"use_grad_scaler": false,
|
45 |
+
"cudnn_enable": true,
|
46 |
+
"cudnn_deterministic": false,
|
47 |
+
"cudnn_benchmark": false,
|
48 |
+
"training_seed": 54321,
|
49 |
+
"model": "fast_pitch",
|
50 |
+
"num_loader_workers": 8,
|
51 |
+
"num_eval_loader_workers": 4,
|
52 |
+
"use_noise_augment": false,
|
53 |
+
"audio": {
|
54 |
+
"fft_size": 1024,
|
55 |
+
"win_length": 1024,
|
56 |
+
"hop_length": 256,
|
57 |
+
"frame_shift_ms": null,
|
58 |
+
"frame_length_ms": null,
|
59 |
+
"stft_pad_mode": "reflect",
|
60 |
+
"sample_rate": 22050,
|
61 |
+
"resample": false,
|
62 |
+
"preemphasis": 0.0,
|
63 |
+
"ref_level_db": 20,
|
64 |
+
"do_sound_norm": false,
|
65 |
+
"log_func": "np.log",
|
66 |
+
"do_trim_silence": true,
|
67 |
+
"trim_db": 60.0,
|
68 |
+
"do_rms_norm": false,
|
69 |
+
"db_level": null,
|
70 |
+
"power": 1.5,
|
71 |
+
"griffin_lim_iters": 60,
|
72 |
+
"num_mels": 80,
|
73 |
+
"mel_fmin": 0.0,
|
74 |
+
"mel_fmax": 8000,
|
75 |
+
"spec_gain": 1.0,
|
76 |
+
"do_amp_to_db_linear": true,
|
77 |
+
"do_amp_to_db_mel": true,
|
78 |
+
"pitch_fmax": 640.0,
|
79 |
+
"pitch_fmin": 0.0,
|
80 |
+
"signal_norm": false,
|
81 |
+
"min_level_db": -100,
|
82 |
+
"symmetric_norm": true,
|
83 |
+
"max_norm": 4.0,
|
84 |
+
"clip_norm": true,
|
85 |
+
"stats_path": null
|
86 |
+
},
|
87 |
+
"use_phonemes": false,
|
88 |
+
"phonemizer": null,
|
89 |
+
"phoneme_language": "ca-es",
|
90 |
+
"compute_input_seq_cache": true,
|
91 |
+
"text_cleaner": "multilingual_cleaners",
|
92 |
+
"enable_eos_bos_chars": false,
|
93 |
+
"test_sentences_file": "",
|
94 |
+
"phoneme_cache_path": null,
|
95 |
+
"characters": {
|
96 |
+
"characters_class": "TTS.tts.utils.text.characters.Graphemes",
|
97 |
+
"vocab_dict": null,
|
98 |
+
"pad": "_",
|
99 |
+
"eos": "*",
|
100 |
+
"bos": "^",
|
101 |
+
"blank": null,
|
102 |
+
"characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
|
103 |
+
"punctuations": "!'(),-.:;?\u00b7 ",
|
104 |
+
"phonemes": "",
|
105 |
+
"is_unique": true,
|
106 |
+
"is_sorted": true
|
107 |
+
},
|
108 |
+
"add_blank": false,
|
109 |
+
"batch_group_size": 0,
|
110 |
+
"loss_masking": null,
|
111 |
+
"min_audio_len": 1,
|
112 |
+
"max_audio_len": Infinity,
|
113 |
+
"min_text_len": 1,
|
114 |
+
"max_text_len": Infinity,
|
115 |
+
"compute_f0": true,
|
116 |
+
"compute_linear_spec": false,
|
117 |
+
"precompute_num_workers": 4,
|
118 |
+
"start_by_longest": false,
|
119 |
+
"datasets": [
|
120 |
+
{
|
121 |
+
"name": "custom_turkish",
|
122 |
+
"path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
|
123 |
+
"meta_file_train": "upc_ona_train.txt",
|
124 |
+
"ignored_speakers": null,
|
125 |
+
"language": "",
|
126 |
+
"meta_file_val": "",
|
127 |
+
"meta_file_attn_mask": ""
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"name": "custom_turkish",
|
131 |
+
"path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
|
132 |
+
"meta_file_train": "upc_ona_val.txt",
|
133 |
+
"ignored_speakers": null,
|
134 |
+
"language": "",
|
135 |
+
"meta_file_val": "",
|
136 |
+
"meta_file_attn_mask": ""
|
137 |
+
}
|
138 |
+
],
|
139 |
+
"test_sentences": [
|
140 |
+
"Hola Barcelona!",
|
141 |
+
"Escriviu al text."
|
142 |
+
],
|
143 |
+
"eval_split_max_size": null,
|
144 |
+
"eval_split_size": 0.01,
|
145 |
+
"use_speaker_weighted_sampler": false,
|
146 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
147 |
+
"use_language_weighted_sampler": false,
|
148 |
+
"language_weighted_sampler_alpha": 1.0,
|
149 |
+
"use_length_weighted_sampler": false,
|
150 |
+
"length_weighted_sampler_alpha": 1.0,
|
151 |
+
"base_model": "forward_tts",
|
152 |
+
"model_args": {
|
153 |
+
"num_chars": 89,
|
154 |
+
"out_channels": 80,
|
155 |
+
"hidden_channels": 384,
|
156 |
+
"use_aligner": true,
|
157 |
+
"use_pitch": true,
|
158 |
+
"pitch_predictor_hidden_channels": 256,
|
159 |
+
"pitch_predictor_kernel_size": 3,
|
160 |
+
"pitch_predictor_dropout_p": 0.1,
|
161 |
+
"pitch_embedding_kernel_size": 3,
|
162 |
+
"duration_predictor_hidden_channels": 256,
|
163 |
+
"duration_predictor_kernel_size": 3,
|
164 |
+
"duration_predictor_dropout_p": 0.1,
|
165 |
+
"positional_encoding": true,
|
166 |
+
"poisitonal_encoding_use_scale": true,
|
167 |
+
"length_scale": 1,
|
168 |
+
"encoder_type": "fftransformer",
|
169 |
+
"encoder_params": {
|
170 |
+
"hidden_channels_ffn": 1024,
|
171 |
+
"num_heads": 1,
|
172 |
+
"num_layers": 6,
|
173 |
+
"dropout_p": 0.1
|
174 |
+
},
|
175 |
+
"decoder_type": "fftransformer",
|
176 |
+
"decoder_params": {
|
177 |
+
"hidden_channels_ffn": 1024,
|
178 |
+
"num_heads": 1,
|
179 |
+
"num_layers": 6,
|
180 |
+
"dropout_p": 0.1
|
181 |
+
},
|
182 |
+
"detach_duration_predictor": false,
|
183 |
+
"max_duration": 75,
|
184 |
+
"num_speakers": 1,
|
185 |
+
"use_speaker_embedding": false,
|
186 |
+
"speakers_file": null,
|
187 |
+
"use_d_vector_file": false,
|
188 |
+
"d_vector_dim": null,
|
189 |
+
"d_vector_file": null
|
190 |
+
},
|
191 |
+
"num_speakers": 0,
|
192 |
+
"speakers_file": null,
|
193 |
+
"use_speaker_embedding": false,
|
194 |
+
"use_d_vector_file": false,
|
195 |
+
"d_vector_file": false,
|
196 |
+
"d_vector_dim": 0,
|
197 |
+
"spec_loss_type": "mse",
|
198 |
+
"duration_loss_type": "mse",
|
199 |
+
"use_ssim_loss": true,
|
200 |
+
"ssim_loss_alpha": 1.0,
|
201 |
+
"spec_loss_alpha": 1.0,
|
202 |
+
"aligner_loss_alpha": 1.0,
|
203 |
+
"pitch_loss_alpha": 0.1,
|
204 |
+
"dur_loss_alpha": 0.1,
|
205 |
+
"binary_align_loss_alpha": 0.1,
|
206 |
+
"binary_loss_warmup_epochs": 150,
|
207 |
+
"min_seq_len": 13,
|
208 |
+
"max_seq_len": 500000,
|
209 |
+
"r": 1,
|
210 |
+
"f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
|
211 |
+
"restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
|
212 |
+
"github_branch": "* dev"
|
213 |
+
}
|
models/collectivat/ljspeech--hifigan_v2_config.json
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_name": "hifigan",
|
3 |
+
"run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
|
4 |
+
|
5 |
+
|
6 |
+
// AUDIO PARAMETERS
|
7 |
+
"audio":{
|
8 |
+
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
9 |
+
"win_length": 1024, // stft window length in ms.
|
10 |
+
"hop_length": 256, // stft window hop-lengh in ms.
|
11 |
+
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
12 |
+
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
13 |
+
|
14 |
+
// Audio processing parameters
|
15 |
+
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
16 |
+
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
17 |
+
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
18 |
+
"log_func": "np.log",
|
19 |
+
|
20 |
+
// Silence trimming
|
21 |
+
"do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
22 |
+
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
23 |
+
|
24 |
+
// MelSpectrogram parameters
|
25 |
+
"num_mels": 80, // size of the mel spec frame.
|
26 |
+
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
27 |
+
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
28 |
+
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
29 |
+
|
30 |
+
// Normalization parameters
|
31 |
+
"signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
32 |
+
"min_level_db": -100, // lower bound for normalization
|
33 |
+
"symmetric_norm": true, // move normalization to range [-1, 1]
|
34 |
+
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
35 |
+
"clip_norm": true, // clip normalized values into the range.
|
36 |
+
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
37 |
+
},
|
38 |
+
|
39 |
+
// DISTRIBUTED TRAINING
|
40 |
+
"distributed":{
|
41 |
+
"backend": "nccl",
|
42 |
+
"url": "tcp:\/\/localhost:54324"
|
43 |
+
},
|
44 |
+
|
45 |
+
// MODEL PARAMETERS
|
46 |
+
"use_pqmf": false,
|
47 |
+
|
48 |
+
// LOSS PARAMETERS
|
49 |
+
"use_stft_loss": false,
|
50 |
+
"use_subband_stft_loss": false,
|
51 |
+
"use_mse_gan_loss": true,
|
52 |
+
"use_hinge_gan_loss": false,
|
53 |
+
"use_feat_match_loss": true, // use only with melgan discriminators
|
54 |
+
"use_l1_spec_loss": true,
|
55 |
+
|
56 |
+
// loss weights
|
57 |
+
"stft_loss_weight": 0,
|
58 |
+
"subband_stft_loss_weight": 0,
|
59 |
+
"mse_G_loss_weight": 1,
|
60 |
+
"hinge_G_loss_weight": 0,
|
61 |
+
"feat_match_loss_weight": 10,
|
62 |
+
"l1_spec_loss_weight": 45,
|
63 |
+
|
64 |
+
// multiscale stft loss parameters
|
65 |
+
// "stft_loss_params": {
|
66 |
+
// "n_ffts": [1024, 2048, 512],
|
67 |
+
// "hop_lengths": [120, 240, 50],
|
68 |
+
// "win_lengths": [600, 1200, 240]
|
69 |
+
// },
|
70 |
+
|
71 |
+
"l1_spec_loss_params": {
|
72 |
+
"use_mel": true,
|
73 |
+
"sample_rate": 16000,
|
74 |
+
"n_fft": 1024,
|
75 |
+
"hop_length": 256,
|
76 |
+
"win_length": 1024,
|
77 |
+
"n_mels": 80,
|
78 |
+
"mel_fmin": 0.0,
|
79 |
+
"mel_fmax": null
|
80 |
+
},
|
81 |
+
|
82 |
+
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
|
83 |
+
|
84 |
+
// DISCRIMINATOR
|
85 |
+
"discriminator_model": "hifigan_discriminator",
|
86 |
+
//"discriminator_model_params":{
|
87 |
+
// "peroids": [2, 3, 5, 7, 11],
|
88 |
+
// "base_channels": 16,
|
89 |
+
// "max_channels":512,
|
90 |
+
// "downsample_factors":[4, 4, 4]
|
91 |
+
//},
|
92 |
+
"steps_to_start_discriminator": 0, // steps required to start GAN trainining.1
|
93 |
+
|
94 |
+
// GENERATOR
|
95 |
+
"generator_model": "hifigan_generator",
|
96 |
+
"generator_model_params": {
|
97 |
+
"resblock_type": "1",
|
98 |
+
"upsample_factors": [8,8,2,2],
|
99 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
100 |
+
"upsample_initial_channel": 128,
|
101 |
+
"resblock_kernel_sizes": [3,7,11],
|
102 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
|
103 |
+
},
|
104 |
+
|
105 |
+
// DATASET
|
106 |
+
"data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
|
107 |
+
"feature_path": null,
|
108 |
+
// "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
|
109 |
+
"seq_len": 8192,
|
110 |
+
"pad_short": 2000,
|
111 |
+
"conv_pad": 0,
|
112 |
+
"use_noise_augment": false,
|
113 |
+
"use_cache": true,
|
114 |
+
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
115 |
+
|
116 |
+
// TRAINING
|
117 |
+
"batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
118 |
+
|
119 |
+
// VALIDATION
|
120 |
+
"run_eval": true,
|
121 |
+
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
122 |
+
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
123 |
+
|
124 |
+
// OPTIMIZER
|
125 |
+
"epochs": 10000, // total number of epochs to train.
|
126 |
+
"wd": 0.0, // Weight decay weight.
|
127 |
+
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
128 |
+
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
|
129 |
+
// "lr_scheduler_gen": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
130 |
+
// "lr_scheduler_gen_params": {
|
131 |
+
// "gamma": 0.999,
|
132 |
+
// "last_epoch": -1
|
133 |
+
// },
|
134 |
+
// "lr_scheduler_disc": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
135 |
+
// "lr_scheduler_disc_params": {
|
136 |
+
// "gamma": 0.999,
|
137 |
+
// "last_epoch": -1
|
138 |
+
// },
|
139 |
+
"lr_gen": 0.00001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
140 |
+
"lr_disc": 0.00001,
|
141 |
+
|
142 |
+
// TENSORBOARD and LOGGING
|
143 |
+
"print_step": 25, // Number of steps to log traning on console.
|
144 |
+
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
145 |
+
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
146 |
+
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
147 |
+
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
148 |
+
|
149 |
+
// DATA LOADING
|
150 |
+
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
151 |
+
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
152 |
+
"eval_split_size": 10,
|
153 |
+
|
154 |
+
// PATHS
|
155 |
+
"output_path": "/home/erogol/gdrive/Trainings/sam/"
|
156 |
+
}
|
157 |
+
|
158 |
+
|
models/collectivat/ljspeech--hifigan_v2_model_file.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
|
3 |
+
size 3794153
|
models/piper/MODEL_CARD
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model card for upc_ona (x-low)
|
2 |
+
|
3 |
+
* Language: ca (Catalan)
|
4 |
+
* Speakers: 1
|
5 |
+
* Quality: x-low
|
6 |
+
* Samplerate: 16,000Hz
|
7 |
+
|
8 |
+
## Dataset
|
9 |
+
|
10 |
+
* URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
|
11 |
+
* License: CC BY-SA 3.0 ES
|
12 |
+
|
13 |
+
## Training
|
14 |
+
|
15 |
+
Trained from scratch.
|
models/piper/ca-upc_ona-x-low.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
|
3 |
+
size 20628813
|
models/piper/ca-upc_ona-x-low.onnx.json
ADDED
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio": {
|
3 |
+
"sample_rate": 16000
|
4 |
+
},
|
5 |
+
"espeak": {
|
6 |
+
"voice": "ca"
|
7 |
+
},
|
8 |
+
"inference": {
|
9 |
+
"noise_scale": 0.667,
|
10 |
+
"length_scale": 1,
|
11 |
+
"noise_w": 0.8
|
12 |
+
},
|
13 |
+
"phoneme_map": {},
|
14 |
+
"phoneme_id_map": {
|
15 |
+
"_": [
|
16 |
+
0
|
17 |
+
],
|
18 |
+
"^": [
|
19 |
+
1
|
20 |
+
],
|
21 |
+
"$": [
|
22 |
+
2
|
23 |
+
],
|
24 |
+
" ": [
|
25 |
+
3
|
26 |
+
],
|
27 |
+
"!": [
|
28 |
+
4
|
29 |
+
],
|
30 |
+
"'": [
|
31 |
+
5
|
32 |
+
],
|
33 |
+
"(": [
|
34 |
+
6
|
35 |
+
],
|
36 |
+
")": [
|
37 |
+
7
|
38 |
+
],
|
39 |
+
",": [
|
40 |
+
8
|
41 |
+
],
|
42 |
+
"-": [
|
43 |
+
9
|
44 |
+
],
|
45 |
+
".": [
|
46 |
+
10
|
47 |
+
],
|
48 |
+
":": [
|
49 |
+
11
|
50 |
+
],
|
51 |
+
";": [
|
52 |
+
12
|
53 |
+
],
|
54 |
+
"?": [
|
55 |
+
13
|
56 |
+
],
|
57 |
+
"a": [
|
58 |
+
14
|
59 |
+
],
|
60 |
+
"b": [
|
61 |
+
15
|
62 |
+
],
|
63 |
+
"c": [
|
64 |
+
16
|
65 |
+
],
|
66 |
+
"d": [
|
67 |
+
17
|
68 |
+
],
|
69 |
+
"e": [
|
70 |
+
18
|
71 |
+
],
|
72 |
+
"f": [
|
73 |
+
19
|
74 |
+
],
|
75 |
+
"h": [
|
76 |
+
20
|
77 |
+
],
|
78 |
+
"i": [
|
79 |
+
21
|
80 |
+
],
|
81 |
+
"j": [
|
82 |
+
22
|
83 |
+
],
|
84 |
+
"k": [
|
85 |
+
23
|
86 |
+
],
|
87 |
+
"l": [
|
88 |
+
24
|
89 |
+
],
|
90 |
+
"m": [
|
91 |
+
25
|
92 |
+
],
|
93 |
+
"n": [
|
94 |
+
26
|
95 |
+
],
|
96 |
+
"o": [
|
97 |
+
27
|
98 |
+
],
|
99 |
+
"p": [
|
100 |
+
28
|
101 |
+
],
|
102 |
+
"q": [
|
103 |
+
29
|
104 |
+
],
|
105 |
+
"r": [
|
106 |
+
30
|
107 |
+
],
|
108 |
+
"s": [
|
109 |
+
31
|
110 |
+
],
|
111 |
+
"t": [
|
112 |
+
32
|
113 |
+
],
|
114 |
+
"u": [
|
115 |
+
33
|
116 |
+
],
|
117 |
+
"v": [
|
118 |
+
34
|
119 |
+
],
|
120 |
+
"w": [
|
121 |
+
35
|
122 |
+
],
|
123 |
+
"x": [
|
124 |
+
36
|
125 |
+
],
|
126 |
+
"y": [
|
127 |
+
37
|
128 |
+
],
|
129 |
+
"z": [
|
130 |
+
38
|
131 |
+
],
|
132 |
+
"æ": [
|
133 |
+
39
|
134 |
+
],
|
135 |
+
"ç": [
|
136 |
+
40
|
137 |
+
],
|
138 |
+
"ð": [
|
139 |
+
41
|
140 |
+
],
|
141 |
+
"ø": [
|
142 |
+
42
|
143 |
+
],
|
144 |
+
"ħ": [
|
145 |
+
43
|
146 |
+
],
|
147 |
+
"ŋ": [
|
148 |
+
44
|
149 |
+
],
|
150 |
+
"œ": [
|
151 |
+
45
|
152 |
+
],
|
153 |
+
"ǀ": [
|
154 |
+
46
|
155 |
+
],
|
156 |
+
"ǁ": [
|
157 |
+
47
|
158 |
+
],
|
159 |
+
"ǂ": [
|
160 |
+
48
|
161 |
+
],
|
162 |
+
"ǃ": [
|
163 |
+
49
|
164 |
+
],
|
165 |
+
"ɐ": [
|
166 |
+
50
|
167 |
+
],
|
168 |
+
"ɑ": [
|
169 |
+
51
|
170 |
+
],
|
171 |
+
"ɒ": [
|
172 |
+
52
|
173 |
+
],
|
174 |
+
"ɓ": [
|
175 |
+
53
|
176 |
+
],
|
177 |
+
"ɔ": [
|
178 |
+
54
|
179 |
+
],
|
180 |
+
"ɕ": [
|
181 |
+
55
|
182 |
+
],
|
183 |
+
"ɖ": [
|
184 |
+
56
|
185 |
+
],
|
186 |
+
"ɗ": [
|
187 |
+
57
|
188 |
+
],
|
189 |
+
"ɘ": [
|
190 |
+
58
|
191 |
+
],
|
192 |
+
"ə": [
|
193 |
+
59
|
194 |
+
],
|
195 |
+
"ɚ": [
|
196 |
+
60
|
197 |
+
],
|
198 |
+
"ɛ": [
|
199 |
+
61
|
200 |
+
],
|
201 |
+
"ɜ": [
|
202 |
+
62
|
203 |
+
],
|
204 |
+
"ɞ": [
|
205 |
+
63
|
206 |
+
],
|
207 |
+
"ɟ": [
|
208 |
+
64
|
209 |
+
],
|
210 |
+
"ɠ": [
|
211 |
+
65
|
212 |
+
],
|
213 |
+
"ɡ": [
|
214 |
+
66
|
215 |
+
],
|
216 |
+
"ɢ": [
|
217 |
+
67
|
218 |
+
],
|
219 |
+
"ɣ": [
|
220 |
+
68
|
221 |
+
],
|
222 |
+
"ɤ": [
|
223 |
+
69
|
224 |
+
],
|
225 |
+
"ɥ": [
|
226 |
+
70
|
227 |
+
],
|
228 |
+
"ɦ": [
|
229 |
+
71
|
230 |
+
],
|
231 |
+
"ɧ": [
|
232 |
+
72
|
233 |
+
],
|
234 |
+
"ɨ": [
|
235 |
+
73
|
236 |
+
],
|
237 |
+
"ɪ": [
|
238 |
+
74
|
239 |
+
],
|
240 |
+
"ɫ": [
|
241 |
+
75
|
242 |
+
],
|
243 |
+
"ɬ": [
|
244 |
+
76
|
245 |
+
],
|
246 |
+
"ɭ": [
|
247 |
+
77
|
248 |
+
],
|
249 |
+
"ɮ": [
|
250 |
+
78
|
251 |
+
],
|
252 |
+
"ɯ": [
|
253 |
+
79
|
254 |
+
],
|
255 |
+
"ɰ": [
|
256 |
+
80
|
257 |
+
],
|
258 |
+
"ɱ": [
|
259 |
+
81
|
260 |
+
],
|
261 |
+
"ɲ": [
|
262 |
+
82
|
263 |
+
],
|
264 |
+
"ɳ": [
|
265 |
+
83
|
266 |
+
],
|
267 |
+
"ɴ": [
|
268 |
+
84
|
269 |
+
],
|
270 |
+
"ɵ": [
|
271 |
+
85
|
272 |
+
],
|
273 |
+
"ɶ": [
|
274 |
+
86
|
275 |
+
],
|
276 |
+
"ɸ": [
|
277 |
+
87
|
278 |
+
],
|
279 |
+
"ɹ": [
|
280 |
+
88
|
281 |
+
],
|
282 |
+
"ɺ": [
|
283 |
+
89
|
284 |
+
],
|
285 |
+
"ɻ": [
|
286 |
+
90
|
287 |
+
],
|
288 |
+
"ɽ": [
|
289 |
+
91
|
290 |
+
],
|
291 |
+
"ɾ": [
|
292 |
+
92
|
293 |
+
],
|
294 |
+
"ʀ": [
|
295 |
+
93
|
296 |
+
],
|
297 |
+
"ʁ": [
|
298 |
+
94
|
299 |
+
],
|
300 |
+
"ʂ": [
|
301 |
+
95
|
302 |
+
],
|
303 |
+
"ʃ": [
|
304 |
+
96
|
305 |
+
],
|
306 |
+
"ʄ": [
|
307 |
+
97
|
308 |
+
],
|
309 |
+
"ʈ": [
|
310 |
+
98
|
311 |
+
],
|
312 |
+
"ʉ": [
|
313 |
+
99
|
314 |
+
],
|
315 |
+
"ʊ": [
|
316 |
+
100
|
317 |
+
],
|
318 |
+
"ʋ": [
|
319 |
+
101
|
320 |
+
],
|
321 |
+
"ʌ": [
|
322 |
+
102
|
323 |
+
],
|
324 |
+
"ʍ": [
|
325 |
+
103
|
326 |
+
],
|
327 |
+
"ʎ": [
|
328 |
+
104
|
329 |
+
],
|
330 |
+
"ʏ": [
|
331 |
+
105
|
332 |
+
],
|
333 |
+
"ʐ": [
|
334 |
+
106
|
335 |
+
],
|
336 |
+
"ʑ": [
|
337 |
+
107
|
338 |
+
],
|
339 |
+
"ʒ": [
|
340 |
+
108
|
341 |
+
],
|
342 |
+
"ʔ": [
|
343 |
+
109
|
344 |
+
],
|
345 |
+
"ʕ": [
|
346 |
+
110
|
347 |
+
],
|
348 |
+
"ʘ": [
|
349 |
+
111
|
350 |
+
],
|
351 |
+
"ʙ": [
|
352 |
+
112
|
353 |
+
],
|
354 |
+
"ʛ": [
|
355 |
+
113
|
356 |
+
],
|
357 |
+
"ʜ": [
|
358 |
+
114
|
359 |
+
],
|
360 |
+
"ʝ": [
|
361 |
+
115
|
362 |
+
],
|
363 |
+
"ʟ": [
|
364 |
+
116
|
365 |
+
],
|
366 |
+
"ʡ": [
|
367 |
+
117
|
368 |
+
],
|
369 |
+
"ʢ": [
|
370 |
+
118
|
371 |
+
],
|
372 |
+
"ʲ": [
|
373 |
+
119
|
374 |
+
],
|
375 |
+
"ˈ": [
|
376 |
+
120
|
377 |
+
],
|
378 |
+
"ˌ": [
|
379 |
+
121
|
380 |
+
],
|
381 |
+
"ː": [
|
382 |
+
122
|
383 |
+
],
|
384 |
+
"ˑ": [
|
385 |
+
123
|
386 |
+
],
|
387 |
+
"˞": [
|
388 |
+
124
|
389 |
+
],
|
390 |
+
"β": [
|
391 |
+
125
|
392 |
+
],
|
393 |
+
"θ": [
|
394 |
+
126
|
395 |
+
],
|
396 |
+
"χ": [
|
397 |
+
127
|
398 |
+
],
|
399 |
+
"ᵻ": [
|
400 |
+
128
|
401 |
+
],
|
402 |
+
"ⱱ": [
|
403 |
+
129
|
404 |
+
]
|
405 |
+
},
|
406 |
+
"num_symbols": 130,
|
407 |
+
"num_speakers": 1,
|
408 |
+
"speaker_id_map": {}
|
409 |
+
}
|