dathudeptrai
commited on
Commit
•
732c107
1
Parent(s):
534a855
Update model
Browse files- README.md +93 -0
- config.yml +86 -0
- model.h5 +3 -0
- processor.json +1 -0
README.md
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- tensorflowtts
|
4 |
+
- audio
|
5 |
+
- text-to-speech
|
6 |
+
- text-to-mel
|
7 |
+
language: fr
|
8 |
+
license: apache-2.0
|
9 |
+
datasets:
|
10 |
+
- synpaflex
|
11 |
+
widget:
|
12 |
+
- text: "Oh, je voudrais tant que tu te souviennes Des jours heureux quand nous étions amis"
|
13 |
+
---
|
14 |
+
|
15 |
+
# Tacotron 2 with Guided Attention trained on Synpaflex (Fr)
|
16 |
+
This repository provides a pretrained [Tacotron2](https://arxiv.org/abs/1712.05884) trained with [Guided Attention](https://arxiv.org/abs/1710.08969) on Synpaflex dataset (Fr). For a detail of the model, we encourage you to read more about
|
17 |
+
[TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS).
|
18 |
+
|
19 |
+
|
20 |
+
## Install TensorFlowTTS
|
21 |
+
First of all, please install TensorFlowTTS with the following command:
|
22 |
+
```
|
23 |
+
pip install TensorFlowTTS
|
24 |
+
```
|
25 |
+
|
26 |
+
### Converting your Text to Mel Spectrogram
|
27 |
+
```python
|
28 |
+
import numpy as np
|
29 |
+
import soundfile as sf
|
30 |
+
import yaml
|
31 |
+
|
32 |
+
import tensorflow as tf
|
33 |
+
|
34 |
+
from tensorflow_tts.inference import AutoProcessor
|
35 |
+
from tensorflow_tts.inference import TFAutoModel
|
36 |
+
|
37 |
+
processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-synpaflex-fr")
|
38 |
+
tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-synpaflex-fr")
|
39 |
+
|
40 |
+
text = "Oh, je voudrais tant que tu te souviennes Des jours heureux quand nous étions amis"
|
41 |
+
|
42 |
+
input_ids = processor.text_to_sequence(text)
|
43 |
+
|
44 |
+
decoder_output, mel_outputs, stop_token_prediction, alignment_history = tacotron2.inference(
|
45 |
+
input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
|
46 |
+
input_lengths=tf.convert_to_tensor([len(input_ids)], tf.int32),
|
47 |
+
speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
|
48 |
+
)
|
49 |
+
|
50 |
+
```
|
51 |
+
|
52 |
+
#### Referencing Tacotron 2
|
53 |
+
```
|
54 |
+
@article{DBLP:journals/corr/abs-1712-05884,
|
55 |
+
author = {Jonathan Shen and
|
56 |
+
Ruoming Pang and
|
57 |
+
Ron J. Weiss and
|
58 |
+
Mike Schuster and
|
59 |
+
Navdeep Jaitly and
|
60 |
+
Zongheng Yang and
|
61 |
+
Zhifeng Chen and
|
62 |
+
Yu Zhang and
|
63 |
+
Yuxuan Wang and
|
64 |
+
R. J. Skerry{-}Ryan and
|
65 |
+
Rif A. Saurous and
|
66 |
+
Yannis Agiomyrgiannakis and
|
67 |
+
Yonghui Wu},
|
68 |
+
title = {Natural {TTS} Synthesis by Conditioning WaveNet on Mel Spectrogram
|
69 |
+
Predictions},
|
70 |
+
journal = {CoRR},
|
71 |
+
volume = {abs/1712.05884},
|
72 |
+
year = {2017},
|
73 |
+
url = {http://arxiv.org/abs/1712.05884},
|
74 |
+
archivePrefix = {arXiv},
|
75 |
+
eprint = {1712.05884},
|
76 |
+
timestamp = {Thu, 28 Nov 2019 08:59:52 +0100},
|
77 |
+
biburl = {https://dblp.org/rec/journals/corr/abs-1712-05884.bib},
|
78 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
79 |
+
}
|
80 |
+
```
|
81 |
+
|
82 |
+
#### Referencing TensorFlowTTS
|
83 |
+
```
|
84 |
+
@misc{TFTTS,
|
85 |
+
author = {Minh Nguyen, Alejandro Miguel Velasquez, Erogol, Kuan Chen, Dawid Kobus, Takuya Ebata,
|
86 |
+
Trinh Le and Yunchao He},
|
87 |
+
title = {TensorflowTTS},
|
88 |
+
year = {2020},
|
89 |
+
publisher = {GitHub},
|
90 |
+
journal = {GitHub repository},
|
91 |
+
howpublished = {\\url{https://github.com/TensorSpeech/TensorFlowTTS}},
|
92 |
+
}
|
93 |
+
```
|
config.yml
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for Tacotron2 v1.
|
2 |
+
# Please make sure this is adjusted for the synpaflex dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration performs 200k iters but 65k iters is enough to get a good models.
|
5 |
+
|
6 |
+
###########################################################
|
7 |
+
# FEATURE EXTRACTION SETTING #
|
8 |
+
###########################################################
|
9 |
+
hop_size: 256 # Hop size.
|
10 |
+
format: "npy"
|
11 |
+
|
12 |
+
|
13 |
+
###########################################################
|
14 |
+
# NETWORK ARCHITECTURE SETTING #
|
15 |
+
###########################################################
|
16 |
+
model_type: "tacotron2"
|
17 |
+
|
18 |
+
tacotron2_params:
|
19 |
+
dataset: synpaflex
|
20 |
+
embedding_hidden_size: 512
|
21 |
+
initializer_range: 0.02
|
22 |
+
embedding_dropout_prob: 0.1
|
23 |
+
n_speakers: 1
|
24 |
+
n_conv_encoder: 5
|
25 |
+
encoder_conv_filters: 512
|
26 |
+
encoder_conv_kernel_sizes: 5
|
27 |
+
encoder_conv_activation: 'relu'
|
28 |
+
encoder_conv_dropout_rate: 0.5
|
29 |
+
encoder_lstm_units: 256
|
30 |
+
n_prenet_layers: 2
|
31 |
+
prenet_units: 256
|
32 |
+
prenet_activation: 'relu'
|
33 |
+
prenet_dropout_rate: 0.5
|
34 |
+
n_lstm_decoder: 1
|
35 |
+
reduction_factor: 1
|
36 |
+
decoder_lstm_units: 1024
|
37 |
+
attention_dim: 128
|
38 |
+
attention_filters: 32
|
39 |
+
attention_kernel: 31
|
40 |
+
n_mels: 80
|
41 |
+
n_conv_postnet: 5
|
42 |
+
postnet_conv_filters: 512
|
43 |
+
postnet_conv_kernel_sizes: 5
|
44 |
+
postnet_dropout_rate: 0.1
|
45 |
+
attention_type: "lsa"
|
46 |
+
|
47 |
+
###########################################################
|
48 |
+
# DATA LOADER SETTING #
|
49 |
+
###########################################################
|
50 |
+
batch_size: 32 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
|
51 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
52 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
53 |
+
mel_length_threshold: 32 # remove all targets has mel_length <= 32
|
54 |
+
is_shuffle: true # shuffle dataset after each epoch.
|
55 |
+
use_fixed_shapes: true # use_fixed_shapes for training (2x speed-up)
|
56 |
+
# refer (https://github.com/dathudeptrai/TensorflowTTS/issues/34#issuecomment-642309118)
|
57 |
+
|
58 |
+
###########################################################
|
59 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
60 |
+
###########################################################
|
61 |
+
optimizer_params:
|
62 |
+
initial_learning_rate: 0.001
|
63 |
+
end_learning_rate: 0.00001
|
64 |
+
decay_steps: 150000 # < train_max_steps is recommend.
|
65 |
+
warmup_proportion: 0.02
|
66 |
+
weight_decay: 0.001
|
67 |
+
|
68 |
+
gradient_accumulation_steps: 1
|
69 |
+
var_train_expr: null # trainable variable expr (eg. 'embeddings|decoder_cell' )
|
70 |
+
# must separate by |. if var_train_expr is null then we
|
71 |
+
# training all variables.
|
72 |
+
###########################################################
|
73 |
+
# INTERVAL SETTING #
|
74 |
+
###########################################################
|
75 |
+
train_max_steps: 200000 # Number of training steps.
|
76 |
+
save_interval_steps: 2000 # Interval steps to save checkpoint.
|
77 |
+
eval_interval_steps: 500 # Interval steps to evaluate the network.
|
78 |
+
log_interval_steps: 200 # Interval steps to record the training log.
|
79 |
+
start_schedule_teacher_forcing: 200001 # don't need to apply schedule teacher forcing.
|
80 |
+
start_ratio_value: 0.5 # start ratio of scheduled teacher forcing.
|
81 |
+
schedule_decay_steps: 50000 # decay step scheduled teacher forcing.
|
82 |
+
end_ratio_value: 0.0 # end ratio of scheduled teacher forcing.
|
83 |
+
###########################################################
|
84 |
+
# OTHER SETTING #
|
85 |
+
###########################################################
|
86 |
+
num_save_intermediate_results: 1 # Number of results to be saved as intermediate results.
|
model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7761e61d0dd3bbe9387ff6191d1507d9fd308d6117c8d3ec2f8151c6f9ea4470
|
3 |
+
size 127842184
|
processor.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"symbol_to_id": {"pad": 0, "!": 1, "/": 2, "'": 3, "(": 4, ")": 5, ",": 6, "-": 7, ".": 8, ":": 9, ";": 10, "?": 11, " ": 12, "A": 13, "B": 14, "C": 15, "D": 16, "E": 17, "F": 18, "G": 19, "H": 20, "I": 21, "J": 22, "K": 23, "L": 24, "M": 25, "N": 26, "O": 27, "P": 28, "Q": 29, "R": 30, "S": 31, "T": 32, "U": 33, "V": 34, "W": 35, "X": 36, "Y": 37, "Z": 38, "a": 39, "b": 40, "c": 41, "d": 42, "e": 43, "f": 44, "g": 45, "h": 46, "i": 47, "j": 48, "k": 49, "l": 50, "m": 51, "n": 52, "o": 53, "p": 54, "q": 55, "r": 56, "s": 57, "t": 58, "u": 59, "v": 60, "w": 61, "x": 62, "y": 63, "z": 64, "\u00e9": 65, "\u00e8": 66, "\u00e0": 67, "\u00f9": 68, "\u00e2": 69, "\u00ea": 70, "\u00ee": 71, "\u00f4": 72, "\u00fb": 73, "\u00e7": 74, "\u00e4": 75, "\u00eb": 76, "\u00ef": 77, "\u00f6": 78, "\u00fc": 79, "\u00ff": 80, "\u0153": 81, "\u00e6": 82, "eos": 83}, "id_to_symbol": {"0": "pad", "1": "!", "2": "/", "3": "'", "4": "(", "5": ")", "6": ",", "7": "-", "8": ".", "9": ":", "10": ";", "11": "?", "12": " ", "13": "A", "14": "B", "15": "C", "16": "D", "17": "E", "18": "F", "19": "G", "20": "H", "21": "I", "22": "J", "23": "K", "24": "L", "25": "M", "26": "N", "27": "O", "28": "P", "29": "Q", "30": "R", "31": "S", "32": "T", "33": "U", "34": "V", "35": "W", "36": "X", "37": "Y", "38": "Z", "39": "a", "40": "b", "41": "c", "42": "d", "43": "e", "44": "f", "45": "g", "46": "h", "47": "i", "48": "j", "49": "k", "50": "l", "51": "m", "52": "n", "53": "o", "54": "p", "55": "q", "56": "r", "57": "s", "58": "t", "59": "u", "60": "v", "61": "w", "62": "x", "63": "y", "64": "z", "65": "\u00e9", "66": "\u00e8", "67": "\u00e0", "68": "\u00f9", "69": "\u00e2", "70": "\u00ea", "71": "\u00ee", "72": "\u00f4", "73": "\u00fb", "74": "\u00e7", "75": "\u00e4", "76": "\u00eb", "77": "\u00ef", "78": "\u00f6", "79": "\u00fc", "80": "\u00ff", "81": "\u0153", "82": "\u00e6", "83": "eos"}, "speakers_map": {"synpaflex": 0}, "processor_name": "SynpaflexProcessor"}
|