reuben256 commited on
Commit
ae5f105
1 Parent(s): 5ef7d54

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +69 -3
  2. config.json +5 -0
  3. gitattributes +34 -0
  4. hyperparams.yaml +64 -0
  5. model.ckpt +3 -0
README.md CHANGED
@@ -1,3 +1,69 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: "lg"
3
+ tags:
4
+ - text-to-speech
5
+ - TTS
6
+ - speech-synthesis
7
+ - Tacotron2
8
+ - speechbrain
9
+ license: "apache-2.0"
10
+ datasets:
11
+ - SALT-TTS
12
+ metrics:
13
+ - mos
14
+ ---
15
+
16
+ # Sunbird AI Text-to-Speech (TTS) model trained on Luganda text
17
+
18
+ ### Text-to-Speech (TTS) with Tacotron2 trained on Professional Studio Recordings
19
+
20
+ This repository provides all the necessary tools for Text-to-Speech (TTS) with SpeechBrain.
21
+
22
+ The pre-trained model takes in input a short text and produces a spectrogram in output. One can get the final waveform by applying a vocoder (e.g., HiFIGAN) on top of the generated spectrogram.
23
+
24
+
25
+ ### Install SpeechBrain
26
+
27
+ ```
28
+ pip install speechbrain
29
+ ```
30
+
31
+
32
+ ### Perform Text-to-Speech (TTS)
33
+
34
+ ```
35
+ import torchaudio
36
+ from speechbrain.pretrained import Tacotron2
37
+ from speechbrain.pretrained import HIFIGAN
38
+
39
+ # Intialize TTS (tacotron2) and Vocoder (HiFIGAN)
40
+ tacotron2 = Tacotron2.from_hparams(source="/Sunbird/sunbird-lug-tts", savedir="tmpdir_tts")
41
+ hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
42
+
43
+ # Running the TTS
44
+ mel_output, mel_length, alignment = tacotron2.encode_text("Mbagaliza Christmass Enungi Nomwaka Omugya Gubaberere Gwamirembe")
45
+
46
+ # Running Vocoder (spectrogram-to-waveform)
47
+ waveforms = hifi_gan.decode_batch(mel_output)
48
+
49
+ # Save the waverform
50
+ torchaudio.save('example_TTS.wav',waveforms.squeeze(1), 22050)
51
+ ```
52
+
53
+ If you want to generate multiple sentences in one-shot, you can do in this way:
54
+
55
+ ```
56
+ from speechbrain.pretrained import Tacotron2
57
+ tacotron2 = Tacotron2.from_hparams(source="speechbrain/TTS_Tacotron2", savedir="tmpdir")
58
+
59
+ items = [
60
+ "Nsanyuse okukulaba",
61
+ "Erinnya lyo ggwe ani?",
62
+ "Mbagaliza Christmass Enungi Nomwaka Omugya Gubaberere Gwamirembe"
63
+ ]
64
+ mel_outputs, mel_lengths, alignments = tacotron2.encode_batch(items)
65
+
66
+ ```
67
+
68
+ ### Inference on GPU
69
+ To perform inference on the GPU, add `run_opts={"device":"cuda"}` when calling the `from_hparams` method.
config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "speechbrain_interface": "Tacotron2",
3
+ "vocoder_interface": "HiFIGAN",
4
+ "vocoder_model_id": "speechbrain/tts-hifigan-ljspeech"
5
+ }
gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
26
+ *.tflite filter=lfs diff=lfs merge=lfs -text
27
+ *.tgz filter=lfs diff=lfs merge=lfs -text
28
+ *.wasm filter=lfs diff=lfs merge=lfs -text
29
+ *.xz filter=lfs diff=lfs merge=lfs -text
30
+ *.zip filter=lfs diff=lfs merge=lfs -text
31
+ *.zst filter=lfs diff=lfs merge=lfs -text
32
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
33
+ model.ckpt filter=lfs diff=lfs merge=lfs -text
34
+ optimizer.ckpt filter=lfs diff=lfs merge=lfs -text
hyperparams.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mask_padding: True
2
+ n_mel_channels: 80
3
+ n_symbols: 148
4
+ symbols_embedding_dim: 512
5
+ encoder_kernel_size: 5
6
+ encoder_n_convolutions: 3
7
+ encoder_embedding_dim: 512
8
+ attention_rnn_dim: 1024
9
+ attention_dim: 128
10
+ attention_location_n_filters: 32
11
+ attention_location_kernel_size: 31
12
+ n_frames_per_step: 1
13
+ decoder_rnn_dim: 1024
14
+ prenet_dim: 256
15
+ max_decoder_steps: 1000
16
+ gate_threshold: 0.5
17
+ p_attention_dropout: 0.1
18
+ p_decoder_dropout: 0.1
19
+ postnet_embedding_dim: 512
20
+ postnet_kernel_size: 5
21
+ postnet_n_convolutions: 5
22
+ decoder_no_early_stopping: False
23
+ sample_rate: 22050
24
+
25
+ # Model
26
+ model: !new:speechbrain.lobes.models.Tacotron2.Tacotron2
27
+ mask_padding: !ref <mask_padding>
28
+ n_mel_channels: !ref <n_mel_channels>
29
+ # symbols
30
+ n_symbols: !ref <n_symbols>
31
+ symbols_embedding_dim: !ref <symbols_embedding_dim>
32
+ # encoder
33
+ encoder_kernel_size: !ref <encoder_kernel_size>
34
+ encoder_n_convolutions: !ref <encoder_n_convolutions>
35
+ encoder_embedding_dim: !ref <encoder_embedding_dim>
36
+ # attention
37
+ attention_rnn_dim: !ref <attention_rnn_dim>
38
+ attention_dim: !ref <attention_dim>
39
+ # attention location
40
+ attention_location_n_filters: !ref <attention_location_n_filters>
41
+ attention_location_kernel_size: !ref <attention_location_kernel_size>
42
+ # decoder
43
+ n_frames_per_step: !ref <n_frames_per_step>
44
+ decoder_rnn_dim: !ref <decoder_rnn_dim>
45
+ prenet_dim: !ref <prenet_dim>
46
+ max_decoder_steps: !ref <max_decoder_steps>
47
+ gate_threshold: !ref <gate_threshold>
48
+ p_attention_dropout: !ref <p_attention_dropout>
49
+ p_decoder_dropout: !ref <p_decoder_dropout>
50
+ # postnet
51
+ postnet_embedding_dim: !ref <postnet_embedding_dim>
52
+ postnet_kernel_size: !ref <postnet_kernel_size>
53
+ postnet_n_convolutions: !ref <postnet_n_convolutions>
54
+ decoder_no_early_stopping: !ref <decoder_no_early_stopping>
55
+
56
+ # Function that converts the text into a sequence of valid characters.
57
+ text_to_sequence: !name:speechbrain.utils.text_to_sequence.text_to_sequence
58
+
59
+ modules:
60
+ model: !ref <model>
61
+
62
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
63
+ loadables:
64
+ model: !ref <model>
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d974eb14aed03438e608ed80f7c9418b333a2d17c4f02f510ed9e4d74c75f214
3
+ size 112830206