divakaivan commited on
Commit
8bc0d0f
1 Parent(s): 5f151ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -6,12 +6,12 @@ import torch
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
 
8
 
9
- checkpoint = "divakaivan/glaswegian_tts"
10
  processor = SpeechT5Processor.from_pretrained(checkpoint)
11
  model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
- # .
15
  speaker_embeddings = {
16
  "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
17
  "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
@@ -62,14 +62,10 @@ title = "SpeechT5: Speech Synthesis"
62
  description = """
63
  The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
64
  By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.
65
-
66
  SpeechT5 can be fine-tuned for different speech tasks. This space demonstrates the <b>text-to-speech</b> (TTS) checkpoint for the English language.
67
-
68
  See also the <a href="https://huggingface.co/spaces/Matthijs/speecht5-asr-demo">speech recognition (ASR) demo</a>
69
  and the <a href="https://huggingface.co/spaces/Matthijs/speecht5-vc-demo">voice conversion demo</a>.
70
-
71
  Refer to <a href="https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ">this Colab notebook</a> to learn how to fine-tune the SpeechT5 TTS model on your own dataset or language.
72
-
73
  <b>How to use:</b> Enter some English text and choose a speaker. The output is a mel spectrogram, which is converted to a mono 16 kHz waveform by the
74
  HiFi-GAN vocoder. Because the model always applies random dropout, each attempt will give slightly different results.
75
  The <em>Surprise Me!</em> option creates a completely randomized speaker.
@@ -77,11 +73,9 @@ The <em>Surprise Me!</em> option creates a completely randomized speaker.
77
 
78
  article = """
79
  <div style='margin:20px auto;'>
80
-
81
  <p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
82
  <a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
83
  <a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
84
-
85
  <pre>
86
  @article{Ao2021SpeechT5,
87
  title = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing},
@@ -92,9 +86,7 @@ article = """
92
  year={2021}
93
  }
94
  </pre>
95
-
96
  <p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>
97
-
98
  </div>
99
  """
100
 
@@ -111,6 +103,15 @@ gr.Interface(
111
  fn=predict,
112
  inputs=[
113
  gr.Text(label="Input Text"),
 
 
 
 
 
 
 
 
 
114
  ],
115
  outputs=[
116
  gr.Audio(label="Generated Speech", type="numpy"),
 
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
 
8
 
9
+ checkpoint = "microsoft/speecht5_tts"
10
  processor = SpeechT5Processor.from_pretrained(checkpoint)
11
  model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
+
15
  speaker_embeddings = {
16
  "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
17
  "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
 
62
  description = """
63
  The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
64
  By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.
 
65
  SpeechT5 can be fine-tuned for different speech tasks. This space demonstrates the <b>text-to-speech</b> (TTS) checkpoint for the English language.
 
66
  See also the <a href="https://huggingface.co/spaces/Matthijs/speecht5-asr-demo">speech recognition (ASR) demo</a>
67
  and the <a href="https://huggingface.co/spaces/Matthijs/speecht5-vc-demo">voice conversion demo</a>.
 
68
  Refer to <a href="https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ">this Colab notebook</a> to learn how to fine-tune the SpeechT5 TTS model on your own dataset or language.
 
69
  <b>How to use:</b> Enter some English text and choose a speaker. The output is a mel spectrogram, which is converted to a mono 16 kHz waveform by the
70
  HiFi-GAN vocoder. Because the model always applies random dropout, each attempt will give slightly different results.
71
  The <em>Surprise Me!</em> option creates a completely randomized speaker.
 
73
 
74
  article = """
75
  <div style='margin:20px auto;'>
 
76
  <p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
77
  <a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
78
  <a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
 
79
  <pre>
80
  @article{Ao2021SpeechT5,
81
  title = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing},
 
86
  year={2021}
87
  }
88
  </pre>
 
89
  <p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>
 
90
  </div>
91
  """
92
 
 
103
  fn=predict,
104
  inputs=[
105
  gr.Text(label="Input Text"),
106
+ gr.Radio(label="Speaker", choices=[
107
+ "BDL (male)",
108
+ "CLB (female)",
109
+ "KSP (male)",
110
+ "RMS (male)",
111
+ "SLT (female)",
112
+ "Surprise Me!"
113
+ ],
114
+ value="BDL (male)"),
115
  ],
116
  outputs=[
117
  gr.Audio(label="Generated Speech", type="numpy"),