divakaivan commited on
Commit
6742dfa
1 Parent(s): 4f76169

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -21
app.py CHANGED
@@ -8,18 +8,7 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5Hif
8
  #.
9
  checkpoint = "microsoft/speecht5_tts"
10
  processor = SpeechT5Processor.from_pretrained(checkpoint)
11
- model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
12
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
-
14
-
15
- speaker_embeddings = {
16
- "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
17
- "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
18
- "KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
19
- "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
20
- "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
21
- }
22
-
23
 
24
  from datasets import load_dataset, Audio
25
 
@@ -137,18 +126,11 @@ def predict(text, speaker):
137
 
138
  ### ### ###
139
  example = dataset['test'][11]
140
- speaker_embedding = torch.tensor(example["speaker_embeddings"]).unsqueeze(0).to(device)
141
-
142
- # Ensure the speaker_embedding has the correct dimensions
143
- if speaker_embedding.dim() == 2:
144
- speaker_embedding = speaker_embedding.unsqueeze(1).expand(-1, inputs["input_ids"].size(1), -1)
145
- elif speaker_embedding.dim() == 3:
146
- speaker_embedding = speaker_embedding.expand(-1, inputs["input_ids"].size(1), -1)
147
 
148
- spectrogram = model.generate_speech(inputs["input_ids"].to(device), speaker_embedding)
149
  with torch.no_grad():
150
  speech = vocoder(spectrogram)
151
- # speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
152
 
153
  speech = (speech.numpy() * 32767).astype(np.int16)
154
  return (16000, speech)
 
8
  #.
9
  checkpoint = "microsoft/speecht5_tts"
10
  processor = SpeechT5Processor.from_pretrained(checkpoint)
11
+ model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  from datasets import load_dataset, Audio
14
 
 
126
 
127
  ### ### ###
128
  example = dataset['test'][11]
129
+ speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
 
 
 
 
 
 
130
 
131
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
132
  with torch.no_grad():
133
  speech = vocoder(spectrogram)
 
134
 
135
  speech = (speech.numpy() * 32767).astype(np.int16)
136
  return (16000, speech)