emirhanbilgic commited on
Commit
05020c4
1 Parent(s): ff597d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -9
app.py CHANGED
@@ -1,8 +1,10 @@
 
1
  import gradio as gr
2
  import torch
3
  from datasets import load_dataset
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  import soundfile as sf
 
6
  import spaces
7
 
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -13,15 +15,28 @@ def load_models_and_data():
13
  model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
14
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
15
 
16
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
17
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
 
 
 
18
 
19
- return model, processor, vocoder, speaker_embeddings
20
 
21
- model, processor, vocoder, speaker_embeddings = load_models_and_data()
 
 
 
 
 
 
 
22
 
23
  @spaces.GPU(duration = 60)
24
- def text_to_speech(text):
 
 
 
25
  inputs = processor(text=text, return_tensors="pt").to(device)
26
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
27
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
@@ -29,10 +44,13 @@ def text_to_speech(text):
29
 
30
  iface = gr.Interface(
31
  fn=text_to_speech,
32
- inputs=gr.Textbox(label="Enter Turkish text to convert to speech"),
 
 
 
33
  outputs=gr.Audio(label="Generated Speech"),
34
- title="Turkish SpeechT5 Text-to-Speech Demo",
35
- description="Enter Turkish text and listen to the generated speech using the fine-tuned SpeechT5 model."
36
  )
37
 
38
- iface.launch()
 
1
+ import os
2
  import gradio as gr
3
  import torch
4
  from datasets import load_dataset
5
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
  import soundfile as sf
7
+ from speechbrain.pretrained import EncoderClassifier
8
  import spaces
9
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
15
  model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
16
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
17
 
18
+ speaker_model = EncoderClassifier.from_hparams(
19
+ source="speechbrain/spkrec-xvect-voxceleb",
20
+ run_opts={"device": device},
21
+ savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb"),
22
+ )
23
 
24
+ return model, processor, vocoder, speaker_model
25
 
26
+ model, processor, vocoder, speaker_model = load_models_and_data()
27
+
28
+ def create_speaker_embedding(waveform):
29
+ with torch.no_grad():
30
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0))
31
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
32
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
33
+ return speaker_embeddings
34
 
35
  @spaces.GPU(duration = 60)
36
+ def text_to_speech(text, waveform):
37
+ speaker_embeddings = create_speaker_embedding(waveform)
38
+ speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0).to(device)
39
+
40
  inputs = processor(text=text, return_tensors="pt").to(device)
41
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
42
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
 
44
 
45
  iface = gr.Interface(
46
  fn=text_to_speech,
47
+ inputs=[
48
+ gr.Textbox(label="Enter Turkish text to convert to speech"),
49
+ gr.Audio(source="upload", type="numpy", label="Upload Speaker Audio"),
50
+ ],
51
  outputs=gr.Audio(label="Generated Speech"),
52
+ title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker Embeddings",
53
+ description="Enter Turkish text and upload an audio file to generate speech using the fine-tuned SpeechT5 model with custom speaker embeddings."
54
  )
55
 
56
+ iface.launch()