ercaronte commited on
Commit
bd9c34f
1 Parent(s): 65ecf8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -25
app.py CHANGED
@@ -2,64 +2,73 @@ import gradio as gr
2
  import numpy as np
3
  import torch
4
  from datasets import load_dataset
5
-
6
  from transformers import pipeline
7
- from transformers import VitsModel, VitsTokenizer
8
- from transformers import SpeechT5Processor, SpeechT5Processor, SpeechT5HifiGan
9
-
10
 
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
 
13
-
14
  # load speech translation checkpoint
15
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
16
 
 
17
  def translate(audio):
18
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
19
  return outputs["text"]
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # load translator to french
23
  en_fr_translator = pipeline("translation_en_to_fr")
24
 
25
-
26
  # load text-to-speech
27
  model_new = VitsModel.from_pretrained("facebook/mms-tts-fra")
28
  tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")
29
 
30
- def synthesise_new(text):
 
31
  translation_to_french = en_fr_translator(text)
32
  french_text = translation_to_french[0]['translation_text']
33
 
34
  inputs = tokenizer(french_text, return_tensors="pt")
35
  input_ids = inputs["input_ids"]
36
-
37
  with torch.no_grad():
38
  outputs = model_new(input_ids)
39
-
40
  speech = outputs["waveform"]
41
  return speech
42
 
43
 
44
- # load text-to-speech checkpoint and speaker embeddings
45
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
46
-
47
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
48
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
49
-
50
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
51
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
52
-
53
- def synthesise(text):
54
- inputs = processor(text=text, return_tensors="pt")
55
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
56
- return speech.cpu()
57
-
58
-
59
  def speech_to_speech_translation(audio):
60
  translated_text = translate(audio)
61
  synthesised_speech = synthesise(translated_text)
62
- synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
63
  return 16000, synthesised_speech
64
 
65
 
 
2
  import numpy as np
3
  import torch
4
  from datasets import load_dataset
 
5
  from transformers import pipeline
 
 
 
6
 
7
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
8
 
 
9
  # load speech translation checkpoint
10
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
11
 
12
+
13
  def translate(audio):
14
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
15
  return outputs["text"]
16
 
17
+ '''
18
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
19
+
20
+ # load text-to-speech checkpoint and speaker embeddings
21
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
22
+
23
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
24
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
25
+
26
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
27
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
28
+
29
+
30
+ def synthesise_old(text):
31
+ inputs = processor(text=text, return_tensors="pt")
32
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
33
+ return speech.cpu()
34
+
35
+
36
+ def speech_to_speech_translation_old(audio):
37
+ translated_text = translate(audio)
38
+ synthesised_speech = synthesise_old(translated_text)
39
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
40
+ return 16000, synthesised_speech
41
+ '''
42
+
43
+ from transformers import VitsModel, VitsTokenizer
44
+
45
 
46
  # load translator to french
47
  en_fr_translator = pipeline("translation_en_to_fr")
48
 
 
49
  # load text-to-speech
50
  model_new = VitsModel.from_pretrained("facebook/mms-tts-fra")
51
  tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")
52
 
53
+
54
+ def synthesise(text):
55
  translation_to_french = en_fr_translator(text)
56
  french_text = translation_to_french[0]['translation_text']
57
 
58
  inputs = tokenizer(french_text, return_tensors="pt")
59
  input_ids = inputs["input_ids"]
60
+
61
  with torch.no_grad():
62
  outputs = model_new(input_ids)
63
+
64
  speech = outputs["waveform"]
65
  return speech
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def speech_to_speech_translation(audio):
69
  translated_text = translate(audio)
70
  synthesised_speech = synthesise(translated_text)
71
+ synthesised_speech = (synthesised_speech[0].numpy() * 32767).astype(np.int16)
72
  return 16000, synthesised_speech
73
 
74