LPhilp1943 commited on
Commit
d8238c0
1 Parent(s): c95f875

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -63
app.py CHANGED
@@ -1,72 +1,54 @@
1
  import os
2
  import sys
3
- import subprocess
4
  import gradio as gr
5
- import torch
6
- import soundfile as sf
7
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
8
- import librosa
9
  from TTS.api import TTS
10
- from TTS.utils.manage import ModelManager
11
 
12
- def install_sentencepiece():
13
- try:
14
- # Attempting to install sentencepiece via pip
15
- subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
16
- except subprocess.CalledProcessError:
17
- # Attempt to install sentencepiece via system package manager if pip install fails
18
- if os.name == "posix":
19
- os.system("sudo apt-get install -y libprotobuf10 protobuf-compiler libprotobuf-dev")
20
- os.system("sudo apt-get install -y libsentencepiece-dev")
21
- else:
22
- raise OSError("Automatic installation of SentencePiece is not supported on this OS")
23
-
24
- # Call the function to attempt installing SentencePiece
25
- install_sentencepiece()
26
-
27
- # Agreeing to Coqui TTS terms of service and setting up environment variables
28
  os.environ["COQUI_TOS_AGREED"] = "1"
29
- os.makedirs("output_audio", exist_ok=True)
30
 
31
- # Initialize ASR model
32
- asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
33
- asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
34
- asr_model.eval()
35
-
36
- # Dynamically list and select TTS model
37
- tts_manager = ModelManager()
38
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
39
  tts = TTS(model_name, gpu=False)
40
-
41
- def resample_audio(input_audio_path, target_sr=16000):
42
- waveform, sr = sf.read(input_audio_path)
43
- if sr != target_sr:
44
- waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
45
- return waveform
46
-
47
- def speech_to_text(input_audio_path):
48
- waveform = resample_audio(input_audio_path)
49
- input_values = asr_processor(waveform, return_tensors="pt").input_values
50
- with torch.no_grad():
51
- logits = asr_model(input_values).logits
52
- predicted_ids = torch.argmax(logits, dim=-1)
53
- transcription = asr_processor.batch_decode(predicted_ids)[0]
54
- return transcription.strip()
55
-
56
- def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav"):
57
- if not text.strip():
58
- return "Empty text input."
59
- tts.tts_to_file(text=text, file_path=output_path, speaker_wav=speaker_wav_path)
60
- return output_path
61
-
62
- def speech_to_speech(input_audio, text_input=None):
63
- speaker_wav_path = input_audio
64
- if text_input is None:
65
- text_input = speech_to_text(input_audio)
66
- return text_to_speech(text_input, speaker_wav_path)
67
-
68
- iface = gr.Interface(fn=speech_to_speech,
69
- inputs=[gr.Audio(type="filepath"), gr.Textbox(optional=True)],
70
- outputs=gr.Audio())
71
- iface.launch()
72
-
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import sys
3
+ from fastapi import Request
4
  import gradio as gr
 
 
 
 
5
  from TTS.api import TTS
 
6
 
7
+ # Agree to Coqui TTS terms of service
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  os.environ["COQUI_TOS_AGREED"] = "1"
 
9
 
10
+ # Initialize TTS with the desired model
 
 
 
 
 
 
11
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
12
  tts = TTS(model_name, gpu=False)
13
+ tts.to("cpu") # Use CPU for inference
14
+
15
+ def predict(prompt, language, audio_file_path, use_mic, agree):
16
+ if not agree:
17
+ return "You must agree to the Terms & Condition!", None
18
+
19
+ if use_mic and not audio_file_path:
20
+ return "Please provide a microphone recording or disable the 'Use Microphone' option.", None
21
+
22
+ if len(prompt) < 2 or len(prompt) > 50000:
23
+ return "Prompt text length must be between 2 and 50000 characters.", None
24
+
25
+ speaker_wav = audio_file_path
26
+ output_path = "output.wav"
27
+
28
+ try:
29
+ tts.tts_to_file(text=prompt, file_path=output_path, speaker_wav=speaker_wav, language=language)
30
+ except Exception as e:
31
+ print(f"Error during TTS generation: {e}", file=sys.stderr)
32
+ return "An error occurred during TTS generation.", None
33
+
34
+ return gr.Audio(file_path=output_path), output_path
35
+
36
+ iface = gr.Interface(
37
+ fn=predict,
38
+ inputs=[
39
+ gr.Textbox(label="Text Prompt"),
40
+ gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en"),
41
+ gr.Audio(label="Reference Audio", type="filepath"),
42
+ gr.Checkbox(label="Use Microphone as Reference", value=False),
43
+ gr.Checkbox(label="Agree to Terms & Conditions", value=True),
44
+ ],
45
+ outputs=[gr.Audio(label="Synthesised Audio"), "text"],
46
+ title="XTTS Text-to-Speech",
47
+ description="A web interface for Coqui's TTS model to generate speech from text.",
48
+ examples=[
49
+ # Example inputs
50
+ ["Hello, World !", "en", "path/to/example_audio.wav", False, True],
51
+ ]
52
+ )
53
+
54
+ iface.launch(debug=True)