poemsforaphrodite commited on
Commit
396e7de
1 Parent(s): 5dbaa00

Upload openvoice_app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. openvoice_app.py +39 -113
openvoice_app.py CHANGED
@@ -18,7 +18,7 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
  output_dir = 'outputs'
19
  os.makedirs(output_dir, exist_ok=True)
20
 
21
- # load models
22
  en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
23
  en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
24
  zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
@@ -26,152 +26,80 @@ zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
26
  tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
27
  tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
28
 
29
- # load speaker embeddings
30
  en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
31
  en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
32
  zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
33
 
34
- # This online demo mainly supports English and Chinese
35
  supported_languages = ['zh', 'en']
36
 
37
  def predict(prompt, style, audio_file_pth):
38
- # initialize an empty info
39
  text_hint = ''
40
 
41
- # set agree to True by default
42
- agree = True
43
-
44
- # first detect the input language
45
  language_predicted = langid.classify(prompt)[0].strip()
46
  print(f"Detected language: {language_predicted}")
47
 
48
  if language_predicted not in supported_languages:
49
- text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
50
- gr.Warning(
51
- f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
52
- )
53
-
54
- return (
55
- text_hint,
56
- None,
57
- None,
58
- )
59
-
60
  if language_predicted == "zh":
61
  tts_model = zh_base_speaker_tts
62
  source_se = zh_source_se
63
  language = 'Chinese'
64
- if style not in ['default']:
65
- text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
66
- gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
67
- return (
68
- text_hint,
69
- None,
70
- None,
71
- )
72
-
73
  else:
74
  tts_model = en_base_speaker_tts
75
- if style == 'default':
76
- source_se = en_source_default_se
77
- else:
78
- source_se = en_source_style_se
79
  language = 'English'
80
  if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
81
- text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
82
- gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
83
- return (
84
- text_hint,
85
- None,
86
- None,
87
- )
88
-
89
- speaker_wav = audio_file_pth
90
 
91
  if len(prompt) < 2:
92
- text_hint += f"[ERROR] Please give a longer prompt text \n"
93
- gr.Warning("Please give a longer prompt text")
94
- return (
95
- text_hint,
96
- None,
97
- None,
98
- )
99
  if len(prompt) > 200:
100
- text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
101
- gr.Warning(
102
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
103
- )
104
- return (
105
- text_hint,
106
- None,
107
- None,
108
- )
109
-
110
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
111
  try:
112
- target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
113
- # base_speaker = f"{output_dir}/openai_source_output.mp3"
114
- # source_se, audio_name = se_extractor.get_se(base_speaker, tone_color_converter, vad=True)
115
  except Exception as e:
116
- text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
117
- gr.Warning(
118
- "[ERROR] Get target tone color error {str(e)} \n"
119
- )
120
- return (
121
- text_hint,
122
- None,
123
- None,
124
- )
125
 
126
  src_path = f'{output_dir}/tmp.wav'
127
  tts_model.tts(prompt, src_path, speaker=style, language=language)
128
 
129
  save_path = f'{output_dir}/output.wav'
130
- # Run the tone color converter
131
  encode_message = "@MyShell"
132
- tone_color_converter.convert(
133
- audio_src_path=src_path,
134
- src_se=source_se,
135
- tgt_se=target_se,
136
- output_path=save_path,
137
- message=encode_message)
138
-
139
- text_hint += f'''Get response successfully \n'''
140
-
141
- return (
142
- text_hint,
143
- save_path,
144
- speaker_wav,
145
- )
146
 
 
 
147
 
148
  title = "MyShell OpenVoice"
149
 
150
- examples = [
151
- [
152
- "今天天气真好,我们一起出去吃饭吧。",
153
- 'default',
154
- "resources/demo_speaker1.mp3",
155
- ],[
156
- "This audio is generated by open voice with a half-performance model.",
157
- 'whispering',
158
- "resources/demo_speaker2.mp3",
159
- ],
160
- [
161
- "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
162
- 'sad',
163
- "resources/demo_speaker0.mp3",
164
- ],
165
- ]
166
-
167
- # Define custom CSS to set the background color to white
168
- custom_css = """
169
- body {
170
- background-color: white !important;
171
- }
172
- """
173
-
174
- with gr.Blocks(css=custom_css, analytics_enabled=False) as demo:
175
  with gr.Row():
176
  with gr.Column():
177
  input_text_gr = gr.Textbox(
@@ -191,10 +119,8 @@ with gr.Blocks(css=custom_css, analytics_enabled=False) as demo:
191
  type="filepath",
192
  value="resources/demo_speaker2.mp3",
193
  )
194
-
195
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
196
 
197
-
198
  with gr.Column():
199
  out_text_gr = gr.Text(label="Info")
200
  audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
 
18
  output_dir = 'outputs'
19
  os.makedirs(output_dir, exist_ok=True)
20
 
21
+ # Load models
22
  en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
23
  en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
24
  zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
 
26
  tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
27
  tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
28
 
29
+ # Load speaker embeddings
30
  en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
31
  en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
32
  zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
33
 
34
+ # Supported languages
35
  supported_languages = ['zh', 'en']
36
 
37
  def predict(prompt, style, audio_file_pth):
 
38
  text_hint = ''
39
 
40
+ # Detect the input language
 
 
 
41
  language_predicted = langid.classify(prompt)[0].strip()
42
  print(f"Detected language: {language_predicted}")
43
 
44
  if language_predicted not in supported_languages:
45
+ text_hint += f"[ERROR] The detected language {language_predicted} is not supported. Supported languages: {supported_languages}\n"
46
+ gr.Warning(f"The detected language {language_predicted} is not supported. Supported languages: {supported_languages}")
47
+ return text_hint, None, None
48
+
 
 
 
 
 
 
 
49
  if language_predicted == "zh":
50
  tts_model = zh_base_speaker_tts
51
  source_se = zh_source_se
52
  language = 'Chinese'
53
+ if style != 'default':
54
+ text_hint += f"[ERROR] The style {style} is not supported for Chinese. Supported style: 'default'\n"
55
+ gr.Warning(f"The style {style} is not supported for Chinese. Supported style: 'default'")
56
+ return text_hint, None, None
 
 
 
 
 
57
  else:
58
  tts_model = en_base_speaker_tts
59
+ source_se = en_source_default_se if style == 'default' else en_source_style_se
 
 
 
60
  language = 'English'
61
  if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
62
+ text_hint += f"[ERROR] The style {style} is not supported for English. Supported styles: ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
63
+ gr.Warning(f"The style {style} is not supported for English. Supported styles: ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
64
+ return text_hint, None, None
 
 
 
 
 
 
65
 
66
  if len(prompt) < 2:
67
+ text_hint += "[ERROR] Please provide a longer prompt text.\n"
68
+ gr.Warning("Please provide a longer prompt text.")
69
+ return text_hint, None, None
 
 
 
 
70
  if len(prompt) > 200:
71
+ text_hint += "[ERROR] Text length limited to 200 characters. Please try shorter text.\n"
72
+ gr.Warning("Text length limited to 200 characters. Please try shorter text.")
73
+ return text_hint, None, None
74
+
 
 
 
 
 
 
 
75
  try:
76
+ target_se, audio_name = se_extractor.get_se(audio_file_pth, tone_color_converter, target_dir='processed', vad=True)
 
 
77
  except Exception as e:
78
+ text_hint += f"[ERROR] Error extracting tone color: {str(e)}\n"
79
+ gr.Warning(f"[ERROR] Error extracting tone color: {str(e)}")
80
+ return text_hint, None, None
 
 
 
 
 
 
81
 
82
  src_path = f'{output_dir}/tmp.wav'
83
  tts_model.tts(prompt, src_path, speaker=style, language=language)
84
 
85
  save_path = f'{output_dir}/output.wav'
 
86
  encode_message = "@MyShell"
87
+ tone_color_converter.convert(audio_src_path=src_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message)
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ text_hint += "Response generated successfully.\n"
90
+ return text_hint, save_path, audio_file_pth
91
 
92
  title = "MyShell OpenVoice"
93
 
94
+ class WhiteTheme(gr.themes.Base):
95
+ def __init__(self):
96
+ super().__init__()
97
+ self.body_background = "white"
98
+ self.block_background = "white"
99
+ self.text_color = "black"
100
+ self.secondary_text_color = "black"
101
+
102
+ with gr.Blocks(theme=WhiteTheme()) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  with gr.Row():
104
  with gr.Column():
105
  input_text_gr = gr.Textbox(
 
119
  type="filepath",
120
  value="resources/demo_speaker2.mp3",
121
  )
 
122
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
123
 
 
124
  with gr.Column():
125
  out_text_gr = gr.Text(label="Info")
126
  audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)