Spaces:
Running
Running
admin
commited on
Commit
·
4ee714e
1
Parent(s):
685acde
2 en
Browse files
app.py
CHANGED
@@ -101,17 +101,13 @@ def tts_fn(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
|
|
101 |
|
102 |
def text_splitter(text: str):
|
103 |
punctuation = r"[。,;,!,?,〜,\n,\r,\t,.,!,;,?,~, ]"
|
104 |
-
# 使用正则表达式根据标点符号分割文本,并忽略重叠的分隔符
|
105 |
sentences = re.split(punctuation, text.strip())
|
106 |
-
# 过滤掉空字符串
|
107 |
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
108 |
|
109 |
|
110 |
def concatenate_audios(audio_samples, sample_rate=44100):
|
111 |
half_second_silence = np.zeros(int(sample_rate / 2))
|
112 |
-
# 初始化最终的音频数组
|
113 |
final_audio = audio_samples[0]
|
114 |
-
# 遍历音频样本列表,并将它们连接起来,每个样本之间插入半秒钟的静音
|
115 |
for sample in audio_samples[1:]:
|
116 |
final_audio = np.concatenate((final_audio, half_second_silence, sample))
|
117 |
|
@@ -121,19 +117,18 @@ def concatenate_audios(audio_samples, sample_rate=44100):
|
|
121 |
|
122 |
def read_text(file_path: str):
|
123 |
try:
|
124 |
-
# 打开文件并读取内容
|
125 |
with open(file_path, "r", encoding="utf-8") as file:
|
126 |
content = file.read()
|
127 |
return content
|
128 |
|
129 |
except FileNotFoundError:
|
130 |
-
print(f"
|
131 |
|
132 |
except IOError:
|
133 |
-
print(f"
|
134 |
|
135 |
except Exception as e:
|
136 |
-
print(f"
|
137 |
|
138 |
|
139 |
def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
|
@@ -215,62 +210,98 @@ if __name__ == "__main__":
|
|
215 |
with gr.Blocks() as app:
|
216 |
gr.Markdown(
|
217 |
"""
|
218 |
-
<
|
219 |
-
欢迎使用此创空间, 此创空间基于 <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a> 开源项目制作,完全免费。使用此创空间必须遵守当地相关法律法规,禁止用其从事任何违法犯罪活动。首次推理需耗时下载模型,还请耐心等待。另外,移至最底端有原理浅讲。
|
220 |
-
</center>
|
221 |
-
"""
|
222 |
)
|
223 |
|
224 |
-
with gr.Tab("
|
225 |
gr.Interface(
|
226 |
-
fn=infer_tab2,
|
227 |
inputs=[
|
228 |
-
gr.TextArea(
|
229 |
-
|
|
|
|
|
|
|
|
|
230 |
gr.Slider(
|
231 |
-
minimum=0,
|
232 |
-
|
|
|
|
|
|
|
|
|
233 |
gr.Slider(
|
234 |
-
minimum=0.1,
|
|
|
|
|
|
|
|
|
235 |
),
|
236 |
gr.Slider(
|
237 |
-
minimum=0.1,
|
|
|
|
|
|
|
|
|
238 |
),
|
239 |
gr.Slider(
|
240 |
-
minimum=0.1,
|
|
|
|
|
|
|
|
|
241 |
),
|
242 |
],
|
243 |
-
outputs=gr.Audio(label="
|
244 |
flagging_mode="never",
|
245 |
concurrency_limit=4,
|
246 |
)
|
247 |
|
248 |
-
with gr.Tab("
|
249 |
gr.Interface(
|
250 |
-
fn=infer_tab1, #
|
251 |
inputs=[
|
252 |
gr.components.File(
|
253 |
-
label="
|
254 |
type="filepath",
|
255 |
file_types=[".txt"],
|
256 |
),
|
257 |
-
gr.Dropdown(choices=speakers, value="莱依拉", label="
|
258 |
gr.Slider(
|
259 |
-
minimum=0,
|
260 |
-
|
|
|
|
|
|
|
|
|
261 |
gr.Slider(
|
262 |
-
minimum=0.1,
|
|
|
|
|
|
|
|
|
263 |
),
|
264 |
gr.Slider(
|
265 |
-
minimum=0.1,
|
|
|
|
|
|
|
|
|
266 |
),
|
267 |
gr.Slider(
|
268 |
-
minimum=0.1,
|
|
|
|
|
|
|
|
|
269 |
),
|
270 |
],
|
271 |
outputs=[
|
272 |
-
gr.Audio(label="
|
273 |
-
gr.TextArea(
|
|
|
|
|
|
|
274 |
],
|
275 |
flagging_mode="never",
|
276 |
concurrency_limit=4,
|
|
|
101 |
|
102 |
def text_splitter(text: str):
|
103 |
punctuation = r"[。,;,!,?,〜,\n,\r,\t,.,!,;,?,~, ]"
|
|
|
104 |
sentences = re.split(punctuation, text.strip())
|
|
|
105 |
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
106 |
|
107 |
|
108 |
def concatenate_audios(audio_samples, sample_rate=44100):
|
109 |
half_second_silence = np.zeros(int(sample_rate / 2))
|
|
|
110 |
final_audio = audio_samples[0]
|
|
|
111 |
for sample in audio_samples[1:]:
|
112 |
final_audio = np.concatenate((final_audio, half_second_silence, sample))
|
113 |
|
|
|
117 |
|
118 |
def read_text(file_path: str):
|
119 |
try:
|
|
|
120 |
with open(file_path, "r", encoding="utf-8") as file:
|
121 |
content = file.read()
|
122 |
return content
|
123 |
|
124 |
except FileNotFoundError:
|
125 |
+
print(f"File Not Found: {file_path}")
|
126 |
|
127 |
except IOError:
|
128 |
+
print(f"An error occurred reading the file: {file_path}")
|
129 |
|
130 |
except Exception as e:
|
131 |
+
print(f"An unknown error has occurred: {e}")
|
132 |
|
133 |
|
134 |
def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
|
|
|
210 |
with gr.Blocks() as app:
|
211 |
gr.Markdown(
|
212 |
"""
|
213 |
+
Welcome to the Space, which is based on the open source project <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a>, and moved to the bottom for an explanation of the principle. This Space must be used in accordance with local laws and regulations, prohibiting the use of it for any criminal activities."""
|
|
|
|
|
|
|
214 |
)
|
215 |
|
216 |
+
with gr.Tab("Input Mode"):
|
217 |
gr.Interface(
|
218 |
+
fn=infer_tab2,
|
219 |
inputs=[
|
220 |
+
gr.TextArea(
|
221 |
+
label="Please input the Simplified Chinese text",
|
222 |
+
placeholder="The first inference takes time to download the model, so be patient.",
|
223 |
+
show_copy_button=True,
|
224 |
+
),
|
225 |
+
gr.Dropdown(choices=speakers, value="莱依拉", label="Role"),
|
226 |
gr.Slider(
|
227 |
+
minimum=0,
|
228 |
+
maximum=1,
|
229 |
+
value=0.2,
|
230 |
+
step=0.1,
|
231 |
+
label="Modulation of intonation",
|
232 |
+
), # SDP/DP Mix Ratio
|
233 |
gr.Slider(
|
234 |
+
minimum=0.1,
|
235 |
+
maximum=2,
|
236 |
+
value=0.6,
|
237 |
+
step=0.1,
|
238 |
+
label="Emotional adjustment",
|
239 |
),
|
240 |
gr.Slider(
|
241 |
+
minimum=0.1,
|
242 |
+
maximum=2,
|
243 |
+
value=0.8,
|
244 |
+
step=0.1,
|
245 |
+
label="Phoneme length",
|
246 |
),
|
247 |
gr.Slider(
|
248 |
+
minimum=0.1,
|
249 |
+
maximum=2,
|
250 |
+
value=1,
|
251 |
+
step=0.1,
|
252 |
+
label="Output duration",
|
253 |
),
|
254 |
],
|
255 |
+
outputs=gr.Audio(label="Output Audio"),
|
256 |
flagging_mode="never",
|
257 |
concurrency_limit=4,
|
258 |
)
|
259 |
|
260 |
+
with gr.Tab("Upload Mode"):
|
261 |
gr.Interface(
|
262 |
+
fn=infer_tab1, # Use text_to_speech func
|
263 |
inputs=[
|
264 |
gr.components.File(
|
265 |
+
label="Please upload a simplified Chinese TXT",
|
266 |
type="filepath",
|
267 |
file_types=[".txt"],
|
268 |
),
|
269 |
+
gr.Dropdown(choices=speakers, value="莱依拉", label="Role"),
|
270 |
gr.Slider(
|
271 |
+
minimum=0,
|
272 |
+
maximum=1,
|
273 |
+
value=0.2,
|
274 |
+
step=0.1,
|
275 |
+
label="Modulation of intonation",
|
276 |
+
),
|
277 |
gr.Slider(
|
278 |
+
minimum=0.1,
|
279 |
+
maximum=2,
|
280 |
+
value=0.6,
|
281 |
+
step=0.1,
|
282 |
+
label="Emotional adjustment",
|
283 |
),
|
284 |
gr.Slider(
|
285 |
+
minimum=0.1,
|
286 |
+
maximum=2,
|
287 |
+
value=0.8,
|
288 |
+
step=0.1,
|
289 |
+
label="Phoneme length",
|
290 |
),
|
291 |
gr.Slider(
|
292 |
+
minimum=0.1,
|
293 |
+
maximum=2,
|
294 |
+
value=1,
|
295 |
+
step=0.1,
|
296 |
+
label="Output duration",
|
297 |
),
|
298 |
],
|
299 |
outputs=[
|
300 |
+
gr.Audio(label="Output Audio"),
|
301 |
+
gr.TextArea(
|
302 |
+
label="Result of TXT extraction",
|
303 |
+
show_copy_button=True,
|
304 |
+
),
|
305 |
],
|
306 |
flagging_mode="never",
|
307 |
concurrency_limit=4,
|