Mahiruoshi commited on
Commit
332dcef
1 Parent(s): ecec7dc

Upload audiobook.py

Browse files
Files changed (1) hide show
  1. audiobook.py +194 -0
audiobook.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import numpy as np
4
+ import IPython.display as ipd
5
+ import torch
6
+ import commons
7
+ import utils
8
+ from models import SynthesizerTrn
9
+ from text.symbols import symbols
10
+ from text import text_to_sequence
11
+ import gradio as gr
12
+ import time
13
+ import json
14
+ import datetime
15
+ import os
16
+ import pickle
17
+ from scipy.io.wavfile import write
18
+ import librosa
19
+ import romajitable
20
+ from mel_processing import spectrogram_torch
21
+ import soundfile as sf
22
+ from scipy import signal
23
+ class VitsGradio:
24
+ def __init__(self):
25
+ self.lan = ["中文","日文","自动"]
26
+ self.modelPaths = []
27
+ for root,dirs,files in os.walk("checkpoints"):
28
+ for dir in dirs:
29
+ self.modelPaths.append(dir)
30
+ with gr.Blocks() as self.Vits:
31
+ with gr.Tab("小说合成"):
32
+ with gr.Row():
33
+ with gr.Column():
34
+ with gr.Row():
35
+ with gr.Column():
36
+ self.Text = gr.File(label="Text")
37
+ self.audio_path = gr.TextArea(label="音频路径",lines=1,value = 'audiobook/chapter.wav')
38
+ btnbook = gr.Button("小说合成")
39
+ btnbook.click(self.tts_fn, inputs=[self.Text,self.audio_path])
40
+ with gr.Tab("TTS设定"):
41
+ with gr.Row():
42
+ with gr.Column():
43
+ with gr.Row():
44
+ with gr.Column():
45
+ self.input1 = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value")
46
+ self.input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
47
+ self.input3 = gr.Dropdown(label="Speaker", choices=list(range(1001)), value=0, interactive=True)
48
+ self.input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
49
+ self.input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.667)
50
+ self.input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
51
+ statusa = gr.TextArea()
52
+ btnVC = gr.Button("完成vits TTS端设定")
53
+ btnVC.click(self.create_tts_fn, inputs=[self.input1, self.input2, self.input3, self.input4, self.input5, self.input6], outputs = [statusa])
54
+
55
+ def is_japanese(self,string):
56
+ for ch in string:
57
+ if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
58
+ return True
59
+ return False
60
+
61
+ def is_english(self,string):
62
+ import re
63
+ pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
64
+ if pattern.fullmatch(string):
65
+ return True
66
+ else:
67
+ return False
68
+
69
+ def get_text(self,text, hps, cleaned=False):
70
+ if cleaned:
71
+ text_norm = text_to_sequence(text, self.hps_ms.symbols, [])
72
+ else:
73
+ text_norm = text_to_sequence(text, self.hps_ms.symbols, self.hps_ms.data.text_cleaners)
74
+ if self.hps_ms.data.add_blank:
75
+ text_norm = commons.intersperse(text_norm, 0)
76
+ text_norm = torch.LongTensor(text_norm)
77
+ return text_norm
78
+
79
+ def get_label(self,text, label):
80
+ if f'[{label}]' in text:
81
+ return True, text.replace(f'[{label}]', '')
82
+ else:
83
+ return False, text
84
+
85
+ def sle(self,language,text):
86
+ text = text.replace('\n','。').replace(' ',',')
87
+ if language == "中文":
88
+ tts_input1 = "[ZH]" + text + "[ZH]"
89
+ return tts_input1
90
+ elif language == "自动":
91
+ tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]"
92
+ return tts_input1
93
+ elif language == "日文":
94
+ tts_input1 = "[JA]" + text + "[JA]"
95
+ return tts_input1
96
+
97
+ def create_tts_fn(self,path, input2, input3, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
98
+ self.language = input2
99
+ self.speaker_id = int(input3)
100
+ self.n_scale = n_scale
101
+ self.n_scale_w = n_scale_w
102
+ self.l_scale = l_scale
103
+ self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
104
+ self.hps_ms = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
105
+ self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
106
+ self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
107
+ self.net_g_ms = SynthesizerTrn(
108
+ self.n_symbols,
109
+ self.hps_ms.data.filter_length // 2 + 1,
110
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
111
+ n_speakers=self.n_speakers,
112
+ **self.hps_ms.model).to(self.dev)
113
+ _ = self.net_g_ms.eval()
114
+ _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g_ms)
115
+ return 'success'
116
+
117
+ def transfer(self,text):
118
+ text = re.sub("<[^>]*>","",text)
119
+ result_list = re.split(r'\n', text)
120
+ final_list = []
121
+ for j in result_list:
122
+ result_list2 = re.split(r'。|!|——|:|;|……|——|。|!', j)
123
+ for i in result_list2:
124
+ if self.is_english(i):
125
+ i = romajitable.to_kana(i).katakana
126
+ for m in range(20):
127
+ i = i.replace('\n','').replace(' ','').replace('……','。').replace('…','。').replace('还','孩').replace('“','').replace('”','').replace('!','。').replace('」','').replace('「','')
128
+ #Current length of single sentence: 50
129
+ if len(i)>1:
130
+ if len(i) > 50:
131
+ try:
132
+ cur_list = re.split(r'。|!|——|,|:', i)
133
+ for i in cur_list:
134
+ if len(i)>1:
135
+ final_list.append(i+'。')
136
+ except:
137
+ pass
138
+ else:
139
+ final_list.append(i)
140
+ final_list = [x for x in final_list if x != '']
141
+ return final_list
142
+
143
+ def tts_fn(self,text,audio_path):
144
+ with open(text.name, "r", encoding="utf-8") as f:
145
+ text = f.read()
146
+ a = ['【','[','(','(','〔']
147
+ b = ['】',']',')',')','〕']
148
+ for i in a:
149
+ text = text.replace(i,'<')
150
+ for i in b:
151
+ text = text.replace(i,'>')
152
+ final_list = self.transfer(text)
153
+ split_list = []
154
+ while len(final_list) > 0:
155
+ split_list.append(final_list[:1000])
156
+ final_list = final_list[1000:]
157
+ c0 = 0
158
+ for lists in split_list:
159
+ audio_fin = []
160
+ t = datetime.timedelta(seconds=0)
161
+ c = 0
162
+ f1 = open(audio_path.replace('.wav',str(c0)+".srt"),'w',encoding='utf-8')
163
+ for sentence in lists:
164
+ try:
165
+ c +=1
166
+ with torch.no_grad():
167
+ stn_tst = self.get_text(self.sle(self.language,sentence), self.hps_ms, cleaned=False)
168
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
169
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
170
+ sid = torch.LongTensor([self.speaker_id]).to(self.dev)
171
+ t1 = time.time()
172
+ audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=self.n_scale, noise_scale_w=self.n_scale_w, length_scale=self.l_scale)[0][
173
+ 0, 0].data.cpu().float().numpy()
174
+ t2 = time.time()
175
+ spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
176
+ print(spending_time)
177
+ time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
178
+ last_time = datetime.timedelta(seconds=len(audio)/float(22050))
179
+ t+=last_time
180
+ time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
181
+ print(time_end)
182
+ f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence.replace('。','')+'\n\n')
183
+ resampled_audio_data = signal.resample(audio, len(audio) * 2)
184
+ audio_fin.append(resampled_audio_data)
185
+ except:
186
+ pass
187
+ sf.write(audio_path.replace('.wav',str(c0)+'.wav'), np.concatenate(audio_fin), 44100, 'PCM_24')
188
+ c0 += 1
189
+ file_path = audio_path.replace('.wav',str(c0)+".srt")
190
+
191
+ if __name__ == '__main__':
192
+ print("开始部署")
193
+ grVits = VitsGradio()
194
+ grVits.Vits.launch()