Plachta commited on
Commit
08a99f7
·
1 Parent(s): 35aaf1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -18
app.py CHANGED
@@ -3,6 +3,9 @@ import json
3
  import os
4
  import re
5
  import tempfile
 
 
 
6
  import librosa
7
  import numpy as np
8
  import torch
@@ -42,7 +45,6 @@ gr.Audio.postprocess = audio_postprocess
42
 
43
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
44
  max_len = 150
45
- empty_audio = np.zeros(22050)
46
  languages = ['日本語', '简体中文', 'English']
47
  characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
48
  '4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
@@ -75,41 +77,40 @@ def show_memory_info(hint):
75
 
76
 
77
  def get_text(text, hps):
78
- text_norm = text_to_sequence(text, hps.data.text_cleaners)
79
  if hps.data.add_blank:
80
  text_norm = commons.intersperse(text_norm, 0)
81
  text_norm = torch.LongTensor(text_norm)
82
  return text_norm
83
 
84
-
85
  hps = utils.get_hparams_from_file("./configs/uma87.json")
86
- net_g = SynthesizerTrn(
87
- len(symbols),
88
  hps.data.filter_length // 2 + 1,
89
  hps.train.segment_size // hps.data.hop_length,
90
  n_speakers=hps.data.n_speakers,
91
  **hps.model)
92
  _ = net_g.eval()
93
 
94
- _ = utils.load_checkpoint("pretrained_models/G_1153000.pth", net_g, None)
95
 
96
  def infer(text_raw, character, language, duration, noise_scale, noise_scale_w):
97
  # check character & duraction parameter
98
  if language not in languages:
99
  print("Error: No such language\n")
100
- return "Error: No such language", (22050, empty_audio)
101
  if character not in characters:
102
  print("Error: No such character\n")
103
- return "Error: No such character", (22050, empty_audio)
104
  # check text length
105
  if limitation:
106
  text_len = len(re.sub("\[([A-Z]{2})\]", "", text_raw))
107
  if text_len > max_len:
108
  print(f"Refused: Text too long ({text_len}).")
109
- return "Error: Text is too long", (22050, empty_audio)
110
  if text_len == 0:
111
  print("Refused: Text length is zero.")
112
- return "Error: Please input text!", (22050, empty_audio)
113
  if language == '日本語':
114
  text = text_raw
115
  elif language == '简体中文':
@@ -121,11 +122,10 @@ def infer(text_raw, character, language, duration, noise_scale, noise_scale_w):
121
  with torch.no_grad():
122
  x_tst = stn_tst.unsqueeze(0)
123
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
124
- sid = torch.LongTensor([char_id])
125
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
126
- length_scale=duration)[0][0, 0].data.cpu().float().numpy()
127
  currentDateAndTime = datetime.now()
128
- print(f"\nCharacter {character} inference successful: {text}")
129
  if language != '日本語':
130
  print(f"translate from {language}: {text_raw}")
131
  show_memory_info(str(currentDateAndTime) + " infer调用后")
@@ -160,10 +160,8 @@ if __name__ == "__main__":
160
  "This synthesizer is created based on [VITS](https://arxiv.org/abs/2106.06103) model, trained on voice data extracted from mobile game Umamusume Pretty Derby \n\n"
161
  "这个合成器是基于VITS文本到语音模型,在从手游《賽馬娘:Pretty Derby》解包的语音数据上训练得到。\n\n"
162
  "[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
163
- "Runtime Error: Memory Limit Exceeded has not been resolved yet.\n\n"
164
- "In case of space crash, You may duplicate this space or [open in Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing) to run it privately and without any queue.\n\n"
165
- "Runtime Error: Memory Limit Exceeded 问题仍然没有解决。\n\n"
166
- "作为备用选项,建议您复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
167
  "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
168
  "如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
169
  )
 
3
  import os
4
  import re
5
  import tempfile
6
+ import logging
7
+ logging.getLogger('numba').setLevel(logging.WARNING)
8
+ import ONNXVITS_infer
9
  import librosa
10
  import numpy as np
11
  import torch
 
45
 
46
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
47
  max_len = 150
 
48
  languages = ['日本語', '简体中文', 'English']
49
  characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
50
  '4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
 
77
 
78
 
79
  def get_text(text, hps):
80
+ text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
81
  if hps.data.add_blank:
82
  text_norm = commons.intersperse(text_norm, 0)
83
  text_norm = torch.LongTensor(text_norm)
84
  return text_norm
85
 
 
86
  hps = utils.get_hparams_from_file("./configs/uma87.json")
87
+ net_g = ONNXVITS_infer.SynthesizerTrn(
88
+ len(hps.symbols),
89
  hps.data.filter_length // 2 + 1,
90
  hps.train.segment_size // hps.data.hop_length,
91
  n_speakers=hps.data.n_speakers,
92
  **hps.model)
93
  _ = net_g.eval()
94
 
95
+ _ = utils.load_checkpoint("pretrained_models/G_1153000.pth", net_g)
96
 
97
  def infer(text_raw, character, language, duration, noise_scale, noise_scale_w):
98
  # check character & duraction parameter
99
  if language not in languages:
100
  print("Error: No such language\n")
101
+ return "Error: No such language", None
102
  if character not in characters:
103
  print("Error: No such character\n")
104
+ return "Error: No such character", None
105
  # check text length
106
  if limitation:
107
  text_len = len(re.sub("\[([A-Z]{2})\]", "", text_raw))
108
  if text_len > max_len:
109
  print(f"Refused: Text too long ({text_len}).")
110
+ return "Error: Text is too long", None
111
  if text_len == 0:
112
  print("Refused: Text length is zero.")
113
+ return "Error: Please input text!", None
114
  if language == '日本語':
115
  text = text_raw
116
  elif language == '简体中文':
 
122
  with torch.no_grad():
123
  x_tst = stn_tst.unsqueeze(0)
124
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
125
+ sid = torch.LongTensor([0])
126
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
 
127
  currentDateAndTime = datetime.now()
128
+ print(f"Character {character} inference successful: {text}\n")
129
  if language != '日本語':
130
  print(f"translate from {language}: {text_raw}")
131
  show_memory_info(str(currentDateAndTime) + " infer调用后")
 
160
  "This synthesizer is created based on [VITS](https://arxiv.org/abs/2106.06103) model, trained on voice data extracted from mobile game Umamusume Pretty Derby \n\n"
161
  "这个合成器是基于VITS文本到语音模型,在从手游《賽馬娘:Pretty Derby》解包的语音数据上训练得到。\n\n"
162
  "[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
163
+ "You may duplicate this space or [open in Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing) to run it privately and without any queue.\n\n"
164
+ "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
 
 
165
  "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
166
  "如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
167
  )