hexgrad commited on
Commit
61044da
·
verified ·
1 Parent(s): 0798678

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -11
app.py CHANGED
@@ -17,11 +17,6 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu'
17
  snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
18
  config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
19
  model = build_model(config['model_params'])
20
- for key, value in model.items():
21
- for module in value.children():
22
- if isinstance(module, torch.nn.RNNBase):
23
- module.flatten_parameters()
24
-
25
  _ = [model[key].eval() for key in model]
26
  _ = [model[key].to(device) for key in model]
27
  for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
@@ -50,6 +45,25 @@ def get_random_text(voice):
50
  def parens_to_angles(s):
51
  return s.replace('(', '«').replace(')', '»')
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def normalize(text):
54
  # TODO: Custom text normalization rules?
55
  text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
@@ -63,7 +77,10 @@ def normalize(text):
63
  text = re.sub(r'[^\S \n]', ' ', text)
64
  text = re.sub(r' +', ' ', text)
65
  text = re.sub(r'(?<=\n) +(?=\n)', '', text)
 
66
  text = re.sub(r'(?<=\d),(?=\d)', '', text)
 
 
67
  return parens_to_angles(text).strip()
68
 
69
  phonemizers = dict(
@@ -428,17 +445,19 @@ with gr.Blocks() as about:
428
  gr.Markdown("""
429
  Kokoro is a frontier TTS model for its size. It has 80 million parameters,<sup>[1]</sup> uses a lean StyleTTS 2 architecture,<sup>[2]</sup> and was trained on high-quality data.
430
 
431
- The weights are currently private, but a free public demo is hosted at https://hf.co/spaces/hexgrad/Kokoro-TTS
 
 
 
 
 
 
432
 
433
  ### Compute
434
  The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](https://cloud.vast.ai/?ref_id=79907).<sup>[3]</sup><br/>
435
  Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
436
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
437
 
438
- ### Voice Stability
439
- ⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.<br/>
440
- Unstable voices may be more likely to stumble or produce unnatural artifacts, especially on shorter texts.
441
-
442
  ### Licenses
443
  Inference code: MIT<br/>
444
  espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
@@ -446,7 +465,7 @@ Random English texts: Unknown<sup>[5]</sup><br/>
446
  Random Japanese texts: CC0 public domain<sup>[6]</sup>
447
 
448
  ### References
449
- 1. Kokoro parameter count | https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L37
450
  2. StyleTTS 2 | https://github.com/yl4579/StyleTTS2
451
  3. Vast.ai referral link | https://cloud.vast.ai/?ref_id=79907
452
  4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
 
17
  snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
18
  config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
19
  model = build_model(config['model_params'])
 
 
 
 
 
20
  _ = [model[key].eval() for key in model]
21
  _ = [model[key].to(device) for key in model]
22
  for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
 
45
  def parens_to_angles(s):
46
  return s.replace('(', '«').replace(')', '»')
47
 
48
+ def split_num(num):
49
+ if '.' not in num:
50
+ a, b = num.split('.')
51
+ b = ' '.join(b)
52
+ return f'{a} point {b}'
53
+ assert num.isdigit() and len(num) == 4, num
54
+ year = int(num)
55
+ if year < 1100 or year % 1000 < 10:
56
+ return num
57
+ first_half = num[:2]
58
+ second_half = num[2:]
59
+ second_half_int = int(second_half)
60
+ if 100 <= year % 1000 <= 999:
61
+ if second_half == '00':
62
+ return f'{first_half} hundred'
63
+ elif second_half_int < 10:
64
+ return f'{first_half} oh {second_half_int}'
65
+ return ' '.join([first_half, second_half])
66
+
67
  def normalize(text):
68
  # TODO: Custom text normalization rules?
69
  text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
 
77
  text = re.sub(r'[^\S \n]', ' ', text)
78
  text = re.sub(r' +', ' ', text)
79
  text = re.sub(r'(?<=\n) +(?=\n)', '', text)
80
+ text = re.sub(r'\d*\.\d+|\b\d{4}\b', split_num, text)
81
  text = re.sub(r'(?<=\d),(?=\d)', '', text)
82
+ text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) # TODO: could be minus
83
+ text = re.sub(r'(?<=\d):(?=\d)', ' ', text)
84
  return parens_to_angles(text).strip()
85
 
86
  phonemizers = dict(
 
445
  gr.Markdown("""
446
  Kokoro is a frontier TTS model for its size. It has 80 million parameters,<sup>[1]</sup> uses a lean StyleTTS 2 architecture,<sup>[2]</sup> and was trained on high-quality data.
447
 
448
+ The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`
449
+
450
+ ### Will this be open sourced?
451
+ There currently isn't a release date scheduled for the weights. The inference code in this space is MIT licensed. The architecture was already published by Li et al, with MIT licensed code and pretrained weights.<sup>[2]</sup>
452
+
453
+ ### What does it mean if a voice is unstable?
454
+ An unstable voice is more likely to stumble or produce unnatural artifacts, especially on short or strange texts.
455
 
456
  ### Compute
457
  The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](https://cloud.vast.ai/?ref_id=79907).<sup>[3]</sup><br/>
458
  Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
459
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
460
 
 
 
 
 
461
  ### Licenses
462
  Inference code: MIT<br/>
463
  espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
 
465
  Random Japanese texts: CC0 public domain<sup>[6]</sup>
466
 
467
  ### References
468
+ 1. Kokoro parameter count | https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L31
469
  2. StyleTTS 2 | https://github.com/yl4579/StyleTTS2
470
  3. Vast.ai referral link | https://cloud.vast.ai/?ref_id=79907
471
  4. eSpeak NG | https://github.com/espeak-ng/espeak-ng