Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -17,11 +17,6 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
17 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
|
18 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
|
19 |
model = build_model(config['model_params'])
|
20 |
-
for key, value in model.items():
|
21 |
-
for module in value.children():
|
22 |
-
if isinstance(module, torch.nn.RNNBase):
|
23 |
-
module.flatten_parameters()
|
24 |
-
|
25 |
_ = [model[key].eval() for key in model]
|
26 |
_ = [model[key].to(device) for key in model]
|
27 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
|
@@ -50,6 +45,25 @@ def get_random_text(voice):
|
|
50 |
def parens_to_angles(s):
|
51 |
return s.replace('(', '«').replace(')', '»')
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
def normalize(text):
|
54 |
# TODO: Custom text normalization rules?
|
55 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
|
@@ -63,7 +77,10 @@ def normalize(text):
|
|
63 |
text = re.sub(r'[^\S \n]', ' ', text)
|
64 |
text = re.sub(r' +', ' ', text)
|
65 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
|
|
66 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
|
|
|
|
67 |
return parens_to_angles(text).strip()
|
68 |
|
69 |
phonemizers = dict(
|
@@ -428,17 +445,19 @@ with gr.Blocks() as about:
|
|
428 |
gr.Markdown("""
|
429 |
Kokoro is a frontier TTS model for its size. It has 80 million parameters,<sup>[1]</sup> uses a lean StyleTTS 2 architecture,<sup>[2]</sup> and was trained on high-quality data.
|
430 |
|
431 |
-
The weights are currently private, but a free public demo is hosted at https://hf.co/spaces/hexgrad/Kokoro-TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
|
433 |
### Compute
|
434 |
The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](https://cloud.vast.ai/?ref_id=79907).<sup>[3]</sup><br/>
|
435 |
Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
|
436 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
437 |
|
438 |
-
### Voice Stability
|
439 |
-
⭐ Starred voices are more stable. 🧪 Experimental voices are less stable.<br/>
|
440 |
-
Unstable voices may be more likely to stumble or produce unnatural artifacts, especially on shorter texts.
|
441 |
-
|
442 |
### Licenses
|
443 |
Inference code: MIT<br/>
|
444 |
espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
|
@@ -446,7 +465,7 @@ Random English texts: Unknown<sup>[5]</sup><br/>
|
|
446 |
Random Japanese texts: CC0 public domain<sup>[6]</sup>
|
447 |
|
448 |
### References
|
449 |
-
1. Kokoro parameter count | https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#
|
450 |
2. StyleTTS 2 | https://github.com/yl4579/StyleTTS2
|
451 |
3. Vast.ai referral link | https://cloud.vast.ai/?ref_id=79907
|
452 |
4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
|
|
|
17 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
|
18 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
|
19 |
model = build_model(config['model_params'])
|
|
|
|
|
|
|
|
|
|
|
20 |
_ = [model[key].eval() for key in model]
|
21 |
_ = [model[key].to(device) for key in model]
|
22 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
|
|
|
45 |
def parens_to_angles(s):
|
46 |
return s.replace('(', '«').replace(')', '»')
|
47 |
|
48 |
+
def split_num(num):
|
49 |
+
if '.' not in num:
|
50 |
+
a, b = num.split('.')
|
51 |
+
b = ' '.join(b)
|
52 |
+
return f'{a} point {b}'
|
53 |
+
assert num.isdigit() and len(num) == 4, num
|
54 |
+
year = int(num)
|
55 |
+
if year < 1100 or year % 1000 < 10:
|
56 |
+
return num
|
57 |
+
first_half = num[:2]
|
58 |
+
second_half = num[2:]
|
59 |
+
second_half_int = int(second_half)
|
60 |
+
if 100 <= year % 1000 <= 999:
|
61 |
+
if second_half == '00':
|
62 |
+
return f'{first_half} hundred'
|
63 |
+
elif second_half_int < 10:
|
64 |
+
return f'{first_half} oh {second_half_int}'
|
65 |
+
return ' '.join([first_half, second_half])
|
66 |
+
|
67 |
def normalize(text):
|
68 |
# TODO: Custom text normalization rules?
|
69 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
|
|
|
77 |
text = re.sub(r'[^\S \n]', ' ', text)
|
78 |
text = re.sub(r' +', ' ', text)
|
79 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
80 |
+
text = re.sub(r'\d*\.\d+|\b\d{4}\b', split_num, text)
|
81 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
82 |
+
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) # TODO: could be minus
|
83 |
+
text = re.sub(r'(?<=\d):(?=\d)', ' ', text)
|
84 |
return parens_to_angles(text).strip()
|
85 |
|
86 |
phonemizers = dict(
|
|
|
445 |
gr.Markdown("""
|
446 |
Kokoro is a frontier TTS model for its size. It has 80 million parameters,<sup>[1]</sup> uses a lean StyleTTS 2 architecture,<sup>[2]</sup> and was trained on high-quality data.
|
447 |
|
448 |
+
The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`
|
449 |
+
|
450 |
+
### Will this be open sourced?
|
451 |
+
There currently isn't a release date scheduled for the weights. The inference code in this space is MIT licensed. The architecture was already published by Li et al, with MIT licensed code and pretrained weights.<sup>[2]</sup>
|
452 |
+
|
453 |
+
### What does it mean if a voice is unstable?
|
454 |
+
An unstable voice is more likely to stumble or produce unnatural artifacts, especially on short or strange texts.
|
455 |
|
456 |
### Compute
|
457 |
The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](https://cloud.vast.ai/?ref_id=79907).<sup>[3]</sup><br/>
|
458 |
Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
|
459 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
460 |
|
|
|
|
|
|
|
|
|
461 |
### Licenses
|
462 |
Inference code: MIT<br/>
|
463 |
espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
|
|
|
465 |
Random Japanese texts: CC0 public domain<sup>[6]</sup>
|
466 |
|
467 |
### References
|
468 |
+
1. Kokoro parameter count | https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L31
|
469 |
2. StyleTTS 2 | https://github.com/yl4579/StyleTTS2
|
470 |
3. Vast.ai referral link | https://cloud.vast.ai/?ref_id=79907
|
471 |
4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
|