speecht5_tts_jvs_ver1_e20_openjtalk_longer_20230809-031157_tokenizer
Browse files
speecht5_openjtalk_tokenizer.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
import re
|
4 |
+
from transformers import SpeechT5Tokenizer
|
5 |
+
from transformers.models.speecht5.tokenization_speecht5 import (
|
6 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
|
7 |
+
)
|
8 |
+
from itertools import chain
|
9 |
+
from typing import List, Optional
|
10 |
+
|
11 |
+
|
12 |
+
def _g2p_with_np(text: str, np_lsit: str) -> List[str]:
|
13 |
+
from pyopenjtalk import g2p
|
14 |
+
|
15 |
+
np_pattern = re.compile(f"([{re.escape(np_lsit)}])")
|
16 |
+
|
17 |
+
return list(
|
18 |
+
chain.from_iterable(
|
19 |
+
[
|
20 |
+
(text,) if text in np_lsit else g2p(text, kana=False, join=False)
|
21 |
+
for text in np_pattern.split(text)
|
22 |
+
if len(text) > 0
|
23 |
+
]
|
24 |
+
)
|
25 |
+
)
|
26 |
+
|
27 |
+
|
28 |
+
NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`"
|
29 |
+
|
30 |
+
|
31 |
+
class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
|
32 |
+
vocab_files_names = {"vocab_file": "spm_char.model"}
|
33 |
+
pretrained_vocab_files_map = {}
|
34 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
35 |
+
model_input_names = ["input_ids", "attention_mask"]
|
36 |
+
label2id = {}
|
37 |
+
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
vocab_file,
|
41 |
+
bos_token: str = "<s>",
|
42 |
+
eos_token: str = "</s>",
|
43 |
+
unk_token: str = "<unk>",
|
44 |
+
pad_token: str = "<pad>",
|
45 |
+
non_phenome_characters: str = NP_CHARCTERS,
|
46 |
+
**kwargs,
|
47 |
+
):
|
48 |
+
try:
|
49 |
+
super().__init__(
|
50 |
+
vocab_file=None,
|
51 |
+
bos_token=bos_token,
|
52 |
+
eos_token=eos_token,
|
53 |
+
unk_token=unk_token,
|
54 |
+
pad_token=pad_token,
|
55 |
+
**kwargs,
|
56 |
+
)
|
57 |
+
except TypeError:
|
58 |
+
pass
|
59 |
+
|
60 |
+
self.non_phenome_characters = non_phenome_characters
|
61 |
+
|
62 |
+
if isinstance(vocab_file, str) and vocab_file.endswith(".json"):
|
63 |
+
with open(vocab_file, encoding="utf-8") as f:
|
64 |
+
self.label2id = json.load(f)
|
65 |
+
self.id2label = {v: k for k, v in self.label2id.items()}
|
66 |
+
|
67 |
+
@property
|
68 |
+
def bos_token_id(self) -> int | None:
|
69 |
+
return super().bos_token_id
|
70 |
+
|
71 |
+
@property
|
72 |
+
def vocab_size(self):
|
73 |
+
return len(self.label2id)
|
74 |
+
|
75 |
+
def get_vocab(self):
|
76 |
+
return self.label2id
|
77 |
+
|
78 |
+
def save_vocabulary(
|
79 |
+
self, save_directory: str, filename_prefix: Optional[str] = None
|
80 |
+
):
|
81 |
+
if filename_prefix is None:
|
82 |
+
filename_prefix = ".json"
|
83 |
+
vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
|
84 |
+
vocab_path.parent.mkdir(parents=True, exist_ok=True)
|
85 |
+
with open(vocab_path, "w", encoding="utf-8") as f:
|
86 |
+
json.dump(self.label2id, f, ensure_ascii=False, indent=2)
|
87 |
+
return str(vocab_path), None
|
88 |
+
|
89 |
+
def _tokenize(self, text: str) -> List[str]:
|
90 |
+
return _g2p_with_np(text, self.non_phenome_characters)
|
91 |
+
|
92 |
+
def _convert_token_to_id(self, token):
|
93 |
+
return self.label2id.get(token, self.label2id.get(self.unk_token))
|
94 |
+
|
95 |
+
def _convert_id_to_token(self, index):
|
96 |
+
return self.id2label.get(index, self.unk_token)
|