thewh1teagle commited on
Commit
8bf5be0
·
0 Parent(s):
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phonemize in Hebrew
3
+ emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ uv sync
3
+ uv pip install "gradio>=5.15.0"
4
+ uv run gradio examples/editor.py
5
+ """
6
+
7
+ from mishkal import phonemize, normalize
8
+ import gradio as gr
9
+
10
+ default_text = """
11
+ כָּל עֶרֶב יָאִיר (הַשֵּׁם הַמָּלֵא וּמְקוֹם הָעֲבוֹדָה שֶׁלּוֹ שְׁמוּרִים בַּמַּעֲרֶכֶת) רָץ 20 קִילוֹמֶטֶר. הוּא מְסַפֵּר לִי שֶׁזֶּה מְנַקֶּה לוֹ אֶת הָרֹאשׁ אַחֲרֵי הָעֲבוֹדָה, "שָׁעָה וָחֵצִי בְּלִי עֲבוֹדָה, אִשָּׁה וִילָדִים" כְּמוֹ שֶׁהוּא מַגְדִּיר זֹאת. אֲבָל אַחֲרֵי הַמִּקְלַחַת הוּא מַתְחִיל בְּמָה שֶׁנִּתָּן לְכַנּוֹת הָעֲבוֹדָה הַשְּׁנִיָּה שֶׁלּוֹ: לִמְצֹא לוֹ קוֹלֵגוֹת חֲדָשׁוֹת לָעֲבוֹדָה, כִּי יָאִיר הוּא כַּנִּרְאֶה הַמֶּלֶךְ שֶׁל "חָבֵר מֵבִיא חָבֵר" בְּיִשְׂרָאֵל.
12
+ """
13
+
14
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])
15
+
16
+
17
+ def on_submit_debug(text: str) -> str:
18
+ phonemes = phonemize(text, preserve_punctuation=True)
19
+ normalized_text = normalize(text)
20
+ return phonemes + "\n\nNormalized:\n" + normalized_text
21
+
22
+
23
+ def on_submit(text: str) -> str:
24
+ return phonemize(text, preserve_punctuation=False)
25
+
26
+
27
+ with gr.Blocks(theme=theme) as demo:
28
+ text_input = gr.Textbox(
29
+ value=default_text, label="Text", rtl=True, elem_classes=["input"]
30
+ )
31
+ checkbox = gr.Checkbox(value=False, label="Enable Debug Mode")
32
+ phonemes_output = gr.Textbox(label="Phonemes")
33
+ submit_button = gr.Button("Create")
34
+
35
+ submit_button.click(
36
+ fn=lambda text, debug: on_submit_debug(text) if debug else on_submit(text),
37
+ inputs=[text_input, checkbox],
38
+ outputs=[phonemes_output],
39
+ )
40
+
41
+
42
+ if __name__ == "__main__":
43
+ demo.launch()
mishkal/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ High level phonemize functions
3
+ """
4
+
5
+ from .phonemize import Phonemizer
6
+ from .utils import normalize # noqa: F401
7
+ from typing import Callable
8
+
9
+ phonemizer = Phonemizer()
10
+
11
+
12
+ def phonemize(
13
+ text: str,
14
+ preserve_punctuation=True,
15
+ preserve_stress=True,
16
+ fallback: Callable[[str], str] = None,
17
+ ) -> str:
18
+ phonemes = phonemizer.phonemize(
19
+ text,
20
+ preserve_punctuation=preserve_punctuation,
21
+ preserve_stress=preserve_stress,
22
+ fallback=fallback,
23
+ )
24
+ return phonemes
mishkal/data/gold_9000.json ADDED
The diff for this file is too large to render. See raw diff
 
mishkal/data/kamatz_katan.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "כל": "kol"
3
+ }
mishkal/data/silver_8500.json ADDED
The diff for this file is too large to render. See raw diff
 
mishkal/data/silver_top_307.json ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "מִשְׁפַּחַה": "miʃpaˈxa",
3
+ "הַמִזְבֵּחַ": "hamizˈbeax",
4
+ "וַאַנַחְנו": "vehaˈnaxnu",
5
+ "בְּעִיקַר": "beiˈkar",
6
+ "בְּרֹאשׁוֹ": "beroˈʃo",
7
+ "לִקְרוֹת": "likˈrot",
8
+ "בְּמַהַלַךְ": "bemahalax",
9
+ "בְּפִי": "befi",
10
+ "הַסְפִינַה": "hasfina",
11
+ "נִמְצְאו": "nimtseu",
12
+ "גְדוֹלוֹת": "gdolot",
13
+ "וַיֵלֵךְ": "vajelex",
14
+ "בִּשְׁאַר": "biʃˈar",
15
+ "הַגְדוֹלִים": "hagdolim",
16
+ "הַכְּפַר": "hakfar",
17
+ "לְבַקֵשׁ": "levakeʃ",
18
+ "כַּלכַּךְ": "kolkax",
19
+ "הַעֵלְיוֹן": "haeljon",
20
+ "תְשׁובַה": "tsuva",
21
+ "לְמַחֹרַת": "lemaxorat",
22
+ "שְׁמַע": "ʃma",
23
+ "וַיִקְרַא": "vajikra",
24
+ "פְּרִי": "pri",
25
+ "בְּצַד": "betsad",
26
+ "נִזְכַּר": "nizkar",
27
+ "בַּזְמַן": "bizman",
28
+ "לִבְנוֹת": "livnot",
29
+ "שִׁבְעַת": "ʃivˈat",
30
+ "מִלְבַד": "milvad",
31
+ "מִדִבְרֵי": "midivrej",
32
+ "לְאַן": "lean",
33
+ "לְמַקוֹם": "lemakom",
34
+ "הַשְׁאַר": "haʃˈar",
35
+ "לְפַנֵיךַ": "lefanexa",
36
+ "הִתְחִילַה": "hitxila",
37
+ "תִרְאֵה": "tirˈe",
38
+ "הַשְׁנִיַה": "haʃnija",
39
+ "הֵבַנְתִי": "hevanti",
40
+ "בִּשְׁנֵי": "biʃnej",
41
+ "לְהַנִיחַ": "leanix",
42
+ "סַבְתַא": "savta",
43
+ "שְׁאֵלַה": "ʃeela",
44
+ "תִשְׁעַה": "tiʃah",
45
+ "ולְאַחַר": "uleaxar",
46
+ "הַטְרוֹיִים": "atrojim",
47
+ "שְׁמַה": "ʃma",
48
+ "בִּשְׁעַת": "biʃnat",
49
+ "לְעִתִים": "leitim",
50
+ "יַשְׁבַה": "jaʃva",
51
+ "בִּדְבַרִים": "bidvarim",
52
+ "ושְׁאַר": "uʃar",
53
+ "קַרְקַע": "karka",
54
+ "נִכְנְסַה": "nixnesa",
55
+ "הַרְחֵק": "harxek",
56
+ "בַּעְלַה": "baˈla",
57
+ "קַרְאו": "karˈu",
58
+ "שֵׁתִהְיֵה": "ʃetihje",
59
+ "הַשְׁנִיַיה": "haʃnija",
60
+ "וַיֹאמְרו": "vajomru",
61
+ "הַקְטַנִים": "haktanim",
62
+ "שְׁתַיִם": "ʃtaim",
63
+ "בְּאַחַת": "beaxat",
64
+ "מַרְאֵה": "marˈeh",
65
+ "בְּדַבַר": "bedavar",
66
+ "לְשַׁם": "leʃam",
67
+ "בְּמַה": "bema",
68
+ "בְּשֵׁל": "beʃel",
69
+ "נַתְנַה": "natna",
70
+ "טֻמְאַת": "tumˈat",
71
+ "מְבַקֵשׁ": "mevakeʃ",
72
+ "עַנְתַה": "anta",
73
+ "לְעַצְמִי": "leatsmi",
74
+ "הַלַכְתִי": "halaxti",
75
+ "הַמִשְׁפַּחַה": "hamiʃpaxa",
76
+ "הַבְּהֵמַה": "habehema",
77
+ "אַרְגַן": "argan",
78
+ "בְּשֵׁקֵט": "beʃeket",
79
+ "דְמֵי": "dmej",
80
+ "לְאוֹרֵךְ": "leorex",
81
+ "רְאובֵן": "reuven",
82
+ "נַתְנו": "natnu",
83
+ "הַרְחוֹב": "harexov",
84
+ "שֵׁצַרִיךְ": "ʃetsarix",
85
+ "לִחְיוֹת": "lixjot",
86
+ "שְׁטַר": "ʃtar",
87
+ "הִרְגִישׁ": "hirgiʃ",
88
+ "עַלְתַה": "alta",
89
+ "יַדְעַה": "jadˈa",
90
+ "בִּפְנִים": "bifnim",
91
+ "לְפֵתַע": "lefeta",
92
+ "הַכְּלִי": "hakli",
93
+ "שֵׁלַךְ": "ʃelax",
94
+ "שִׁמְעוֹן": "ʃimˈon",
95
+ "ובְנֵי": "uvnej",
96
+ "בְּעַד": "bead",
97
+ "בִּמְיוחַד": "bimjuxad",
98
+ "הוֹלְכִים": "holxim",
99
+ "לְיִשְׂרַאֵל": "lejisraˈel",
100
+ "מְקַבֵּל": "mekabel",
101
+ "שְׁאֵינַן": "ʃeejnan",
102
+ "דְבַר": "dvar",
103
+ "הַשְׁלִישִׁי": "haʃliʃi",
104
+ "הַבְּרִית": "habrit",
105
+ "לֵךְ": "lex",
106
+ "חַזְרַה": "xazra",
107
+ "דְבַרַיו": "dvarav",
108
+ "הִרְגִישַׁה": "hirgiʃa",
109
+ "לִקְרַאת": "likrat",
110
+ "צְרִיכִים": "tsrixim",
111
+ "טְפַחִים": "tfaxim",
112
+ "הִרְגַשְׁתִי": "hirgaʃti",
113
+ "נַפְלַה": "nafla",
114
+ "בְּיַחַד": "bejaxad",
115
+ "אַמַרְתַ": "amart",
116
+ "שֵׁאַנַחְנו": "ʃeanaxnu",
117
+ "הַמִשְׁנֵה": "amiʃna",
118
+ "הַקַרְקַע": "akarka",
119
+ "לְמַשַׁל": "lemaʃal",
120
+ "לִקְרוֹא": "likro",
121
+ "יַדְעו": "jadu",
122
+ "כְּלַפֵּי": "klapej",
123
+ "לְאַט": "leat",
124
+ "שְׁאִם": "ʃeim",
125
+ "לִשְׁמוֹעַ": "liʃmoa",
126
+ "מִמְךַ": "mimex",
127
+ "לִבְנֵי": "livnej",
128
+ "בְּגַדִים": "bgadim",
129
+ "דַוְוקַא": "davka",
130
+ "לִקְנוֹת": "liknot",
131
+ "אַרְצַה": "artsa",
132
+ "לְבֵין": "lvejn",
133
+ "בִּשְׁנַת": "biʃnat",
134
+ "יַשְׁבו": "jaʃvu",
135
+ "לְהַבִין": "lehavin",
136
+ "לְסַפֵּר": "lesaper",
137
+ "לְצַד": "letsad",
138
+ "בְּעַצְמוֹ": "beatsmo",
139
+ "בְּנִי": "bni",
140
+ "לְבַדוֹ": "levado",
141
+ "יְכוֹלִים": "jexolim",
142
+ "לְכַאן": "lekan",
143
+ "הַרִצְפַּה": "haritspa",
144
+ "ה��נְנִי": "hineni",
145
+ "שְׁנִיַה": "ʃnija",
146
+ "הַלְכַה": "alxa",
147
+ "בְּעֵצֵם": "beetsem",
148
+ "מִשְׁפַּט": "miʃpat",
149
+ "פְּנִימַה": "pnima",
150
+ "יְהודִי": "jeudi",
151
+ "מְלַאכַה": "melaxa",
152
+ "מְבַרֵךְ": "mevarex",
153
+ "עַמְדַה": "amda",
154
+ "לְאַחוֹר": "leaxor",
155
+ "בְּיִשְׂרַאֵל": "bejisraˈel",
156
+ "מִבְּלִי": "mibli",
157
+ "לִמְצוֹא": "limtso",
158
+ "מְדַבֵּר": "medaber",
159
+ "לְמִי": "lemi",
160
+ "שִׂמְחַה": "simxa",
161
+ "סַמוךְ": "samux",
162
+ "בְּסוֹף": "besof",
163
+ "תְרומַה": "truma",
164
+ "בְּאֵחַד": "beexad",
165
+ "מִצְוַת": "mitsvat",
166
+ "קַרְבַּן": "korban",
167
+ "לְקַבֵּל": "lekabel",
168
+ "קְטַנִים": "ktanim",
169
+ "לְרֵגַע": "lerega",
170
+ "הַמִשְׁפַּט": "hamiʃpat",
171
+ "צְרִיכִין": "tsrixin",
172
+ "יַכוֹלְתִי": "jaxolti",
173
+ "בְּעֵינַיו": "beejnav",
174
+ "כְּלוֹמַר": "klomar",
175
+ "בְּאֵמְצַע": "beemtsa",
176
+ "סְבִיב": "sviv",
177
+ "בִּרְשׁות": "birʃut",
178
+ "כַּנִרְאֵה": "kanire",
179
+ "גְדוֹלִים": "gdolim",
180
+ "הַשׁולְחַן": "haʃulxan",
181
+ "שַׁאַלְתִי": "ʃaˈhalti",
182
+ "לְבַסוֹף": "levasof",
183
+ "כְּדֵרֵךְ": "kederex",
184
+ "מְנַת": "menat",
185
+ "אֵצְלוֹ": "etslo",
186
+ "לְבַד": "levad",
187
+ "לְהַגִיעַ": "leagia",
188
+ "לְהַגִיד": "leagid",
189
+ "בְּרַכַה": "brexa",
190
+ "דַעְתוֹ": "dahato",
191
+ "מַסְפִּיק": "masˈpik",
192
+ "בְּשַׁעַת": "beʃat",
193
+ "עַמְדו": "amdu",
194
+ "יוֹדְעִים": "jodˈim",
195
+ "בְּעֵינֵי": "beeˈnej",
196
+ "וַיְדַבֵּר": "vajedaber",
197
+ "נִשְׁמַע": "niʃma",
198
+ "בְּלִבּוֹ": "belibo",
199
+ "בְּמֵשֵׁךְ": "bemeʃex",
200
+ "לְהוֹצִיא": "lehotsi",
201
+ "הַקְטַנַה": "haktana",
202
+ "לְכַךְ": "lexax",
203
+ "לְזֵה": "leze",
204
+ "בְּעֵת": "beˈet",
205
+ "לִשְׁתוֹת": "liʃtot",
206
+ "נִשְׁבַּע": "niʃba",
207
+ "לְפַחוֹת": "lefaxot",
208
+ "שְׁלוֹשִׁים": "ʃloʃim",
209
+ "לְעֵבֵר": "leˈever",
210
+ "מִצְוַה": "mitsva",
211
+ "בְּרֹאשׁ": "beroʃ",
212
+ "אֵתְכֵם": "etxem",
213
+ "בְּשֵׁם": "beʃem",
214
+ "סְתַם": "stam",
215
+ "בְּכַךְ": "bexax",
216
+ "מִלְחַמַה": "milxama",
217
+ "יַצְאו": "jatsˈu",
218
+ "נַפְשׁוֹ": "nafʃo",
219
+ "כְּלֵי": "klej",
220
+ "נַפְשִׁי": "nafʃi",
221
+ "מִצְווֹת": "mitsvot",
222
+ "יַלְדַה": "jalda",
223
+ "בְּאֵרֵץ": "beerets",
224
+ "בְּעוֹד": "beod",
225
+ "שֵׁיִהְיו": "ʃejihju",
226
+ "הַזְקֵנַה": "hazkena",
227
+ "תְחִלַה": "txila",
228
+ "מִצְרַיִם": "mitsraim",
229
+ "פִּתְאֹם": "pitˈom",
230
+ "אַרְבַּעִים": "arbaˈim",
231
+ "כְּשֵׁהוא": "kʃehu",
232
+ "פְּלוֹנִי": "ploni",
233
+ "בְּדֵרֵךְ": "bederex",
234
+ "הַלְכו": "halxu",
235
+ "הַמֵמְשַׁלַה": "hamemʃala",
236
+ "שַׁלוֹם": "ʃaˈlom",
237
+ "לְמַטַה": "lemata",
238
+ "כְּכַל": "kexol",
239
+ "שַׁלוֹם": "ʃaˈlom",
240
+ "בְּאוֹפֵן": "beofen",
241
+ "לְעַצְמוֹ": "leatsmo",
242
+ "שְׁמוֹ": "ʃmo",
243
+ "לְגַמְרֵי": "legamerej",
244
+ "שֵׁכְּבַר": "ʃekvar",
245
+ "עַצְמִי": "atsmi",
246
+ "שְׁלֹמֹה": "ʃlomo",
247
+ "בַּדֵרֵךְ": "baderex",
248
+ "הַגְדוֹלַה": "hagdola",
249
+ "בְּמַקוֹם": "bemakom",
250
+ "נִדְמֵה": "nidme",
251
+ "בְּאוֹתַה": "beota",
252
+ "רַגְלַיו": "raglav",
253
+ "יְהודַה": "jehuda",
254
+ "מְקוֹם": "mekom",
255
+ "כְּנֵגֵד": "keneged",
256
+ "בְּסֵדֵר": "beseder",
257
+ "חַשַׁבְתִי": "xaʃavti",
258
+ "יְלַדִים": "jeladim",
259
+ "רְשׁות": "reʃut",
260
+ "ובְכַל": "vebekol",
261
+ "בְּהֵמַה": "behema",
262
+ "יְכוֹלַה": "jexola",
263
+ "שִׁבְעַה": "ʃivˈa",
264
+ "לְהַבִיא": "lehavi",
265
+ "כְּאִלו": "keilu",
266
+ "שְׁנֵיהֵם": "ʃnejhem",
267
+ "בִּלְתִי": "bilti",
268
+ "כְּלִי": "kli",
269
+ "נִשְׁאַר": "niʃˈar",
270
+ "מִסְפַּר": "mispar",
271
+ "וכְבַר": "vekvar",
272
+ "בְּשַׁעַה": "beʃaˈa",
273
+ "צְרִיכַה": "tsrixa",
274
+ "הַמְדִינַה": "amedina",
275
+ "יְמֵי": "jemej",
276
+ "שֵׁבֵּאַרְנו": "ʃebaˈarnu",
277
+ "שֵׁהַיְתַה": "ʃeajta",
278
+ "דִבְרֵי": "divrej",
279
+ "בִּזְמַן": "bizman",
280
+ "לְשֵׁם": "leʃam",
281
+ "לְמַעְלַה": "lemala",
282
+ "שְׁנַיִם": "ʃnajim",
283
+ "בְּזֵה": "beze",
284
+ "עֵשְׂרֵה": "esre",
285
+ "טֻמְאַה": "tumˈa",
286
+ "שַׁמַעְתִי": "ʃamati",
287
+ "בִּמְקוֹם": "bimkom",
288
+ "בִּגְלַל": "biglal",
289
+ "שְׁאַר": "ʃˈar",
290
+ "פִּתְאוֹם": "pitom",
291
+ "יִהְיו": "jihju",
292
+ "הִתְחִיל": "hitxil",
293
+ "בַּלַיְלַה": "balajla",
294
+ "פְּעַמִים": "peamim",
295
+ "קְטַנַה": "ktana",
296
+ "בַּרוךְ": "barux",
297
+ "בִּכְלַל": "bixlal",
298
+ "יַדַעְתִי": "jaˈdati",
299
+ "עַצְמַה": "atsman",
300
+ "הַיְלַדִים": "hajeladim",
301
+ "בְּיַד": "bejad",
302
+ "בְּלֹא": "beˈlo",
303
+ "לְמַעַן": "lemaan",
304
+ "הַכְּנֵסֵת": "hakneset",
305
+ "לְלֹא": "lelo",
306
+ "בִּשְׁבִיל": "biʃvil",
307
+ "שְׁהִיא": "ʃehi",
308
+ "לְפַנַיו": "lefanav",
309
+ "שְׁהֵן": "ʃehen",
310
+ "כְּפִי": "kfi",
311
+ "לְדַבֵּר": "ledaber",
312
+ "בְּיַדוֹ": "bejado",
313
+ "הַבַּיְתַה": "habajta",
314
+ "בְּדִיוק": "bedijuk",
315
+ "אוֹמְרִים": "omrim",
316
+ "בְּנוֹ": "bno",
317
+ "בְּקוֹל": "bekol",
318
+ "לְבֵית": "levejt",
319
+ "בְּאוֹתוֹ": "beoto",
320
+ "בִּפְנֵי": "bifnej",
321
+ "אַרְבַּעַה": "arbaha",
322
+ "הַדְבַרִים": "advarim",
323
+ "שֵׁלְךַ": "ʃelxa",
324
+ "לְתוֹךְ": "letox",
325
+ "שְׁלוֹשַׁה": "ʃloˈʃa",
326
+ "כְּלַל": "klal",
327
+ "לְיַד": "lejad",
328
+ "לְפִיכַּךְ": "lefikax",
329
+ "שֵׁיִהְיֵה": "ʃejihje",
330
+ "נִכְנַס": "nixnas",
331
+ "הַזְמַן": "azman",
332
+ "אַרְבַּע": "arba",
333
+ "בִּלְבַד": "bilvad",
334
+ "נִמְצַא": "nimtsa",
335
+ "תִהְיֵה": "tihje",
336
+ "בְּיוֹתֵר": "bejoter",
337
+ "מֵלֵךְ": "melex",
338
+ "עֵשְׂרִים": "esrim",
339
+ "הִלְכּוֹת": "hilxot",
340
+ "גְדוֹלַה": "gdola",
341
+ "כְּגוֹן": "kegon",
342
+ "וַיְהִי": "vajehi",
343
+ "לְעוֹלַם": "leolam",
344
+ "אוֹתְךַ": "otxa",
345
+ "פְּנֵי": "pnej",
346
+ "כִּמְעַט": "kimat",
347
+ "הַלַךְ": "alax",
348
+ "הַלַיְלַה": "halajla",
349
+ "יְדֵי": "jedej",
350
+ "בְּבֵית": "bevejt",
351
+ "אַמְרו": "amru",
352
+ "מִתוֹךְ": "mitox",
353
+ "קְצַת": "ktsat",
354
+ "מְעַט": "meat",
355
+ "לְפִי": "leˈfi",
356
+ "נִרְאֵה": "nirhe",
357
+ "בְּיוֹם": "bejom",
358
+ "כְּאִילו": "keilu",
359
+ "לִרְאוֹת": "lirhot",
360
+ "כְּלום": "klum",
361
+ "מְאוֹד": "meod",
362
+ "הַיְיתַה": "haˈjta",
363
+ "שְׁתֵי": "ʃtej",
364
+ "אַנַחְנו": "anaxnu",
365
+ "לְכַל": "lekol",
366
+ "לַךְ": "lax",
367
+ "אַמַרְתִי": "amarti",
368
+ "עַכְשַׁיו": "axʃav",
369
+ "בְּלִי": "bli",
370
+ "הַרְבֵּה": "arbe",
371
+ "עַצְמוֹ": "atsmo",
372
+ "דְבַרִים": "dvarim",
373
+ "דֵרֵךְ": "derex",
374
+ "לְאַחַר": "leaxar",
375
+ "שְׁהוא": "ʃehu",
376
+ "בְּתוֹךְ": "betox",
377
+ "שְׁאֵין": "ʃeejn",
378
+ "מְאֹד": "meod",
379
+ "בְּנֵי": "bnej",
380
+ "זְמַן": "zman",
381
+ "שְׁנֵי": "ʃnej",
382
+ "לִהְיוֹת": "lihjot",
383
+ "יִהְיֵה": "jihje",
384
+ "הַמֵלֵךְ": "hamelex",
385
+ "צַרִיךְ": "tsarix",
386
+ "מִפְּנֵי": "mipnej",
387
+ "יִשְׂרַאֵל": "jisraˈel",
388
+ "לִפְנֵי": "lifnej",
389
+ "בְּכַל": "bexol",
390
+ "יְהוַה": "jeave",
391
+ "אַמְרַה": "amra",
392
+ "כְּבַר": "kvar",
393
+ "לְךַ": "lexa",
394
+ "כְּדֵי": "kedej",
395
+ "אַךְ": "ax",
396
+ "כְּמוֹ": "kmo",
397
+ "כַּךְ": "kax"
398
+ }
mishkal/data/symbols.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "₪": "ʃeˈkel",
3
+ "$": "doˈlar"
4
+ }
mishkal/expander/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expand dates and numbers into words with niqqud
3
+ This happens before phonemization
4
+ """
5
+
6
+ from .numbers import num_to_word
7
+ from .dates import date_to_word
8
+ from .time_to_word import time_to_word
9
+ from .dictionary import Dictionary
10
+ from mishkal.log import log
11
+
12
+
13
+ class Expander:
14
+ def __init__(self):
15
+ self.dictionary = Dictionary()
16
+
17
+ def expand_text(self, text: str):
18
+ text = self.dictionary.expand_text(text)
19
+
20
+ words = []
21
+ for source_word in text.split():
22
+ try:
23
+ word = date_to_word(source_word)
24
+ if word == source_word:
25
+ word = time_to_word(word)
26
+ if word == source_word:
27
+ word = num_to_word(word)
28
+ words.append(word)
29
+ except Exception as e:
30
+ log.error(f"Failed to expand {word} with error: {e}")
31
+ words.append(source_word)
32
+ return " ".join(words)
mishkal/expander/dates.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .numbers import num_to_word
3
+
4
+ # Mapping of month names in Hebrew with diacritics (Gregorian months)
5
+ MONTHS = {
6
+ 1: "יָנוּאָר",
7
+ 2: "פֶבְרוּאָר",
8
+ 3: "מֵרְץ",
9
+ 4: "אֵפְרִיל",
10
+ 5: "מַאי",
11
+ 6: "יוּנִי",
12
+ 7: "יוּלִי",
13
+ 8: "אוֹגֻסְט",
14
+ 9: "סֶפְּטֶמְבֶּר",
15
+ 10: "אוֹקְטוֹבֶּר",
16
+ 11: "נוֹבֶמְבֶּר",
17
+ 12: "דֶּצֶמְבֶּר",
18
+ }
19
+
20
+ # Mapping of day names in Hebrew with diacritics
21
+ DAYS = {
22
+ 0: "יוֹם רִאשׁוֹן",
23
+ 1: "יוֹם שֵׁנִי",
24
+ 2: "יוֹם שְׁלִישִׁי",
25
+ 3: "יוֹם רֵבִיעִי",
26
+ 4: "יוֹם חֲמִישִׁי",
27
+ 5: "יוֹם שִׁישִׁי",
28
+ 6: "יוֹם שַׁבָּת",
29
+ }
30
+
31
+
32
+ def date_to_word(word: str, include_day_name=False) -> str:
33
+ """
34
+ Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
35
+ Returns the original word if it's not a valid date.
36
+ """
37
+ separators = ["-", ".", "/"]
38
+ orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
39
+ date_formats = [sep.join(order) for order in orders for sep in separators]
40
+
41
+ for date_format in date_formats:
42
+ try:
43
+ # Try parsing the word with each date format
44
+ date_obj = datetime.strptime(word, date_format)
45
+
46
+ # Get the Hebrew day name with diacritics
47
+ day_name = DAYS[date_obj.weekday()]
48
+
49
+ # Convert month to Hebrew name with diacritics
50
+ month_name = MONTHS[date_obj.month]
51
+ day = num_to_word(str(date_obj.day))
52
+ year = num_to_word(str(date_obj.year))
53
+
54
+ text = f"{day} בֵּ{month_name} {year}"
55
+ if include_day_name:
56
+ text = f"{day_name}, {text}"
57
+ return text
58
+ except ValueError:
59
+ continue
60
+ return word
mishkal/expander/dictionary.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dictionaries are tab separated key value words
3
+ """
4
+
5
+ from pathlib import Path
6
+ import json
7
+ import re
8
+ from mishkal.utils import remove_niqqud
9
+ from mishkal.utils import normalize
10
+ import unicodedata
11
+
12
+ files = Path(__file__).parent.joinpath("../data").glob("*.json")
13
+ # Sort in reverse order to prioritize the most recent and best
14
+ order = {"bronze": 1, "silver": 2, "gold": 3}
15
+ files = sorted(
16
+ files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
17
+ )
18
+
19
+
20
+ class Dictionary:
21
+ def __init__(self):
22
+ self.dict = {}
23
+ self.load_dictionaries()
24
+
25
+ def load_dictionaries(self):
26
+ for file in files:
27
+ with open(file, "r", encoding="utf-8") as f:
28
+ dictionary: dict = json.load(f)
29
+ normalized_dictionary = {}
30
+
31
+ # normalize niqqud keys
32
+ for k, v in dictionary.items():
33
+ k = normalize(k)
34
+ # Ensure not empty
35
+ if k and v:
36
+ normalized_dictionary[k] = v
37
+ self.dict.update(normalized_dictionary)
38
+
39
+ def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
40
+ source: str = match.group(0)
41
+ # decomposite
42
+ source = unicodedata.normalize("NFD", source)
43
+ raw_lookup = self.dict.get(source)
44
+
45
+ without_niqqud_lookup = self.dict.get(remove_niqqud(source))
46
+ with_niqqud_lookup = self.dict.get(normalize(source))
47
+ # Compare without niqqud ONLY if source has no niqqud
48
+ if raw_lookup:
49
+ return raw_lookup
50
+ if without_niqqud_lookup:
51
+ return without_niqqud_lookup
52
+ elif with_niqqud_lookup:
53
+ return with_niqqud_lookup
54
+ return source
55
+
56
+ def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
57
+ raw_source: str = match.group(0)
58
+ if raw_source.isnumeric():
59
+ return raw_source
60
+
61
+ raw_lookup = self.dict.get(raw_source)
62
+
63
+ # Compare without niqqud ONLY if source has no niqqud
64
+ if raw_lookup:
65
+ return raw_lookup
66
+ # search by only ', space, regular niqqud, alphabet
67
+ raw_source = re.sub(
68
+ r"[\u05B0-\u05EB ']+", self.replace_hebrew_only_callback, raw_source
69
+ )
70
+ return raw_source
71
+
72
+ def expand_text(self, text: str) -> str:
73
+ """
74
+ TODO: if key doesn't have diacritics expand even diacritized words
75
+ """
76
+ text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
77
+
78
+ return text
mishkal/expander/number_names.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
3
+ """
4
+
5
+ ZERO = {"אפס": "אֶפֶס"}
6
+
7
+
8
+ ONES = {
9
+ "אחת": "אַחַת",
10
+ "אחד": "אֶחָד",
11
+ "ראשונה": "רִאשׁוֹנָה",
12
+ "ראשון": "רִאשׁוֹן",
13
+ "ראשונות": "רִאשׁוֹנוֹת",
14
+ "ראשונים": "רִאשׁוֹנִים",
15
+ "שתיים": "שְׁתַּיִם",
16
+ "שניים": "שְׁנַיִם",
17
+ "שתי": "שְׁתֵּי",
18
+ "שני": "שְׁנֵי",
19
+ "שנייה": "שְׁנִיָּה",
20
+ "שניות": "שְׁנִיּוֹת",
21
+ "שלוש": "שָׁלוֹשׁ",
22
+ "שלושה": "שְׁלוֹשָׁה",
23
+ "שלושת": "שְׁלוֹשֶׁת",
24
+ "שלישית": "שְׁלִישִׁית",
25
+ "שלישי": "שְׁלִישִׁי",
26
+ "שלישיות": "שְׁלִישִׁיּוֹת",
27
+ "שלישיים": "שְׁלִישִׁיִּים",
28
+ "ארבע": "אַרְבַּע",
29
+ "ארבעה": "אַרְבַּעָה",
30
+ "ארבעת": "אַרְבַּעַת",
31
+ "רביעית": "רֵבִיעִית",
32
+ "רביעי": "רֵבִיעִי",
33
+ "רביעיות": "רֵבִיעִיוֹת",
34
+ "רביעיים": "רֵבִיעִיִּים",
35
+ "חמש": "חָמֵשׁ",
36
+ "חמישה": "חֲמִשָּׁה",
37
+ "חמשת": "חֲמֵשֶׁת",
38
+ "חמישית": "חֲמִישִּׁית",
39
+ "חמישי": "חֲמִישִּׁי",
40
+ "חמישיות": "חֲמִישִּׁיוֹת",
41
+ "חמישיים": "חֲמִישִּׁיִּים",
42
+ "שש": "שֵׁשׁ",
43
+ "שישה": "שִׁשָּׁה",
44
+ "ששת": "שֵׁשֶׁת",
45
+ "שישית": "שִׁשִּׁית",
46
+ "שישי": "שִׁשִּׁי",
47
+ "שישיות": "שִׁשִּׁיוֹת",
48
+ "שישיים": "שִׁשִּׁיִּים",
49
+ "שבע": "שֶׁבַע",
50
+ "שבעה": "שִׁבְעָה",
51
+ "שבעת": "שִׁבְעַת",
52
+ "שביעית": "שְׁבִיעִית",
53
+ "שביעי": "שְׁבִיעִי",
54
+ "שביעיות": "שְׁבִיעִיוֹת",
55
+ "שביעיים": "שְׁבִיעִיִּים",
56
+ "שמונה": "שְׁמוֹנֶה",
57
+ "שמונת": "שְׁמוֹנַת",
58
+ "שמינית": "שְׁמִינִית",
59
+ "שמיני": "שְׁמִינִי",
60
+ "שמיניות": "שְׁמִינִיוֹת",
61
+ "שמיניים": "שְׁמִינִיִּים",
62
+ "תשע": "תֵּשַׁע",
63
+ "תשעה": "תִּשְׁעָה",
64
+ "תשעת": "תִּשְׁעַת",
65
+ "תשיעית": "תְּשִׁיעִית",
66
+ "תשיעי": "תְּשִׁיעִי",
67
+ "תשיעיות": "תְּשִׁיעִיּוֹת",
68
+ "תשיעיים": "תְּשִׁיעִיִּים",
69
+ }
70
+
71
+
72
+ TENS = {
73
+ "עשר": "עֶשֶׂר",
74
+ "עשרה": "עֲשָׁרָה",
75
+ "עשרת": "עֲשֶׁרֶת",
76
+ "עשירית": "עֲשִׁירִית",
77
+ "עשירי": "עֲשִׁירִי",
78
+ "עשיריות": "עֲשִׁירִיוֹת",
79
+ "עשיריים": "עֲשִׁירִיִּים",
80
+ "שתים עשרה": "שְׁתֵּים עֶשְׂרֵה",
81
+ "שנים עשר": "שְׁנֵים עָשָׂר",
82
+ }
83
+
84
+
85
+ TWENTIES = {
86
+ "עשרים": "עֶשְׂרִים",
87
+ "שלושים": "שְׁלוֹשִׁים",
88
+ "ארבעים": "אַרְבָּעִים",
89
+ "חמישים": "חֲמִשִּׁים",
90
+ "שישים": "שִׁשִּׁים",
91
+ "שבעים": "שִׁבְעִים",
92
+ "שמונים": "שְׁמוֹנִים",
93
+ "תשעים": "תִּשְׁעִים",
94
+ }
95
+
96
+
97
+ HUNDREDS = {
98
+ "מאה": "מֵאָה",
99
+ "מאת": "מֵאַת",
100
+ "מאתיים": "מָאתַיִם",
101
+ "מאות": "מֵאוֹת",
102
+ }
103
+
104
+ THOUSANDS = {
105
+ "אלף": "אֶלֶף",
106
+ "אלפיים": "אַלְפַּיִם",
107
+ "אלפים": "אֲלָפִים",
108
+ "אלפי": "אַלְפִּי",
109
+ }
110
+
111
+
112
+ LARGE = {
113
+ "מיליון": "מִילְיוֹן",
114
+ "מיליוני": "מִילְיוֹנִי",
115
+ "מיליארד": "מִילְיַארְד",
116
+ "מיליארדי": "מִילְיַארְדִּי",
117
+ "טריליון": "טְרִילְיוֹן",
118
+ "טריליוני": "טְרִילְיוֹנִי",
119
+ "קוודריליון": "קוֹוַדְרִילְיוֹן",
120
+ "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
121
+ "קווינטיליון": "קוִוִּנְטִילְיוֹן",
122
+ "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
123
+ "סקסטיליון": "סְקֶסְטִילְיוֹן",
124
+ "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
125
+ "ספטיליון": "סְפֶּטִילְיוֹן",
126
+ "ספטיליוני": "סְפֶּטִילְיוֹנִי",
127
+ "אוקטיליון": "אוֹקְטִילְיוֹן",
128
+ "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
129
+ "נוניליון": "נוּנִילְיוֹן",
130
+ "נוניליוני": "נוּנִילְיוֹנִי",
131
+ "דסיליון": "דֶּסִילְיוֹן",
132
+ "דסיליוני": "דֶּסִילְיוֹנִי",
133
+ "אונדסיליון": "אוּנְדְסִילְיוֹן",
134
+ "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
135
+ "דואודסיליון": "דוּאודְסִילְיוֹן",
136
+ "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
137
+ "טרדסיליון": "טֶרְדְסִילְיוֹן",
138
+ "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
139
+ "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
140
+ "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
141
+ "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
142
+ "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
143
+ "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
144
+ "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
145
+ "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
146
+ "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
147
+ "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
148
+ "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
149
+ "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
150
+ "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
151
+ "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
152
+ "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
153
+ }
154
+
155
+
156
+ LETTERS = {
157
+ "ו": "וֵ",
158
+ "ה": "הַ",
159
+ }
160
+
161
+
162
+ CURRENCY = {
163
+ "שקל": "שֵׁקֶל",
164
+ "שקלים": "שְׁקָלִים",
165
+ "אגורה": "אֲגוֹרָה",
166
+ "אגורות": "אֲגוֹרוֹת",
167
+ "אירו": "אֵירוֹ",
168
+ "סנט": "סֵנְט",
169
+ "סנטים": "סֵנְטִים",
170
+ "דולר": "דוֹלָר",
171
+ "דולרים": "דוֹלָרִים",
172
+ }
173
+
174
+
175
+ POINTS = {
176
+ "מינוס": "מִינּוּס",
177
+ "נקודה": "נְקֻדָּה",
178
+ }
179
+
180
+ NUMBER_NAMES = {
181
+ **CURRENCY,
182
+ **HUNDREDS,
183
+ **LARGE,
184
+ **LETTERS,
185
+ **ONES,
186
+ **POINTS,
187
+ **TENS,
188
+ **THOUSANDS,
189
+ **TWENTIES,
190
+ **ZERO,
191
+ }
mishkal/expander/numbers.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import num2words
2
+ from .number_names import NUMBER_NAMES
3
+ import re
4
+
5
+
6
+ def add_diacritics(words: str):
7
+ new_words = []
8
+ for word in words.split():
9
+ if NUMBER_NAMES.get(word):
10
+ new_words.append(NUMBER_NAMES[word])
11
+ elif NUMBER_NAMES.get(word[1:]):
12
+ # With Vav or Bet
13
+ new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
14
+ else:
15
+ new_words.append(word)
16
+ return " ".join(new_words)
17
+
18
+
19
+ def num_to_word(maybe_number: str) -> str:
20
+ def replace_number(match):
21
+ num = match.group()
22
+ words = num2words.num2words(num, lang="he", ordinal=False)
23
+ return add_diacritics(words)
24
+
25
+ # Replace all whole numbers in the string
26
+ result = re.sub(r"\d+", replace_number, maybe_number)
27
+
28
+ return result
mishkal/expander/time_to_word.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert time to words
3
+ TODO: fix zeros eg. 22:00
4
+ """
5
+
6
+ import re
7
+
8
+ PATTERNS = [
9
+ r"(\d{1,2})([apm]{2})", # AM/PM format
10
+ r"(\d{1,2}):(\d{2})", # HH:MM format
11
+ ]
12
+
13
+
14
+ def extract_time(match):
15
+ """
16
+ Extract hour and minute from a string in HH:MM or AM/PM format
17
+ and return as integers.
18
+ """
19
+ time_str = match.group(0).lower().strip()
20
+
21
+ # Check for HH:MM format
22
+ match = re.match(r"(\d{1,2}):(\d{2})", time_str)
23
+ if match:
24
+ h = int(match.group(1))
25
+ m = int(match.group(2))
26
+ return f"{convert_to_word(h, m)}"
27
+
28
+ # Check for AM/PM format
29
+ match = re.match(r"(\d{1,2})([apm]{2})", time_str)
30
+ if match:
31
+ h = int(match.group(1))
32
+ period = match.group(2)
33
+
34
+ # Normalize to 24-hour format
35
+ if period == "am" and h == 12:
36
+ h = 0
37
+ elif period == "pm" and h != 12:
38
+ h += 12
39
+ return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
40
+
41
+ return match.group(0) # Return original text if the format is not recognized
42
+
43
+
44
+ def convert_to_word(h, m):
45
+ hours = [
46
+ "אֶפֶס",
47
+ "אַחַת",
48
+ "שְׁנַיִם", # Will be replaced with "שֵׁנִי" when needed
49
+ "שָׁלוֹשׁ",
50
+ "אַרְבַּע",
51
+ "חָמֵשׁ",
52
+ "שֵׁשׁ",
53
+ "שֶׁבַע",
54
+ "שְׁמוֹנֵה",
55
+ "תֵּשַׁע",
56
+ "עֵשֵׂר",
57
+ "אַחַת עֶשְׂרֵה",
58
+ "שְׁתֵּים עֶשְׂרֵה",
59
+ ]
60
+
61
+ tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
62
+
63
+ ten_to_twenty = [
64
+ "עֵשֵׂר",
65
+ "אַחַת עֶשְׂרֵה",
66
+ "שְׁתֵּים עֶשְׂרֵה",
67
+ "שְׁלוֹשׁ עֶשְׂרֵה",
68
+ "אַרְבַּע עֶשְׂרֵה",
69
+ "חֲמֵשׁ עֶשְׂרֵה",
70
+ "שֵׁשׁ עֶשְׂרֵה",
71
+ "שְׁבַע עֶשְׂרֵה",
72
+ "שְׁמוֹנֶה עֶשְׂרֵה",
73
+ "תְּשַׁע עֶשְׂרֵה",
74
+ ]
75
+
76
+ vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
77
+
78
+ # Convert 0 hours to 12 (midnight)
79
+ if h == 0:
80
+ h = 12
81
+
82
+ elif h > 12:
83
+ h -= 12
84
+
85
+ if m == 0:
86
+ return f"{hours[h]}"
87
+
88
+ elif 1 <= m <= 9:
89
+ minute_word = (
90
+ vocab["shtey"] if m == 2 else hours[m]
91
+ ) # Replace "שניים" with "שני"
92
+ return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
93
+
94
+ elif 10 <= m <= 19:
95
+ return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
96
+
97
+ else:
98
+ tens_part = f"{vocab['and']}{tens[m // 10]}"
99
+ units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
100
+ return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
101
+
102
+
103
+ def time_to_word(text: str):
104
+ return re.sub("|".join(PATTERNS), extract_time, text)
mishkal/log.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import colorlog
4
+
5
+
6
+ def _create_logger():
7
+ """
8
+ Create a logger with colorized output
9
+ Usage: LOG_LEVEL=DEBUG python <script.py>
10
+ """
11
+
12
+ handler = colorlog.StreamHandler()
13
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
14
+ handler.setFormatter(
15
+ colorlog.ColoredFormatter(
16
+ fmt=fmt,
17
+ log_colors={
18
+ "DEBUG": "blue",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red",
23
+ },
24
+ )
25
+ )
26
+ # Get log level from LOG_LEVEL environment variable
27
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
28
+ logger = colorlog.getLogger(__package__)
29
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
30
+ # Setup logging to stdout
31
+ logger.addHandler(handler)
32
+ return logger
33
+
34
+
35
+ log = _create_logger()
mishkal/phonemize.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The actual letters phonemization happens here.
3
+ Phonemes generated based on rules.
4
+
5
+ Early rules:
6
+ 1. Niqqud malle vowels
7
+ 2. Dagesh (custom beged kefet)
8
+ 3. Final letter without niqqud
9
+ 4. Final Het gnuva
10
+ 5. Geresh (Gimel, Ttadik, Zain)
11
+ 6. Shva nax and na
12
+ Revised rules:
13
+ 1. Consonants
14
+ 2. Niqqud
15
+
16
+ Reference:
17
+ - https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
18
+ - https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
19
+ - https://en.wikipedia.org/wiki/Help:IPA/Hebrew
20
+ """
21
+
22
+ from .vocab import LETTERS_NAMES_PHONEMES, Letter, Token
23
+ from mishkal import vocab, utils
24
+ from .expander import Expander
25
+ from mishkal.utils import normalize
26
+ import re
27
+ from typing import Callable
28
+
29
+
30
+ # Vav vowel
31
+ vavs = {
32
+ "doubles_identical": {"וּוּ": "vu", "וֹוֹ": "vo"},
33
+ "doubles": {
34
+ "ווּ": "vu",
35
+ "ווֹ": "vo",
36
+ },
37
+ "start": {
38
+ "וַ": "va",
39
+ "וְ": "ve",
40
+ "וֵ": "ve",
41
+ "וִ": "vi",
42
+ "וֹ": "vo",
43
+ "וּ": "u",
44
+ "וֻ": "vu",
45
+ },
46
+ "middle": {
47
+ "וַ": "va",
48
+ "וְ": "v",
49
+ "וֵ": "ve",
50
+ "וִ": "vi",
51
+ "וֹ": "o",
52
+ "וּ": "u",
53
+ "וֻ": "u",
54
+ },
55
+ }
56
+
57
+
58
+ class Phonemizer:
59
+ def __init__(self):
60
+ self.expander = Expander()
61
+
62
+ def phonemize(
63
+ self,
64
+ text: str,
65
+ preserve_punctuation=True,
66
+ preserve_stress=True,
67
+ fallback: Callable[[str], str] = None,
68
+ ) -> str:
69
+ # TODO: is that enough? what if there's punctuation around? other chars?
70
+ he_pattern = r"[\u05b0-\u05ea]+"
71
+ fallback_pattern = r"[a-zA-Z]+"
72
+
73
+ def fallback_replace_callback(match: re.Match):
74
+ word = match.group(0)
75
+ if self.expander.dictionary.dict.get(word):
76
+ # skip
77
+ # TODO: better API
78
+ return word
79
+ phonemes = fallback(word).strip()
80
+ # TODO: check that it has only IPA?!
81
+ for c in phonemes:
82
+ vocab.SET_OUTPUT_CHARACTERS.add(c)
83
+ return phonemes
84
+
85
+ if fallback is not None:
86
+ text = re.sub(fallback_pattern, fallback_replace_callback, text)
87
+ text = self.expander.expand_text(text)
88
+ tokens: list[Token] = []
89
+ self.fallback = fallback
90
+
91
+ def heb_replace_callback(match: re.Match):
92
+ word = match.group(0)
93
+ word = normalize(word)
94
+ word = "".join(
95
+ i for i in word if i in vocab.SET_LETTERS or i in vocab.SET_NIQQUD
96
+ )
97
+ letters = utils.extract_letters(word)
98
+ hebrew_tokens = self.phonemize_hebrew(letters)
99
+ tokens.extend(hebrew_tokens)
100
+ return "".join(i.phonemes for i in hebrew_tokens)
101
+
102
+ text = re.sub(he_pattern, heb_replace_callback, text)
103
+
104
+ if not preserve_punctuation:
105
+ text = "".join(i for i in text if i not in vocab.PUNCTUATION or i == " ")
106
+ if not preserve_stress:
107
+ text = "".join(
108
+ i for i in text if i not in [vocab.STRESS, vocab.SECONDARY_STRESS]
109
+ )
110
+ text = "".join(i for i in text if i in vocab.SET_OUTPUT_CHARACTERS)
111
+ return text
112
+
113
+ def phonemize_hebrew(self, letters: list[Letter]) -> list[Token]:
114
+ tokens: list[Token] = []
115
+ i = 0
116
+ while i < len(letters):
117
+ cur = letters[i]
118
+ prev = letters[i - 1] if i > 0 else None
119
+ next = letters[i + 1] if i < len(letters) - 1 else None
120
+
121
+ # early rules
122
+
123
+ # Single letter name
124
+ if not next and not prev and cur and not cur.symbols:
125
+ token = Token(
126
+ cur.as_str(), LETTERS_NAMES_PHONEMES.get(cur.letter_str, "")
127
+ )
128
+ tokens.append(token)
129
+ i += 1
130
+ continue
131
+
132
+ if cur.letter_str == "ו":
133
+ # special doubles
134
+ if next and cur.as_str() == next.as_str():
135
+ phonemes = vavs["doubles_identical"].get(
136
+ cur.as_str() + next.as_str(), "vo"
137
+ )
138
+ tokens.append(Token(cur.as_str() + next.as_str(), phonemes))
139
+ i += 2
140
+ continue
141
+ # doubles with one has no symbols
142
+ if next and (
143
+ cur == "ו" and next == "ו" and (not cur.symbols or not next.symbols)
144
+ ):
145
+ phonemes = vavs["doubles"].get(cur.as_str() + next.as_str())
146
+ if not phonemes:
147
+ # take the one with the symbols
148
+ letter = cur.as_str() if cur.symbols else next.as_str()
149
+ phonemes = vavs["middle"].get(letter, "v")
150
+ tokens.append(Token(cur.as_str() + next.as_str(), phonemes))
151
+ i += 2
152
+ continue
153
+ # start
154
+ if not prev:
155
+ phonemes = vavs["start"].get(cur.as_str(), "v")
156
+ tokens.append(Token(cur.as_str(), phonemes))
157
+ i += 1
158
+ continue
159
+ # middle
160
+ phonemes = vavs["middle"].get(cur.as_str(), "v")
161
+ tokens.append(Token(cur.as_str(), phonemes))
162
+ i += 1
163
+ continue
164
+ # Yod vowel
165
+ if cur == "י" and prev and not cur.symbols: # Yod without niqqud
166
+ # Not possible to say ii
167
+ if tokens[-1].phonemes.endswith("i"):
168
+ token = Token(prev.as_str() + cur.as_str(), "")
169
+ tokens.append(token)
170
+ i += 1
171
+ continue
172
+ if not prev.symbols:
173
+ phoneme = vocab.VOWEL_I
174
+ token = Token(prev.as_str() + cur.as_str(), phoneme)
175
+ tokens.append(token)
176
+ i += 1
177
+ continue
178
+ elif "\u05b4" in prev.symbols: # Hirik
179
+ phoneme = ""
180
+ token = Token(cur.as_str(), phoneme)
181
+ tokens.append(token)
182
+ i += 1
183
+ continue
184
+
185
+ # Some final letters can be silent
186
+ if not next and cur.letter_str in "אהע" and not cur.symbols:
187
+ phoneme = ""
188
+ token = Token(cur.as_str(), phoneme)
189
+ tokens.append(token)
190
+ i += 1
191
+ continue
192
+ # Het gnuva
193
+ if not next and cur == "ח" and "\u05b7" in cur.symbols: # Patah
194
+ phoneme = vocab.HET_GNUVA
195
+ token = Token(cur.as_str(), phoneme)
196
+ tokens.append(token)
197
+ i += 1
198
+ continue
199
+
200
+ # Geresh
201
+ if "'" in cur.symbols and cur.letter_str in ["ג", "ז", "צ"]:
202
+ phoneme = (
203
+ vocab.GIMEL_OR_ZAIN_WITH_DAGESH
204
+ if cur.letter_str in ["ג", "ז"]
205
+ else vocab.TSADIK_WITH_DAGESH
206
+ )
207
+ phoneme += "".join(
208
+ [vocab.NIQQUD_PHONEMES.get(niqqud, "") for niqqud in cur.symbols]
209
+ )
210
+ token = Token(cur.as_str() + (next.as_str() if next else ""), phoneme)
211
+ tokens.append(token)
212
+ i += 1
213
+ continue
214
+
215
+ # Shva nax and na
216
+ if "\u05b0" in cur.symbols:
217
+ phoneme = vocab.LETTERS_PHONEMES.get(cur.letter_str, "")
218
+ # First
219
+ if not prev:
220
+ if cur.letter_str == "ו":
221
+ phoneme += vocab.VOWEL_E
222
+ elif cur.letter_str in "למנרי":
223
+ phoneme += vocab.VOWEL_E
224
+ elif next and next.letter_str in "אהע": # Groni
225
+ phoneme += vocab.VOWEL_E
226
+ # Middle
227
+ else:
228
+ # After vav with dagesh nax
229
+ if prev and prev.letter_str == "ו" and "\u05bc" in prev.symbols:
230
+ phoneme += ""
231
+ # Double final shva(s) nax
232
+ elif i == len(letters) - 1 and prev and "\u05b0" in prev.symbols:
233
+ phoneme += ""
234
+ elif i == len(letters) - 1 and next and "\u05b0" in next.symbols:
235
+ phoneme += ""
236
+ # Double shva same letter
237
+ elif next and next.letter_str == cur.letter_str:
238
+ phoneme += vocab.VOWEL_E
239
+ # Double shva
240
+ elif next and "\u05b0" in next.symbols:
241
+ phoneme += ""
242
+ # Previous nax
243
+ elif tokens:
244
+ if "\u05b0" in prev.symbols and not tokens[
245
+ -1
246
+ ].phonemes.endswith(vocab.VOWEL_E):
247
+ phoneme += vocab.VOWEL_E
248
+ token = Token(cur.letter_str, phoneme)
249
+ tokens.append(token)
250
+ i += 1
251
+ continue
252
+
253
+ # revised rules
254
+ phoneme = vocab.LETTERS_PHONEMES.get(cur.letter_str, "")
255
+ phoneme += "".join(
256
+ [vocab.NIQQUD_PHONEMES.get(niqqud, "") for niqqud in cur.symbols]
257
+ )
258
+ token = Token(cur.letter_str, phoneme)
259
+ tokens.append(token)
260
+ i += 1
261
+ return tokens
mishkal/utils.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from mishkal.phonemize import Letter
3
+ from mishkal import vocab
4
+ import unicodedata
5
+
6
+ NORMALIZE_PATTERNS = {
7
+ # Alphabet followed by 1/2 symbols then dagesh. make dagesh first
8
+ "([\u05d0-\u05ea])([\u05b0-\u05c7]{1,2})(\u05bc)": r"\1\3\2",
9
+ r"([^בכךפףו])(\u05bc)": r"\1",
10
+ }
11
+
12
+
13
+ def remove_niqqud(text: str):
14
+ return re.sub(vocab.HE_NIQQUD_PATTERN, "", text)
15
+
16
+
17
+ def has_niqqud(text: str):
18
+ return re.search(vocab.HE_NIQQUD_PATTERN, text) is not None
19
+
20
+
21
+ def normalize(text: str) -> str:
22
+ """
23
+ Normalize unicode (decomposite)
24
+ Deduplicate niqqud (eg. only Patah instead of Kamatz)
25
+ Keep only Hebrew characters / punctuation / IPA
26
+ """
27
+ # Decompose text
28
+
29
+ text = unicodedata.normalize("NFD", text)
30
+ for k, v in NORMALIZE_PATTERNS.items():
31
+ text = re.sub(k, v, text)
32
+ # Normalize niqqud, remove duplicate phonetics 'sounds' (eg. only Patah)
33
+ for k, v in vocab.NIQQUD_NORMALIZE.items():
34
+ text = text.replace(k, v)
35
+
36
+ # Keep only lexicon characters
37
+ text = "".join(
38
+ [
39
+ c
40
+ for c in text
41
+ if c in vocab.SET_INPUT_CHARACTERS or c in vocab.SET_OUTPUT_CHARACTERS
42
+ ]
43
+ )
44
+ return text
45
+
46
+
47
+ def extract_letters(word: str) -> list[Letter]:
48
+ """
49
+ Extract letters from word
50
+ We assume that:
51
+ - Dates expanded to words
52
+ - Numbers expanded to word
53
+ - Symbols expanded already
54
+ - Known words converted to phonemes
55
+ - Rashey Tavot (acronyms) expanded already
56
+ - English words converted to phonemes already
57
+ - Text normalized using unicodedata.normalize('NFD')
58
+
59
+ This function extract *ONLY* hebrew letters and niqqud from LEXICON
60
+ Other characters ignored!
61
+ """
62
+ # Normalize niqqud
63
+ for niqqud, normalized in vocab.NIQQUD_NORMALIZE.items():
64
+ word = word.replace(niqqud, normalized)
65
+ # Remove non-lexicon characters
66
+ word = "".join([c for c in word if c in vocab.SET_INPUT_CHARACTERS])
67
+ letters = []
68
+ i = 0
69
+ while i < len(word):
70
+ char = word[i]
71
+ if char in vocab.SET_LETTERS or char == "'":
72
+ symbols = []
73
+ i += 1 # Move to potential niqqud
74
+ # Collect symbols attached to this letter
75
+ while i < len(word) and (
76
+ word[i] in vocab.SET_LETTER_SYMBOLS or word[i] == "'"
77
+ ):
78
+ symbols.append(word[i])
79
+ i += 1 # Move to the next character
80
+
81
+ if char in "בכפ" and "\u05bc" in symbols:
82
+ char += "\u05bc" # Add dagesh to the letter itself
83
+ if (
84
+ "\u05bc" in symbols and char not in "ו"
85
+ ): # we'll keep dagesh symbol only for vav
86
+ symbols.remove("\u05bc") # remove dagesh
87
+ # Shin
88
+ if "\u05c1" in symbols:
89
+ char += "\u05c1"
90
+ symbols.remove("\u05c1")
91
+ # Sin
92
+ if "\u05c2" in symbols:
93
+ char += "\u05c2"
94
+ symbols.remove("\u05c2")
95
+ letters.append(Letter(char, set(symbols)))
96
+ else:
97
+ i += 1 # Skip non-letter symbols
98
+ return letters
99
+
100
+
101
+ def get_unicode_names(text: str):
102
+ return [unicodedata.name(c, "?") for c in text]
mishkal/vocab.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASCII IPA transcription of Hebrew consonants and vowels.
3
+ """
4
+
5
+ import unicodedata
6
+
7
+
8
+ class Letter:
9
+ def __init__(self, letter_str: str, symbols: list[str] = []):
10
+ self.letter_str = letter_str
11
+ self.symbols: list[str] = symbols
12
+
13
+ def __repr__(self):
14
+ return f"{self.letter_str + ' ' + ', '.join(unicodedata.name(s) for s in self.symbols)}"
15
+
16
+ def __eq__(self, other: str):
17
+ return (
18
+ self.letter_str == other or self.letter_str + "".join(self.symbols) == other
19
+ )
20
+
21
+ def as_str(self):
22
+ return self.letter_str + "".join(self.symbols)
23
+
24
+
25
+ class Token:
26
+ def __init__(self, token: str, phonemes: str = ""):
27
+ self.token = token
28
+ self.phonemes: str = phonemes
29
+
30
+ def __repr__(self):
31
+ return f"{self.token} {self.phonemes}"
32
+
33
+
34
+ # https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
35
+ HE_CHARS_PATTERN = (
36
+ r"\b[\u05B0-\u05EA\u05F3\u0027]+\b" # Chars including niqqud, geresh and en_geresh
37
+ )
38
+ HE_NIQQUD_PATTERN = r"[\u05B0-\u05C7]"
39
+
40
+ PUNCTUATION = r"""- .,"':!?()"""
41
+
42
+ # Special
43
+ GIMEL_OR_ZAIN_WITH_DAGESH = "dʒ"
44
+ TSADIK_WITH_DAGESH = "tʃ"
45
+ SHIN_WITH_POINT = "ʃ"
46
+ SIN_WITH_POINT = "s"
47
+ STRESS = "\u02c8"
48
+ SECONDARY_STRESS = "\u02cc"
49
+ HET_GNUVA = "ax"
50
+
51
+ LETTERS_NAMES_PHONEMES = {
52
+ "א": "alef", # Alef, glottal stop
53
+ "ב": "bet", # Bet
54
+ "ג": "gimel", # Gimel
55
+ "ד": "dalet", # Dalet
56
+ "ה": "hej", # He
57
+ "ו": "vav", # Vav
58
+ "ז": "zajin", # Zayin
59
+ "ח": "xet", # Het
60
+ "ט": "tet", # Tet
61
+ "י": "jud", # Yod
62
+ "ך": "xaf sofit", # Haf sofit
63
+ "כ": "xaf", # Haf
64
+ "ל": "lamed", # Lamed
65
+ "ם": "mem sofit", # Mem Sofit
66
+ "מ": "mem", # Mem
67
+ "ן": "nun sofit", # Nun Sofit
68
+ "נ": "nun", # Nun
69
+ "ס": "samex", # Samekh
70
+ "ע": "ajin", # Ayin, glottal stop
71
+ "פ": "fey", # Fey
72
+ "ף": "fey sofit", # Fey Sofit
73
+ "ץ": "tsadik sofit", # Tsadik sofit
74
+ "צ": "tsadik", # Tsadik
75
+ "ק": "kuf", # Kuf
76
+ "ר": "rejiʃ", # Resh
77
+ "ש": "ʃin", # Shin
78
+ "ת": "taf", # Taf
79
+ }
80
+
81
+ # Consonants
82
+ LETTERS_PHONEMES = {
83
+ "א": "", # Alef
84
+ "ב": "v", # Bet
85
+ "ג": "g", # Gimel
86
+ "ד": "d", # Dalet
87
+ "ה": "h", # He
88
+ "ו": "v", # Vav
89
+ "ז": "z", # Zayin
90
+ "ח": "x", # Het
91
+ "ט": "t", # Tet
92
+ "י": "j", # Yod
93
+ "ך": "x", # Haf sofit
94
+ "כ": "x", # Haf
95
+ "ל": "l", # Lamed
96
+ "ם": "m", # Mem Sofit
97
+ "מ": "m", # Mem
98
+ "ן": "n", # Nun Sofit
99
+ "נ": "n", # Nun
100
+ "ס": "s", # Samekh
101
+ "ע": "", # Ayin, only voweled
102
+ "פ": "f", # Fey
103
+ "ף": "f", # Fey Sofit
104
+ "ץ": "ts", # Tsadik sofit
105
+ "צ": "ts", # Tsadik
106
+ "ק": "k", # Kuf
107
+ "ר": "r", # Resh
108
+ "ש": "ʃ", # Shin
109
+ "ת": "t", # Taf
110
+ # Beged Kefet
111
+ "בּ": "b",
112
+ "כּ": "k",
113
+ "פּ": "p",
114
+ "שׁ": "ʃ",
115
+ "שׂ": "s",
116
+ }
117
+
118
+ # Vowels
119
+ VOWEL_A = "a"
120
+ VOWEL_E = "e"
121
+ VOWEL_I = "i"
122
+ VOWEL_O = "o"
123
+ VOWEL_U = "u"
124
+
125
+ NIQQUD_PHONEMES = {
126
+ "\u05b4": "i", # Hiriq
127
+ "\u05b5": "e", # Tsere
128
+ "\u05b7": "a", # Patah
129
+ "\u05b9": "o", # Holam
130
+ "\u05ba": "o", # Holam haser for vav
131
+ "\u05bb": "u", # Qubuts
132
+ }
133
+
134
+ SET_LETTER_SYMBOLS = {
135
+ "\u05b0", # Shva
136
+ "\u05b4", # Hiriq
137
+ "\u05b5", # Tsere
138
+ "\u05b7", # Patah
139
+ "\u05b9", # Holam
140
+ "\u05ba", # Holam haser for vav
141
+ "\u05bb", # Qubuts
142
+ "\u05bc", # Dagesh
143
+ "\u05c1", # Shin dot
144
+ "\u05c2", # Sin dot
145
+ "'", # Geresh
146
+ }
147
+
148
+ """
149
+ We're left with the following niqqud (10):
150
+ Shva, Hiriq, Tsere, Patah, Holam, Qubuts, Dagesh,
151
+ Holam haser for vav, Shin dot, Sin dot
152
+ """
153
+ NIQQUD_NORMALIZE = {
154
+ "\u05b1": "\u05b5", # Hataf Segol -> Tsere
155
+ "\u05b2": "\u05b7", # Hataf Patah -> Patah
156
+ "\u05b3": "\u05b9", # Hataf Qamats -> Holam
157
+ "\u05b6": "\u05b5", # Segol -> Tsere
158
+ # Kamatz -> Patah
159
+ "\u05b8": "\u05b7", # Qamats -> Patah
160
+ "\u05c7": "\u05b9", # Qamats Qatan -> Holam
161
+ "\u05f3": "'", # Hebrew geresh to regular geresh
162
+ }
163
+
164
+
165
+ SET_OUTPUT_CHARACTERS = set(
166
+ [*GIMEL_OR_ZAIN_WITH_DAGESH, TSADIK_WITH_DAGESH, SHIN_WITH_POINT, SIN_WITH_POINT]
167
+ + [STRESS, SECONDARY_STRESS]
168
+ + list(LETTERS_PHONEMES.values())
169
+ + list(NIQQUD_PHONEMES.values())
170
+ + [VOWEL_A, VOWEL_E, VOWEL_I, VOWEL_O, VOWEL_U]
171
+ + list(PUNCTUATION)
172
+ )
173
+
174
+ SET_NIQQUD = {
175
+ # Shva, Hiriq, Tsere, Patah, Holam, Holam haser for vav, Qubuts, Dagesh, Shin dot, Sin dot
176
+ "\u05b0",
177
+ "\u05b4",
178
+ "\u05b5",
179
+ "\u05b7",
180
+ "\u05b9",
181
+ "\u05ba",
182
+ "\u05bb",
183
+ "\u05bc",
184
+ "\u05c1",
185
+ "\u05c2",
186
+ }
187
+ SET_LETTERS = set(LETTERS_PHONEMES.keys())
188
+ SET_PUNCTUATION = set(PUNCTUATION)
189
+
190
+
191
+ # Set for fast lookup
192
+ SET_INPUT_CHARACTERS = set(
193
+ list(LETTERS_PHONEMES.keys()) + list(SET_NIQQUD) + list(PUNCTUATION) + ["'"]
194
+ )
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=5.15.0
2
+ num2words
3
+ colorlog