Add required scripts
Browse files- .gitattributes +1 -0
- README.md +0 -0
- notes/.keep +0 -0
- notes/data_preparation.ipynb +626 -0
- notes/fa.tar.gz +3 -0
- src/dictionary.py +664 -0
- src/normalizer.py +227 -0
- src/requirements.txt +3 -0
- src/run_config.py +108 -0
- src/run_persian.sh +51 -0
- src/run_wav2vec2_pretrain_flax.py +638 -0
.gitattributes
CHANGED
@@ -14,3 +14,4 @@
|
|
14 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
|
14 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
File without changes
|
notes/.keep
ADDED
File without changes
|
notes/data_preparation.ipynb
ADDED
@@ -0,0 +1,626 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import os\n",
|
10 |
+
"import sys"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 5,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [
|
18 |
+
{
|
19 |
+
"data": {
|
20 |
+
"text/plain": "['../src',\n '/Users/m3hrdadfi/Projects/HF/hfflax/hub/wav2vec2-base-persian/notes',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles/lib/python',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python39.zip',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/lib-dynload',\n '',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages',\n '/Users/m3hrdadfi/Projects/Apps/zabanshenas',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages/IPython/extensions',\n '/Users/m3hrdadfi/.ipython']"
|
21 |
+
},
|
22 |
+
"execution_count": 5,
|
23 |
+
"metadata": {},
|
24 |
+
"output_type": "execute_result"
|
25 |
+
}
|
26 |
+
],
|
27 |
+
"source": [
|
28 |
+
"sys.path"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "code",
|
33 |
+
"execution_count": 4,
|
34 |
+
"metadata": {},
|
35 |
+
"outputs": [],
|
36 |
+
"source": [
|
37 |
+
"if \"../src\" not in sys.path:\n",
|
38 |
+
" sys.path.insert(0, \"../src\")"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "code",
|
43 |
+
"execution_count": 6,
|
44 |
+
"metadata": {},
|
45 |
+
"outputs": [],
|
46 |
+
"source": [
|
47 |
+
"from normalizer import normalizer"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": 7,
|
53 |
+
"metadata": {},
|
54 |
+
"outputs": [
|
55 |
+
{
|
56 |
+
"name": "stdout",
|
57 |
+
"output_type": "stream",
|
58 |
+
"text": [
|
59 |
+
"سلام بر شما که میآیید و میآموزید که بیآرآیم \n",
|
60 |
+
"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند \n",
|
61 |
+
"میانافزارهای امروزی نرمافزار سختافزار امروز نوشتافزارها \n",
|
62 |
+
"این کتاب بهترین در نوع شتر آسانتر هست \n",
|
63 |
+
"سه چیز هست که از پژوهش در این زمینه آموختهام \n"
|
64 |
+
]
|
65 |
+
}
|
66 |
+
],
|
67 |
+
"source": [
|
68 |
+
"input_text = \"سلام بر شما که میآیید و میآموزید که بیآرآیم\"\n",
|
69 |
+
"print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
|
70 |
+
"\n",
|
71 |
+
"input_text = \"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند.\"\n",
|
72 |
+
"print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
|
73 |
+
"\n",
|
74 |
+
"input_text = \" میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها\"\n",
|
75 |
+
"print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
|
76 |
+
"\n",
|
77 |
+
"input_text = \"این کتاب بهترین در نوع شتر آسانتر هست\"\n",
|
78 |
+
"print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
|
79 |
+
"\n",
|
80 |
+
"input_text = \"سه چیز هست که از پژوهش در این زمینه آموختهام\"\n",
|
81 |
+
"print(normalizer({\"sentence\": input_text}, return_dict=False))"
|
82 |
+
]
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"cell_type": "code",
|
86 |
+
"execution_count": 12,
|
87 |
+
"metadata": {},
|
88 |
+
"outputs": [],
|
89 |
+
"source": [
|
90 |
+
"# !mkdir -p /home/m3hrdadfi/code/data\n",
|
91 |
+
"# %cd /home/m3hrdadfi/code/data\n",
|
92 |
+
"# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n",
|
93 |
+
"# %cd /home/m3hrdadfi/"
|
94 |
+
]
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"cell_type": "code",
|
98 |
+
"execution_count": 13,
|
99 |
+
"metadata": {},
|
100 |
+
"outputs": [],
|
101 |
+
"source": [
|
102 |
+
"# import os\n",
|
103 |
+
"\n",
|
104 |
+
"# lang = \"fa\"\n",
|
105 |
+
"# abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/code/data/{lang}/dataset\", f\"cv{lang}\", lang)\n",
|
106 |
+
"# save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n",
|
107 |
+
"# print(abs_path_to_data)\n",
|
108 |
+
"# print(save_path)\n",
|
109 |
+
"# print()\n",
|
110 |
+
"# !ls {save_path}\n",
|
111 |
+
"# !ls {abs_path_to_data}/*.tsv"
|
112 |
+
]
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"cell_type": "code",
|
116 |
+
"execution_count": 14,
|
117 |
+
"metadata": {},
|
118 |
+
"outputs": [],
|
119 |
+
"source": [
|
120 |
+
"def normalizer_without_batch(text, pruning=False):\n",
|
121 |
+
" try:\n",
|
122 |
+
" batch = {\n",
|
123 |
+
" \"sentence\": text\n",
|
124 |
+
" }\n",
|
125 |
+
" text = normalizer(batch, return_dict=False)\n",
|
126 |
+
" \n",
|
127 |
+
" if pruning:\n",
|
128 |
+
" if not len(text.split()) > 3:\n",
|
129 |
+
" text = None\n",
|
130 |
+
" \n",
|
131 |
+
" except:\n",
|
132 |
+
" print(text)\n",
|
133 |
+
" text = None\n",
|
134 |
+
" \n",
|
135 |
+
" return text"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 15,
|
141 |
+
"metadata": {},
|
142 |
+
"outputs": [],
|
143 |
+
"source": [
|
144 |
+
"import pandas as pd\n",
|
145 |
+
"import numpy as np\n",
|
146 |
+
"from tqdm import tqdm"
|
147 |
+
]
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"cell_type": "code",
|
151 |
+
"execution_count": 16,
|
152 |
+
"metadata": {},
|
153 |
+
"outputs": [],
|
154 |
+
"source": [
|
155 |
+
"# test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n",
|
156 |
+
"\n",
|
157 |
+
"# print(f\"Step 0: {len(test_df)}\")\n",
|
158 |
+
"\n",
|
159 |
+
"# test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n",
|
160 |
+
"# test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
|
161 |
+
"# test_df = test_df.dropna(subset=[\"path\"])\n",
|
162 |
+
"# test_df = test_df.drop(\"status\", 1)\n",
|
163 |
+
"# print(f\"Step 1: {len(test_df)}\")\n",
|
164 |
+
"\n",
|
165 |
+
"# test_df[\"prev_sentence\"] = test_df[\"sentence\"]\n",
|
166 |
+
"# test_df[\"sentence\"] = test_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
|
167 |
+
"# test_df = test_df.dropna(subset=[\"sentence\"])\n",
|
168 |
+
"# print(f\"Step 2: {len(test_df)}\")\n",
|
169 |
+
"\n",
|
170 |
+
"# test_df = test_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
|
171 |
+
"# test_df = test_df.drop_duplicates(subset=\"path\")\n",
|
172 |
+
"# print(f\"Step 3: {len(test_df)}\")\n",
|
173 |
+
"\n",
|
174 |
+
"# test_df = test_df.reset_index(drop=True)\n",
|
175 |
+
"# test_df.head()"
|
176 |
+
]
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"cell_type": "code",
|
180 |
+
"execution_count": 17,
|
181 |
+
"metadata": {},
|
182 |
+
"outputs": [],
|
183 |
+
"source": [
|
184 |
+
"# _train_df = pd.concat([\n",
|
185 |
+
"# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
|
186 |
+
"# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
|
187 |
+
"# ])\n",
|
188 |
+
"# print(len(_train_df))\n",
|
189 |
+
"\n",
|
190 |
+
"# train_df = pd.concat([\n",
|
191 |
+
"# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
|
192 |
+
"# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
|
193 |
+
"# pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n",
|
194 |
+
"# pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n",
|
195 |
+
"# ])\n",
|
196 |
+
"# print(f\"Step 0: {len(train_df)}\")\n",
|
197 |
+
"\n",
|
198 |
+
"# train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n",
|
199 |
+
"# train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
|
200 |
+
"# train_df = train_df.dropna(subset=[\"path\"])\n",
|
201 |
+
"# train_df = train_df.drop(\"status\", 1)\n",
|
202 |
+
"# print(f\"Step 1: {len(train_df)}\")\n",
|
203 |
+
"\n",
|
204 |
+
"# train_df[\"prev_sentence\"] = train_df[\"sentence\"]\n",
|
205 |
+
"# train_df[\"sentence\"] = train_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t, pruning=True))\n",
|
206 |
+
"# train_df = train_df.dropna(subset=[\"sentence\"])\n",
|
207 |
+
"# print(f\"Step 2: {len(train_df)}\")\n",
|
208 |
+
"\n",
|
209 |
+
"# train_df = train_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
|
210 |
+
"# train_df = train_df.drop_duplicates(subset=\"path\")\n",
|
211 |
+
"# print(f\"Step 3: {len(train_df)}\")\n",
|
212 |
+
"\n",
|
213 |
+
"# train_df = train_df.sample(frac=1)\n",
|
214 |
+
"# train_df = train_df.reset_index(drop=True)\n",
|
215 |
+
"# train_df.head()"
|
216 |
+
]
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"cell_type": "code",
|
220 |
+
"execution_count": 18,
|
221 |
+
"metadata": {},
|
222 |
+
"outputs": [],
|
223 |
+
"source": [
|
224 |
+
"# from tqdm import tqdm\n",
|
225 |
+
"\n",
|
226 |
+
"# testset_indices = []\n",
|
227 |
+
"\n",
|
228 |
+
"# for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n",
|
229 |
+
"# _id = row[\"path\"]\n",
|
230 |
+
"# finder = train_df[train_df[\"path\"] == _id]\n",
|
231 |
+
"# if len(finder) > 0:\n",
|
232 |
+
"# testset_indices.extend(list(finder.index))\n",
|
233 |
+
"\n",
|
234 |
+
"# testset_indices = list(set(testset_indices))\n",
|
235 |
+
"# print(f\"Found #{len(testset_indices)} test data\")"
|
236 |
+
]
|
237 |
+
},
|
238 |
+
{
|
239 |
+
"cell_type": "code",
|
240 |
+
"execution_count": 19,
|
241 |
+
"metadata": {},
|
242 |
+
"outputs": [],
|
243 |
+
"source": [
|
244 |
+
"# print(len(train_df))\n",
|
245 |
+
"# train_df = train_df.drop(testset_indices)\n",
|
246 |
+
"# print(len(train_df))"
|
247 |
+
]
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"cell_type": "code",
|
251 |
+
"execution_count": 20,
|
252 |
+
"metadata": {},
|
253 |
+
"outputs": [],
|
254 |
+
"source": [
|
255 |
+
"# import pandas as pd\n",
|
256 |
+
"\n",
|
257 |
+
"# df = pd.concat([train_df, test_df], axis=0)\n",
|
258 |
+
"# # df = validated_df.copy()\n",
|
259 |
+
"# print(df.info())\n",
|
260 |
+
"# # df[\"sentence\"] = df[\"prev_sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
|
261 |
+
"# # df = df.dropna(subset=[\"sentence\"])\n",
|
262 |
+
"# # df[\"sentence_spell\"] = df[\"sentence\"].apply(lambda t: normalizer({\"sentence\": t}, is_spell_check=True, return_dict=False))\n",
|
263 |
+
"# df = df.reset_index(drop=True)\n",
|
264 |
+
"# print(df.info())\n",
|
265 |
+
"# df.head()"
|
266 |
+
]
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"cell_type": "code",
|
270 |
+
"execution_count": 21,
|
271 |
+
"metadata": {},
|
272 |
+
"outputs": [],
|
273 |
+
"source": [
|
274 |
+
"# import torchaudio\n",
|
275 |
+
"# import librosa\n",
|
276 |
+
"# import IPython.display as ipd\n",
|
277 |
+
"# import numpy as np\n",
|
278 |
+
"\n",
|
279 |
+
"# def load_audio(path):\n",
|
280 |
+
"# speech, sr = torchaudio.load(path)\n",
|
281 |
+
"# speech = speech[0].numpy().squeeze() \n",
|
282 |
+
"# speech = librosa.resample(np.asarray(speech), sr, 16_000)\n",
|
283 |
+
" \n",
|
284 |
+
"# print(speech.shape, sr)\n",
|
285 |
+
" \n",
|
286 |
+
"# ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))"
|
287 |
+
]
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"cell_type": "code",
|
291 |
+
"execution_count": 22,
|
292 |
+
"metadata": {},
|
293 |
+
"outputs": [],
|
294 |
+
"source": [
|
295 |
+
"# main_vocab = [\"ح\", \"چ\", \"ج\", \"ث\", \"ت\", \"پ\", \"ب\", \"آ\", \"ا\", \"ش\", \"س\", \"ژ\", \"ز\", \"ر\", \"ذ\", \"د\", \"خ\", \"ق\", \"ف\", \"غ\", \"ع\", \"ظ\", \"ط\", \"ض\", \"ص\", \"ی\", \"ه\", \"و\", \"ن\", \"م\", \"ل\", \"گ\", \"ک\"]\n",
|
296 |
+
"# text = \" \".join(df[\"sentence\"].values.tolist())\n",
|
297 |
+
"# vocab = list(sorted(set(text)))\n",
|
298 |
+
"\n",
|
299 |
+
"# for v in main_vocab:\n",
|
300 |
+
"# if v not in vocab:\n",
|
301 |
+
"# print(\"v\", v)\n",
|
302 |
+
"\n",
|
303 |
+
"# print(len(main_vocab), len(vocab))\n",
|
304 |
+
"# print(len(vocab), vocab)"
|
305 |
+
]
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"cell_type": "code",
|
309 |
+
"execution_count": 23,
|
310 |
+
"metadata": {},
|
311 |
+
"outputs": [],
|
312 |
+
"source": [
|
313 |
+
"# import numpy as np\n",
|
314 |
+
"\n",
|
315 |
+
"\n",
|
316 |
+
"# idx = np.random.randint(0, len(df))\n",
|
317 |
+
"# # idx = 6140\n",
|
318 |
+
"# sample = df.iloc[idx]\n",
|
319 |
+
"# ipd.display(sample)\n",
|
320 |
+
"# # print(sample.iloc[idx][\"prev_sentence\"])\n",
|
321 |
+
"# print()\n",
|
322 |
+
"# print(sample[\"prev_sentence\"])\n",
|
323 |
+
"# print(sample[\"sentence\"])\n",
|
324 |
+
"# print()\n",
|
325 |
+
"# load_audio(sample[\"path\"])"
|
326 |
+
]
|
327 |
+
},
|
328 |
+
{
|
329 |
+
"cell_type": "code",
|
330 |
+
"execution_count": 24,
|
331 |
+
"metadata": {},
|
332 |
+
"outputs": [],
|
333 |
+
"source": [
|
334 |
+
"# new_train_df = train_df.copy()\n",
|
335 |
+
"# new_train_df[\"_path\"] = new_train_df[\"path\"]\n",
|
336 |
+
"# new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
|
337 |
+
"# print(new_train_df.info())\n",
|
338 |
+
"# new_train_df.head()"
|
339 |
+
]
|
340 |
+
},
|
341 |
+
{
|
342 |
+
"cell_type": "code",
|
343 |
+
"execution_count": 25,
|
344 |
+
"metadata": {},
|
345 |
+
"outputs": [],
|
346 |
+
"source": [
|
347 |
+
"# new_test_df = test_df.copy()\n",
|
348 |
+
"# new_test_df[\"_path\"] = new_test_df[\"path\"]\n",
|
349 |
+
"# new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
|
350 |
+
"# print(new_test_df.info())\n",
|
351 |
+
"# new_test_df.head()"
|
352 |
+
]
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"cell_type": "code",
|
356 |
+
"execution_count": 26,
|
357 |
+
"metadata": {},
|
358 |
+
"outputs": [],
|
359 |
+
"source": [
|
360 |
+
"# import shutil\n",
|
361 |
+
"# from tqdm import tqdm"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"execution_count": 27,
|
367 |
+
"metadata": {},
|
368 |
+
"outputs": [],
|
369 |
+
"source": [
|
370 |
+
"# !mkdir -p {save_path}/clips\n",
|
371 |
+
"# !mkdir -p {save_path}/augs"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "code",
|
376 |
+
"execution_count": 28,
|
377 |
+
"metadata": {},
|
378 |
+
"outputs": [],
|
379 |
+
"source": [
|
380 |
+
"# for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n",
|
381 |
+
"# shutil.copy(row[\"_path\"], row[\"path\"])"
|
382 |
+
]
|
383 |
+
},
|
384 |
+
{
|
385 |
+
"cell_type": "code",
|
386 |
+
"execution_count": 29,
|
387 |
+
"metadata": {},
|
388 |
+
"outputs": [],
|
389 |
+
"source": [
|
390 |
+
"# for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n",
|
391 |
+
"# shutil.copy(row[\"_path\"], row[\"path\"])"
|
392 |
+
]
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"cell_type": "code",
|
396 |
+
"execution_count": 30,
|
397 |
+
"metadata": {},
|
398 |
+
"outputs": [],
|
399 |
+
"source": [
|
400 |
+
"# # aug_train_df = new_train_df.copy()\n",
|
401 |
+
"# aug_train_df = new_train_df.sample(frac=0.1)\n",
|
402 |
+
"# aug_train_df = aug_train_df.reset_index(drop=True)\n",
|
403 |
+
"# aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n",
|
404 |
+
"# aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n",
|
405 |
+
"# print(aug_train_df.info())\n",
|
406 |
+
"# aug_train_df.head()"
|
407 |
+
]
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"cell_type": "code",
|
411 |
+
"execution_count": 31,
|
412 |
+
"metadata": {},
|
413 |
+
"outputs": [],
|
414 |
+
"source": [
|
415 |
+
"# print(aug_train_df.iloc[0][\"_path\"])\n",
|
416 |
+
"# print(aug_train_df.iloc[0][\"path\"])"
|
417 |
+
]
|
418 |
+
},
|
419 |
+
{
|
420 |
+
"cell_type": "code",
|
421 |
+
"execution_count": 32,
|
422 |
+
"metadata": {},
|
423 |
+
"outputs": [],
|
424 |
+
"source": [
|
425 |
+
"# # augmentation\n",
|
426 |
+
"\n",
|
427 |
+
"# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n",
|
428 |
+
"# import numpy as np\n",
|
429 |
+
"# import soundfile as sf\n",
|
430 |
+
"\n",
|
431 |
+
"# augment = Compose([\n",
|
432 |
+
"# # AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
|
433 |
+
"# # PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n",
|
434 |
+
"# # Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n",
|
435 |
+
"# AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
|
436 |
+
"# TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n",
|
437 |
+
"# PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n",
|
438 |
+
"# ])\n",
|
439 |
+
"\n",
|
440 |
+
"# def augmented_speech_file_to_array_fn(in_path, out_path):\n",
|
441 |
+
"# speech_array, sampling_rate = torchaudio.load(in_path)\n",
|
442 |
+
"# speech_array = speech_array.squeeze().numpy()\n",
|
443 |
+
"# speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n",
|
444 |
+
"# sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")"
|
445 |
+
]
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"cell_type": "code",
|
449 |
+
"execution_count": 33,
|
450 |
+
"metadata": {},
|
451 |
+
"outputs": [],
|
452 |
+
"source": [
|
453 |
+
"# # for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n",
|
454 |
+
"# # augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n",
|
455 |
+
"# !ls"
|
456 |
+
]
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"cell_type": "code",
|
460 |
+
"execution_count": 34,
|
461 |
+
"metadata": {},
|
462 |
+
"outputs": [],
|
463 |
+
"source": [
|
464 |
+
"# # new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n",
|
465 |
+
"# new_train_aug_df = new_train_df.copy()\n",
|
466 |
+
"# new_train_aug_df = new_train_aug_df.sample(frac=1)\n",
|
467 |
+
"# new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n",
|
468 |
+
"# print(new_train_aug_df.info())\n",
|
469 |
+
"# new_train_aug_df.head()"
|
470 |
+
]
|
471 |
+
},
|
472 |
+
{
|
473 |
+
"cell_type": "code",
|
474 |
+
"execution_count": 35,
|
475 |
+
"metadata": {},
|
476 |
+
"outputs": [],
|
477 |
+
"source": [
|
478 |
+
"# new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
|
479 |
+
"# new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
|
480 |
+
"# new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
|
481 |
+
]
|
482 |
+
},
|
483 |
+
{
|
484 |
+
"cell_type": "code",
|
485 |
+
"execution_count": 36,
|
486 |
+
"metadata": {},
|
487 |
+
"outputs": [],
|
488 |
+
"source": [
|
489 |
+
"# new_train_df.count()"
|
490 |
+
]
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"cell_type": "code",
|
494 |
+
"execution_count": 37,
|
495 |
+
"metadata": {},
|
496 |
+
"outputs": [],
|
497 |
+
"source": [
|
498 |
+
"# new_test_df.count()"
|
499 |
+
]
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"cell_type": "code",
|
503 |
+
"execution_count": 38,
|
504 |
+
"metadata": {},
|
505 |
+
"outputs": [],
|
506 |
+
"source": [
|
507 |
+
"# import pandas as pd\n",
|
508 |
+
"\n",
|
509 |
+
"# import os\n",
|
510 |
+
"# from tqdm import tqdm"
|
511 |
+
]
|
512 |
+
},
|
513 |
+
{
|
514 |
+
"cell_type": "code",
|
515 |
+
"execution_count": 39,
|
516 |
+
"metadata": {},
|
517 |
+
"outputs": [],
|
518 |
+
"source": [
|
519 |
+
"# train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n",
|
520 |
+
"# print(train_df.info())\n",
|
521 |
+
"# train_df.head()"
|
522 |
+
]
|
523 |
+
},
|
524 |
+
{
|
525 |
+
"cell_type": "code",
|
526 |
+
"execution_count": 40,
|
527 |
+
"metadata": {},
|
528 |
+
"outputs": [],
|
529 |
+
"source": [
|
530 |
+
"# test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n",
|
531 |
+
"# print(test_df.info())\n",
|
532 |
+
"# test_df.head()"
|
533 |
+
]
|
534 |
+
},
|
535 |
+
{
|
536 |
+
"cell_type": "code",
|
537 |
+
"execution_count": 41,
|
538 |
+
"metadata": {},
|
539 |
+
"outputs": [],
|
540 |
+
"source": [
|
541 |
+
"# non_existed_train = []\n",
|
542 |
+
"\n",
|
543 |
+
"# for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n",
|
544 |
+
"# if not os.path.exists(row[\"path\"]):\n",
|
545 |
+
"# non_existed_train.extends(list(index))\n",
|
546 |
+
"# break"
|
547 |
+
]
|
548 |
+
},
|
549 |
+
{
|
550 |
+
"cell_type": "code",
|
551 |
+
"execution_count": 42,
|
552 |
+
"metadata": {},
|
553 |
+
"outputs": [],
|
554 |
+
"source": [
|
555 |
+
"# import numpy as np\n",
|
556 |
+
"\n",
|
557 |
+
"\n",
|
558 |
+
"# idx = np.random.randint(0, len(train_df))\n",
|
559 |
+
"# # idx = 6140\n",
|
560 |
+
"# sample = train_df.iloc[idx]\n",
|
561 |
+
"# ipd.display(sample)\n",
|
562 |
+
"# # print(sample.iloc[idx][\"prev_sentence\"])\n",
|
563 |
+
"# print()\n",
|
564 |
+
"# print(sample[\"prev_sentence\"])\n",
|
565 |
+
"# print(sample[\"sentence\"])\n",
|
566 |
+
"# print()\n",
|
567 |
+
"# load_audio(sample[\"path\"])"
|
568 |
+
]
|
569 |
+
},
|
570 |
+
{
|
571 |
+
"cell_type": "code",
|
572 |
+
"execution_count": 43,
|
573 |
+
"metadata": {},
|
574 |
+
"outputs": [],
|
575 |
+
"source": [
|
576 |
+
"# train_df_half = train_df.copy()\n",
|
577 |
+
"# print(train_df_half.shape)\n",
|
578 |
+
"# train_df_half = train_df_half.dropna()\n",
|
579 |
+
"# print(train_df_half.shape)\n",
|
580 |
+
"# train_df_half = train_df_half.drop_duplicates()\n",
|
581 |
+
"# print(train_df_half.shape)\n",
|
582 |
+
"\n",
|
583 |
+
"# train_df_half = train_df_half.sample(frac=0.5)\n",
|
584 |
+
"# train_df_half = train_df_half.reset_index(drop=True)\n",
|
585 |
+
"# print(train_df_half.shape)"
|
586 |
+
]
|
587 |
+
},
|
588 |
+
{
|
589 |
+
"cell_type": "code",
|
590 |
+
"execution_count": 44,
|
591 |
+
"metadata": {},
|
592 |
+
"outputs": [],
|
593 |
+
"source": [
|
594 |
+
"# train_df_half.to_csv(f\"{save_path}/train_no_aug_half.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
|
595 |
+
]
|
596 |
+
},
|
597 |
+
{
|
598 |
+
"cell_type": "code",
|
599 |
+
"execution_count": null,
|
600 |
+
"metadata": {},
|
601 |
+
"outputs": [],
|
602 |
+
"source": []
|
603 |
+
}
|
604 |
+
],
|
605 |
+
"metadata": {
|
606 |
+
"kernelspec": {
|
607 |
+
"display_name": "transformers",
|
608 |
+
"name": "transformers"
|
609 |
+
},
|
610 |
+
"language_info": {
|
611 |
+
"codemirror_mode": {
|
612 |
+
"name": "ipython",
|
613 |
+
"version": 3
|
614 |
+
},
|
615 |
+
"file_extension": ".py",
|
616 |
+
"mimetype": "text/x-python",
|
617 |
+
"name": "python",
|
618 |
+
"nbconvert_exporter": "python",
|
619 |
+
"pygments_lexer": "ipython3",
|
620 |
+
"version": "3.9.4"
|
621 |
+
},
|
622 |
+
"orig_nbformat": 2
|
623 |
+
},
|
624 |
+
"nbformat": 4,
|
625 |
+
"nbformat_minor": 2
|
626 |
+
}
|
notes/fa.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f3c53202d7d12dfe973604737fc11b0a50c9c94b85c4cae70fcc693fe2babb4
|
3 |
+
size 7020110
|
src/dictionary.py
ADDED
@@ -0,0 +1,664 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dictionary_mapping = {
|
2 |
+
'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
|
3 |
+
'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
|
4 |
+
"ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
|
5 |
+
"ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
|
6 |
+
'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
|
7 |
+
'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
|
8 |
+
|
9 |
+
"a": "ای", "b": "بی", "c": "سی", "d": "دی", "e": "ایی", "f": "اف",
|
10 |
+
"g": "جی", "h": "اچ", "i": "آی", "j": "جی", "k": "کی", "l": "ال",
|
11 |
+
"m": "ام", "n": "ان", "o": "او", "p": "پی", "q": "کیو", "r": "آر",
|
12 |
+
"s": "اس", "t": "تی", "u": "یو", "v": "وی", "w": "دبلیو", "x": "اکس",
|
13 |
+
"y": "وای", "z": "زد ",
|
14 |
+
"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
|
15 |
+
|
16 |
+
"نو آوریمان": "نوآوریمان",
|
17 |
+
"نو آوری مان": "نوآوریمان",
|
18 |
+
"نو آوریمان": "نوآوریمان",
|
19 |
+
" ا م ": "ام ",
|
20 |
+
" م ": "ام ",
|
21 |
+
"کنندهای": "کنندهای",
|
22 |
+
"ارائهای": "ارائهای",
|
23 |
+
"ایدهای": "ایدهای",
|
24 |
+
"ماسهای": "ماسهای",
|
25 |
+
"خامنهای": "خامنهای",
|
26 |
+
"قلهای": "قلهای",
|
27 |
+
"سیارهای": "سیارهای",
|
28 |
+
"کیسهای": "کیسهای",
|
29 |
+
"شانهای": "شانهای",
|
30 |
+
"غریبهای": "غریبهای",
|
31 |
+
"برنامهای": "برنامهای",
|
32 |
+
"سختگیرانهای": "سختگیرانهای",
|
33 |
+
"بهانهای": "بهانهای",
|
34 |
+
"زیرروالهای": "زیر روالهای",
|
35 |
+
"درهای": "درهای",
|
36 |
+
"آمادهای": "آمادهای",
|
37 |
+
"سادهای": "سادهای",
|
38 |
+
"سرمایهگذارهای": "سرمایه گذارهای",
|
39 |
+
"فوقالعادهای": "فوقالعادهای",
|
40 |
+
"حادثهای": "حادثهای",
|
41 |
+
"نویسندههای": "نویسندههای",
|
42 |
+
"علاقهای": "علاقهای",
|
43 |
+
"برجستهای": "برجستهای",
|
44 |
+
"جلگهای": "جلگهای",
|
45 |
+
"زندهای": "زندهای",
|
46 |
+
"فنآوریهای": "فناوریهای",
|
47 |
+
"سایهروشنهای": "سایه روشنهای",
|
48 |
+
"بیسابقهای": "بی سابقهای",
|
49 |
+
"فرضیهای": "فرضیهای",
|
50 |
+
"راهاندازهای": "راه اندازهای",
|
51 |
+
"بیشهای": "بیشهای",
|
52 |
+
"مقالهای": "مقالهای",
|
53 |
+
"دیگهای": "دیگهای",
|
54 |
+
"ماههاست": "ماه هاست",
|
55 |
+
"نرمافزارهای": "نرمافزارهای",
|
56 |
+
"کتابسوزانهای": "کتاب سوزانهای",
|
57 |
+
"سیستمعاملهای": "سیستم عاملهای",
|
58 |
+
"اسلحهای": "اسلحهای",
|
59 |
+
"وقفهای": "وقفهای",
|
60 |
+
"زمینهای": "زمینهای",
|
61 |
+
"حرامزادههای": "حرامزادههای",
|
62 |
+
"هزینهای": "هزینهای",
|
63 |
+
"انداختهای": "انداختهای",
|
64 |
+
"جسورانهای": "جسورانهای",
|
65 |
+
"فاجعهای": "فاجعهای",
|
66 |
+
"جامعهای": "جامعهای",
|
67 |
+
"پدیدهای": "پدیدهای",
|
68 |
+
"اغواگرانهای": "اغواگرانهای",
|
69 |
+
"تکانهای": "تکانهای",
|
70 |
+
"لولهای": "لولهای",
|
71 |
+
"نشانهای": "نشانهای",
|
72 |
+
"وسیلهای": "وسیلهای",
|
73 |
+
"آیندهای": "آیندهای",
|
74 |
+
"بردهای": "بردهای",
|
75 |
+
"سابقهای": "سابقهای",
|
76 |
+
"ناحیهای": "ناحیهای",
|
77 |
+
"تکاندهندهای": "تکان دهندهای",
|
78 |
+
"بودجهای": "بودجهای",
|
79 |
+
"روزانهای": "روزانهای",
|
80 |
+
"چارهای": "چارهای",
|
81 |
+
"انگیزهای": "انگیزهای",
|
82 |
+
"دادهای": "دادهای",
|
83 |
+
"عدهای": "عدهای",
|
84 |
+
"هفتهای": "هفتهای",
|
85 |
+
"منطقهای": "منطقهای",
|
86 |
+
"استارتآپهای": "استارتاپهای",
|
87 |
+
"سازهای": "سازهای",
|
88 |
+
"مجموعهای": "مجموعهای",
|
89 |
+
"فلسفهای": "فلسفهای",
|
90 |
+
"تذکردهندهای": "تذکر دهندهای",
|
91 |
+
"مصاحبهای": "مصابحهای",
|
92 |
+
"نمونهای": "نمونهای",
|
93 |
+
"قلمموهای": "قلم موهای",
|
94 |
+
"شبزندهداری": "شب زندهداری",
|
95 |
+
"خوردهباشد": "خورده باشد",
|
96 |
+
"داشتهباشید": "داشته باشید",
|
97 |
+
"فزایندهای": "فزایندهای",
|
98 |
+
"عمدهای": "عمدهای",
|
99 |
+
"بدیهایی": "بدیهای",
|
100 |
+
"نوشتهایم": "نوشتهایم",
|
101 |
+
"بنتالهدی": "بنت الهدی",
|
102 |
+
"نوشتهام": "نوشتهام",
|
103 |
+
"سرمایهگذاران": "سرمایه گذاران",
|
104 |
+
"خانهی": "خانهی",
|
105 |
+
"گستاخانهی": "گستاخانهی",
|
106 |
+
"گرفتهباشیم": "گرفته باشیم",
|
107 |
+
"خونهی": "خونهی",
|
108 |
+
"داشتهام": "داشتهام",
|
109 |
+
"رشتهام": "رشتهام",
|
110 |
+
"سرمایهگذارانشان": "سرمایه گذارانشان",
|
111 |
+
"ریشهکنی": "ریشهکنی",
|
112 |
+
"مودبانهتری": "مودبانهتری",
|
113 |
+
"برگردانشدهاند": "برگردان شدهاند",
|
114 |
+
"قرمهسبزی": "قرمهسبزی",
|
115 |
+
"راهجویی": "راه جویی",
|
116 |
+
"اماهیچوقت": "اما هیچوقت",
|
117 |
+
"آبوهوای": "آب و هوای",
|
118 |
+
"بقیهاش": "بقیهاش",
|
119 |
+
"طبقهبندی": "طبقهبندی",
|
120 |
+
"مردههان": "مرده هان",
|
121 |
+
"آمادهاند": "آمادهاند",
|
122 |
+
"نشدهاید": "نشدهاید",
|
123 |
+
"آگاهیرسانی": "آگاهی رسانی",
|
124 |
+
"نداشتهاند": "نداشتهاند",
|
125 |
+
"شکنانهترین": "شکنانهترین",
|
126 |
+
"اقدامهایی": "اقدامهایی",
|
127 |
+
"راهآهن": "راه آهن",
|
128 |
+
"شدهاند": "شدهاند",
|
129 |
+
"تازهترین": "تازهترین",
|
130 |
+
"روبهروی": "رو به روی",
|
131 |
+
"منحصربهفرد": "منحصر به فرد",
|
132 |
+
"سیزدهبدر": "سیزده بدر",
|
133 |
+
"برندهی": "برندهی",
|
134 |
+
"خانهاشتراکی": "خانه اشتراکی",
|
135 |
+
"دادههایی": "دادههایی",
|
136 |
+
"استفادهتر": "استفادهتر",
|
137 |
+
"گذرنامهتان": "گذرنامهتان",
|
138 |
+
"کهنترین": "کهنهترین",
|
139 |
+
"فرهنگسرا": "فرهنگسرا",
|
140 |
+
"آمادهاید": "آمادهاید",
|
141 |
+
"ویژهی": "ویژهی",
|
142 |
+
"غریزهات": "غریزهات",
|
143 |
+
"مادرشوهری": "مادر شوهری",
|
144 |
+
"نبودهام": "نبودهام",
|
145 |
+
"بودهاند": "بودهاند",
|
146 |
+
"وتنها": "و تنها",
|
147 |
+
"بداههکاری": "بداههکاری",
|
148 |
+
"سرمایهگذار": "سرمایه گذار",
|
149 |
+
"برنامهنویس": "برنامه نویس",
|
150 |
+
"مهنازخانم": "مهناز خانم",
|
151 |
+
"مواجهاند": "مواجهاند",
|
152 |
+
"توسعهاش": "توسعهاش",
|
153 |
+
"سینهام": "سینهام",
|
154 |
+
"سینهام": "سینهام",
|
155 |
+
"نمیخواهند": "نمیخواهند",
|
156 |
+
"فنآوریها": "فناوریها",
|
157 |
+
"دنبالهرو": "دنبالهرو",
|
158 |
+
"لبهی": "لبهی",
|
159 |
+
"اللهیار": "الله یار",
|
160 |
+
"ارزندهتر": "ارزندهتر",
|
161 |
+
"برههای": "برهای",
|
162 |
+
"پیادهسازی": "پیادهسازی",
|
163 |
+
"دهسالگی": "ده سالگی",
|
164 |
+
"رسانهای": "رسانهای",
|
165 |
+
"ریشسفیدها": "ریش سفیدها",
|
166 |
+
"چهجوری": "چه جوری",
|
167 |
+
"ویژگیهایی": "ویژگیهایی",
|
168 |
+
"میفهمیم": "میفهمیم",
|
169 |
+
"وبهم": "و بهم",
|
170 |
+
"قطرهای": "قطرهای",
|
171 |
+
"ازتنهایی": "از تنهایی",
|
172 |
+
"لطیفهای": "لطیفهای",
|
173 |
+
"باشهاومدم": "باشه اومدم",
|
174 |
+
"منحصربهفردترین": "منحصر به فردترین",
|
175 |
+
"کردهاند": "کردهاند",
|
176 |
+
"اندازهای": "اندازهای",
|
177 |
+
"بهرهبرداری": "بهره برداری",
|
178 |
+
"اماشوهرجان": "اما شوهر جان",
|
179 |
+
"خانوادهاش": "خانوادهاش",
|
180 |
+
"نشدهاند": "نشدهاند",
|
181 |
+
"نکردهایم": "نکردهایم",
|
182 |
+
"تخممرغهایش": "تخم مرغهایش",
|
183 |
+
"وظیفهش": "وظیفهاش",
|
184 |
+
"مشگینشهر": "مشگی شهر",
|
185 |
+
"توسعهدهندگانش": "توسعه دهندگانش",
|
186 |
+
"امینابراهیم": "امین ابراهیم",
|
187 |
+
"دربارهاش": "دربارهاش",
|
188 |
+
"میانافزارها": "میانافزارها",
|
189 |
+
"دیدهاند": "دیدهاند",
|
190 |
+
"خانوادهام": "خانوادهام",
|
191 |
+
"مایهی": "مایهی",
|
192 |
+
"نوشتهشدن": "نوشته شدن",
|
193 |
+
"راهحلهایشان": "راه حلهایشان",
|
194 |
+
"میهماننواز": "میهمان نواز",
|
195 |
+
"زیبندهی": "زیرندهی",
|
196 |
+
"راههایی": "راههایی",
|
197 |
+
"جربزهی": "جربزهی",
|
198 |
+
"بهجا": " به جا",
|
199 |
+
"بطورهمزمان": "به طور همزمان",
|
200 |
+
"فهمیدهبود": "فهمیده بود",
|
201 |
+
"دوربرگردانها": "دور برگردانها",
|
202 |
+
"شالودهی": "شالودهی",
|
203 |
+
"راهکاریی": "راهکاری",
|
204 |
+
"مخالفتهایی": "مخالفتهایی",
|
205 |
+
"چیزهاازشون": "چیزها ازشون",
|
206 |
+
"سکونتگاههای": "سکونت گاههای",
|
207 |
+
"سالهابود": "سالها بود",
|
208 |
+
"نمونهی": "نمونهی",
|
209 |
+
"سرمایهگذاری": "سرمایه گذاری",
|
210 |
+
"شبکهای": "شبکهای",
|
211 |
+
"خواهرشوهر": "خواهر شوهر",
|
212 |
+
"سرگیجهآور": "سرگیجه آور",
|
213 |
+
"آستانهی": "آستانهی",
|
214 |
+
"دادهاست": "داده است",
|
215 |
+
"مجسمهسازی": "مجسمه سازی",
|
216 |
+
"ماهرانهترین": "ماهرانهترین",
|
217 |
+
"پنجشنبههایی": "پنجشنبه شبهایی",
|
218 |
+
"نرفنهام": "نرفتهام",
|
219 |
+
"قورمهسبزی": "قورمه سبزی",
|
220 |
+
"گذارهای": "گذارهای",
|
221 |
+
"بندهخدا": "بنده خدا",
|
222 |
+
"روزنامهنگاران": "روزنامه نگاران",
|
223 |
+
"نقشهی": "نقشهی",
|
224 |
+
"حملهی": "حملهی",
|
225 |
+
"تکنیکهاست": "تکنیک هاست",
|
226 |
+
"نرمافزارهایمان": "نرمافرارهایمان",
|
227 |
+
"مادرشوهرم": "مادر شوهرم",
|
228 |
+
"ماهگیمون": "ماه گیمون",
|
229 |
+
"مادرشوهرمحترم": "مادر شوهر محترم",
|
230 |
+
"شوهرداری": "شوهر داری",
|
231 |
+
"سرمایهگذارها": "سرمایه گذارها",
|
232 |
+
"بهرهمند": "بهرهمند",
|
233 |
+
"درمانهایی": "درمانهایی",
|
234 |
+
"عامدانهتر": "عامدانهتر",
|
235 |
+
"تازهوارد": "تازه وارد",
|
236 |
+
"مونتهویدئو": "مونته ویدئو",
|
237 |
+
"ذائقهاش": "ذائقهاش",
|
238 |
+
"گوشهگیرتر": "گوشهگیرتر",
|
239 |
+
"دنبالهدار": "دنبالهدار",
|
240 |
+
"بیخانمانها": "بیخانمانها",
|
241 |
+
"سرمایهدارها": "سرمایهدارها",
|
242 |
+
"مادرشوهریم": "مادر شوهریم",
|
243 |
+
"صبحانهاش": "صبحانهاش",
|
244 |
+
"جنازهست": "جنازه است",
|
245 |
+
"شمارهات": "شمارهای",
|
246 |
+
"بهقدری": "به قدری",
|
247 |
+
"کیسهی": "کیسهی",
|
248 |
+
"کوششهایی": "کوششهایی",
|
249 |
+
"مادرشوهر": "مادر شوهر",
|
250 |
+
"رابطهی": "رابطهی",
|
251 |
+
"نوشتهاند": "نوشتهاند",
|
252 |
+
"کنجکاوانهی": "کنجکاوانهی",
|
253 |
+
"غیرمتعهد": "غیر متعهد",
|
254 |
+
"کردهای": "کردهای",
|
255 |
+
"وهمکارانم": "و همکارانم",
|
256 |
+
"گردهمآیی": "گردهمایی",
|
257 |
+
"اللهوردی": "الله وردی",
|
258 |
+
"صرفهجویی": "صرفه جویی",
|
259 |
+
"ماندهاند": "ماندهاند",
|
260 |
+
"برنامهنویسی": "برنامهنویسی",
|
261 |
+
"امینمهدی": "امین مهدی",
|
262 |
+
"سهامدارنی": "سهام دارانی",
|
263 |
+
"مسابقهی": "مسابقهی",
|
264 |
+
"ستارهشناسم": "ستار شناسم",
|
265 |
+
"گرفتهاند": "گرفتهاند",
|
266 |
+
"جامعهشان": "جامعهشان",
|
267 |
+
"بچهی": "بچهی",
|
268 |
+
"شیوهی": "شیوهی",
|
269 |
+
"بهکار": "به کار",
|
270 |
+
"بهتراست": "بهتر است",
|
271 |
+
"سروکلهشون": "سر و کلهشون",
|
272 |
+
"رسیدهمسرش": "رسید همسرش",
|
273 |
+
"پسراهل": "پسر اهل",
|
274 |
+
"پروژههای": "پروژههای",
|
275 |
+
"عاقلانهام": "عاقلانهام",
|
276 |
+
"گذاشتهاند": "گذاشتهاند",
|
277 |
+
"کردهام": "کردهام",
|
278 |
+
"اندازهگیری": "اندازه گیری",
|
279 |
+
"یاوهگویی": "یاوه گویی",
|
280 |
+
"سازمانهایی": "سازمانهایی",
|
281 |
+
"نمودهاند": "نمودهاند",
|
282 |
+
"تنهاییآور": "تنهایی آور",
|
283 |
+
"قراردهیم": "قرار دهیم",
|
284 |
+
"ازشوهرجان": "از شوهر جان",
|
285 |
+
"کرهجنوبی": "کره جنوبی",
|
286 |
+
"توهینآمیز": "توهین آمیز",
|
287 |
+
"فنآوریهایی": "فناوریهایی",
|
288 |
+
"داشتهاید": "داشتهاید",
|
289 |
+
"شدهایم": "شدهایم",
|
290 |
+
"نمیفهمم": "نمیفهمم",
|
291 |
+
"مثالهایی": "مثالهایی",
|
292 |
+
"رییسجمهور": "رییس جمهور",
|
293 |
+
"مجموعهی": "مجموعهی",
|
294 |
+
"درندهاند": "درندهاند",
|
295 |
+
"امابهش": "اما بهش",
|
296 |
+
"بازخواهند": "باز خواهند",
|
297 |
+
"برنامههایی": "برنامههایی",
|
298 |
+
"یهجا": "یه جا",
|
299 |
+
"زگیلهایی": "زگیلهایی",
|
300 |
+
"وسیلهی": "وسیلهی",
|
301 |
+
"بهمنیار": "بهمن یار",
|
302 |
+
"دادهام": "دادهام",
|
303 |
+
"بههنگام": "به هنگام",
|
304 |
+
"بهدروغ": "به دروغ",
|
305 |
+
"دورافتادهترین": "دور افتادهترین",
|
306 |
+
"نامهایی": "نامهایی",
|
307 |
+
"سهقسمتی": "سه قسمتی",
|
308 |
+
"توجهازچیدن": "توجه از چیدن",
|
309 |
+
"پیامرسانها": "پیام رسانها",
|
310 |
+
"بهمنزاد": "بهمن زاد",
|
311 |
+
"نشانههایی": "نشانههایی",
|
312 |
+
"راهحلهای": "راه حلهای",
|
313 |
+
"راهحلهایی": "راه حلهایی",
|
314 |
+
"راهحلهای": "راه حلهای",
|
315 |
+
"نظرخواهیها": "نظر خواهیها",
|
316 |
+
"نظرخواهیها": "نظر خواهیها",
|
317 |
+
"کندهی": "کندهی",
|
318 |
+
"حرامزادههای": "حرام زادههای",
|
319 |
+
"شبیهسازیهایی": "شبیه سازیهایی",
|
320 |
+
"مهارتهایی": "مهارتهایی",
|
321 |
+
"روبهرویشان": "رو به رویشان",
|
322 |
+
"برجستهترین": "برجستهترین",
|
323 |
+
"نمیفهمیدم": "نمیفهمیدم",
|
324 |
+
"دستگاههایی": "دستگاههایی",
|
325 |
+
"برادرشوهر": "برادر شوهر",
|
326 |
+
"گرسنهام": "گرستهام",
|
327 |
+
"گرسنههام": "گرستهام",
|
328 |
+
"قهوهخوری": "قهوه خوری",
|
329 |
+
"دادهاید": "دادهاید",
|
330 |
+
"بهآرامی": "به آرمانی",
|
331 |
+
"دانستنیهاست": "دانستنیهاست",
|
332 |
+
"بهراحتی": "به راحتی",
|
333 |
+
"ایدهپردازی": "ایدهپردازی",
|
334 |
+
"ریشسفیدهای": "ریش سفیدهای",
|
335 |
+
"خفهمون": "خفه مون",
|
336 |
+
"بهجای": "به جای",
|
337 |
+
"ریزخشونتها": "ریز خشونتها",
|
338 |
+
"ریزخشونتها": "ریز خشونتها",
|
339 |
+
"حساسیتهایی": "حساسیتهایی",
|
340 |
+
"پشتصحنهی": "پشت صحنهی",
|
341 |
+
"کلهی": "کلهی",
|
342 |
+
"تاشوهرم": "تا شوهرم",
|
343 |
+
"آیندهاش": "آیندهاش",
|
344 |
+
"پروانههایی": "پروانههایی",
|
345 |
+
"خوبیهایی": "خوبیهایی",
|
346 |
+
"نرمافزارها": "نرمافزارها",
|
347 |
+
"رساندهاند": "رساندهاند",
|
348 |
+
"سرمایهگذارنی": "سرمایه گذارانی",
|
349 |
+
"تکهچسبانی": "تکه چسبانی",
|
350 |
+
"بیتوجهی": "بی توجهی",
|
351 |
+
"جاهطلبی": "جاه طلبی",
|
352 |
+
"پرغلغلهتان": "پر غلغلهتان",
|
353 |
+
"خمینیشهر": "خمینی شهر",
|
354 |
+
"رشتهتوییت": "رشته توییت",
|
355 |
+
"موهبتهایی": "موهبتهایی",
|
356 |
+
"برنامهی": "برنامهی",
|
357 |
+
"مادرشوهردارم": "مادر شوهر داردم",
|
358 |
+
"سیاهپوستان": "سیاه پوستان",
|
359 |
+
"شرکتهایی": "شرکتهایی",
|
360 |
+
"نیاوردهاند": "نیاوردهاند",
|
361 |
+
"آنهم": "آن هم",
|
362 |
+
"شوهرداریم": "شوهر داریم",
|
363 |
+
"یکچهارم": "یک چهارم",
|
364 |
+
"پروندههاست": "پرونده هاست",
|
365 |
+
"برنامهت": "برنامهات",
|
366 |
+
"چروکیدهمان": "چروکیدهمان",
|
367 |
+
"زمینهسازی": "زمینه سازی",
|
368 |
+
"زدهاند": "زدهاند",
|
369 |
+
"اظهارنظرپرداختن": "اظهار نظر پرداختن",
|
370 |
+
"صلحطلبانهترین": "صلح طلبانهترین",
|
371 |
+
"بهغلط": "به غلط",
|
372 |
+
"ایدهآلم": "ایده آلم",
|
373 |
+
"سیاهکاران": "سیاه کاران",
|
374 |
+
"امیرابراهیم": "امیر ابراهیم",
|
375 |
+
"توسعهدهندگان": "توسعه دهندگان",
|
376 |
+
"لحظهی": "لحظهی",
|
377 |
+
"امینطاها": "امین طاها",
|
378 |
+
"بینالنهرین": "بین النهرین",
|
379 |
+
"نیمهوقت": "نیمه وقت",
|
380 |
+
"پیادهروی": "پیاده روی",
|
381 |
+
"آلودهاند": "آلودهاند",
|
382 |
+
"گریهکرد": "گره کرد",
|
383 |
+
"نعمتهایی": "نعمتهایی",
|
384 |
+
"مادرشوهرشماهم": "مادر شوهر شما هم",
|
385 |
+
"آشپزخونهاس": "آشپزخونهاس",
|
386 |
+
"مسابقهها": "مسابقهها",
|
387 |
+
"مسابقهای": "مسابقههای",
|
388 |
+
"برنامهریزی": "برنامهریزی",
|
389 |
+
"بازخواهید": "باز خواهید",
|
390 |
+
"جوییما": "جویی ما",
|
391 |
+
"آماده ایم": "آمادهایم",
|
392 |
+
"مدلسازی": "مدلسازی",
|
393 |
+
"درصورتیکه": "در صورتیکه",
|
394 |
+
"آمریکاییات": "آمریکاییات",
|
395 |
+
"مادریاش": "مادریاش",
|
396 |
+
"غافلگیرکننده": "غافلگیر کننده",
|
397 |
+
"پیکرتراشی": "پیکر تراشی",
|
398 |
+
"اذیتوآزار": "اذیت و آزار",
|
399 |
+
"امتیازاورترین": "امتیاز آور",
|
400 |
+
"جیکجیک": "جیک جیک",
|
401 |
+
"تاشب": "تا شب",
|
402 |
+
"کپیرایت": "کپی رایت",
|
403 |
+
"آنتیبادی": "آنتی بادی",
|
404 |
+
"عجیبتر": "عجیبتر",
|
405 |
+
"استانداردسازی": "استاندارد سازی",
|
406 |
+
"هشتادوهشت": "هشتاد و هشت",
|
407 |
+
"متنوعتر": "متنوعتر",
|
408 |
+
"منظورانجام": "منظور انجام",
|
409 |
+
"نگرانکنندهترین": "نگران کنندهترین",
|
410 |
+
"شگفتانگیز": "شگفت انگیز",
|
411 |
+
"رنگینپوست": "رنگین پو��ت",
|
412 |
+
"فارغ التحصیلان": "فارغالتحصیلان",
|
413 |
+
"ترسناکتر": "ترسناکتر",
|
414 |
+
"لا رامبلا": "لارامبلا",
|
415 |
+
"پرجمعیتترین": "پرجمعیتترین",
|
416 |
+
"درمیآیند": "درمیآیند",
|
417 |
+
"باشمالکی": "باشم الکی",
|
418 |
+
"وسیعتر": "وسیعتر",
|
419 |
+
"فاحشهخانه": "فاحشه خانه",
|
420 |
+
"بااحتیاط": "با احتیاط",
|
421 |
+
"قانعکننده": "قانعکننده",
|
422 |
+
"انعطافپذیری": "انعطافپذیری",
|
423 |
+
"بیتالمقدس": "بیتالمقدس",
|
424 |
+
"اوپناستریتمپ": "اوپن استریت مپ",
|
425 |
+
"روزابارونی": "روزا بارونی",
|
426 |
+
"محافظهکارانه": "محافظه کارانه",
|
427 |
+
"فوتبالدستی": "فوتبال دستی",
|
428 |
+
"توسعهدهنده": "توسعه دهنده",
|
429 |
+
"قانونگزاران": "قانون گزاران",
|
430 |
+
"العسریسرا": "العسر یسرا",
|
431 |
+
"خارقالعاده": "خارقالعاده",
|
432 |
+
"بیماریمزمن": "بیماری مزمن",
|
433 |
+
"بادوستانتان": "با دوستانتان",
|
434 |
+
"برابربیشتر": "برابر بیشتر",
|
435 |
+
"ارائهدهنده": "ارائه دهنده",
|
436 |
+
"طوفانزدگان": "طوفان زندگان",
|
437 |
+
"امینمحمد": "امین محمد",
|
438 |
+
"محیطزیست": "محیط زیست",
|
439 |
+
"شقیترینشان": "شقیترینشان",
|
440 |
+
"بودواقعا": "بود واقعا",
|
441 |
+
"نیویورکتایمز": "نیویورک تایمز",
|
442 |
+
"ریودوژانیرو": "ریو دو ژانیرو",
|
443 |
+
"مشترکالمنافع": "مشترکالمنافع",
|
444 |
+
"اسلایدسازم": "اسلاید سازم",
|
445 |
+
"نمیآوریدش": "نمیآوریدش",
|
446 |
+
"بینالملل": "بینالملل",
|
447 |
+
"مصرفکنندگان": "مصرف کنندگان",
|
448 |
+
"امینالدین": "امین الدین",
|
449 |
+
"امریکااینقدر": "امریکا اینقدر",
|
450 |
+
"بعضیاوقات": "بعضی اوقات",
|
451 |
+
"خاطربچه": "خاطر بچه",
|
452 |
+
"ایناکیلویی": "اینا کیلویی",
|
453 |
+
"ویکیپدیا": "ویکیپدیا",
|
454 |
+
"مافکرمیکنیم": "ما فکر میکنیم",
|
455 |
+
"انگلیسیزبان": "انگلیسی زبان",
|
456 |
+
"کلهشون": "کلهشون",
|
457 |
+
"آدمبزرگی": "آرم بزرگی",
|
458 |
+
"مر آت مر آه": "مرآت مرآت",
|
459 |
+
"آسیبزد": "آسیب زد",
|
460 |
+
"آیآرسی": "آی آرسی",
|
461 |
+
"آسیااقیانوسیه": "آسیا اقیانوسیه",
|
462 |
+
"آیای": "آیا",
|
463 |
+
"میانجنسی": "میان جنسی",
|
464 |
+
"میاننسلی": "میان نسلی",
|
465 |
+
"میانافزارها": "میان افزارها",
|
466 |
+
"آییننامه": "آییننامه",
|
467 |
+
"ارائهشده": "ارائهشده",
|
468 |
+
"اشپزخونه": "آشپزخونه",
|
469 |
+
"اماعلتشونمیپرسه": "اما علتشو نمیپرسه",
|
470 |
+
"امیدوارکننده": "امیدوار کننده",
|
471 |
+
"ایالاتمتحده": "ایالات متحده",
|
472 |
+
"بااینکه": "با اینکه",
|
473 |
+
"بلندپروازانه": "بلند پروازانه",
|
474 |
+
"بهترازاینه": "بهتر از اینه",
|
475 |
+
"بهدستآمده": "به دستآمده",
|
476 |
+
"بهوسیله": "به وسیله",
|
477 |
+
"بیادبانه": "بی ادبانه",
|
478 |
+
"بیاندازه": "بی اندازه",
|
479 |
+
"بیصبرانه": "بی صبرانه",
|
480 |
+
"بیفایده": "بی فایده",
|
481 |
+
"بیمهره": "بی مهره",
|
482 |
+
"بینظیره": "بی نظیره",
|
483 |
+
"تاریخزده": "تاریخ زده",
|
484 |
+
"تهرانزده": "تهران زده",
|
485 |
+
"تولیدشده": "تولید شده",
|
486 |
+
"تولیدکننده": "تولید کننده",
|
487 |
+
"تکمیلشده": "تکمیل شده",
|
488 |
+
"جاافتاده": "جا افتاده",
|
489 |
+
"جمعآوریکننده": "جمع آوری کننده",
|
490 |
+
"جورآدمیه": "جور آدمیه",
|
491 |
+
"حقالزحمه": "حق الزحمه",
|
492 |
+
"دخترونهتره": "دخترونه تره",
|
493 |
+
"دوپنجره": "دو پنجره",
|
494 |
+
"ذاتالریه": "ذاتالریه",
|
495 |
+
"راسالخیمه": "راسالخیمه",
|
496 |
+
"رنگماده": "رنگ ماده",
|
497 |
+
"سوئاستفاده": "سو استفاده",
|
498 |
+
"سواستفاده": "سو استفاده",
|
499 |
+
"شبهجزیره": "شبه جزیره",
|
500 |
+
"صادرکننده": "صادر کننده",
|
501 |
+
"ضررداره": "ضرر داره",
|
502 |
+
"عابرپیاده": "عابر پیاده",
|
503 |
+
"فوقالعاده": "فوقالعاده",
|
504 |
+
"قابلتوجه": "قابل توجه",
|
505 |
+
"قانعکننده": "قانع کننده",
|
506 |
+
"مادربیچاره": "مادر بیچاره",
|
507 |
+
"مشخصشده": "مشخص شده",
|
508 |
+
"مصرفکننده": "مصرف کننده",
|
509 |
+
"مصیبتزده": "مصیب تزده",
|
510 |
+
"ناامیدکننده": "ناامید کننده",
|
511 |
+
"نیمفاصله": "نیمفاصله",
|
512 |
+
"هماهنگکننده": "هماهنگ کننده",
|
513 |
+
"همهجانبه": "همه جانبه",
|
514 |
+
"واردکننده": "وارد کننده",
|
515 |
+
"وخوابگاه": "و خوابگاه",
|
516 |
+
"ودستگاه": "و دستگاه",
|
517 |
+
"وزردچوبه": "و زردچوبه",
|
518 |
+
"وپروانه": "و پروانه",
|
519 |
+
"پدرخوانده": "پدر خوانده",
|
520 |
+
"چاپشده": "چاپ شده",
|
521 |
+
"کردته": "کرد ته",
|
522 |
+
"کردندکه": "کردند که",
|
523 |
+
"یکطرفه": "یک طرفه",
|
524 |
+
"پایینتره": "پایینتره",
|
525 |
+
"اشتراکگذاری": "اشتراک گذاری",
|
526 |
+
"انحصارگراناند": "انحصار گراناند",
|
527 |
+
"خوشحالییییی": "خوشحالی",
|
528 |
+
"همتیمیهایشان": "هم تیمیهایشان",
|
529 |
+
"پایدارامباید": "پایدارام باید",
|
530 |
+
"پرجنبوجوشتر": "پر جنب و جوشتر",
|
531 |
+
"آبمروارید": "آب مروارید",
|
532 |
+
"آتشسوزی": "آتش سوزی",
|
533 |
+
"آتشنشانی": "آتشنشانی",
|
534 |
+
"آتشنشان": "آتشنشان",
|
535 |
+
"آرامشبخش": "آرامش بخش",
|
536 |
+
"آشناداشتن": "آشنا داشتن",
|
537 |
+
"آقاچیزی": "آقا چیزی",
|
538 |
+
"آموختهام": "آموختهام",
|
539 |
+
"آموزششان": "آموزششان",
|
540 |
+
"ازآنجا": "از آنجا",
|
541 |
+
"ازالان": "از الان",
|
542 |
+
"ازاینجا": "از اینجا",
|
543 |
+
"ازجیبش": "از جیبش",
|
544 |
+
"ازدستش": "از دستش",
|
545 |
+
"ازدیوار": "از دیوار",
|
546 |
+
"ازشغلشون": "از شغلشون",
|
547 |
+
"ازوقتی": "از وقتی",
|
548 |
+
"ازکسانی": "از کسانی",
|
549 |
+
"اسباببازی": "اسباب بازی",
|
550 |
+
"اسبسوار": "اسب سوار",
|
551 |
+
"اصیلزاده": "اصیل زاده",
|
552 |
+
"افتادهاید": "افتادهاید",
|
553 |
+
"الهام": "الهام",
|
554 |
+
"امااصلا": "اما اصلا",
|
555 |
+
"امااصلابه": "اما اصلا به",
|
556 |
+
"امااین": "اما این",
|
557 |
+
"امابعد": "اما بعد",
|
558 |
+
"امابعدیکی": "اما بعد یکی",
|
559 |
+
"اماجاذبه": "اما جاذبه",
|
560 |
+
"امرارمعاش": "امرار معاش",
|
561 |
+
"امکانپذیر": "امکان پذیر",
|
562 |
+
"انتهای": "انتهای",
|
563 |
+
"انتهایی": "انتهایی",
|
564 |
+
"ایزدبانوی": "ایزد بانوی",
|
565 |
+
"بااینحال": "با اینحال",
|
566 |
+
"باحتمال": "به احتمال",
|
567 |
+
"باحجاب": "با حجاب",
|
568 |
+
"باخنده": "با خنده",
|
569 |
+
"بادوستاش": "با دوستاش",
|
570 |
+
"بارمان": "بار مان",
|
571 |
+
"بازتر": "باز تر",
|
572 |
+
"باطعنه": "با طعنه",
|
573 |
+
"بافریاد": "با فریاد",
|
574 |
+
"بارگزاری": "بارگذاری",
|
575 |
+
"بالامنم": "بالا منم",
|
576 |
+
"بگیرمامان": "بگیر مامان",
|
577 |
+
"بیاحترامی": "بی احترامی",
|
578 |
+
"بیادبی": "بی ادبی",
|
579 |
+
"بیاعتنا": "بی اعتنا",
|
580 |
+
"بیدارباش": "بیدار باش",
|
581 |
+
"بیشازحد": "بیش از حد",
|
582 |
+
"بیمسئولیت": "بی مسئولیت",
|
583 |
+
"تاسفبار": "تاسف بار",
|
584 |
+
"تامشکلمون": "تا مشکلمون",
|
585 |
+
"تانقشه": "تا نقشه",
|
586 |
+
"تصمیمگیری": "تصمیم گیری",
|
587 |
+
"تقسیمبندی": "تقسیم بندی",
|
588 |
+
"تقصیرارو": "تقصیرا رو",
|
589 |
+
"جدیدابرای": "جدیدا برای",
|
590 |
+
"جعبهابزار": "جعبه ابزار",
|
591 |
+
"جلوتونو": "جلو تو نو",
|
592 |
+
"حاضردر": "حاضر در",
|
593 |
+
"حاضرنیست": "حاضر نیست",
|
594 |
+
"دستنخورده": "دست نخورده",
|
595 |
+
"دوامتیاز": "دو امتیاز",
|
596 |
+
"دوروزتمام": "دو روز تمام",
|
597 |
+
"شخصیسازی": "شخصیسازی",
|
598 |
+
"شدواجناس": "شد و اجناس",
|
599 |
+
"شوهردارم": "شوهر دارم",
|
600 |
+
"شوهرشماهم": "شوهر شما هم",
|
601 |
+
"شوهرمحترم": "شوهر محترم",
|
602 |
+
"شکلگیری": "شکل گیری",
|
603 |
+
"صخرهنوردی": "صخرهنوردی",
|
604 |
+
"صدوبیست": "صد و بیست",
|
605 |
+
"عقبنشینی": "عقب نشینی",
|
606 |
+
"عکسالعمل": "عکسالعمل",
|
607 |
+
"غرغرمیکنم": "غرغر میکنم",
|
608 |
+
"هزاربار": "هزار بار",
|
609 |
+
"هزارتومان": "هزار تومان",
|
610 |
+
"هزارجور": "هزار جور",
|
611 |
+
"هزاروسیصد": "هزار و سیصد",
|
612 |
+
"هممیهنان": "هم میهنان",
|
613 |
+
"هممیهنانش": "هم میهنانش",
|
614 |
+
"همنسلانش": "هم نسلانش",
|
615 |
+
"همهگیری": "همه گیری",
|
616 |
+
"هییییچ": "هیچ",
|
617 |
+
"وقتاخیلی": "وقتا خیلی",
|
618 |
+
"وقتابه": "وقتا به",
|
619 |
+
"وقتگذرانی": "وقت گذرانی",
|
620 |
+
"ومحکوم": "و محکوم",
|
621 |
+
"ومحیطها": "و محیطها",
|
622 |
+
"وکشورتان": "و کشورتان",
|
623 |
+
"ویکیمدیا": "ویکیمدی��",
|
624 |
+
"یهوگفت": "یهو گفت",
|
625 |
+
"اینجااز": "اینجا از",
|
626 |
+
}
|
627 |
+
fixator_dictionary = {
|
628 |
+
"بهای": "بهای",
|
629 |
+
"بهترین": "بهترین",
|
630 |
+
"آستر": "آستر",
|
631 |
+
"ارکستر": "ارکستر",
|
632 |
+
"انتر": "انتر",
|
633 |
+
"بستر": "بستر",
|
634 |
+
"بهتر": "بهتر",
|
635 |
+
"بهترتر": "بهترتر",
|
636 |
+
"توئیتر": "تویتتر",
|
637 |
+
"توییتر": "توییتر",
|
638 |
+
"تیتر": "تیتر",
|
639 |
+
"دختر": "دختر",
|
640 |
+
"دفتر": "دفتر",
|
641 |
+
"دلستر": "دلستر",
|
642 |
+
"دکتر": "دکتر",
|
643 |
+
"شتر": "شتر",
|
644 |
+
"لیتر": "لیتر",
|
645 |
+
"متر": "متر",
|
646 |
+
"هیپستر": "هیپستر",
|
647 |
+
"پیتر": "پیتر",
|
648 |
+
"چتر": "چتر",
|
649 |
+
"کمتر": "کمتر",
|
650 |
+
"گنگستر": "گنگستر",
|
651 |
+
"انگشتر": "انگشتر",
|
652 |
+
"سنتر": "سنتر",
|
653 |
+
"تویتتر": "توییتر",
|
654 |
+
"مادهشتر": "ماده شتر",
|
655 |
+
"ویترین": "ویترین",
|
656 |
+
"کرونومتر": "کرنومتر",
|
657 |
+
"کهتر": "کهتر",
|
658 |
+
"فیلتر": "فیلتر",
|
659 |
+
"الهام": "الهام",
|
660 |
+
"آلمان": "آلمان",
|
661 |
+
"انتهای": "انتهای",
|
662 |
+
"انتهایی": "انتهایی",
|
663 |
+
"آموختهام": "آموختهام",
|
664 |
+
}
|
src/normalizer.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from parsivar import Normalizer
|
2 |
+
from parsivar import SpellCheck
|
3 |
+
|
4 |
+
import num2fawords
|
5 |
+
import re
|
6 |
+
import string
|
7 |
+
|
8 |
+
from dictionary import dictionary_mapping, fixator_dictionary
|
9 |
+
|
10 |
+
_normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True)
|
11 |
+
_spell = SpellCheck()
|
12 |
+
chars_to_ignore = [
|
13 |
+
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
|
14 |
+
"#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
|
15 |
+
".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
|
16 |
+
'ā', 'š', 'ّ', 'ْ',
|
17 |
+
]
|
18 |
+
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
|
19 |
+
chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
|
20 |
+
zwnj = "\u200c"
|
21 |
+
silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "]
|
22 |
+
|
23 |
+
|
24 |
+
def multiple_replace(text, chars_to_mapping):
|
25 |
+
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
|
26 |
+
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
|
27 |
+
|
28 |
+
|
29 |
+
def remove_special_characters(text, chars_to_ignore_regex):
|
30 |
+
text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
|
31 |
+
return text
|
32 |
+
|
33 |
+
|
34 |
+
def convert_word_nums_to_text(word):
|
35 |
+
try:
|
36 |
+
word = int(word)
|
37 |
+
word = num2fawords.words(word)
|
38 |
+
except:
|
39 |
+
word = word
|
40 |
+
|
41 |
+
return word
|
42 |
+
|
43 |
+
|
44 |
+
def normalizer_at_word_level(text):
|
45 |
+
words = text.split()
|
46 |
+
_text = []
|
47 |
+
|
48 |
+
for word in words:
|
49 |
+
word = convert_word_nums_to_text(word)
|
50 |
+
word = fixator_dictionary.get(word, word)
|
51 |
+
|
52 |
+
_text.append(word)
|
53 |
+
|
54 |
+
return " ".join(_text) + " "
|
55 |
+
|
56 |
+
|
57 |
+
def finder(ss, s, starter=False):
|
58 |
+
found = []
|
59 |
+
for m in re.finditer(ss, s):
|
60 |
+
if starter:
|
61 |
+
found.append(m.start())
|
62 |
+
else:
|
63 |
+
found.append((m.start(), m.end()))
|
64 |
+
|
65 |
+
return found
|
66 |
+
|
67 |
+
|
68 |
+
def substring_replace(ss, s, start, end, stripped=True):
|
69 |
+
s_start = s[:start]
|
70 |
+
s_end = s[end:]
|
71 |
+
|
72 |
+
counter = 0
|
73 |
+
if stripped:
|
74 |
+
counter = 1 if s_start.endswith(" ") else counter
|
75 |
+
s_start = s_start.rstrip()
|
76 |
+
|
77 |
+
return s_start + ss + s_end, counter
|
78 |
+
|
79 |
+
|
80 |
+
def normalizer(
|
81 |
+
batch,
|
82 |
+
is_normalize=True,
|
83 |
+
is_spell_check=False,
|
84 |
+
return_dict=True,
|
85 |
+
filter_trivials=False,
|
86 |
+
remove_extra_space=False
|
87 |
+
):
|
88 |
+
text = batch["sentence"].lower().strip()
|
89 |
+
|
90 |
+
# Parsivar normalizer
|
91 |
+
if is_normalize:
|
92 |
+
text = _normalizer.normalize(text)
|
93 |
+
|
94 |
+
# Dictionary mapping
|
95 |
+
text = multiple_replace(text, dictionary_mapping)
|
96 |
+
text = re.sub(" +", " ", text)
|
97 |
+
|
98 |
+
# Remove specials
|
99 |
+
text = remove_special_characters(text, chars_to_ignore)
|
100 |
+
text = re.sub(" +", " ", text)
|
101 |
+
|
102 |
+
# Replace connected آ
|
103 |
+
special, pointer = "آ", int("0")
|
104 |
+
for f in sorted(finder(special, text, True)):
|
105 |
+
index = f + pointer - 1
|
106 |
+
if len(text) >= index:
|
107 |
+
if text[index] not in silent_chars:
|
108 |
+
new_text, extra_pointer = substring_replace(
|
109 |
+
f"{text[index]}{zwnj}", text, index, index + 1, stripped=True)
|
110 |
+
text = new_text
|
111 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
112 |
+
|
113 |
+
# Replace connected ها
|
114 |
+
pointer = int("0")
|
115 |
+
special_list = [
|
116 |
+
# "ام", "ای", "است", "ایم", "اید", "اند",
|
117 |
+
"هایمان", "هایم", "هایت", "هایش",
|
118 |
+
"هایتان", "هایشان", "هام", "هات",
|
119 |
+
"هاتان", "هامون", "هامان", "هاش",
|
120 |
+
"هاتون", "هاشان", "هاشون",
|
121 |
+
"هایی", "های", "هاس", "ها"
|
122 |
+
]
|
123 |
+
for special in special_list:
|
124 |
+
pointer = 0
|
125 |
+
text = text
|
126 |
+
for f in sorted(finder(special, text, False)):
|
127 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
128 |
+
if len(text) >= (end + 1):
|
129 |
+
if len(text) == (end + 1):
|
130 |
+
new_text, extra_pointer = substring_replace(
|
131 |
+
f"{zwnj}{special}",
|
132 |
+
text,
|
133 |
+
start + 1,
|
134 |
+
end + 1,
|
135 |
+
stripped=True)
|
136 |
+
text = new_text
|
137 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
138 |
+
else:
|
139 |
+
if text[end + 1] == " ":
|
140 |
+
new_text, extra_pointer = substring_replace(
|
141 |
+
f"{zwnj}{special}",
|
142 |
+
text,
|
143 |
+
start + 1,
|
144 |
+
end + 1,
|
145 |
+
stripped=True)
|
146 |
+
text = new_text
|
147 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
148 |
+
|
149 |
+
special, pointer = "افزار", int("0")
|
150 |
+
for f in sorted(finder(special, text, False)):
|
151 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
152 |
+
|
153 |
+
if len(text) >= (end + 1):
|
154 |
+
new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True)
|
155 |
+
text = new_text
|
156 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
157 |
+
|
158 |
+
# Replace connected ها
|
159 |
+
pointer = int("0")
|
160 |
+
special_list = [
|
161 |
+
"ترین", "تر"
|
162 |
+
]
|
163 |
+
for special in special_list:
|
164 |
+
pointer = 0
|
165 |
+
text = text
|
166 |
+
for f in sorted(finder(special, text, False)):
|
167 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
168 |
+
if len(text) >= (end + 1):
|
169 |
+
if len(text) == (end + 1):
|
170 |
+
new_text, extra_pointer = substring_replace(
|
171 |
+
f"{zwnj}{special}",
|
172 |
+
text,
|
173 |
+
start + 1,
|
174 |
+
end + 1,
|
175 |
+
stripped=True)
|
176 |
+
text = new_text
|
177 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
178 |
+
else:
|
179 |
+
if text[end + 1] == " ":
|
180 |
+
new_text, extra_pointer = substring_replace(
|
181 |
+
f"{zwnj}{special}",
|
182 |
+
text,
|
183 |
+
start + 1,
|
184 |
+
end + 1,
|
185 |
+
stripped=True)
|
186 |
+
text = new_text
|
187 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
188 |
+
|
189 |
+
# Parsivar spell correction
|
190 |
+
if is_spell_check:
|
191 |
+
text = _normalizer.normalize(_spell.spell_corrector(text))
|
192 |
+
|
193 |
+
# Normalizer at word level
|
194 |
+
text = normalizer_at_word_level(text)
|
195 |
+
text = re.sub(" +", " ", text)
|
196 |
+
|
197 |
+
if remove_extra_space:
|
198 |
+
text = text.strip()
|
199 |
+
else:
|
200 |
+
text = text.strip() + " "
|
201 |
+
|
202 |
+
if filter_trivials:
|
203 |
+
if not len(text) > 2:
|
204 |
+
text = None
|
205 |
+
|
206 |
+
if not return_dict:
|
207 |
+
return text
|
208 |
+
|
209 |
+
batch["sentence"] = text
|
210 |
+
return batch
|
211 |
+
|
212 |
+
|
213 |
+
if __name__ == '__main__':
|
214 |
+
input_text = "سلام بر شما که میآیید و میآموزید که بیآرآیم"
|
215 |
+
print(normalizer({"sentence": input_text}, return_dict=False))
|
216 |
+
|
217 |
+
input_text = "کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند."
|
218 |
+
print(normalizer({"sentence": input_text}, return_dict=False))
|
219 |
+
|
220 |
+
input_text = " میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها"
|
221 |
+
print(normalizer({"sentence": input_text}, return_dict=False))
|
222 |
+
|
223 |
+
input_text = "این کتاب بهترین در نوع شتر آسانتر هست"
|
224 |
+
print(normalizer({"sentence": input_text}, return_dict=False))
|
225 |
+
|
226 |
+
input_text = "سه چیز هست که از پژوهش در این زمینه آموختهام"
|
227 |
+
print(normalizer({"sentence": input_text}, return_dict=False))
|
src/requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
num2fawords
|
2 |
+
parsivar
|
3 |
+
tensorboard
|
src/run_config.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
from dataclasses import dataclass, field
|
6 |
+
from typing import Dict, List, Optional, Tuple
|
7 |
+
from transformers import (
|
8 |
+
HfArgumentParser,
|
9 |
+
Wav2Vec2Config,
|
10 |
+
Wav2Vec2FeatureExtractor
|
11 |
+
)
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
|
16 |
+
@dataclass
|
17 |
+
class ConfigArguments:
|
18 |
+
"""
|
19 |
+
Arguments to which config we are going to set up.
|
20 |
+
"""
|
21 |
+
output_dir: str = field(
|
22 |
+
default=".",
|
23 |
+
metadata={"help": "The output directory where the config will be written."},
|
24 |
+
)
|
25 |
+
name_or_path: Optional[str] = field(
|
26 |
+
default=None,
|
27 |
+
metadata={
|
28 |
+
"help": "The model checkpoint for weights initialization."
|
29 |
+
"Don't set if you want to train a model from scratch."
|
30 |
+
},
|
31 |
+
)
|
32 |
+
config_params: Optional[str] = field(
|
33 |
+
default=None,
|
34 |
+
metadata={"help": "Custom configuration for the specific `name_or_path`"}
|
35 |
+
)
|
36 |
+
feature_extractor_params: Optional[str] = field(
|
37 |
+
default=None,
|
38 |
+
metadata={"help": "Custom feature extractor configuration for the specific `name_or_path`"}
|
39 |
+
)
|
40 |
+
|
41 |
+
def __post_init__(self):
|
42 |
+
if self.config_params:
|
43 |
+
try:
|
44 |
+
self.config_params = ast.literal_eval(self.config_params)
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Your custom `config` parameters do not acceptable due to {e}")
|
47 |
+
|
48 |
+
if self.feature_extractor_params:
|
49 |
+
try:
|
50 |
+
self.feature_extractor_params = ast.literal_eval(self.feature_extractor_params)
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Your custom `feature_extractor` parameters do not acceptable due to {e}")
|
53 |
+
|
54 |
+
|
55 |
+
def main():
|
56 |
+
parser = HfArgumentParser([ConfigArguments])
|
57 |
+
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
58 |
+
# If we pass only one argument to the script and it's the path to a json file,
|
59 |
+
# let's parse it to get our arguments.
|
60 |
+
config_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
|
61 |
+
else:
|
62 |
+
config_args = parser.parse_args_into_dataclasses()[0]
|
63 |
+
# Setup logging
|
64 |
+
logging.basicConfig(
|
65 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
66 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
67 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
68 |
+
)
|
69 |
+
logger.setLevel(logging.INFO)
|
70 |
+
logger.info(f"Setting up configuration {config_args.name_or_path} with extra params {config_args.config_params}")
|
71 |
+
if config_args.config_params and isinstance(config_args.config_params, dict):
|
72 |
+
config = Wav2Vec2Config.from_pretrained(
|
73 |
+
config_args.name_or_path,
|
74 |
+
**config_args.config_params
|
75 |
+
)
|
76 |
+
else:
|
77 |
+
config = Wav2Vec2Config.from_pretrained(
|
78 |
+
config_args.name_or_path,
|
79 |
+
mask_time_length=10,
|
80 |
+
mask_time_prob=0.05,
|
81 |
+
diversity_loss_weight=0.1,
|
82 |
+
num_negatives=100,
|
83 |
+
do_stable_layer_norm=True,
|
84 |
+
feat_extract_norm="layer",
|
85 |
+
vocab_size=40
|
86 |
+
)
|
87 |
+
|
88 |
+
logger.info(f"Setting up feature_extractor {config_args.name_or_path} with extra params "
|
89 |
+
f"{config_args.feature_extractor_params}")
|
90 |
+
if config_args.feature_extractor_params and isinstance(config_args.feature_extractor_params, dict):
|
91 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
|
92 |
+
config_args.name_or_path,
|
93 |
+
**config_args.feature_extractor_params
|
94 |
+
)
|
95 |
+
else:
|
96 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
|
97 |
+
config_args.name_or_path,
|
98 |
+
return_attention_mask=True
|
99 |
+
)
|
100 |
+
logger.info(f"Your `config` saved here {config_args.output_dir}/config.json")
|
101 |
+
config.save_pretrained(config_args.output_dir)
|
102 |
+
|
103 |
+
logger.info(f"Your `feature_extractor` saved here {config_args.output_dir}/preprocessor_config.json")
|
104 |
+
feature_extractor.save_pretrained(config_args.output_dir)
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == '__main__':
|
108 |
+
main()
|
src/run_persian.sh
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
export LC_ALL=C.UTF-8
|
4 |
+
export LANG=C.UTF-8
|
5 |
+
|
6 |
+
export OUTPUT_DIR=/home/m3hrdadfi/code/wav2vec2-base-persian
|
7 |
+
export MODEL_NAME_OR_PATH=/home/m3hrdadfi/code/wav2vec2-base-persian
|
8 |
+
|
9 |
+
|
10 |
+
export TRAIN_FILE=/home/m3hrdadfi/code/data/train.csv
|
11 |
+
export VALIDATION_FILE=/home/m3hrdadfi/code/data/test.csv
|
12 |
+
export SPEECH_FILE_COLUMN=path
|
13 |
+
|
14 |
+
|
15 |
+
#export MAX_EVAL_SAMPLES=5000
|
16 |
+
export PER_DEVICE_TRAIN_BATCH_SIZE=32
|
17 |
+
export PER_DEVICE_EVAL_BATCH_SIZE=32
|
18 |
+
#export GRADIENT_ACCUMULATION_STEPS=2
|
19 |
+
export NUM_TRAIN_EPOCHS=5.0
|
20 |
+
export LEARNING_RATE=5e-4
|
21 |
+
export WARMUP_STEPS=1000
|
22 |
+
#export LOGGING_STEPS=500
|
23 |
+
#export EVAL_STEPS=2500
|
24 |
+
#export SAVE_STEPS=2500
|
25 |
+
export PREPROCESSING_NUM_WORKERS=4
|
26 |
+
export MAX_DURATION_IN_SECONDS=20.0
|
27 |
+
export ADAM_BETA_1=0.9
|
28 |
+
export ADAM_BETA_2=0.98
|
29 |
+
export WEIGHT_DECAY=0.01
|
30 |
+
export D_TYPE=bfloat16
|
31 |
+
export PAD_TO_MULTIPLE_OF=16384
|
32 |
+
|
33 |
+
python src/run_wav2vec2_pretrain_flax.py \
|
34 |
+
--output_dir="$OUTPUT_DIR" \
|
35 |
+
--train_file="$TRAIN_FILE" \
|
36 |
+
--validation_file="$VALIDATION_FILE" \
|
37 |
+
--speech_file_column="$SPEECH_FILE_COLUMN" \
|
38 |
+
--model_name_or_path="$MODEL_NAME_OR_PATH" \
|
39 |
+
--per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
|
40 |
+
--per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
|
41 |
+
--preprocessing_num_workers=$PREPROCESSING_NUM_WORKERS \
|
42 |
+
--max_duration_in_seconds=$MAX_DURATION_IN_SECONDS \
|
43 |
+
--num_train_epochs=$NUM_TRAIN_EPOCHS \
|
44 |
+
--learning_rate=$LEARNING_RATE \
|
45 |
+
--warmup_steps=$WARMUP_STEPS \
|
46 |
+
--weight_decay=$WEIGHT_DECAY \
|
47 |
+
--adam_beta1=$ADAM_BETA_1 \
|
48 |
+
--adam_beta2=$ADAM_BETA_2 \
|
49 |
+
--dtype="$D_TYPE" \
|
50 |
+
--pad_to_multiple_of=$PAD_TO_MULTIPLE_OF \
|
51 |
+
--push_to_hub
|
src/run_wav2vec2_pretrain_flax.py
ADDED
@@ -0,0 +1,638 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
from dataclasses import field
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Dict, List, Optional, Union
|
7 |
+
|
8 |
+
# !/usr/bin/env python
|
9 |
+
# coding=utf-8
|
10 |
+
# Copyright 2021 The HuggingFace Team All rights reserved.
|
11 |
+
#
|
12 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
13 |
+
# you may not use this file except in compliance with the License.
|
14 |
+
# You may obtain a copy of the License at
|
15 |
+
#
|
16 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
17 |
+
#
|
18 |
+
# Unless required by applicable law or agreed to in writing, software
|
19 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
20 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
21 |
+
# See the License for the specific language governing permissions and
|
22 |
+
# limitations under the License.
|
23 |
+
"""
|
24 |
+
Training the library models for Wav2Vec.
|
25 |
+
"""
|
26 |
+
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
27 |
+
|
28 |
+
import numpy as np
|
29 |
+
from datasets import DatasetDict, load_dataset
|
30 |
+
from tqdm import tqdm
|
31 |
+
|
32 |
+
import flax
|
33 |
+
import jax
|
34 |
+
import jax.numpy as jnp
|
35 |
+
import librosa
|
36 |
+
import optax
|
37 |
+
from flax import jax_utils, traverse_util
|
38 |
+
from flax.training import train_state
|
39 |
+
from flax.training.common_utils import get_metrics, onehot, shard
|
40 |
+
from transformers import (
|
41 |
+
FlaxWav2Vec2ForPreTraining,
|
42 |
+
HfArgumentParser,
|
43 |
+
TrainingArguments,
|
44 |
+
Wav2Vec2Config,
|
45 |
+
Wav2Vec2FeatureExtractor,
|
46 |
+
is_tensorboard_available,
|
47 |
+
)
|
48 |
+
from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices, _sample_negative_indices
|
49 |
+
|
50 |
+
from normalizer import normalizer
|
51 |
+
|
52 |
+
logger = logging.getLogger(__name__)
|
53 |
+
|
54 |
+
|
55 |
+
@flax.struct.dataclass
|
56 |
+
class ModelArguments:
|
57 |
+
"""
|
58 |
+
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
59 |
+
"""
|
60 |
+
|
61 |
+
model_name_or_path: str = field(
|
62 |
+
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
|
63 |
+
)
|
64 |
+
cache_dir: Optional[str] = field(
|
65 |
+
default=None,
|
66 |
+
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
67 |
+
)
|
68 |
+
freeze_feature_extractor: Optional[bool] = field(
|
69 |
+
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
|
70 |
+
)
|
71 |
+
gradient_checkpointing: Optional[bool] = field(
|
72 |
+
default=False, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
|
73 |
+
)
|
74 |
+
verbose_logging: Optional[bool] = field(
|
75 |
+
default=False,
|
76 |
+
metadata={"help": "Whether to log verbose messages or not."},
|
77 |
+
)
|
78 |
+
max_gumbel_temperature: Optional[float] = field(
|
79 |
+
default=2.0, metadata={"help": "Maximum temperature for gumbel softmax."}
|
80 |
+
)
|
81 |
+
min_gumbel_temperature: Optional[float] = field(
|
82 |
+
default=0.1, metadata={"help": "Minimum temperature for gumbel softmax."}
|
83 |
+
)
|
84 |
+
gumbel_temperature_decay: Optional[float] = field(
|
85 |
+
default=0.999995, metadata={"help": "Decay of gumbel temperature during training."}
|
86 |
+
)
|
87 |
+
dtype: Optional[str] = field(
|
88 |
+
default="float32",
|
89 |
+
metadata={
|
90 |
+
"help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
|
91 |
+
},
|
92 |
+
)
|
93 |
+
|
94 |
+
|
95 |
+
@flax.struct.dataclass
|
96 |
+
class DataTrainingArguments:
|
97 |
+
"""
|
98 |
+
Arguments pertaining to what data we are going to input our model for training and eval.
|
99 |
+
|
100 |
+
Using `HfArgumentParser` we can turn this class
|
101 |
+
into argparse arguments to be able to specify them on
|
102 |
+
the command line.
|
103 |
+
"""
|
104 |
+
|
105 |
+
dataset_name: str = field(
|
106 |
+
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
|
107 |
+
)
|
108 |
+
dataset_config_name: Optional[str] = field(
|
109 |
+
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
110 |
+
)
|
111 |
+
train_split_name: Optional[str] = field(
|
112 |
+
default="train",
|
113 |
+
metadata={
|
114 |
+
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
115 |
+
},
|
116 |
+
)
|
117 |
+
validation_split_name: Optional[str] = field(
|
118 |
+
default="validation",
|
119 |
+
metadata={
|
120 |
+
"help": "The name of the validation data set split to use (via the datasets library). Defaults to 'validation'"
|
121 |
+
},
|
122 |
+
)
|
123 |
+
train_file: Optional[str] = field(
|
124 |
+
default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
|
125 |
+
)
|
126 |
+
validation_file: Optional[str] = field(
|
127 |
+
default=None,
|
128 |
+
metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
|
129 |
+
)
|
130 |
+
speech_file_column: Optional[str] = field(
|
131 |
+
default="file",
|
132 |
+
metadata={"help": "Column in the dataset that contains speech file path. Defaults to 'file'"},
|
133 |
+
)
|
134 |
+
overwrite_cache: bool = field(
|
135 |
+
default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
|
136 |
+
)
|
137 |
+
validation_split_percentage: Optional[int] = field(
|
138 |
+
default=5,
|
139 |
+
metadata={
|
140 |
+
"help": "The percentage of the train set used as validation set in case there's no validation split"
|
141 |
+
},
|
142 |
+
)
|
143 |
+
preprocessing_num_workers: Optional[int] = field(
|
144 |
+
default=None,
|
145 |
+
metadata={"help": "The number of processes to use for the preprocessing."},
|
146 |
+
)
|
147 |
+
max_duration_in_seconds: Optional[float] = field(
|
148 |
+
default=20.0, metadata={"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds"}
|
149 |
+
)
|
150 |
+
pad_to_multiple_of: Optional[int] = field(
|
151 |
+
default=1024,
|
152 |
+
metadata={
|
153 |
+
"help": "If set will pad the sequence to a multiple of the provided value. This is important to avoid triggering recompilations on TPU"
|
154 |
+
},
|
155 |
+
)
|
156 |
+
|
157 |
+
|
158 |
+
@flax.struct.dataclass
|
159 |
+
class FlaxDataCollatorForWav2Vec2Pretraining:
|
160 |
+
"""
|
161 |
+
Data collator that will dynamically pad the inputs received and prepare masked indices
|
162 |
+
for self-supervised pretraining.
|
163 |
+
|
164 |
+
Args:
|
165 |
+
model (:class:`~transformers.FlaxWav2Vec2ForPreTraining`):
|
166 |
+
The Wav2Vec2 model used for pretraining. The data collator needs to have access
|
167 |
+
to config and ``_get_feat_extract_output_lengths`` function for correct padding.
|
168 |
+
feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
|
169 |
+
The processor used for proccessing the data.
|
170 |
+
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
171 |
+
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
172 |
+
among:
|
173 |
+
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
174 |
+
sequence if provided).
|
175 |
+
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
176 |
+
maximum acceptable input length for the model if that argument is not provided.
|
177 |
+
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
178 |
+
different lengths).
|
179 |
+
max_length (:obj:`int`, `optional`):
|
180 |
+
Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
|
181 |
+
pad_to_multiple_of (:obj:`int`, `optional`):
|
182 |
+
If set will pad the sequence to a multiple of the provided value.
|
183 |
+
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
184 |
+
7.5 (Volta).
|
185 |
+
"""
|
186 |
+
|
187 |
+
model: FlaxWav2Vec2ForPreTraining
|
188 |
+
feature_extractor: Wav2Vec2FeatureExtractor
|
189 |
+
padding: Union[bool, str] = "longest"
|
190 |
+
pad_to_multiple_of: Optional[int] = None
|
191 |
+
max_length: Optional[int] = None
|
192 |
+
|
193 |
+
def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
|
194 |
+
# reformat list to dict and set to pytorch format
|
195 |
+
batch = self.feature_extractor.pad(
|
196 |
+
features,
|
197 |
+
max_length=self.max_length,
|
198 |
+
padding=self.padding,
|
199 |
+
pad_to_multiple_of=self.pad_to_multiple_of,
|
200 |
+
return_tensors="np",
|
201 |
+
)
|
202 |
+
mask_indices_seq_length = self.model._get_feat_extract_output_lengths(batch["input_values"].shape[-1])
|
203 |
+
|
204 |
+
# sample randomly masked indices
|
205 |
+
batch["mask_time_indices"] = _compute_mask_indices(
|
206 |
+
(batch["input_values"].shape[0], mask_indices_seq_length),
|
207 |
+
self.model.config.mask_time_prob,
|
208 |
+
self.model.config.mask_time_length,
|
209 |
+
min_masks=2,
|
210 |
+
)
|
211 |
+
|
212 |
+
# sample indices to take for negative vectors
|
213 |
+
batch["sampled_negative_indices"] = _sample_negative_indices(
|
214 |
+
(batch["mask_time_indices"].shape + (self.model.config.proj_codevector_dim,)),
|
215 |
+
self.model.config.num_negatives,
|
216 |
+
)
|
217 |
+
|
218 |
+
return batch
|
219 |
+
|
220 |
+
|
221 |
+
def configure_logger(model_args: ModelArguments, training_args: TrainingArguments):
|
222 |
+
logging.basicConfig(
|
223 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
224 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
225 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
226 |
+
)
|
227 |
+
logging_level = logging.WARNING
|
228 |
+
if model_args.verbose_logging:
|
229 |
+
logging_level = logging.DEBUG
|
230 |
+
logger.setLevel(logging_level)
|
231 |
+
|
232 |
+
|
233 |
+
def write_train_metric(summary_writer, train_metrics, train_time, step):
|
234 |
+
summary_writer.scalar("train_time", train_time, step)
|
235 |
+
|
236 |
+
train_metrics = get_metrics(train_metrics)
|
237 |
+
for key, vals in train_metrics.items():
|
238 |
+
tag = f"train_{key}"
|
239 |
+
for i, val in enumerate(vals):
|
240 |
+
summary_writer.scalar(tag, val, step - len(vals) + i + 1)
|
241 |
+
|
242 |
+
|
243 |
+
def write_eval_metric(summary_writer, eval_metrics, step):
|
244 |
+
for metric_name, value in eval_metrics.items():
|
245 |
+
summary_writer.scalar(f"eval_{metric_name}", value, step)
|
246 |
+
|
247 |
+
|
248 |
+
def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
|
249 |
+
num_samples = len(samples_idx)
|
250 |
+
samples_to_remove = num_samples % batch_size
|
251 |
+
|
252 |
+
if samples_to_remove != 0:
|
253 |
+
samples_idx = samples_idx[:-samples_to_remove]
|
254 |
+
sections_split = num_samples // batch_size
|
255 |
+
batch_idx = np.split(samples_idx, sections_split)
|
256 |
+
return batch_idx
|
257 |
+
|
258 |
+
|
259 |
+
def compute_contrastive_loss(
|
260 |
+
quantized_features, transformer_features, negative_indices, mask_time_indices, logits_temp, num_negatives
|
261 |
+
):
|
262 |
+
batch_size, sequence_length, hidden_size = quantized_features.shape
|
263 |
+
|
264 |
+
# take negative vectors from sampled indices
|
265 |
+
quantized_negatives = quantized_features.reshape(-1, hidden_size)[negative_indices.reshape(-1)]
|
266 |
+
quantized_negatives = quantized_negatives.reshape(
|
267 |
+
batch_size, sequence_length, num_negatives, hidden_size
|
268 |
+
).transpose(2, 0, 1, 3)
|
269 |
+
|
270 |
+
target_features = jnp.concatenate([quantized_features[None, :], quantized_negatives], axis=0)
|
271 |
+
loss_logits = optax.cosine_similarity(transformer_features, target_features)
|
272 |
+
loss_logits = loss_logits / logits_temp
|
273 |
+
|
274 |
+
neg_is_pos = (quantized_features == quantized_negatives).all(-1)
|
275 |
+
neg_is_pos = jnp.concatenate([jnp.full((1,) + loss_logits.shape[1:], False), neg_is_pos], axis=0)
|
276 |
+
|
277 |
+
# make sure incorrectly sampled vectors don't contribute to loss
|
278 |
+
loss_logits = jnp.where(neg_is_pos, -1e9, loss_logits)
|
279 |
+
|
280 |
+
predictions = loss_logits.transpose(2, 1, 0).reshape(-1, loss_logits.shape[0])
|
281 |
+
targets = ((1 - mask_time_indices) * -100).transpose(1, 0).flatten()
|
282 |
+
|
283 |
+
target_mask = jnp.where(targets >= 0, 1.0, 0.0)
|
284 |
+
contrastive_loss = optax.softmax_cross_entropy(predictions, onehot(targets, predictions.shape[-1])) * target_mask
|
285 |
+
|
286 |
+
contrastive_loss = contrastive_loss.sum()
|
287 |
+
|
288 |
+
return contrastive_loss
|
289 |
+
|
290 |
+
|
291 |
+
def main():
|
292 |
+
# See all possible arguments in src/transformers/training_args.py
|
293 |
+
# or by passing the --help flag to this script.
|
294 |
+
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
295 |
+
|
296 |
+
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
297 |
+
|
298 |
+
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
299 |
+
configure_logger(model_args, training_args)
|
300 |
+
|
301 |
+
# Downloading and loading a dataset from the hub.
|
302 |
+
if data_args.dataset_name:
|
303 |
+
|
304 |
+
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
305 |
+
|
306 |
+
if "validation" not in datasets.keys():
|
307 |
+
# make sure only "validation" and "train" keys remain"
|
308 |
+
datasets = DatasetDict()
|
309 |
+
datasets["validation"] = load_dataset(
|
310 |
+
data_args.dataset_name,
|
311 |
+
data_args.dataset_config_name,
|
312 |
+
split=f"{data_args.train_split_name}[:{data_args.validation_split_percentage}%]",
|
313 |
+
cache_dir=model_args.cache_dir,
|
314 |
+
)
|
315 |
+
datasets["train"] = load_dataset(
|
316 |
+
data_args.dataset_name,
|
317 |
+
data_args.dataset_config_name,
|
318 |
+
split=f"{data_args.train_split_name}[{data_args.validation_split_percentage}%:]",
|
319 |
+
cache_dir=model_args.cache_dir,
|
320 |
+
)
|
321 |
+
else:
|
322 |
+
# make sure only "validation" and "train" keys remain"
|
323 |
+
datasets = DatasetDict()
|
324 |
+
datasets["validation"] = load_dataset(
|
325 |
+
data_args.dataset_name,
|
326 |
+
data_args.dataset_config_name,
|
327 |
+
split="validation",
|
328 |
+
cache_dir=model_args.cache_dir,
|
329 |
+
)
|
330 |
+
datasets["train"] = load_dataset(
|
331 |
+
data_args.dataset_name,
|
332 |
+
data_args.dataset_config_name,
|
333 |
+
split=f"{data_args.train_split_name}",
|
334 |
+
cache_dir=model_args.cache_dir,
|
335 |
+
)
|
336 |
+
else:
|
337 |
+
data_files = {}
|
338 |
+
if data_args.train_file is not None:
|
339 |
+
data_files["train"] = data_args.train_file
|
340 |
+
if data_args.validation_file is not None:
|
341 |
+
data_files["validation"] = data_args.validation_file
|
342 |
+
extension = data_args.train_file.split(".")[-1]
|
343 |
+
datasets = load_dataset(extension, data_files=data_files, delimiter="\t")
|
344 |
+
|
345 |
+
# only normalized-inputs-training is supported
|
346 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
|
347 |
+
model_args.model_name_or_path,
|
348 |
+
cache_dir=model_args.cache_dir,
|
349 |
+
do_normalize=True
|
350 |
+
)
|
351 |
+
|
352 |
+
def prepare_dataset(batch):
|
353 |
+
# check that all files have the correct sampling rate
|
354 |
+
batch["speech"], _ = librosa.load(batch[data_args.speech_file_column], sr=feature_extractor.sampling_rate)
|
355 |
+
return batch
|
356 |
+
|
357 |
+
# load audio files into numpy arrays
|
358 |
+
vectorized_datasets = datasets.map(
|
359 |
+
prepare_dataset,
|
360 |
+
num_proc=data_args.preprocessing_num_workers,
|
361 |
+
remove_columns=datasets["train"].column_names
|
362 |
+
)
|
363 |
+
|
364 |
+
# filter audio files that are too long
|
365 |
+
vectorized_datasets = vectorized_datasets.filter(
|
366 |
+
lambda data: len(data["speech"]) < int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
|
367 |
+
)
|
368 |
+
|
369 |
+
def normalize(batch):
|
370 |
+
return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate)
|
371 |
+
|
372 |
+
# normalize and transform to `BatchFeatures`
|
373 |
+
vectorized_datasets = vectorized_datasets.map(
|
374 |
+
normalize,
|
375 |
+
batched=True,
|
376 |
+
num_proc=data_args.preprocessing_num_workers,
|
377 |
+
load_from_cache_file=not data_args.overwrite_cache,
|
378 |
+
remove_columns=vectorized_datasets["train"].column_names,
|
379 |
+
)
|
380 |
+
|
381 |
+
# pretraining is only supported for "newer" stable layer norm architecture
|
382 |
+
# apply_spec_augment has to be True, mask_feature_prob has to be 0.0
|
383 |
+
config = Wav2Vec2Config.from_pretrained(
|
384 |
+
model_args.model_name_or_path,
|
385 |
+
cache_dir=model_args.cache_dir,
|
386 |
+
gradient_checkpointing=model_args.gradient_checkpointing,
|
387 |
+
)
|
388 |
+
|
389 |
+
if not config.do_stable_layer_norm or config.feat_extract_norm != "layer":
|
390 |
+
raise ValueError(
|
391 |
+
"PreTraining is only supported for ``config.do_stable_layer_norm=True`` and ``config.feat_extract_norm='layer'"
|
392 |
+
)
|
393 |
+
|
394 |
+
model = FlaxWav2Vec2ForPreTraining(
|
395 |
+
config,
|
396 |
+
seed=training_args.seed,
|
397 |
+
dtype=getattr(jnp, model_args.dtype)
|
398 |
+
)
|
399 |
+
|
400 |
+
data_collator = FlaxDataCollatorForWav2Vec2Pretraining(
|
401 |
+
model=model,
|
402 |
+
feature_extractor=feature_extractor,
|
403 |
+
pad_to_multiple_of=data_args.pad_to_multiple_of
|
404 |
+
)
|
405 |
+
|
406 |
+
# Enable tensorboard only on the master node
|
407 |
+
has_tensorboard = is_tensorboard_available()
|
408 |
+
if has_tensorboard and jax.process_index() == 0:
|
409 |
+
try:
|
410 |
+
from flax.metrics.tensorboard import SummaryWriter
|
411 |
+
|
412 |
+
summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
|
413 |
+
except ImportError as ie:
|
414 |
+
has_tensorboard = False
|
415 |
+
logger.warning(
|
416 |
+
f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
|
417 |
+
)
|
418 |
+
else:
|
419 |
+
logger.warning(
|
420 |
+
"Unable to display metrics through TensorBoard because the package is not installed: "
|
421 |
+
"Please run pip install tensorboard to enable."
|
422 |
+
)
|
423 |
+
|
424 |
+
# Initialize our training
|
425 |
+
rng = jax.random.PRNGKey(training_args.seed)
|
426 |
+
dropout_rngs = jax.random.split(rng, jax.local_device_count())
|
427 |
+
gumbel_rngs = jax.random.split(rng, jax.local_device_count())
|
428 |
+
|
429 |
+
num_epochs = int(training_args.num_train_epochs)
|
430 |
+
train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
|
431 |
+
eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
|
432 |
+
|
433 |
+
num_train_steps = len(vectorized_datasets["train"]) // train_batch_size * num_epochs
|
434 |
+
|
435 |
+
# Create learning rate schedule
|
436 |
+
warmup_fn = optax.linear_schedule(
|
437 |
+
init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
|
438 |
+
)
|
439 |
+
decay_fn = optax.linear_schedule(
|
440 |
+
init_value=training_args.learning_rate,
|
441 |
+
end_value=0,
|
442 |
+
transition_steps=num_train_steps - training_args.warmup_steps,
|
443 |
+
)
|
444 |
+
linear_decay_lr_schedule_fn = optax.join_schedules(
|
445 |
+
schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
|
446 |
+
)
|
447 |
+
|
448 |
+
# We use Optax's "masking" functionality to not apply weight decay
|
449 |
+
# to bias and LayerNorm scale parameters. decay_mask_fn returns a
|
450 |
+
# mask boolean with the same structure as the parameters.
|
451 |
+
# The mask is True for parameters that should be decayed.
|
452 |
+
def decay_mask_fn(params):
|
453 |
+
flat_params = traverse_util.flatten_dict(params)
|
454 |
+
flat_mask = {
|
455 |
+
path: (path[-1] != "bias" and path[-2:] not in [("layer_norm", "scale"), ("final_layer_norm", "scale")])
|
456 |
+
for path in flat_params
|
457 |
+
}
|
458 |
+
return traverse_util.unflatten_dict(flat_mask)
|
459 |
+
|
460 |
+
# create adam optimizer
|
461 |
+
adamw = optax.adamw(
|
462 |
+
learning_rate=linear_decay_lr_schedule_fn,
|
463 |
+
b1=training_args.adam_beta1,
|
464 |
+
b2=training_args.adam_beta2,
|
465 |
+
eps=training_args.adam_epsilon,
|
466 |
+
weight_decay=training_args.weight_decay,
|
467 |
+
mask=decay_mask_fn,
|
468 |
+
)
|
469 |
+
|
470 |
+
# Setup train state and define training hyper-parameters
|
471 |
+
state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
|
472 |
+
num_negatives = model.config.num_negatives
|
473 |
+
contrastive_logits_temperature = model.config.contrastive_logits_temperature
|
474 |
+
num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
|
475 |
+
diversity_loss_weight = model.config.diversity_loss_weight
|
476 |
+
|
477 |
+
# Define gradient update step fn
|
478 |
+
def train_step(state, batch, dropout_rng, gumbel_rng):
|
479 |
+
dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
|
480 |
+
gumbel_rng, new_gumbel_rng = jax.random.split(gumbel_rng)
|
481 |
+
|
482 |
+
def loss_fn(params):
|
483 |
+
negative_indices = batch.pop("sampled_negative_indices")
|
484 |
+
|
485 |
+
gumbel_temperature = jnp.clip(
|
486 |
+
model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay ** state.step,
|
487 |
+
a_min=model_args.min_gumbel_temperature,
|
488 |
+
)
|
489 |
+
|
490 |
+
outputs = state.apply_fn(
|
491 |
+
**batch,
|
492 |
+
gumbel_temperature=gumbel_temperature,
|
493 |
+
params=params,
|
494 |
+
dropout_rng=dropout_rng,
|
495 |
+
gumbel_rng=gumbel_rng,
|
496 |
+
train=True,
|
497 |
+
)
|
498 |
+
|
499 |
+
contrastive_loss = compute_contrastive_loss(
|
500 |
+
outputs.projected_quantized_states,
|
501 |
+
outputs.projected_states,
|
502 |
+
negative_indices,
|
503 |
+
batch["mask_time_indices"],
|
504 |
+
contrastive_logits_temperature,
|
505 |
+
num_negatives,
|
506 |
+
)
|
507 |
+
|
508 |
+
diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
|
509 |
+
loss = contrastive_loss + diversity_loss_weight * diversity_loss
|
510 |
+
|
511 |
+
return loss
|
512 |
+
|
513 |
+
grad_fn = jax.value_and_grad(loss_fn)
|
514 |
+
loss, grad = grad_fn(state.params)
|
515 |
+
grad = jax.lax.pmean(grad, "batch")
|
516 |
+
new_state = state.apply_gradients(grads=grad)
|
517 |
+
|
518 |
+
metrics = jax.lax.pmean(
|
519 |
+
{"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
|
520 |
+
)
|
521 |
+
|
522 |
+
return new_state, metrics, new_dropout_rng, new_gumbel_rng
|
523 |
+
|
524 |
+
# Create parallel version of the train step
|
525 |
+
p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
|
526 |
+
|
527 |
+
# Define eval fn
|
528 |
+
def eval_step(params, batch):
|
529 |
+
negative_indices = batch.pop("sampled_negative_indices")
|
530 |
+
|
531 |
+
outputs = model(**batch, params=params, train=False)
|
532 |
+
|
533 |
+
contrastive_loss = compute_contrastive_loss(
|
534 |
+
outputs.projected_quantized_states,
|
535 |
+
outputs.projected_states,
|
536 |
+
negative_indices,
|
537 |
+
batch["mask_time_indices"],
|
538 |
+
contrastive_logits_temperature,
|
539 |
+
num_negatives,
|
540 |
+
)
|
541 |
+
|
542 |
+
diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
|
543 |
+
loss = contrastive_loss + diversity_loss_weight * diversity_loss
|
544 |
+
|
545 |
+
# summarize metrics
|
546 |
+
metrics = {"loss": loss.mean(), "codevector_perplexity": outputs.codevector_perplexity}
|
547 |
+
metrics = jax.lax.pmean(metrics, axis_name="batch")
|
548 |
+
|
549 |
+
return metrics
|
550 |
+
|
551 |
+
p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
|
552 |
+
|
553 |
+
# Replicate the train state on each device
|
554 |
+
state = jax_utils.replicate(state)
|
555 |
+
|
556 |
+
train_time = 0
|
557 |
+
train_metrics = []
|
558 |
+
epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
|
559 |
+
for epoch in epochs:
|
560 |
+
# ======================== Training ================================
|
561 |
+
train_start = time.time()
|
562 |
+
|
563 |
+
# Create sampling rng
|
564 |
+
rng, input_rng = jax.random.split(rng)
|
565 |
+
|
566 |
+
# Generate an epoch by shuffling sampling indices from the train dataset
|
567 |
+
num_train_samples = len(vectorized_datasets["train"])
|
568 |
+
train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
|
569 |
+
train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
|
570 |
+
|
571 |
+
# Gather the indexes for creating the batch and do a training step
|
572 |
+
for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
|
573 |
+
samples = [vectorized_datasets["train"][int(idx)] for idx in batch_idx]
|
574 |
+
model_inputs = data_collator(samples)
|
575 |
+
model_inputs = shard(model_inputs.data)
|
576 |
+
|
577 |
+
# Model forward
|
578 |
+
state, train_metric, dropout_rngs, gumbel_rngs = p_train_step(
|
579 |
+
state, model_inputs, dropout_rngs, gumbel_rngs
|
580 |
+
)
|
581 |
+
train_metrics.append(train_metric)
|
582 |
+
|
583 |
+
cur_step = epoch * (num_train_samples // train_batch_size) + step
|
584 |
+
|
585 |
+
if cur_step % training_args.logging_steps == 0 and cur_step > 0:
|
586 |
+
# Save metrics
|
587 |
+
train_metric = jax_utils.unreplicate(train_metric)
|
588 |
+
train_time += time.time() - train_start
|
589 |
+
if has_tensorboard and jax.process_index() == 0:
|
590 |
+
write_train_metric(summary_writer, train_metrics, train_time, cur_step)
|
591 |
+
|
592 |
+
epochs.write(
|
593 |
+
f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
|
594 |
+
)
|
595 |
+
|
596 |
+
train_metrics = []
|
597 |
+
|
598 |
+
# ======================== Evaluating ==============================
|
599 |
+
num_eval_samples = len(vectorized_datasets["validation"])
|
600 |
+
eval_samples_idx = jnp.arange(num_eval_samples)
|
601 |
+
eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
|
602 |
+
|
603 |
+
eval_metrics = []
|
604 |
+
for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
|
605 |
+
samples = [vectorized_datasets["validation"][int(idx)] for idx in batch_idx]
|
606 |
+
model_inputs = data_collator(samples)
|
607 |
+
|
608 |
+
# Model forward
|
609 |
+
model_inputs = shard(model_inputs.data)
|
610 |
+
metrics = p_eval_step(state.params, model_inputs)
|
611 |
+
eval_metrics.append(metrics)
|
612 |
+
|
613 |
+
# get eval metrics
|
614 |
+
eval_metrics = get_metrics(eval_metrics)
|
615 |
+
eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
|
616 |
+
|
617 |
+
# Update progress bar
|
618 |
+
epochs.write(
|
619 |
+
f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {eval_metrics['loss']}, Perplexity: {eval_metrics['codevector_perplexity']})"
|
620 |
+
)
|
621 |
+
|
622 |
+
# Save metrics
|
623 |
+
if has_tensorboard and jax.process_index() == 0:
|
624 |
+
cur_step = epoch * (len(vectorized_datasets["train"]) // train_batch_size)
|
625 |
+
write_eval_metric(summary_writer, eval_metrics, cur_step)
|
626 |
+
|
627 |
+
# save checkpoint after each epoch and push checkpoint to the hub
|
628 |
+
if jax.process_index() == 0:
|
629 |
+
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
630 |
+
model.save_pretrained(
|
631 |
+
training_args.output_dir,
|
632 |
+
params=params,
|
633 |
+
push_to_hub=training_args.push_to_hub
|
634 |
+
)
|
635 |
+
|
636 |
+
|
637 |
+
if __name__ == "__main__":
|
638 |
+
main()
|