diff --git "a/notes/data_preparation_pt.ipynb" "b/notes/data_preparation_pt.ipynb"
--- "a/notes/data_preparation_pt.ipynb"
+++ "b/notes/data_preparation_pt.ipynb"
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -12,85 +12,50 @@
},
{
"cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": "['../src',\n '/Users/m3hrdadfi/Projects/HF/hfflax/hub/wav2vec2-base-persian/notes',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles/lib/python',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python39.zip',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/lib-dynload',\n '',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages',\n '/Users/m3hrdadfi/Projects/Apps/zabanshenas',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages/IPython/extensions',\n '/Users/m3hrdadfi/.ipython']"
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sys.path"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "if \"../src\" not in sys.path:\n",
- " sys.path.insert(0, \"../src\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
- "from normalizer import normalizer"
+ "# !mkdir -p /home/m3hrdadfi/code/data\n",
+ "# %cd /home/m3hrdadfi/code/data\n",
+ "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n",
+ "# %cd /home/m3hrdadfi/"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "سلام بر شما که میآیید و میآموزید که بیآرآیم \n",
- "کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند \n",
- "میانافزارهای امروزی نرمافزار سختافزار امروز نوشتافزارها \n",
- "این کتاب بهترین در نوع شتر آسانتر هست \n",
- "سه چیز هست که از پژوهش در این زمینه آموختهام \n"
+ "/home/m3hrdadfi/data/fa/cvfa/fa\n",
+ "/home/m3hrdadfi/data/fa\n",
+ "\n",
+ "cvfa fa.tar.gz\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/dev.tsv\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/invalidated.tsv\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/other.tsv\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/reported.tsv\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/test.tsv\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/train.tsv\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/validated.tsv\n"
]
}
],
"source": [
- "input_text = \"سلام بر شما که میآیید و میآموزید که بیآرآیم\"\n",
- "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
- "\n",
- "input_text = \"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند.\"\n",
- "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
- "\n",
- "input_text = \" میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها\"\n",
- "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
- "\n",
- "input_text = \"این کتاب بهترین در نوع شتر آسانتر هست\"\n",
- "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+ "import os\n",
"\n",
- "input_text = \"سه چیز هست که از پژوهش در این زمینه آموختهام\"\n",
- "print(normalizer({\"sentence\": input_text}, return_dict=False))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "# !mkdir -p /home/m3hrdadfi/code/data\n",
- "# %cd /home/m3hrdadfi/code/data\n",
- "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n",
- "# %cd /home/m3hrdadfi/"
+ "lang = \"fa\"\n",
+ "abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/data/{lang}\", f\"cv{lang}\", lang)\n",
+ "save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n",
+ "print(abs_path_to_data)\n",
+ "print(save_path)\n",
+ "print()\n",
+ "!ls {save_path}\n",
+ "!ls {abs_path_to_data}/*.tsv"
]
},
{
@@ -98,48 +63,6 @@
"execution_count": 13,
"metadata": {},
"outputs": [],
- "source": [
- "# import os\n",
- "\n",
- "# lang = \"fa\"\n",
- "# abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/code/data/{lang}/dataset\", f\"cv{lang}\", lang)\n",
- "# save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n",
- "# print(abs_path_to_data)\n",
- "# print(save_path)\n",
- "# print()\n",
- "# !ls {save_path}\n",
- "# !ls {abs_path_to_data}/*.tsv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "def normalizer_without_batch(text, pruning=False):\n",
- " try:\n",
- " batch = {\n",
- " \"sentence\": text\n",
- " }\n",
- " text = normalizer(batch, return_dict=False)\n",
- " \n",
- " if pruning:\n",
- " if not len(text.split()) > 3:\n",
- " text = None\n",
- " \n",
- " except:\n",
- " print(text)\n",
- " text = None\n",
- " \n",
- " return text"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
@@ -148,142 +71,235 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Step 0: 5213\n",
+ "Step 1: 5213\n",
+ "Step 2: 5213\n",
+ "Step 3: 5213\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "
\n\n
\n \n \n | \n sentence | \n path | \n
\n \n \n \n 0 | \n از مهمونداری کنار بکشم | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n برو از مهرداد بپرس. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n خب ، تو چیكار می كنی؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n مسقط پایتخت عمان در عربی به معنای محل سقوط است | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n آه، نه اصلاُ! | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n",
+ "test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n",
"\n",
- "# print(f\"Step 0: {len(test_df)}\")\n",
+ "print(f\"Step 0: {len(test_df)}\")\n",
"\n",
- "# test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n",
- "# test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
- "# test_df = test_df.dropna(subset=[\"path\"])\n",
- "# test_df = test_df.drop(\"status\", 1)\n",
- "# print(f\"Step 1: {len(test_df)}\")\n",
+ "test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n",
+ "test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
+ "test_df = test_df.dropna(subset=[\"path\"])\n",
+ "test_df = test_df.drop(\"status\", 1)\n",
+ "print(f\"Step 1: {len(test_df)}\")\n",
"\n",
- "# test_df[\"prev_sentence\"] = test_df[\"sentence\"]\n",
- "# test_df[\"sentence\"] = test_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
- "# test_df = test_df.dropna(subset=[\"sentence\"])\n",
- "# print(f\"Step 2: {len(test_df)}\")\n",
+ "test_df = test_df.dropna(subset=[\"sentence\"])\n",
+ "print(f\"Step 2: {len(test_df)}\")\n",
"\n",
- "# test_df = test_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
- "# test_df = test_df.drop_duplicates(subset=\"path\")\n",
- "# print(f\"Step 3: {len(test_df)}\")\n",
+ "test_df = test_df[[\"sentence\", \"path\"]]\n",
+ "test_df = test_df.drop_duplicates(subset=\"path\")\n",
+ "print(f\"Step 3: {len(test_df)}\")\n",
"\n",
- "# test_df = test_df.reset_index(drop=True)\n",
- "# test_df.head()"
+ "test_df = test_df.reset_index(drop=True)\n",
+ "test_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 18,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "12806\n",
+ "Step 0: 286975\n",
+ "Step 1: 286975\n",
+ "Step 2: 286975\n",
+ "Step 3: 274169\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# _train_df = pd.concat([\n",
- "# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
- "# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
- "# ])\n",
- "# print(len(_train_df))\n",
+ "_train_df = pd.concat([\n",
+ " pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
+ " pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
+ "])\n",
+ "print(len(_train_df))\n",
"\n",
- "# train_df = pd.concat([\n",
- "# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
- "# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
- "# pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n",
- "# pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n",
- "# ])\n",
- "# print(f\"Step 0: {len(train_df)}\")\n",
+ "train_df = pd.concat([\n",
+ " pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
+ " pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
+ " pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n",
+ " pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n",
+ "])\n",
+ "print(f\"Step 0: {len(train_df)}\")\n",
"\n",
- "# train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n",
- "# train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
- "# train_df = train_df.dropna(subset=[\"path\"])\n",
- "# train_df = train_df.drop(\"status\", 1)\n",
- "# print(f\"Step 1: {len(train_df)}\")\n",
+ "train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n",
+ "train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
+ "train_df = train_df.dropna(subset=[\"path\"])\n",
+ "train_df = train_df.drop(\"status\", 1)\n",
+ "print(f\"Step 1: {len(train_df)}\")\n",
"\n",
- "# train_df[\"prev_sentence\"] = train_df[\"sentence\"]\n",
- "# train_df[\"sentence\"] = train_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t, pruning=True))\n",
- "# train_df = train_df.dropna(subset=[\"sentence\"])\n",
- "# print(f\"Step 2: {len(train_df)}\")\n",
+ "train_df = train_df.dropna(subset=[\"sentence\"])\n",
+ "print(f\"Step 2: {len(train_df)}\")\n",
"\n",
- "# train_df = train_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
- "# train_df = train_df.drop_duplicates(subset=\"path\")\n",
- "# print(f\"Step 3: {len(train_df)}\")\n",
+ "train_df = train_df[[\"sentence\", \"path\"]]\n",
+ "train_df = train_df.drop_duplicates(subset=\"path\")\n",
+ "print(f\"Step 3: {len(train_df)}\")\n",
"\n",
- "# train_df = train_df.sample(frac=1)\n",
- "# train_df = train_df.reset_index(drop=True)\n",
- "# train_df.head()"
+ "train_df = train_df.sample(frac=1)\n",
+ "train_df = train_df.reset_index(drop=True)\n",
+ "train_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 19,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 5213/5213 [02:58<00:00, 29.27it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found #5213 test data\n"
+ ]
+ }
+ ],
"source": [
- "# from tqdm import tqdm\n",
+ "from tqdm import tqdm\n",
"\n",
- "# testset_indices = []\n",
+ "testset_indices = []\n",
"\n",
- "# for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n",
- "# _id = row[\"path\"]\n",
- "# finder = train_df[train_df[\"path\"] == _id]\n",
- "# if len(finder) > 0:\n",
- "# testset_indices.extend(list(finder.index))\n",
+ "for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n",
+ " _id = row[\"path\"]\n",
+ " finder = train_df[train_df[\"path\"] == _id]\n",
+ " if len(finder) > 0:\n",
+ " testset_indices.extend(list(finder.index))\n",
"\n",
- "# testset_indices = list(set(testset_indices))\n",
- "# print(f\"Found #{len(testset_indices)} test data\")"
+ "testset_indices = list(set(testset_indices))\n",
+ "print(f\"Found #{len(testset_indices)} test data\")"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 20,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "274169\n",
+ "268956\n"
+ ]
+ }
+ ],
"source": [
- "# print(len(train_df))\n",
- "# train_df = train_df.drop(testset_indices)\n",
- "# print(len(train_df))"
+ "print(len(train_df))\n",
+ "train_df = train_df.drop(testset_indices)\n",
+ "print(len(train_df))"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 21,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 274169 entries, 0 to 5212\n",
+ "Data columns (total 2 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sentence 274169 non-null object\n",
+ " 1 path 274169 non-null object\n",
+ "dtypes: object(2)\n",
+ "memory usage: 6.3+ MB\n",
+ "None\n",
+ "\n",
+ "RangeIndex: 274169 entries, 0 to 274168\n",
+ "Data columns (total 2 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sentence 274169 non-null object\n",
+ " 1 path 274169 non-null object\n",
+ "dtypes: object(2)\n",
+ "memory usage: 4.2+ MB\n",
+ "None\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ ��ده است. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# import pandas as pd\n",
+ "import pandas as pd\n",
"\n",
- "# df = pd.concat([train_df, test_df], axis=0)\n",
- "# # df = validated_df.copy()\n",
- "# print(df.info())\n",
- "# # df[\"sentence\"] = df[\"prev_sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
- "# # df = df.dropna(subset=[\"sentence\"])\n",
- "# # df[\"sentence_spell\"] = df[\"sentence\"].apply(lambda t: normalizer({\"sentence\": t}, is_spell_check=True, return_dict=False))\n",
- "# df = df.reset_index(drop=True)\n",
- "# print(df.info())\n",
- "# df.head()"
+ "df = pd.concat([train_df, test_df], axis=0)\n",
+ "print(df.info())\n",
+ "df = df.reset_index(drop=True)\n",
+ "print(df.info())\n",
+ "df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
- "# import torchaudio\n",
- "# import librosa\n",
- "# import IPython.display as ipd\n",
- "# import numpy as np\n",
+ "import torchaudio\n",
+ "import librosa\n",
+ "import IPython.display as ipd\n",
+ "import numpy as np\n",
"\n",
- "# def load_audio(path):\n",
- "# speech, sr = torchaudio.load(path)\n",
- "# speech = speech[0].numpy().squeeze() \n",
- "# speech = librosa.resample(np.asarray(speech), sr, 16_000)\n",
+ "def load_audio(path):\n",
+ " speech, sr = torchaudio.load(path)\n",
+ " speech = speech[0].numpy().squeeze() \n",
+ " speech = librosa.resample(np.asarray(speech), sr, 16_000)\n",
" \n",
- "# print(speech.shape, sr)\n",
+ " print(speech.shape, sr)\n",
" \n",
- "# ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))"
+ " ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))"
]
},
{
@@ -306,244 +322,566 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 27,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "sentence می توانید لطفاً سفر را برای من ترتیب دهید؟\npath /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...\nName: 95177, dtype: object"
+ },
+ "metadata": {
+ "transient": {}
+ },
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "می توانید لطفاً سفر را برای من ترتیب دهید؟\n",
+ "\n",
+ "(70272,) 48000\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n \n ",
+ "text/plain": ""
+ },
+ "metadata": {
+ "transient": {}
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# import numpy as np\n",
+ "import numpy as np\n",
"\n",
"\n",
- "# idx = np.random.randint(0, len(df))\n",
- "# # idx = 6140\n",
- "# sample = df.iloc[idx]\n",
- "# ipd.display(sample)\n",
- "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
- "# print()\n",
- "# print(sample[\"prev_sentence\"])\n",
- "# print(sample[\"sentence\"])\n",
- "# print()\n",
- "# load_audio(sample[\"path\"])"
+ "idx = np.random.randint(0, len(df))\n",
+ "# idx = 6140\n",
+ "sample = df.iloc[idx]\n",
+ "ipd.display(sample)\n",
+ "\n",
+ "print()\n",
+ "print(sample[\"sentence\"])\n",
+ "print()\n",
+ "load_audio(sample[\"path\"])\n",
+ "\n",
+ "train_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 29,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 268956 entries, 0 to 274168\n",
+ "Data columns (total 3 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sentence 268956 non-null object\n",
+ " 1 path 268956 non-null object\n",
+ " 2 _path 268956 non-null object\n",
+ "dtypes: object(3)\n",
+ "memory usage: 8.2+ MB\n",
+ "None\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_voice_fa_20100079.mp3\n",
+ "/home/m3hrdadfi/data/fa/clips/common_voice_fa_20100079.mp3\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# new_train_df = train_df.copy()\n",
- "# new_train_df[\"_path\"] = new_train_df[\"path\"]\n",
- "# new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
- "# print(new_train_df.info())\n",
- "# new_train_df.head()"
+ "new_train_df = train_df.copy()\n",
+ "new_train_df[\"_path\"] = new_train_df[\"path\"]\n",
+ "new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/data/fa/clips\", t.split(\"/\")[-1]))\n",
+ "print(new_train_df.info())\n",
+ "print(new_train_df.iloc[0][\"_path\"])\n",
+ "print(new_train_df.iloc[0][\"path\"])\n",
+ "new_train_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 30,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 5213 entries, 0 to 5212\n",
+ "Data columns (total 3 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sentence 5213 non-null object\n",
+ " 1 path 5213 non-null object\n",
+ " 2 _path 5213 non-null object\n",
+ "dtypes: object(3)\n",
+ "memory usage: 122.3+ KB\n",
+ "None\n",
+ "/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_voice_fa_18325365.mp3\n",
+ "/home/m3hrdadfi/data/fa/clips/common_voice_fa_18325365.mp3\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n از مهمونداری کنار بکشم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n برو از مهرداد بپرس. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n خب ، تو چیكار می كنی؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n مسقط پایتخت عمان در عربی به معنای محل سقوط است | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n آه، نه اصلاُ! | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# new_test_df = test_df.copy()\n",
- "# new_test_df[\"_path\"] = new_test_df[\"path\"]\n",
- "# new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
- "# print(new_test_df.info())\n",
- "# new_test_df.head()"
+ "new_test_df = test_df.copy()\n",
+ "new_test_df[\"_path\"] = new_test_df[\"path\"]\n",
+ "new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/data/fa/clips\", t.split(\"/\")[-1]))\n",
+ "print(new_test_df.info())\n",
+ "print(new_test_df.iloc[0][\"_path\"])\n",
+ "print(new_test_df.iloc[0][\"path\"])\n",
+ "new_test_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
- "# import shutil\n",
- "# from tqdm import tqdm"
+ "import shutil\n",
+ "from tqdm import tqdm"
]
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 32,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/m3hrdadfi/data/fa\n"
+ ]
+ }
+ ],
"source": [
- "# !mkdir -p {save_path}/clips\n",
- "# !mkdir -p {save_path}/augs"
+ "print(save_path)"
]
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
- "# for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n",
- "# shutil.copy(row[\"_path\"], row[\"path\"])"
+ "!mkdir -p {save_path}/clips\n",
+ "!mkdir -p {save_path}/augs"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 34,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 268956/268956 [02:40<00:00, 1675.19it/s]\n"
+ ]
+ }
+ ],
"source": [
- "# for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n",
- "# shutil.copy(row[\"_path\"], row[\"path\"])"
+ "for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n",
+ " shutil.copy(row[\"_path\"], row[\"path\"])"
]
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 35,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 5213/5213 [00:01<00:00, 4777.79it/s]\n"
+ ]
+ }
+ ],
"source": [
- "# # aug_train_df = new_train_df.copy()\n",
- "# aug_train_df = new_train_df.sample(frac=0.1)\n",
- "# aug_train_df = aug_train_df.reset_index(drop=True)\n",
- "# aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n",
- "# aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n",
- "# print(aug_train_df.info())\n",
- "# aug_train_df.head()"
+ "for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n",
+ " shutil.copy(row[\"_path\"], row[\"path\"])"
]
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 36,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 26896 entries, 0 to 26895\n",
+ "Data columns (total 3 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sentence 26896 non-null object\n",
+ " 1 path 26896 non-null object\n",
+ " 2 _path 26896 non-null object\n",
+ "dtypes: object(3)\n",
+ "memory usage: 630.5+ KB\n",
+ "None\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n کدامیک ارزان تر است؟ | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n 1 | \n آیا قرمز را بیشتر از آبی دوست داری؟ | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n 2 | \n من می خوام کمک کنم | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n 3 | \n در آفریقای جنوبی، برنامهای به نام دختران تکنو هست | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n 4 | \n حالا، این موضوع به ما فرصت ایجاد چند سناریو را... | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 کدامیک ارزان تر است؟ \n1 آیا قرمز را بیشتر از آبی دوست داری؟ \n2 من می خوام کمک کنم \n3 در آفریقای جنوبی، برنامهای به نام دختران تکنو هست \n4 حالا، این موضوع به ما فرصت ایجاد چند سناریو را... \n\n path \\\n0 /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... \n1 /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... \n2 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n3 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n4 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n\n _path \n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... "
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# print(aug_train_df.iloc[0][\"_path\"])\n",
- "# print(aug_train_df.iloc[0][\"path\"])"
+ "# aug_train_df = new_train_df.copy()\n",
+ "aug_train_df = new_train_df.sample(frac=0.1)\n",
+ "aug_train_df = aug_train_df.reset_index(drop=True)\n",
+ "aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n",
+ "aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n",
+ "print(aug_train_df.info())\n",
+ "aug_train_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/m3hrdadfi/data/fa/clips/common_voice_fa_20109281.mp3\n",
+ "/home/m3hrdadfi/data/fa/augs/common_voice_fa_20109281_aug.mp3.wav\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(aug_train_df.iloc[0][\"_path\"])\n",
+ "print(aug_train_df.iloc[0][\"path\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
- "# # augmentation\n",
+ "# augmentation\n",
"\n",
- "# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n",
- "# import numpy as np\n",
- "# import soundfile as sf\n",
+ "from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n",
+ "import numpy as np\n",
+ "import soundfile as sf\n",
"\n",
- "# augment = Compose([\n",
- "# # AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
- "# # PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n",
- "# # Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n",
+ "augment = Compose([\n",
"# AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
- "# TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n",
- "# PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n",
- "# ])\n",
+ "# PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n",
+ "# Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n",
+ " AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
+ " TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n",
+ " PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n",
+ "])\n",
"\n",
- "# def augmented_speech_file_to_array_fn(in_path, out_path):\n",
- "# speech_array, sampling_rate = torchaudio.load(in_path)\n",
- "# speech_array = speech_array.squeeze().numpy()\n",
- "# speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n",
- "# sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")"
+ "def augmented_speech_file_to_array_fn(in_path, out_path):\n",
+ " speech_array, sampling_rate = torchaudio.load(in_path)\n",
+ " speech_array = speech_array.squeeze().numpy()\n",
+ " speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n",
+ " sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")"
]
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 40,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 26896/26896 [1:18:09<00:00, 5.74it/s]\n"
+ ]
+ }
+ ],
"source": [
- "# # for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n",
- "# # augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n",
+ "for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n",
+ " augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n",
"# !ls"
]
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 42,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 295852 entries, 0 to 295851\n",
+ "Data columns (total 3 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sentence 295852 non-null object\n",
+ " 1 path 295852 non-null object\n",
+ " 2 _path 295852 non-null object\n",
+ "dtypes: object(3)\n",
+ "memory usage: 6.8+ MB\n",
+ "None\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n اما دیدم نه،هیچ جوره نمیتونم ببخشمش به خدا گفت... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n برای امروز. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n چون اگر میدانیم چیزی که بیگناه در نظر میگیریم ... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n ضمیر من را بدانید -- آقا، خانم، ایشان | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n تا تقویت و تکثیرشان کنیم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 اما دیدم نه،هیچ جوره نمیتونم ببخشمش به خدا گفت... \n1 برای امروز. \n2 چون اگر میدانیم چیزی که بیگناه در نظر میگیریم ... \n3 ضمیر من را بدانید -- آقا، خانم، ایشان \n4 تا تقویت و تکثیرشان کنیم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# # new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n",
+ "new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n",
"# new_train_aug_df = new_train_df.copy()\n",
- "# new_train_aug_df = new_train_aug_df.sample(frac=1)\n",
- "# new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n",
- "# print(new_train_aug_df.info())\n",
- "# new_train_aug_df.head()"
+ "new_train_aug_df = new_train_aug_df.sample(frac=1)\n",
+ "new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n",
+ "print(new_train_aug_df.info())\n",
+ "new_train_aug_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 43,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "'/home/m3hrdadfi/data/fa'"
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
- "# new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
- "# new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
+ "save_path"
]
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
- "# new_train_df.count()"
+ "new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
+ "new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
+ "new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
]
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 45,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "sentence 268956\npath 268956\n_path 268956\ndtype: int64"
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# new_test_df.count()"
+ "new_train_df.count()"
]
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "sentence 5213\npath 5213\n_path 5213\ndtype: int64"
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_test_df.count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
- "# import pandas as pd\n",
+ "import pandas as pd\n",
"\n",
- "# import os\n",
- "# from tqdm import tqdm"
+ "import os\n",
+ "from tqdm import tqdm"
]
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 48,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 268956 entries, 0 to 268955\n",
+ "Data columns (total 3 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sentence 268956 non-null object\n",
+ " 1 path 268956 non-null object\n",
+ " 2 _path 268956 non-null object\n",
+ "dtypes: object(3)\n",
+ "memory usage: 6.2+ MB\n",
+ "None\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n",
- "# print(train_df.info())\n",
- "# train_df.head()"
+ "train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n",
+ "print(train_df.info())\n",
+ "train_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 49,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 5213 entries, 0 to 5212\n",
+ "Data columns (total 3 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sentence 5213 non-null object\n",
+ " 1 path 5213 non-null object\n",
+ " 2 _path 5213 non-null object\n",
+ "dtypes: object(3)\n",
+ "memory usage: 122.3+ KB\n",
+ "None\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n از مهمونداری کنار بکشم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n برو از مهرداد بپرس. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n خب ، تو چیكار می كنی؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n مسقط پایتخت عمان در عربی به معنای محل سقوط است | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n آه، نه اصلاُ! | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
+ "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n",
- "# print(test_df.info())\n",
- "# test_df.head()"
+ "test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n",
+ "print(test_df.info())\n",
+ "test_df.head()"
]
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 50,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 268956/268956 [00:11<00:00, 24344.12it/s]\n"
+ ]
+ }
+ ],
"source": [
- "# non_existed_train = []\n",
+ "non_existed_train = []\n",
"\n",
- "# for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n",
- "# if not os.path.exists(row[\"path\"]):\n",
- "# non_existed_train.extends(list(index))\n",
- "# break"
+ "for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n",
+ " if not os.path.exists(row[\"path\"]):\n",
+ " non_existed_train.extends(list(index))\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "[]"
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "non_existed_train"
]
},
{
@@ -604,8 +942,13 @@
],
"metadata": {
"kernelspec": {
- "display_name": "transformers",
- "name": "transformers"
+ "display_name": "Python 3.8.10 ('jax-env': venv)",
+ "metadata": {
+ "interpreter": {
+ "hash": "d26705e03f37deada2a9ba7d9c91760e1381e108d31e47ed80b202768ffcaf62"
+ }
+ },
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -617,7 +960,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.4"
+ "version": "3.8.10"
},
"orig_nbformat": 2
},