diff --git "a/notes/data_preparation_pt.ipynb" "b/notes/data_preparation_pt.ipynb" --- "a/notes/data_preparation_pt.ipynb" +++ "b/notes/data_preparation_pt.ipynb" @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -12,85 +12,50 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": "['../src',\n '/Users/m3hrdadfi/Projects/HF/hfflax/hub/wav2vec2-base-persian/notes',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles/lib/python',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python39.zip',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/lib-dynload',\n '',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages',\n '/Users/m3hrdadfi/Projects/Apps/zabanshenas',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages/IPython/extensions',\n '/Users/m3hrdadfi/.ipython']" - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sys.path" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "if \"../src\" not in sys.path:\n", - " sys.path.insert(0, \"../src\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "from normalizer import normalizer" + "# !mkdir -p /home/m3hrdadfi/code/data\n", + "# %cd /home/m3hrdadfi/code/data\n", + "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n", + "# %cd /home/m3hrdadfi/" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "سلام بر شما که می‌آیید و می‌آموزید که بی‌آرآیم \n", - "کتاب‌هایمان میدانی کجا‌ها ماه‌هاس که کی‌هامون و کیهان دنباله‌هاشون برای بهای هستند \n", - "میان‌‌افزار‌های امروزی نرم‌افزار سخت‌افزار امروز نوشت‌افزار‌ها \n", - "این کتاب بهترین در نوع شتر آسان‌تر هست \n", - "سه چیز هست که از پژوهش در این زمینه آموخته‌ام \n" + "/home/m3hrdadfi/data/fa/cvfa/fa\n", + "/home/m3hrdadfi/data/fa\n", + "\n", + "cvfa fa.tar.gz\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/dev.tsv\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/invalidated.tsv\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/other.tsv\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/reported.tsv\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/test.tsv\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/train.tsv\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/validated.tsv\n" ] } ], "source": [ - "input_text = \"سلام بر شما که میآیید و میآموزید که بیآرآیم\"\n", - "print(normalizer({\"sentence\": input_text}, return_dict=False))\n", - "\n", - "input_text = \"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند.\"\n", - "print(normalizer({\"sentence\": input_text}, return_dict=False))\n", - "\n", - "input_text = \" میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها\"\n", - "print(normalizer({\"sentence\": input_text}, return_dict=False))\n", - "\n", - "input_text = \"این کتاب بهترین در نوع شتر آسانتر هست\"\n", - "print(normalizer({\"sentence\": input_text}, return_dict=False))\n", + "import os\n", "\n", - "input_text = \"سه چیز هست که از پژوهش در این زمینه آموختهام\"\n", - "print(normalizer({\"sentence\": input_text}, return_dict=False))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "# !mkdir -p /home/m3hrdadfi/code/data\n", - "# %cd /home/m3hrdadfi/code/data\n", - "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n", - "# %cd /home/m3hrdadfi/" + "lang = \"fa\"\n", + "abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/data/{lang}\", f\"cv{lang}\", lang)\n", + "save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n", + "print(abs_path_to_data)\n", + "print(save_path)\n", + "print()\n", + "!ls {save_path}\n", + "!ls {abs_path_to_data}/*.tsv" ] }, { @@ -98,48 +63,6 @@ "execution_count": 13, "metadata": {}, "outputs": [], - "source": [ - "# import os\n", - "\n", - "# lang = \"fa\"\n", - "# abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/code/data/{lang}/dataset\", f\"cv{lang}\", lang)\n", - "# save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n", - "# print(abs_path_to_data)\n", - "# print(save_path)\n", - "# print()\n", - "# !ls {save_path}\n", - "# !ls {abs_path_to_data}/*.tsv" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def normalizer_without_batch(text, pruning=False):\n", - " try:\n", - " batch = {\n", - " \"sentence\": text\n", - " }\n", - " text = normalizer(batch, return_dict=False)\n", - " \n", - " if pruning:\n", - " if not len(text.split()) > 3:\n", - " text = None\n", - " \n", - " except:\n", - " print(text)\n", - " text = None\n", - " \n", - " return text" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -148,142 +71,235 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step 0: 5213\n", + "Step 1: 5213\n", + "Step 2: 5213\n", + "Step 3: 5213\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath
0از مهمونداری کنار بکشم/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1برو از مهرداد بپرس./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2خب ، تو چیكار می كنی؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3مسقط پایتخت عمان در عربی به معنای محل سقوط است/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4آه، نه اصلاُ!/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n", + "test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n", "\n", - "# print(f\"Step 0: {len(test_df)}\")\n", + "print(f\"Step 0: {len(test_df)}\")\n", "\n", - "# test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n", - "# test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n", - "# test_df = test_df.dropna(subset=[\"path\"])\n", - "# test_df = test_df.drop(\"status\", 1)\n", - "# print(f\"Step 1: {len(test_df)}\")\n", + "test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n", + "test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n", + "test_df = test_df.dropna(subset=[\"path\"])\n", + "test_df = test_df.drop(\"status\", 1)\n", + "print(f\"Step 1: {len(test_df)}\")\n", "\n", - "# test_df[\"prev_sentence\"] = test_df[\"sentence\"]\n", - "# test_df[\"sentence\"] = test_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t))\n", - "# test_df = test_df.dropna(subset=[\"sentence\"])\n", - "# print(f\"Step 2: {len(test_df)}\")\n", + "test_df = test_df.dropna(subset=[\"sentence\"])\n", + "print(f\"Step 2: {len(test_df)}\")\n", "\n", - "# test_df = test_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n", - "# test_df = test_df.drop_duplicates(subset=\"path\")\n", - "# print(f\"Step 3: {len(test_df)}\")\n", + "test_df = test_df[[\"sentence\", \"path\"]]\n", + "test_df = test_df.drop_duplicates(subset=\"path\")\n", + "print(f\"Step 3: {len(test_df)}\")\n", "\n", - "# test_df = test_df.reset_index(drop=True)\n", - "# test_df.head()" + "test_df = test_df.reset_index(drop=True)\n", + "test_df.head()" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12806\n", + "Step 0: 286975\n", + "Step 1: 286975\n", + "Step 2: 286975\n", + "Step 3: 274169\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# _train_df = pd.concat([\n", - "# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n", - "# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n", - "# ])\n", - "# print(len(_train_df))\n", + "_train_df = pd.concat([\n", + " pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n", + " pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n", + "])\n", + "print(len(_train_df))\n", "\n", - "# train_df = pd.concat([\n", - "# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n", - "# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n", - "# pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n", - "# pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n", - "# ])\n", - "# print(f\"Step 0: {len(train_df)}\")\n", + "train_df = pd.concat([\n", + " pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n", + " pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n", + " pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n", + " pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n", + "])\n", + "print(f\"Step 0: {len(train_df)}\")\n", "\n", - "# train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n", - "# train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n", - "# train_df = train_df.dropna(subset=[\"path\"])\n", - "# train_df = train_df.drop(\"status\", 1)\n", - "# print(f\"Step 1: {len(train_df)}\")\n", + "train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n", + "train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n", + "train_df = train_df.dropna(subset=[\"path\"])\n", + "train_df = train_df.drop(\"status\", 1)\n", + "print(f\"Step 1: {len(train_df)}\")\n", "\n", - "# train_df[\"prev_sentence\"] = train_df[\"sentence\"]\n", - "# train_df[\"sentence\"] = train_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t, pruning=True))\n", - "# train_df = train_df.dropna(subset=[\"sentence\"])\n", - "# print(f\"Step 2: {len(train_df)}\")\n", + "train_df = train_df.dropna(subset=[\"sentence\"])\n", + "print(f\"Step 2: {len(train_df)}\")\n", "\n", - "# train_df = train_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n", - "# train_df = train_df.drop_duplicates(subset=\"path\")\n", - "# print(f\"Step 3: {len(train_df)}\")\n", + "train_df = train_df[[\"sentence\", \"path\"]]\n", + "train_df = train_df.drop_duplicates(subset=\"path\")\n", + "print(f\"Step 3: {len(train_df)}\")\n", "\n", - "# train_df = train_df.sample(frac=1)\n", - "# train_df = train_df.reset_index(drop=True)\n", - "# train_df.head()" + "train_df = train_df.sample(frac=1)\n", + "train_df = train_df.reset_index(drop=True)\n", + "train_df.head()" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5213/5213 [02:58<00:00, 29.27it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found #5213 test data\n" + ] + } + ], "source": [ - "# from tqdm import tqdm\n", + "from tqdm import tqdm\n", "\n", - "# testset_indices = []\n", + "testset_indices = []\n", "\n", - "# for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n", - "# _id = row[\"path\"]\n", - "# finder = train_df[train_df[\"path\"] == _id]\n", - "# if len(finder) > 0:\n", - "# testset_indices.extend(list(finder.index))\n", + "for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n", + " _id = row[\"path\"]\n", + " finder = train_df[train_df[\"path\"] == _id]\n", + " if len(finder) > 0:\n", + " testset_indices.extend(list(finder.index))\n", "\n", - "# testset_indices = list(set(testset_indices))\n", - "# print(f\"Found #{len(testset_indices)} test data\")" + "testset_indices = list(set(testset_indices))\n", + "print(f\"Found #{len(testset_indices)} test data\")" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "274169\n", + "268956\n" + ] + } + ], "source": [ - "# print(len(train_df))\n", - "# train_df = train_df.drop(testset_indices)\n", - "# print(len(train_df))" + "print(len(train_df))\n", + "train_df = train_df.drop(testset_indices)\n", + "print(len(train_df))" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 274169 entries, 0 to 5212\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentence 274169 non-null object\n", + " 1 path 274169 non-null object\n", + "dtypes: object(2)\n", + "memory usage: 6.3+ MB\n", + "None\n", + "\n", + "RangeIndex: 274169 entries, 0 to 274168\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentence 274169 non-null object\n", + " 1 path 274169 non-null object\n", + "dtypes: object(2)\n", + "memory usage: 4.2+ MB\n", + "None\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ ��ده است./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# import pandas as pd\n", + "import pandas as pd\n", "\n", - "# df = pd.concat([train_df, test_df], axis=0)\n", - "# # df = validated_df.copy()\n", - "# print(df.info())\n", - "# # df[\"sentence\"] = df[\"prev_sentence\"].apply(lambda t: normalizer_without_batch(t))\n", - "# # df = df.dropna(subset=[\"sentence\"])\n", - "# # df[\"sentence_spell\"] = df[\"sentence\"].apply(lambda t: normalizer({\"sentence\": t}, is_spell_check=True, return_dict=False))\n", - "# df = df.reset_index(drop=True)\n", - "# print(df.info())\n", - "# df.head()" + "df = pd.concat([train_df, test_df], axis=0)\n", + "print(df.info())\n", + "df = df.reset_index(drop=True)\n", + "print(df.info())\n", + "df.head()" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "# import torchaudio\n", - "# import librosa\n", - "# import IPython.display as ipd\n", - "# import numpy as np\n", + "import torchaudio\n", + "import librosa\n", + "import IPython.display as ipd\n", + "import numpy as np\n", "\n", - "# def load_audio(path):\n", - "# speech, sr = torchaudio.load(path)\n", - "# speech = speech[0].numpy().squeeze() \n", - "# speech = librosa.resample(np.asarray(speech), sr, 16_000)\n", + "def load_audio(path):\n", + " speech, sr = torchaudio.load(path)\n", + " speech = speech[0].numpy().squeeze() \n", + " speech = librosa.resample(np.asarray(speech), sr, 16_000)\n", " \n", - "# print(speech.shape, sr)\n", + " print(speech.shape, sr)\n", " \n", - "# ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))" + " ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))" ] }, { @@ -306,244 +322,566 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": "sentence می توانید لطفاً سفر را برای من ترتیب دهید؟\npath /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...\nName: 95177, dtype: object" + }, + "metadata": { + "transient": {} + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "می توانید لطفاً سفر را برای من ترتیب دهید؟\n", + "\n", + "(70272,) 48000\n" + ] + }, + { + "data": { + "text/html": "\n \n ", + "text/plain": "" + }, + "metadata": { + "transient": {} + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# import numpy as np\n", + "import numpy as np\n", "\n", "\n", - "# idx = np.random.randint(0, len(df))\n", - "# # idx = 6140\n", - "# sample = df.iloc[idx]\n", - "# ipd.display(sample)\n", - "# # print(sample.iloc[idx][\"prev_sentence\"])\n", - "# print()\n", - "# print(sample[\"prev_sentence\"])\n", - "# print(sample[\"sentence\"])\n", - "# print()\n", - "# load_audio(sample[\"path\"])" + "idx = np.random.randint(0, len(df))\n", + "# idx = 6140\n", + "sample = df.iloc[idx]\n", + "ipd.display(sample)\n", + "\n", + "print()\n", + "print(sample[\"sentence\"])\n", + "print()\n", + "load_audio(sample[\"path\"])\n", + "\n", + "train_df.head()" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 268956 entries, 0 to 274168\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentence 268956 non-null object\n", + " 1 path 268956 non-null object\n", + " 2 _path 268956 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 8.2+ MB\n", + "None\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_voice_fa_20100079.mp3\n", + "/home/m3hrdadfi/data/fa/clips/common_voice_fa_20100079.mp3\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# new_train_df = train_df.copy()\n", - "# new_train_df[\"_path\"] = new_train_df[\"path\"]\n", - "# new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n", - "# print(new_train_df.info())\n", - "# new_train_df.head()" + "new_train_df = train_df.copy()\n", + "new_train_df[\"_path\"] = new_train_df[\"path\"]\n", + "new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/data/fa/clips\", t.split(\"/\")[-1]))\n", + "print(new_train_df.info())\n", + "print(new_train_df.iloc[0][\"_path\"])\n", + "print(new_train_df.iloc[0][\"path\"])\n", + "new_train_df.head()" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5213 entries, 0 to 5212\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentence 5213 non-null object\n", + " 1 path 5213 non-null object\n", + " 2 _path 5213 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 122.3+ KB\n", + "None\n", + "/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_voice_fa_18325365.mp3\n", + "/home/m3hrdadfi/data/fa/clips/common_voice_fa_18325365.mp3\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0از مهمونداری کنار بکشم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1برو از مهرداد بپرس./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2خب ، تو چیكار می كنی؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3مسقط پایتخت عمان در عربی به معنای محل سقوط است/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4آه، نه اصلاُ!/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# new_test_df = test_df.copy()\n", - "# new_test_df[\"_path\"] = new_test_df[\"path\"]\n", - "# new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n", - "# print(new_test_df.info())\n", - "# new_test_df.head()" + "new_test_df = test_df.copy()\n", + "new_test_df[\"_path\"] = new_test_df[\"path\"]\n", + "new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/data/fa/clips\", t.split(\"/\")[-1]))\n", + "print(new_test_df.info())\n", + "print(new_test_df.iloc[0][\"_path\"])\n", + "print(new_test_df.iloc[0][\"path\"])\n", + "new_test_df.head()" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "# import shutil\n", - "# from tqdm import tqdm" + "import shutil\n", + "from tqdm import tqdm" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/m3hrdadfi/data/fa\n" + ] + } + ], "source": [ - "# !mkdir -p {save_path}/clips\n", - "# !mkdir -p {save_path}/augs" + "print(save_path)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "# for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n", - "# shutil.copy(row[\"_path\"], row[\"path\"])" + "!mkdir -p {save_path}/clips\n", + "!mkdir -p {save_path}/augs" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 268956/268956 [02:40<00:00, 1675.19it/s]\n" + ] + } + ], "source": [ - "# for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n", - "# shutil.copy(row[\"_path\"], row[\"path\"])" + "for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n", + " shutil.copy(row[\"_path\"], row[\"path\"])" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5213/5213 [00:01<00:00, 4777.79it/s]\n" + ] + } + ], "source": [ - "# # aug_train_df = new_train_df.copy()\n", - "# aug_train_df = new_train_df.sample(frac=0.1)\n", - "# aug_train_df = aug_train_df.reset_index(drop=True)\n", - "# aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n", - "# aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n", - "# print(aug_train_df.info())\n", - "# aug_train_df.head()" + "for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n", + " shutil.copy(row[\"_path\"], row[\"path\"])" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 36, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 26896 entries, 0 to 26895\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentence 26896 non-null object\n", + " 1 path 26896 non-null object\n", + " 2 _path 26896 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 630.5+ KB\n", + "None\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0کدامیک ارزان تر است؟/home/m3hrdadfi/data/fa/augs/common_voice_fa_2.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
1آیا قرمز را بیشتر از آبی دوست داری؟/home/m3hrdadfi/data/fa/augs/common_voice_fa_2.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
2من می خوام کمک کنم/home/m3hrdadfi/data/fa/augs/common_voice_fa_1.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
3در آفریقای جنوبی، برنامهای به نام دختران تکنو هست/home/m3hrdadfi/data/fa/augs/common_voice_fa_1.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
4حالا، این موضوع به ما فرصت ایجاد چند سناریو را.../home/m3hrdadfi/data/fa/augs/common_voice_fa_1.../home/m3hrdadfi/data/fa/clips/common_voice_fa_...
\n
", + "text/plain": " sentence \\\n0 کدامیک ارزان تر است؟ \n1 آیا قرمز را بیشتر از آبی دوست داری؟ \n2 من می خوام کمک کنم \n3 در آفریقای جنوبی، برنامهای به نام دختران تکنو هست \n4 حالا، این موضوع به ما فرصت ایجاد چند سناریو را... \n\n path \\\n0 /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... \n1 /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... \n2 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n3 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n4 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n\n _path \n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... " + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# print(aug_train_df.iloc[0][\"_path\"])\n", - "# print(aug_train_df.iloc[0][\"path\"])" + "# aug_train_df = new_train_df.copy()\n", + "aug_train_df = new_train_df.sample(frac=0.1)\n", + "aug_train_df = aug_train_df.reset_index(drop=True)\n", + "aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n", + "aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n", + "print(aug_train_df.info())\n", + "aug_train_df.head()" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/m3hrdadfi/data/fa/clips/common_voice_fa_20109281.mp3\n", + "/home/m3hrdadfi/data/fa/augs/common_voice_fa_20109281_aug.mp3.wav\n" + ] + } + ], + "source": [ + "print(aug_train_df.iloc[0][\"_path\"])\n", + "print(aug_train_df.iloc[0][\"path\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ - "# # augmentation\n", + "# augmentation\n", "\n", - "# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n", - "# import numpy as np\n", - "# import soundfile as sf\n", + "from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n", + "import numpy as np\n", + "import soundfile as sf\n", "\n", - "# augment = Compose([\n", - "# # AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n", - "# # PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n", - "# # Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n", + "augment = Compose([\n", "# AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n", - "# TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n", - "# PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n", - "# ])\n", + "# PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n", + "# Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n", + " AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n", + " TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n", + " PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n", + "])\n", "\n", - "# def augmented_speech_file_to_array_fn(in_path, out_path):\n", - "# speech_array, sampling_rate = torchaudio.load(in_path)\n", - "# speech_array = speech_array.squeeze().numpy()\n", - "# speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n", - "# sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")" + "def augmented_speech_file_to_array_fn(in_path, out_path):\n", + " speech_array, sampling_rate = torchaudio.load(in_path)\n", + " speech_array = speech_array.squeeze().numpy()\n", + " speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n", + " sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 40, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 26896/26896 [1:18:09<00:00, 5.74it/s]\n" + ] + } + ], "source": [ - "# # for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n", - "# # augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n", + "for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n", + " augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n", "# !ls" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 42, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 295852 entries, 0 to 295851\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentence 295852 non-null object\n", + " 1 path 295852 non-null object\n", + " 2 _path 295852 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 6.8+ MB\n", + "None\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0اما دیدم نه،هیچ جوره نمیتونم ببخشمش به خدا گفت.../home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1برای امروز./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2چون اگر میدانیم چیزی که بیگناه در نظر میگیریم .../home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3ضمیر من را بدانید -- آقا، خانم، ایشان/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4تا تقویت و تکثیرشان کنیم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 اما دیدم نه،هیچ جوره نمیتونم ببخشمش به خدا گفت... \n1 برای امروز. \n2 چون اگر میدانیم چیزی که بیگناه در نظر میگیریم ... \n3 ضمیر من را بدانید -- آقا، خانم، ایشان \n4 تا تقویت و تکثیرشان کنیم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# # new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n", + "new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n", "# new_train_aug_df = new_train_df.copy()\n", - "# new_train_aug_df = new_train_aug_df.sample(frac=1)\n", - "# new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n", - "# print(new_train_aug_df.info())\n", - "# new_train_aug_df.head()" + "new_train_aug_df = new_train_aug_df.sample(frac=1)\n", + "new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n", + "print(new_train_aug_df.info())\n", + "new_train_aug_df.head()" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 43, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": "'/home/m3hrdadfi/data/fa'" + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n", - "# new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n", - "# new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)" + "save_path" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ - "# new_train_df.count()" + "new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n", + "new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n", + "new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 45, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": "sentence 268956\npath 268956\n_path 268956\ndtype: int64" + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# new_test_df.count()" + "new_train_df.count()" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": "sentence 5213\npath 5213\n_path 5213\ndtype: int64" + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_test_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ - "# import pandas as pd\n", + "import pandas as pd\n", "\n", - "# import os\n", - "# from tqdm import tqdm" + "import os\n", + "from tqdm import tqdm" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 48, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 268956 entries, 0 to 268955\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentence 268956 non-null object\n", + " 1 path 268956 non-null object\n", + " 2 _path 268956 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 6.2+ MB\n", + "None\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0رآس ، اینجا چه خبره ؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1ممکن است آن را تعمیر کنید وقتی منتظر هستم؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2دلم برای تو تنگ شده است./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3دارم اتاقم را تمیز میکنم./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4هاورد باهاتون صحبت کنم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n", - "# print(train_df.info())\n", - "# train_df.head()" + "train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n", + "print(train_df.info())\n", + "train_df.head()" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 49, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5213 entries, 0 to 5212\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentence 5213 non-null object\n", + " 1 path 5213 non-null object\n", + " 2 _path 5213 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 122.3+ KB\n", + "None\n" + ] + }, + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sentencepath_path
0از مهمونداری کنار بکشم/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
1برو از مهرداد بپرس./home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
2خب ، تو چیكار می كنی؟/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
3مسقط پایتخت عمان در عربی به معنای محل سقوط است/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
4آه، نه اصلاُ!/home/m3hrdadfi/data/fa/clips/common_voice_fa_.../home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...
\n
", + "text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... " + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n", - "# print(test_df.info())\n", - "# test_df.head()" + "test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n", + "print(test_df.info())\n", + "test_df.head()" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 268956/268956 [00:11<00:00, 24344.12it/s]\n" + ] + } + ], "source": [ - "# non_existed_train = []\n", + "non_existed_train = []\n", "\n", - "# for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n", - "# if not os.path.exists(row[\"path\"]):\n", - "# non_existed_train.extends(list(index))\n", - "# break" + "for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n", + " if not os.path.exists(row[\"path\"]):\n", + " non_existed_train.extends(list(index))\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": "[]" + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_existed_train" ] }, { @@ -604,8 +942,13 @@ ], "metadata": { "kernelspec": { - "display_name": "transformers", - "name": "transformers" + "display_name": "Python 3.8.10 ('jax-env': venv)", + "metadata": { + "interpreter": { + "hash": "d26705e03f37deada2a9ba7d9c91760e1381e108d31e47ed80b202768ffcaf62" + } + }, + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -617,7 +960,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.4" + "version": "3.8.10" }, "orig_nbformat": 2 },