{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# !mkdir -p /home/m3hrdadfi/code/data\n",
"# %cd /home/m3hrdadfi/code/data\n",
"# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n",
"# %cd /home/m3hrdadfi/"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/m3hrdadfi/data/fa/cvfa/fa\n",
"/home/m3hrdadfi/data/fa\n",
"\n",
"cvfa fa.tar.gz\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/dev.tsv\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/invalidated.tsv\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/other.tsv\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/reported.tsv\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/test.tsv\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/train.tsv\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/validated.tsv\n"
]
}
],
"source": [
"import os\n",
"\n",
"lang = \"fa\"\n",
"abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/data/{lang}\", f\"cv{lang}\", lang)\n",
"save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n",
"print(abs_path_to_data)\n",
"print(save_path)\n",
"print()\n",
"!ls {save_path}\n",
"!ls {abs_path_to_data}/*.tsv"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Step 0: 5213\n",
"Step 1: 5213\n",
"Step 2: 5213\n",
"Step 3: 5213\n"
]
},
{
"data": {
"text/html": "
\n\n
\n \n \n | \n sentence | \n path | \n
\n \n \n \n 0 | \n از مهمونداری کنار بکشم | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n برو از مهرداد بپرس. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n خب ، تو چیكار می كنی؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n مسقط پایتخت عمان در عربی به معنای محل سقوط است | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n آه، نه اصلاُ! | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n",
"\n",
"print(f\"Step 0: {len(test_df)}\")\n",
"\n",
"test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n",
"test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
"test_df = test_df.dropna(subset=[\"path\"])\n",
"test_df = test_df.drop(\"status\", 1)\n",
"print(f\"Step 1: {len(test_df)}\")\n",
"\n",
"test_df = test_df.dropna(subset=[\"sentence\"])\n",
"print(f\"Step 2: {len(test_df)}\")\n",
"\n",
"test_df = test_df[[\"sentence\", \"path\"]]\n",
"test_df = test_df.drop_duplicates(subset=\"path\")\n",
"print(f\"Step 3: {len(test_df)}\")\n",
"\n",
"test_df = test_df.reset_index(drop=True)\n",
"test_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"12806\n",
"Step 0: 286975\n",
"Step 1: 286975\n",
"Step 2: 286975\n",
"Step 3: 274169\n"
]
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_train_df = pd.concat([\n",
" pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
" pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
"])\n",
"print(len(_train_df))\n",
"\n",
"train_df = pd.concat([\n",
" pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
" pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
" pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n",
" pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n",
"])\n",
"print(f\"Step 0: {len(train_df)}\")\n",
"\n",
"train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n",
"train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
"train_df = train_df.dropna(subset=[\"path\"])\n",
"train_df = train_df.drop(\"status\", 1)\n",
"print(f\"Step 1: {len(train_df)}\")\n",
"\n",
"train_df = train_df.dropna(subset=[\"sentence\"])\n",
"print(f\"Step 2: {len(train_df)}\")\n",
"\n",
"train_df = train_df[[\"sentence\", \"path\"]]\n",
"train_df = train_df.drop_duplicates(subset=\"path\")\n",
"print(f\"Step 3: {len(train_df)}\")\n",
"\n",
"train_df = train_df.sample(frac=1)\n",
"train_df = train_df.reset_index(drop=True)\n",
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 5213/5213 [02:58<00:00, 29.27it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found #5213 test data\n"
]
}
],
"source": [
"from tqdm import tqdm\n",
"\n",
"testset_indices = []\n",
"\n",
"for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n",
" _id = row[\"path\"]\n",
" finder = train_df[train_df[\"path\"] == _id]\n",
" if len(finder) > 0:\n",
" testset_indices.extend(list(finder.index))\n",
"\n",
"testset_indices = list(set(testset_indices))\n",
"print(f\"Found #{len(testset_indices)} test data\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"274169\n",
"268956\n"
]
}
],
"source": [
"print(len(train_df))\n",
"train_df = train_df.drop(testset_indices)\n",
"print(len(train_df))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Int64Index: 274169 entries, 0 to 5212\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence 274169 non-null object\n",
" 1 path 274169 non-null object\n",
"dtypes: object(2)\n",
"memory usage: 6.3+ MB\n",
"None\n",
"\n",
"RangeIndex: 274169 entries, 0 to 274168\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence 274169 non-null object\n",
" 1 path 274169 non-null object\n",
"dtypes: object(2)\n",
"memory usage: 4.2+ MB\n",
"None\n"
]
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.concat([train_df, test_df], axis=0)\n",
"print(df.info())\n",
"df = df.reset_index(drop=True)\n",
"print(df.info())\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import torchaudio\n",
"import librosa\n",
"import IPython.display as ipd\n",
"import numpy as np\n",
"\n",
"def load_audio(path):\n",
" speech, sr = torchaudio.load(path)\n",
" speech = speech[0].numpy().squeeze() \n",
" speech = librosa.resample(np.asarray(speech), sr, 16_000)\n",
" \n",
" print(speech.shape, sr)\n",
" \n",
" ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# main_vocab = [\"ح\", \"چ\", \"ج\", \"ث\", \"ت\", \"پ\", \"ب\", \"آ\", \"ا\", \"ش\", \"س\", \"ژ\", \"ز\", \"ر\", \"ذ\", \"د\", \"خ\", \"ق\", \"ف\", \"غ\", \"ع\", \"ظ\", \"ط\", \"ض\", \"ص\", \"ی\", \"ه\", \"و\", \"ن\", \"م\", \"ل\", \"گ\", \"ک\"]\n",
"# text = \" \".join(df[\"sentence\"].values.tolist())\n",
"# vocab = list(sorted(set(text)))\n",
"\n",
"# for v in main_vocab:\n",
"# if v not in vocab:\n",
"# print(\"v\", v)\n",
"\n",
"# print(len(main_vocab), len(vocab))\n",
"# print(len(vocab), vocab)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "sentence می توانید لطفاً سفر را برای من ترتیب دهید؟\npath /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v...\nName: 95177, dtype: object"
},
"metadata": {
"transient": {}
},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"می توانید لطفاً سفر را برای من ترتیب دهید؟\n",
"\n",
"(70272,) 48000\n"
]
},
{
"data": {
"text/html": "\n \n ",
"text/plain": ""
},
"metadata": {
"transient": {}
},
"output_type": "display_data"
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"\n",
"\n",
"idx = np.random.randint(0, len(df))\n",
"# idx = 6140\n",
"sample = df.iloc[idx]\n",
"ipd.display(sample)\n",
"\n",
"print()\n",
"print(sample[\"sentence\"])\n",
"print()\n",
"load_audio(sample[\"path\"])\n",
"\n",
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Int64Index: 268956 entries, 0 to 274168\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence 268956 non-null object\n",
" 1 path 268956 non-null object\n",
" 2 _path 268956 non-null object\n",
"dtypes: object(3)\n",
"memory usage: 8.2+ MB\n",
"None\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_voice_fa_20100079.mp3\n",
"/home/m3hrdadfi/data/fa/clips/common_voice_fa_20100079.mp3\n"
]
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_train_df = train_df.copy()\n",
"new_train_df[\"_path\"] = new_train_df[\"path\"]\n",
"new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/data/fa/clips\", t.split(\"/\")[-1]))\n",
"print(new_train_df.info())\n",
"print(new_train_df.iloc[0][\"_path\"])\n",
"print(new_train_df.iloc[0][\"path\"])\n",
"new_train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5213 entries, 0 to 5212\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence 5213 non-null object\n",
" 1 path 5213 non-null object\n",
" 2 _path 5213 non-null object\n",
"dtypes: object(3)\n",
"memory usage: 122.3+ KB\n",
"None\n",
"/home/m3hrdadfi/data/fa/cvfa/fa/clips/common_voice_fa_18325365.mp3\n",
"/home/m3hrdadfi/data/fa/clips/common_voice_fa_18325365.mp3\n"
]
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n از مهمونداری کنار بکشم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n برو از مهرداد بپرس. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n خب ، تو چیكار می كنی؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n مسقط پایتخت عمان در عربی به معنای محل سقوط است | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n آه، نه اصلاُ! | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_test_df = test_df.copy()\n",
"new_test_df[\"_path\"] = new_test_df[\"path\"]\n",
"new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/data/fa/clips\", t.split(\"/\")[-1]))\n",
"print(new_test_df.info())\n",
"print(new_test_df.iloc[0][\"_path\"])\n",
"print(new_test_df.iloc[0][\"path\"])\n",
"new_test_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/m3hrdadfi/data/fa\n"
]
}
],
"source": [
"print(save_path)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"!mkdir -p {save_path}/clips\n",
"!mkdir -p {save_path}/augs"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 268956/268956 [02:40<00:00, 1675.19it/s]\n"
]
}
],
"source": [
"for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n",
" shutil.copy(row[\"_path\"], row[\"path\"])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 5213/5213 [00:01<00:00, 4777.79it/s]\n"
]
}
],
"source": [
"for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n",
" shutil.copy(row[\"_path\"], row[\"path\"])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 26896 entries, 0 to 26895\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence 26896 non-null object\n",
" 1 path 26896 non-null object\n",
" 2 _path 26896 non-null object\n",
"dtypes: object(3)\n",
"memory usage: 630.5+ KB\n",
"None\n"
]
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n کدامیک ارزان تر است؟ | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n 1 | \n آیا قرمز را بیشتر از آبی دوست داری؟ | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n 2 | \n من می خوام کمک کنم | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n 3 | \n در آفریقای جنوبی، برنامهای به نام دختران تکنو هست | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n 4 | \n حالا، این موضوع به ما فرصت ایجاد چند سناریو را... | \n /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 کدامیک ارزان تر است؟ \n1 آیا قرمز را بیشتر از آبی دوست داری؟ \n2 من می خوام کمک کنم \n3 در آفریقای جنوبی، برنامهای به نام دختران تکنو هست \n4 حالا، این موضوع به ما فرصت ایجاد چند سناریو را... \n\n path \\\n0 /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... \n1 /home/m3hrdadfi/data/fa/augs/common_voice_fa_2... \n2 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n3 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n4 /home/m3hrdadfi/data/fa/augs/common_voice_fa_1... \n\n _path \n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... "
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# aug_train_df = new_train_df.copy()\n",
"aug_train_df = new_train_df.sample(frac=0.1)\n",
"aug_train_df = aug_train_df.reset_index(drop=True)\n",
"aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n",
"aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n",
"print(aug_train_df.info())\n",
"aug_train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/m3hrdadfi/data/fa/clips/common_voice_fa_20109281.mp3\n",
"/home/m3hrdadfi/data/fa/augs/common_voice_fa_20109281_aug.mp3.wav\n"
]
}
],
"source": [
"print(aug_train_df.iloc[0][\"_path\"])\n",
"print(aug_train_df.iloc[0][\"path\"])"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# augmentation\n",
"\n",
"from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n",
"import numpy as np\n",
"import soundfile as sf\n",
"\n",
"augment = Compose([\n",
"# AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
"# PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n",
"# Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n",
" AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
" TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n",
" PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n",
"])\n",
"\n",
"def augmented_speech_file_to_array_fn(in_path, out_path):\n",
" speech_array, sampling_rate = torchaudio.load(in_path)\n",
" speech_array = speech_array.squeeze().numpy()\n",
" speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n",
" sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 26896/26896 [1:18:09<00:00, 5.74it/s]\n"
]
}
],
"source": [
"for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n",
" augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n",
"# !ls"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 295852 entries, 0 to 295851\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence 295852 non-null object\n",
" 1 path 295852 non-null object\n",
" 2 _path 295852 non-null object\n",
"dtypes: object(3)\n",
"memory usage: 6.8+ MB\n",
"None\n"
]
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n اما دیدم نه،هیچ جوره نمیتونم ببخشمش به خدا گفت... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n برای امروز. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n چون اگر میدانیم چیزی که بیگناه در نظر میگیریم ... | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n ضمیر من را بدانید -- آقا، خانم، ایشان | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n تا تقویت و تکثیرشان کنیم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 اما دیدم نه،هیچ جوره نمیتونم ببخشمش به خدا گفت... \n1 برای امروز. \n2 چون اگر میدانیم چیزی که بیگناه در نظر میگیریم ... \n3 ضمیر من را بدانید -- آقا، خانم، ایشان \n4 تا تقویت و تکثیرشان کنیم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n",
"# new_train_aug_df = new_train_df.copy()\n",
"new_train_aug_df = new_train_aug_df.sample(frac=1)\n",
"new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n",
"print(new_train_aug_df.info())\n",
"new_train_aug_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "'/home/m3hrdadfi/data/fa'"
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"save_path"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
"new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
"new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "sentence 268956\npath 268956\n_path 268956\ndtype: int64"
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_train_df.count()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "sentence 5213\npath 5213\n_path 5213\ndtype: int64"
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_test_df.count()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"import os\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 268956 entries, 0 to 268955\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence 268956 non-null object\n",
" 1 path 268956 non-null object\n",
" 2 _path 268956 non-null object\n",
"dtypes: object(3)\n",
"memory usage: 6.2+ MB\n",
"None\n"
]
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n رآس ، اینجا چه خبره ؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n دلم برای تو تنگ شده است. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n دارم اتاقم را تمیز میکنم. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n هاورد باهاتون صحبت کنم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 رآس ، اینجا چه خبره ؟ \n1 ممکن است آن را تعمیر کنید وقتی منتظر هستم؟ \n2 دلم برای تو تنگ شده است. \n3 دارم اتاقم را تمیز میکنم. \n4 هاورد باهاتون صحبت کنم \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n",
"print(train_df.info())\n",
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5213 entries, 0 to 5212\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence 5213 non-null object\n",
" 1 path 5213 non-null object\n",
" 2 _path 5213 non-null object\n",
"dtypes: object(3)\n",
"memory usage: 122.3+ KB\n",
"None\n"
]
},
{
"data": {
"text/html": "\n\n
\n \n \n | \n sentence | \n path | \n _path | \n
\n \n \n \n 0 | \n از مهمونداری کنار بکشم | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 1 | \n برو از مهرداد بپرس. | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 2 | \n خب ، تو چیكار می كنی؟ | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 3 | \n مسقط پایتخت عمان در عربی به معنای محل سقوط است | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n 4 | \n آه، نه اصلاُ! | \n /home/m3hrdadfi/data/fa/clips/common_voice_fa_... | \n /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... | \n
\n \n
\n
",
"text/plain": " sentence \\\n0 از مهمونداری کنار بکشم \n1 برو از مهرداد بپرس. \n2 خب ، تو چیكار می كنی؟ \n3 مسقط پایتخت عمان در عربی به معنای محل سقوط است \n4 آه، نه اصلاُ! \n\n path \\\n0 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n1 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n2 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n3 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n4 /home/m3hrdadfi/data/fa/clips/common_voice_fa_... \n\n _path \n0 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n1 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n2 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n3 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... \n4 /home/m3hrdadfi/data/fa/cvfa/fa/clips/common_v... "
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n",
"print(test_df.info())\n",
"test_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 268956/268956 [00:11<00:00, 24344.12it/s]\n"
]
}
],
"source": [
"non_existed_train = []\n",
"\n",
"for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n",
" if not os.path.exists(row[\"path\"]):\n",
" non_existed_train.extends(list(index))\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "[]"
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"non_existed_train"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"# import numpy as np\n",
"\n",
"\n",
"# idx = np.random.randint(0, len(train_df))\n",
"# # idx = 6140\n",
"# sample = train_df.iloc[idx]\n",
"# ipd.display(sample)\n",
"# # print(sample.iloc[idx][\"prev_sentence\"])\n",
"# print()\n",
"# print(sample[\"prev_sentence\"])\n",
"# print(sample[\"sentence\"])\n",
"# print()\n",
"# load_audio(sample[\"path\"])"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# train_df_half = train_df.copy()\n",
"# print(train_df_half.shape)\n",
"# train_df_half = train_df_half.dropna()\n",
"# print(train_df_half.shape)\n",
"# train_df_half = train_df_half.drop_duplicates()\n",
"# print(train_df_half.shape)\n",
"\n",
"# train_df_half = train_df_half.sample(frac=0.5)\n",
"# train_df_half = train_df_half.reset_index(drop=True)\n",
"# print(train_df_half.shape)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"# train_df_half.to_csv(f\"{save_path}/train_no_aug_half.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.10 ('jax-env': venv)",
"metadata": {
"interpreter": {
"hash": "d26705e03f37deada2a9ba7d9c91760e1381e108d31e47ed80b202768ffcaf62"
}
},
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
}