dh-mc commited on
Commit
c755e09
·
1 Parent(s): 778bfcb

refined English translations and Chinese prompt

Browse files
competition/02_Translation.ipynb CHANGED
@@ -1 +1 @@
1
- {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /home/inflaton/code/projects/courses/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /home/inflaton/code/projects/courses/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"data":{"text/plain":["(5, 5, 5)"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","\n","\n","df_dev = pd.read_csv(\"datasets/mgtv/dev.csv\")\n","len(df_dev[\"title\"].value_counts()), len(df_dev[\"puzzle\"].value_counts()), len(\n"," df_dev[\"truth\"].value_counts()\n",")"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package wordnet to /home/inflaton/nltk_data...\n","[nltk_data] Package wordnet is already up-to-date!\n","[nltk_data] Downloading package punkt to /home/inflaton/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package omw-1.4 to /home/inflaton/nltk_data...\n","[nltk_data] Package omw-1.4 is already up-to-date!\n"]}],"source":["from llm_toolkit.translation_utils import translate\n","import pandas as pd\n","\n","\n","def translate_df(df, cache_path=None):\n"," if cache_path and os.path.exists(cache_path):\n"," cache_df = pd.read_csv(cache_path)\n"," else:\n"," cache_df = pd.DataFrame(columns=[\"chinese\", \"english\"])\n","\n"," cache_dict = {k: v for k, v in zip(cache_df[\"chinese\"], cache_df[\"english\"])}\n","\n"," df[\"text\"] = df[\"text\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"title\"] = df[\"title\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"label\"] = df[\"label\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"puzzle\"] = df[\"puzzle\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"truth\"] = df[\"truth\"].apply(lambda x: translate(x, cache_dict))\n","\n"," if cache_path:\n"," for k in cache_df[\"chinese\"]:\n"," if k in cache_dict:\n"," del cache_dict[k]\n","\n"," if k in cache_dict:\n"," new_data = {\"chinese\": k, \"english\": cache_dict[k]}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n","\n"," cache_df.to_csv(cache_path, index=False)\n","\n"," return df"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["df_dev = translate_df(df_dev, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["df_dev.to_csv(\"datasets/mgtv/dev_en.csv\", index=False)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"datasets/mgtv/train.csv\")"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["df = translate_df(df, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["df.to_csv(\"datasets/mgtv/train_en.csv\", index=False)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["label\n","No 11783\n","Yes 6591\n","It doesn't matter 5076\n","The method of interrogation was wrong 921\n","Correct answer. 629\n","Name: count, dtype: int64"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["df[\"label\"].value_counts()"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>text</th>\n"," <th>label</th>\n"," <th>answer</th>\n"," <th>title</th>\n"," <th>puzzle</th>\n"," <th>truth</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Did the thief believe in the gods?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Did they steal the pumpkins to ensure a bounti...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>The villagers like pumpkins too.</td>\n"," <td>It doesn't matter</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>People in the village need to use pumpkins as ...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Were they stolen from the village?</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" text label \\\n","0 Did the thief believe in the gods? No \n","1 Did they steal the pumpkins to ensure a bounti... No \n","2 The villagers like pumpkins too. It doesn't matter \n","3 People in the village need to use pumpkins as ... No \n","4 Were they stolen from the village? Yes \n","\n"," answer title \\\n","0 NaN The Mystery of the Vanishing Pumpkins \n","1 NaN The Mystery of the Vanishing Pumpkins \n","2 NaN The Mystery of the Vanishing Pumpkins \n","3 NaN The Mystery of the Vanishing Pumpkins \n","4 NaN The Mystery of the Vanishing Pumpkins \n","\n"," puzzle \\\n","0 In the village of Zhen, there is a legend that... \n","1 In the village of Zhen, there is a legend that... \n","2 In the village of Zhen, there is a legend that... \n","3 In the village of Zhen, there is a legend that... \n","4 In the village of Zhen, there is a legend that... \n","\n"," truth \n","0 The truth turned out to be related to an old f... \n","1 The truth turned out to be related to an old f... \n","2 The truth turned out to be related to an old f... \n","3 The truth turned out to be related to an old f... \n","4 The truth turned out to be related to an old f... "]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["df_cn = pd.read_csv(\"datasets/mgtv/train.csv\")\n","df_cache = pd.read_csv(\"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/plain":["Index(['text', 'label', 'answer', 'title', 'puzzle', 'truth'], dtype='object')"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["df_cn.columns"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/plain":["0"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["count = 0\n","for col in [\"text\", \"title\", \"puzzle\", \"truth\"]:\n"," for c in df_cn[col].unique():\n"," if c not in df_cache[\"chinese\"].values:\n"," # print(c)\n"," loc = df_cn.loc[df_cn[col] == c, col]\n"," first_occurrence_index = loc.index[\n"," 0\n"," ] # Get the index of the first occurrence\n"," # print(f\"First occurrence at index: {first_occurrence_index}\")\n"," row_cn = df_cn.iloc[first_occurrence_index][col]\n"," row_en = df.iloc[first_occurrence_index][col]\n"," new_data = {\"chinese\": row_cn, \"english\": row_en}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n"," count += 1\n","\n","count"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>chinese</th>\n"," <th>english</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":["Empty DataFrame\n","Columns: [chinese, english]\n","Index: []"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["import re\n","\n","# Function to check if an English translation contains Chinese characters\n","def contains_chinese(text):\n"," return bool(re.search(r\"[\\u4e00-\\u9fff]\", str(text)))\n","\n","\n","# Apply the function to the English column to find rows with partial Chinese text\n","partial_translations = df_cache[df_cache[\"english\"].apply(contains_chinese)]\n","\n","partial_translations.head()"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/plain":["(0, 2)"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["partial_translations.shape"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
 
1
+ {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/Library/CloudStorage/[email protected]/My Drive/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/Library/CloudStorage/[email protected]/My Drive/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/plain":["(5, 5, 5)"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","\n","\n","df_dev = pd.read_csv(\"datasets/mgtv/dev.csv\")\n","len(df_dev[\"title\"].value_counts()), len(df_dev[\"puzzle\"].value_counts()), len(\n"," df_dev[\"truth\"].value_counts()\n",")"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/Library/CloudStorage/[email protected]/My Drive/logical-reasoning/llm_toolkit/translation_utils.py\n"]},{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package wordnet to\n","[nltk_data] /Users/inflaton/nltk_data...\n","[nltk_data] Package wordnet is already up-to-date!\n","[nltk_data] Downloading package punkt to /Users/inflaton/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package omw-1.4 to\n","[nltk_data] /Users/inflaton/nltk_data...\n","[nltk_data] Package omw-1.4 is already up-to-date!\n"]}],"source":["from llm_toolkit.translation_utils import translate\n","import pandas as pd\n","\n","\n","def translate_df(df, cache_path=None):\n"," if cache_path and os.path.exists(cache_path):\n"," cache_df = pd.read_csv(cache_path)\n"," else:\n"," cache_df = pd.DataFrame(columns=[\"chinese\", \"english\"])\n","\n"," cache_dict = {k: v for k, v in zip(cache_df[\"chinese\"], cache_df[\"english\"])}\n","\n"," df[\"text\"] = df[\"text\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"title\"] = df[\"title\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"label\"] = df[\"label\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"puzzle\"] = df[\"puzzle\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"truth\"] = df[\"truth\"].apply(lambda x: translate(x, cache_dict))\n","\n"," if cache_path:\n"," for k in cache_df[\"chinese\"]:\n"," if k in cache_dict:\n"," del cache_dict[k]\n","\n"," if k in cache_dict:\n"," new_data = {\"chinese\": k, \"english\": cache_dict[k]}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n","\n"," cache_df.to_csv(cache_path, index=False)\n","\n"," return df"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["df_dev = translate_df(df_dev, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["df_dev.to_csv(\"datasets/mgtv/dev_en.csv\", index=False)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"datasets/mgtv/train.csv\")"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["df = translate_df(df, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["df.to_csv(\"datasets/mgtv/train_en.csv\", index=False)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["label\n","No 11783\n","Yes 6591\n","Unimportant 5076\n","Incorrect questioning 921\n","Correct answer 629\n","Name: count, dtype: int64"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["df[\"label\"].value_counts()"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>text</th>\n"," <th>label</th>\n"," <th>answer</th>\n"," <th>title</th>\n"," <th>puzzle</th>\n"," <th>truth</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Did the thief believe in the gods?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Did they steal the pumpkins to ensure a bounti...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>The villagers like pumpkins too.</td>\n"," <td>Unimportant</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>People in the village need to use pumpkins as ...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Were they stolen from the village?</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" text label answer \\\n","0 Did the thief believe in the gods? No NaN \n","1 Did they steal the pumpkins to ensure a bounti... No NaN \n","2 The villagers like pumpkins too. Unimportant NaN \n","3 People in the village need to use pumpkins as ... No NaN \n","4 Were they stolen from the village? Yes NaN \n","\n"," title \\\n","0 The Mystery of the Vanishing Pumpkins \n","1 The Mystery of the Vanishing Pumpkins \n","2 The Mystery of the Vanishing Pumpkins \n","3 The Mystery of the Vanishing Pumpkins \n","4 The Mystery of the Vanishing Pumpkins \n","\n"," puzzle \\\n","0 In the village of Zhen, there is a legend that... \n","1 In the village of Zhen, there is a legend that... \n","2 In the village of Zhen, there is a legend that... \n","3 In the village of Zhen, there is a legend that... \n","4 In the village of Zhen, there is a legend that... \n","\n"," truth \n","0 The truth turned out to be related to an old f... \n","1 The truth turned out to be related to an old f... \n","2 The truth turned out to be related to an old f... \n","3 The truth turned out to be related to an old f... \n","4 The truth turned out to be related to an old f... "]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["df_cn = pd.read_csv(\"datasets/mgtv/train.csv\")\n","df_cache = pd.read_csv(\"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/plain":["Index(['text', 'label', 'answer', 'title', 'puzzle', 'truth'], dtype='object')"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["df_cn.columns"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/plain":["0"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["count = 0\n","for col in [\"text\", \"title\", \"puzzle\", \"truth\"]:\n"," for c in df_cn[col].unique():\n"," if c not in df_cache[\"chinese\"].values:\n"," # print(c)\n"," loc = df_cn.loc[df_cn[col] == c, col]\n"," first_occurrence_index = loc.index[\n"," 0\n"," ] # Get the index of the first occurrence\n"," # print(f\"First occurrence at index: {first_occurrence_index}\")\n"," row_cn = df_cn.iloc[first_occurrence_index][col]\n"," row_en = df.iloc[first_occurrence_index][col]\n"," new_data = {\"chinese\": row_cn, \"english\": row_en}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n"," count += 1\n","\n","count"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>chinese</th>\n"," <th>english</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":["Empty DataFrame\n","Columns: [chinese, english]\n","Index: []"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["import re\n","\n","# Function to check if an English translation contains Chinese characters\n","def contains_chinese(text):\n"," return bool(re.search(r\"[\\u4e00-\\u9fff]\", str(text)))\n","\n","\n","# Apply the function to the English column to find rows with partial Chinese text\n","partial_translations = df_cache[df_cache[\"english\"].apply(contains_chinese)]\n","\n","partial_translations.head()"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/plain":["(0, 2)"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["partial_translations.shape"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
competition/03_EDA_en.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
datasets/mgtv/dev_en.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c5af721070a488f493b7119391fccf3a36b9a0e856549ebda44f49fd3714425
3
- size 2937933
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e1acd1da529afa3ac720faa34b6aa4af9ac400c8541c4098311a35bb5ed3846
3
+ size 2936318
datasets/mgtv/train_en.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f367d02068d0a185bd8628b2a6acf10d8e314413a4e942fe5d7bef2e86bc7808
3
- size 23602552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d1ebae37034fe568e24a0ea5b166f0a2e38489a890c9971980336684de7131c
3
+ size 23556731
datasets/mgtv/unique_translations.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:125827afa7850c08d70b82b14481085400017b2c671a46b40a9c781b4bac8323
3
- size 1619192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e84d985ddb2974831e549de74def6a16bff41e6eed39b38d8d78529a6615289
3
+ size 1619169
llm_toolkit/eval.py CHANGED
@@ -2,8 +2,6 @@ import os
2
  import sys
3
  import torch
4
  from dotenv import find_dotenv, load_dotenv
5
- from llamafactory.chat import ChatModel
6
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
7
 
8
  found_dotenv = find_dotenv(".env")
9
 
@@ -16,6 +14,7 @@ path = os.path.dirname(found_dotenv)
16
  print(f"Adding {path} to sys.path")
17
  sys.path.append(path)
18
 
 
19
  from llm_toolkit.translation_utils import *
20
 
21
  model_name = os.getenv("MODEL_NAME")
@@ -26,48 +25,6 @@ results_path = os.getenv("RESULTS_PATH")
26
 
27
  print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
28
 
29
-
30
- def load_model(
31
- model_name,
32
- max_seq_length=2048,
33
- dtype=None,
34
- load_in_4bit=False,
35
- adapter_name_or_path=None,
36
- ):
37
- print(f"loading model: {model_name}")
38
-
39
- if adapter_name_or_path:
40
- template = "llama3" if "llama-3" in model_name.lower() else "chatml"
41
-
42
- args = dict(
43
- model_name_or_path=model_name,
44
- adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
45
- template=template, # same to the one in training
46
- finetuning_type="lora", # same to the one in training
47
- quantization_bit=4, # load 4-bit quantized model
48
- )
49
- chat_model = ChatModel(args)
50
- return chat_model.engine.model, chat_model.engine.tokenizer
51
-
52
- tokenizer = AutoTokenizer.from_pretrained(model_name)
53
- bnb_config = BitsAndBytesConfig(
54
- load_in_4bit=True,
55
- bnb_4bit_quant_type="nf4",
56
- bnb_4bit_use_double_quant=False,
57
- bnb_4bit_compute_dtype=torch.bfloat16,
58
- )
59
-
60
- model = AutoModelForCausalLM.from_pretrained(
61
- model_name,
62
- quantization_config=bnb_config,
63
- # attn_implementation="flash_attention_2",
64
- trust_remote_code=True,
65
- device_map="auto",
66
- )
67
-
68
- return model, tokenizer
69
-
70
-
71
  gpu_stats = torch.cuda.get_device_properties(0)
72
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
73
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
 
2
  import sys
3
  import torch
4
  from dotenv import find_dotenv, load_dotenv
 
 
5
 
6
  found_dotenv = find_dotenv(".env")
7
 
 
14
  print(f"Adding {path} to sys.path")
15
  sys.path.append(path)
16
 
17
+ from llm_toolkit.translation_engine import *
18
  from llm_toolkit.translation_utils import *
19
 
20
  model_name = os.getenv("MODEL_NAME")
 
25
 
26
  print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  gpu_stats = torch.cuda.get_device_properties(0)
29
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
30
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
llm_toolkit/eval_logical_reasoning.py CHANGED
@@ -49,6 +49,8 @@ if len(sys.argv) > 1:
49
  # create new dataset exluding those idx
50
  datasets["test"] = datasets["test"].select(range(num))
51
 
 
 
52
  print("Evaluating model: " + model_name)
53
  predictions = eval_model(model, tokenizer, datasets["test"])
54
 
 
49
  # create new dataset exluding those idx
50
  datasets["test"] = datasets["test"].select(range(num))
51
 
52
+ print(datasets["test"].to_pandas().head(1))
53
+
54
  print("Evaluating model: " + model_name)
55
  predictions = eval_model(model, tokenizer, datasets["test"])
56
 
llm_toolkit/llm_utils.py CHANGED
@@ -42,6 +42,11 @@ def load_model(
42
  torch_dtype=dtype,
43
  trust_remote_code=True,
44
  device_map="auto",
 
 
 
 
 
45
  )
46
 
47
  return model, tokenizer
 
42
  torch_dtype=dtype,
43
  trust_remote_code=True,
44
  device_map="auto",
45
+ ) if load_in_4bit else AutoModelForCausalLM.from_pretrained(
46
+ model_name,
47
+ torch_dtype=dtype,
48
+ trust_remote_code=True,
49
+ device_map="auto",
50
  )
51
 
52
  return model, tokenizer
llm_toolkit/logical_reasoning_utils.py CHANGED
@@ -58,21 +58,25 @@ def load_logical_reasoning_dataset(data_path, tokenizer=None):
58
  )
59
 
60
  if tokenizer:
61
- reasoning_prompt = """你是一个逻辑游戏的主持人。游戏规则如下:
62
-
63
- 1. 参与者会得到一个谜题。
64
- 2. 参与者可以通过提问来获取线索,尝试解开谜题。
65
- 3. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。
66
- 4. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。
67
- 5. 参与者需要根据回答来推理,并最终找出谜题的正确答案。
 
 
 
 
68
 
69
  请严格按照这些规则回答参与者提出的问题。
70
 
71
- 谜题: {}
72
 
73
- 实际情况: {}
74
 
75
- 参与者提出的问题: {}
76
  """
77
  def formatting_prompts_func(examples):
78
  inputs = examples["text"]
 
58
  )
59
 
60
  if tokenizer:
61
+ reasoning_prompt = """你是一个情景猜谜游戏的主持人。游戏规则如下:
62
+
63
+ 1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。
64
+ 2. 主持人知道谜底,谜底是谜面的答案。
65
+ 3. 参与者可以询问任何封闭式问题来找寻事件的真相。
66
+ 4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:
67
+ - 若谜面和谜底能找到问题的答案,回答:是或者不是
68
+ - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要
69
+ - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误
70
+ - 若参与者提问基本还原了谜底真相,回答:回答正确
71
+ 5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。
72
 
73
  请严格按照这些规则回答参与者提出的问题。
74
 
75
+ **谜面:** {}
76
 
77
+ **谜底:** {}
78
 
79
+ **参与者提出的问题:** {}
80
  """
81
  def formatting_prompts_func(examples):
82
  inputs = examples["text"]