Spaces:
Build error
Build error
refined English translations and Chinese prompt
Browse files- competition/02_Translation.ipynb +1 -1
- competition/03_EDA_en.ipynb +0 -0
- datasets/mgtv/dev_en.csv +2 -2
- datasets/mgtv/train_en.csv +2 -2
- datasets/mgtv/unique_translations.csv +2 -2
- llm_toolkit/eval.py +1 -44
- llm_toolkit/eval_logical_reasoning.py +2 -0
- llm_toolkit/llm_utils.py +5 -0
- llm_toolkit/logical_reasoning_utils.py +14 -10
competition/02_Translation.ipynb
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /home/inflaton/code/projects/courses/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /home/inflaton/code/projects/courses/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"data":{"text/plain":["(5, 5, 5)"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","\n","\n","df_dev = pd.read_csv(\"datasets/mgtv/dev.csv\")\n","len(df_dev[\"title\"].value_counts()), len(df_dev[\"puzzle\"].value_counts()), len(\n"," df_dev[\"truth\"].value_counts()\n",")"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package wordnet to /home/inflaton/nltk_data...\n","[nltk_data] Package wordnet is already up-to-date!\n","[nltk_data] Downloading package punkt to /home/inflaton/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package omw-1.4 to /home/inflaton/nltk_data...\n","[nltk_data] Package omw-1.4 is already up-to-date!\n"]}],"source":["from llm_toolkit.translation_utils import translate\n","import pandas as pd\n","\n","\n","def translate_df(df, cache_path=None):\n"," if cache_path and os.path.exists(cache_path):\n"," cache_df = pd.read_csv(cache_path)\n"," else:\n"," cache_df = pd.DataFrame(columns=[\"chinese\", \"english\"])\n","\n"," cache_dict = {k: v for k, v in zip(cache_df[\"chinese\"], cache_df[\"english\"])}\n","\n"," df[\"text\"] = df[\"text\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"title\"] = df[\"title\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"label\"] = df[\"label\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"puzzle\"] = df[\"puzzle\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"truth\"] = df[\"truth\"].apply(lambda x: translate(x, cache_dict))\n","\n"," if cache_path:\n"," for k in cache_df[\"chinese\"]:\n"," if k in cache_dict:\n"," del cache_dict[k]\n","\n"," if k in cache_dict:\n"," new_data = {\"chinese\": k, \"english\": cache_dict[k]}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n","\n"," cache_df.to_csv(cache_path, index=False)\n","\n"," return df"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["df_dev = translate_df(df_dev, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["df_dev.to_csv(\"datasets/mgtv/dev_en.csv\", index=False)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"datasets/mgtv/train.csv\")"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["df = translate_df(df, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["df.to_csv(\"datasets/mgtv/train_en.csv\", index=False)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["label\n","No 11783\n","Yes 6591\n","It doesn't matter 5076\n","The method of interrogation was wrong 921\n","Correct answer. 629\n","Name: count, dtype: int64"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["df[\"label\"].value_counts()"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>text</th>\n"," <th>label</th>\n"," <th>answer</th>\n"," <th>title</th>\n"," <th>puzzle</th>\n"," <th>truth</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Did the thief believe in the gods?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Did they steal the pumpkins to ensure a bounti...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>The villagers like pumpkins too.</td>\n"," <td>It doesn't matter</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>People in the village need to use pumpkins as ...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Were they stolen from the village?</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" text label \\\n","0 Did the thief believe in the gods? No \n","1 Did they steal the pumpkins to ensure a bounti... No \n","2 The villagers like pumpkins too. It doesn't matter \n","3 People in the village need to use pumpkins as ... No \n","4 Were they stolen from the village? Yes \n","\n"," answer title \\\n","0 NaN The Mystery of the Vanishing Pumpkins \n","1 NaN The Mystery of the Vanishing Pumpkins \n","2 NaN The Mystery of the Vanishing Pumpkins \n","3 NaN The Mystery of the Vanishing Pumpkins \n","4 NaN The Mystery of the Vanishing Pumpkins \n","\n"," puzzle \\\n","0 In the village of Zhen, there is a legend that... \n","1 In the village of Zhen, there is a legend that... \n","2 In the village of Zhen, there is a legend that... \n","3 In the village of Zhen, there is a legend that... \n","4 In the village of Zhen, there is a legend that... \n","\n"," truth \n","0 The truth turned out to be related to an old f... \n","1 The truth turned out to be related to an old f... \n","2 The truth turned out to be related to an old f... \n","3 The truth turned out to be related to an old f... \n","4 The truth turned out to be related to an old f... "]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["df_cn = pd.read_csv(\"datasets/mgtv/train.csv\")\n","df_cache = pd.read_csv(\"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/plain":["Index(['text', 'label', 'answer', 'title', 'puzzle', 'truth'], dtype='object')"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["df_cn.columns"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/plain":["0"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["count = 0\n","for col in [\"text\", \"title\", \"puzzle\", \"truth\"]:\n"," for c in df_cn[col].unique():\n"," if c not in df_cache[\"chinese\"].values:\n"," # print(c)\n"," loc = df_cn.loc[df_cn[col] == c, col]\n"," first_occurrence_index = loc.index[\n"," 0\n"," ] # Get the index of the first occurrence\n"," # print(f\"First occurrence at index: {first_occurrence_index}\")\n"," row_cn = df_cn.iloc[first_occurrence_index][col]\n"," row_en = df.iloc[first_occurrence_index][col]\n"," new_data = {\"chinese\": row_cn, \"english\": row_en}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n"," count += 1\n","\n","count"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>chinese</th>\n"," <th>english</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":["Empty DataFrame\n","Columns: [chinese, english]\n","Index: []"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["import re\n","\n","# Function to check if an English translation contains Chinese characters\n","def contains_chinese(text):\n"," return bool(re.search(r\"[\\u4e00-\\u9fff]\", str(text)))\n","\n","\n","# Apply the function to the English column to find rows with partial Chinese text\n","partial_translations = df_cache[df_cache[\"english\"].apply(contains_chinese)]\n","\n","partial_translations.head()"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/plain":["(0, 2)"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["partial_translations.shape"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
|
|
|
1 |
+
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n","except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/Library/CloudStorage/[email protected]/My Drive/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/Library/CloudStorage/[email protected]/My Drive/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/plain":["(5, 5, 5)"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","\n","\n","df_dev = pd.read_csv(\"datasets/mgtv/dev.csv\")\n","len(df_dev[\"title\"].value_counts()), len(df_dev[\"puzzle\"].value_counts()), len(\n"," df_dev[\"truth\"].value_counts()\n",")"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/Library/CloudStorage/[email protected]/My Drive/logical-reasoning/llm_toolkit/translation_utils.py\n"]},{"name":"stderr","output_type":"stream","text":["[nltk_data] Downloading package wordnet to\n","[nltk_data] /Users/inflaton/nltk_data...\n","[nltk_data] Package wordnet is already up-to-date!\n","[nltk_data] Downloading package punkt to /Users/inflaton/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package omw-1.4 to\n","[nltk_data] /Users/inflaton/nltk_data...\n","[nltk_data] Package omw-1.4 is already up-to-date!\n"]}],"source":["from llm_toolkit.translation_utils import translate\n","import pandas as pd\n","\n","\n","def translate_df(df, cache_path=None):\n"," if cache_path and os.path.exists(cache_path):\n"," cache_df = pd.read_csv(cache_path)\n"," else:\n"," cache_df = pd.DataFrame(columns=[\"chinese\", \"english\"])\n","\n"," cache_dict = {k: v for k, v in zip(cache_df[\"chinese\"], cache_df[\"english\"])}\n","\n"," df[\"text\"] = df[\"text\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"title\"] = df[\"title\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"label\"] = df[\"label\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"puzzle\"] = df[\"puzzle\"].apply(lambda x: translate(x, cache_dict))\n"," df[\"truth\"] = df[\"truth\"].apply(lambda x: translate(x, cache_dict))\n","\n"," if cache_path:\n"," for k in cache_df[\"chinese\"]:\n"," if k in cache_dict:\n"," del cache_dict[k]\n","\n"," if k in cache_dict:\n"," new_data = {\"chinese\": k, \"english\": cache_dict[k]}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n","\n"," cache_df.to_csv(cache_path, index=False)\n","\n"," return df"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["df_dev = translate_df(df_dev, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["df_dev.to_csv(\"datasets/mgtv/dev_en.csv\", index=False)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["import pandas as pd\n","\n","df = pd.read_csv(\"datasets/mgtv/train.csv\")"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["df = translate_df(df, \"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["df.to_csv(\"datasets/mgtv/train_en.csv\", index=False)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["label\n","No 11783\n","Yes 6591\n","Unimportant 5076\n","Incorrect questioning 921\n","Correct answer 629\n","Name: count, dtype: int64"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["df[\"label\"].value_counts()"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>text</th>\n"," <th>label</th>\n"," <th>answer</th>\n"," <th>title</th>\n"," <th>puzzle</th>\n"," <th>truth</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Did the thief believe in the gods?</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Did they steal the pumpkins to ensure a bounti...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>The villagers like pumpkins too.</td>\n"," <td>Unimportant</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>People in the village need to use pumpkins as ...</td>\n"," <td>No</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Were they stolen from the village?</td>\n"," <td>Yes</td>\n"," <td>NaN</td>\n"," <td>The Mystery of the Vanishing Pumpkins</td>\n"," <td>In the village of Zhen, there is a legend that...</td>\n"," <td>The truth turned out to be related to an old f...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" text label answer \\\n","0 Did the thief believe in the gods? No NaN \n","1 Did they steal the pumpkins to ensure a bounti... No NaN \n","2 The villagers like pumpkins too. Unimportant NaN \n","3 People in the village need to use pumpkins as ... No NaN \n","4 Were they stolen from the village? Yes NaN \n","\n"," title \\\n","0 The Mystery of the Vanishing Pumpkins \n","1 The Mystery of the Vanishing Pumpkins \n","2 The Mystery of the Vanishing Pumpkins \n","3 The Mystery of the Vanishing Pumpkins \n","4 The Mystery of the Vanishing Pumpkins \n","\n"," puzzle \\\n","0 In the village of Zhen, there is a legend that... \n","1 In the village of Zhen, there is a legend that... \n","2 In the village of Zhen, there is a legend that... \n","3 In the village of Zhen, there is a legend that... \n","4 In the village of Zhen, there is a legend that... \n","\n"," truth \n","0 The truth turned out to be related to an old f... \n","1 The truth turned out to be related to an old f... \n","2 The truth turned out to be related to an old f... \n","3 The truth turned out to be related to an old f... \n","4 The truth turned out to be related to an old f... "]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["df_cn = pd.read_csv(\"datasets/mgtv/train.csv\")\n","df_cache = pd.read_csv(\"datasets/mgtv/unique_translations.csv\")"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/plain":["Index(['text', 'label', 'answer', 'title', 'puzzle', 'truth'], dtype='object')"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["df_cn.columns"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/plain":["0"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["count = 0\n","for col in [\"text\", \"title\", \"puzzle\", \"truth\"]:\n"," for c in df_cn[col].unique():\n"," if c not in df_cache[\"chinese\"].values:\n"," # print(c)\n"," loc = df_cn.loc[df_cn[col] == c, col]\n"," first_occurrence_index = loc.index[\n"," 0\n"," ] # Get the index of the first occurrence\n"," # print(f\"First occurrence at index: {first_occurrence_index}\")\n"," row_cn = df_cn.iloc[first_occurrence_index][col]\n"," row_en = df.iloc[first_occurrence_index][col]\n"," new_data = {\"chinese\": row_cn, \"english\": row_en}\n"," new_row_df = pd.DataFrame([new_data])\n"," df_cache = pd.concat(\n"," [df_cache, new_row_df],\n"," ignore_index=True,\n"," )\n"," count += 1\n","\n","count"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>chinese</th>\n"," <th>english</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":["Empty DataFrame\n","Columns: [chinese, english]\n","Index: []"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["import re\n","\n","# Function to check if an English translation contains Chinese characters\n","def contains_chinese(text):\n"," return bool(re.search(r\"[\\u4e00-\\u9fff]\", str(text)))\n","\n","\n","# Apply the function to the English column to find rows with partial Chinese text\n","partial_translations = df_cache[df_cache[\"english\"].apply(contains_chinese)]\n","\n","partial_translations.head()"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/plain":["(0, 2)"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["partial_translations.shape"]}],"metadata":{"accelerator":"GPU","application/vnd.databricks.v1+notebook":{"dashboards":[],"environmentMetadata":null,"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"07_MAC_+_Qwen2-7B-Instructi_Unsloth_train","widgets":{}},"colab":{"gpuType":"T4","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":0}
|
competition/03_EDA_en.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
datasets/mgtv/dev_en.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e1acd1da529afa3ac720faa34b6aa4af9ac400c8541c4098311a35bb5ed3846
|
3 |
+
size 2936318
|
datasets/mgtv/train_en.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d1ebae37034fe568e24a0ea5b166f0a2e38489a890c9971980336684de7131c
|
3 |
+
size 23556731
|
datasets/mgtv/unique_translations.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e84d985ddb2974831e549de74def6a16bff41e6eed39b38d8d78529a6615289
|
3 |
+
size 1619169
|
llm_toolkit/eval.py
CHANGED
@@ -2,8 +2,6 @@ import os
|
|
2 |
import sys
|
3 |
import torch
|
4 |
from dotenv import find_dotenv, load_dotenv
|
5 |
-
from llamafactory.chat import ChatModel
|
6 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
7 |
|
8 |
found_dotenv = find_dotenv(".env")
|
9 |
|
@@ -16,6 +14,7 @@ path = os.path.dirname(found_dotenv)
|
|
16 |
print(f"Adding {path} to sys.path")
|
17 |
sys.path.append(path)
|
18 |
|
|
|
19 |
from llm_toolkit.translation_utils import *
|
20 |
|
21 |
model_name = os.getenv("MODEL_NAME")
|
@@ -26,48 +25,6 @@ results_path = os.getenv("RESULTS_PATH")
|
|
26 |
|
27 |
print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
|
28 |
|
29 |
-
|
30 |
-
def load_model(
|
31 |
-
model_name,
|
32 |
-
max_seq_length=2048,
|
33 |
-
dtype=None,
|
34 |
-
load_in_4bit=False,
|
35 |
-
adapter_name_or_path=None,
|
36 |
-
):
|
37 |
-
print(f"loading model: {model_name}")
|
38 |
-
|
39 |
-
if adapter_name_or_path:
|
40 |
-
template = "llama3" if "llama-3" in model_name.lower() else "chatml"
|
41 |
-
|
42 |
-
args = dict(
|
43 |
-
model_name_or_path=model_name,
|
44 |
-
adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
|
45 |
-
template=template, # same to the one in training
|
46 |
-
finetuning_type="lora", # same to the one in training
|
47 |
-
quantization_bit=4, # load 4-bit quantized model
|
48 |
-
)
|
49 |
-
chat_model = ChatModel(args)
|
50 |
-
return chat_model.engine.model, chat_model.engine.tokenizer
|
51 |
-
|
52 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
53 |
-
bnb_config = BitsAndBytesConfig(
|
54 |
-
load_in_4bit=True,
|
55 |
-
bnb_4bit_quant_type="nf4",
|
56 |
-
bnb_4bit_use_double_quant=False,
|
57 |
-
bnb_4bit_compute_dtype=torch.bfloat16,
|
58 |
-
)
|
59 |
-
|
60 |
-
model = AutoModelForCausalLM.from_pretrained(
|
61 |
-
model_name,
|
62 |
-
quantization_config=bnb_config,
|
63 |
-
# attn_implementation="flash_attention_2",
|
64 |
-
trust_remote_code=True,
|
65 |
-
device_map="auto",
|
66 |
-
)
|
67 |
-
|
68 |
-
return model, tokenizer
|
69 |
-
|
70 |
-
|
71 |
gpu_stats = torch.cuda.get_device_properties(0)
|
72 |
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
73 |
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
|
|
2 |
import sys
|
3 |
import torch
|
4 |
from dotenv import find_dotenv, load_dotenv
|
|
|
|
|
5 |
|
6 |
found_dotenv = find_dotenv(".env")
|
7 |
|
|
|
14 |
print(f"Adding {path} to sys.path")
|
15 |
sys.path.append(path)
|
16 |
|
17 |
+
from llm_toolkit.translation_engine import *
|
18 |
from llm_toolkit.translation_utils import *
|
19 |
|
20 |
model_name = os.getenv("MODEL_NAME")
|
|
|
25 |
|
26 |
print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
gpu_stats = torch.cuda.get_device_properties(0)
|
29 |
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
30 |
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
llm_toolkit/eval_logical_reasoning.py
CHANGED
@@ -49,6 +49,8 @@ if len(sys.argv) > 1:
|
|
49 |
# create new dataset exluding those idx
|
50 |
datasets["test"] = datasets["test"].select(range(num))
|
51 |
|
|
|
|
|
52 |
print("Evaluating model: " + model_name)
|
53 |
predictions = eval_model(model, tokenizer, datasets["test"])
|
54 |
|
|
|
49 |
# create new dataset exluding those idx
|
50 |
datasets["test"] = datasets["test"].select(range(num))
|
51 |
|
52 |
+
print(datasets["test"].to_pandas().head(1))
|
53 |
+
|
54 |
print("Evaluating model: " + model_name)
|
55 |
predictions = eval_model(model, tokenizer, datasets["test"])
|
56 |
|
llm_toolkit/llm_utils.py
CHANGED
@@ -42,6 +42,11 @@ def load_model(
|
|
42 |
torch_dtype=dtype,
|
43 |
trust_remote_code=True,
|
44 |
device_map="auto",
|
|
|
|
|
|
|
|
|
|
|
45 |
)
|
46 |
|
47 |
return model, tokenizer
|
|
|
42 |
torch_dtype=dtype,
|
43 |
trust_remote_code=True,
|
44 |
device_map="auto",
|
45 |
+
) if load_in_4bit else AutoModelForCausalLM.from_pretrained(
|
46 |
+
model_name,
|
47 |
+
torch_dtype=dtype,
|
48 |
+
trust_remote_code=True,
|
49 |
+
device_map="auto",
|
50 |
)
|
51 |
|
52 |
return model, tokenizer
|
llm_toolkit/logical_reasoning_utils.py
CHANGED
@@ -58,21 +58,25 @@ def load_logical_reasoning_dataset(data_path, tokenizer=None):
|
|
58 |
)
|
59 |
|
60 |
if tokenizer:
|
61 |
-
reasoning_prompt = """
|
62 |
-
|
63 |
-
1.
|
64 |
-
2.
|
65 |
-
3.
|
66 |
-
4.
|
67 |
-
|
|
|
|
|
|
|
|
|
68 |
|
69 |
请严格按照这些规则回答参与者提出的问题。
|
70 |
|
71 |
-
|
72 |
|
73 |
-
|
74 |
|
75 |
-
|
76 |
"""
|
77 |
def formatting_prompts_func(examples):
|
78 |
inputs = examples["text"]
|
|
|
58 |
)
|
59 |
|
60 |
if tokenizer:
|
61 |
+
reasoning_prompt = """你是一个情景猜谜游戏的主持人。游戏规则如下:
|
62 |
+
|
63 |
+
1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。
|
64 |
+
2. 主持人知道谜底,谜底是谜面的答案。
|
65 |
+
3. 参与者可以询问任何封闭式问题来找寻事件的真相。
|
66 |
+
4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:
|
67 |
+
- 若谜面和谜底能找到问题的答案,回答:是或者不是
|
68 |
+
- 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要
|
69 |
+
- 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误
|
70 |
+
- 若参与者提问基本还原了谜底真相,回答:回答正确
|
71 |
+
5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。
|
72 |
|
73 |
请严格按照这些规则回答参与者提出的问题。
|
74 |
|
75 |
+
**谜面:** {}
|
76 |
|
77 |
+
**谜底:** {}
|
78 |
|
79 |
+
**参与者提出的问题:** {}
|
80 |
"""
|
81 |
def formatting_prompts_func(examples):
|
82 |
inputs = examples["text"]
|