File size: 6,841 Bytes
a54c5b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>input_text</th>\n",
" <th>target_text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>我要求的是法律上的澄清</td>\n",
" <td>我係要求……呢啲係好清楚嘅法律上嘅澄清呀</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>每晚由七點半,到十一點半</td>\n",
" <td>誒,由七點半就做到十一點半</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>梁頌恒議員,你是否要繼續發言</td>\n",
" <td>梁頌恆議員呢,係咪繼續係發言</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>可以怎樣稱呼我?我只知道整條街都稱我「大家姐」,因為我最大,年紀最大</td>\n",
" <td>可以點叫我呀?呢度成條街叫我大家姐,因為我最大,年紀最大吖嘛</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>至於他的答覆能否回應你剛才的提問,我並不能夠提出任何意見</td>\n",
" <td>噉呢,就對於佢能唔能夠達到你頭先提問嗰個嘅要求呢,我就唔能夠作出任何嘅意見</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35872</th>\n",
" <td>他曾在2006及2007年擔任暑期實習生</td>\n",
" <td>2006~2007學年寒暑假間亦試過將學校整大兼修容過</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35873</th>\n",
" <td>克里莫尼迪茲戰爭</td>\n",
" <td>克里米亞戰爭</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35874</th>\n",
" <td>產卵後親魚迴歸大海</td>\n",
" <td>海潮遇返失敗多年嘅生母</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35875</th>\n",
" <td>學校規模冠絕全馬。</td>\n",
" <td>學校嘅運動水平可謂全區之冠。</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35876</th>\n",
" <td>黃龍溪鎮也逐漸由繁忙的碼頭轉變為安靜的江邊場鎮。</td>\n",
" <td>而九龍寨城到海邊碼頭就慢慢變成市集。</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>35877 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" input_text \\\n",
"0 我要求的是法律上的澄清 \n",
"1 每晚由七點半,到十一點半 \n",
"2 梁頌恒議員,你是否要繼續發言 \n",
"3 可以怎樣稱呼我?我只知道整條街都稱我「大家姐」,因為我最大,年紀最大 \n",
"4 至於他的答覆能否回應你剛才的提問,我並不能夠提出任何意見 \n",
"... ... \n",
"35872 他曾在2006及2007年擔任暑期實習生 \n",
"35873 克里莫尼迪茲戰爭 \n",
"35874 產卵後親魚迴歸大海 \n",
"35875 學校規模冠絕全馬。 \n",
"35876 黃龍溪鎮也逐漸由繁忙的碼頭轉變為安靜的江邊場鎮。 \n",
"\n",
" target_text \n",
"0 我係要求……呢啲係好清楚嘅法律上嘅澄清呀 \n",
"1 誒,由七點半就做到十一點半 \n",
"2 梁頌恆議員呢,係咪繼續係發言 \n",
"3 可以點叫我呀?呢度成條街叫我大家姐,因為我最大,年紀最大吖嘛 \n",
"4 噉呢,就對於佢能唔能夠達到你頭先提問嗰個嘅要求呢,我就唔能夠作出任何嘅意見 \n",
"... ... \n",
"35872 2006~2007學年寒暑假間亦試過將學校整大兼修容過 \n",
"35873 克里米亞戰爭 \n",
"35874 海潮遇返失敗多年嘅生母 \n",
"35875 學校嘅運動水平可謂全區之冠。 \n",
"35876 而九龍寨城到海邊碼頭就慢慢變成市集。 \n",
"\n",
"[35877 rows x 2 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_pickle(\"yue_zh_combined36k.pkl\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = df.reset_index() # make sure indexes pair with number of rows\n",
"\n",
"with open(\"train/mined_bitext.can\", \"w+\") as can_file, open(\"train/mined_bitext.man\", \"w+\") as man_file:\n",
" for index, row in df.iterrows():\n",
" man_file.write(row['input_text'] + \"\\n\")\n",
" can_file.write(row['target_text'] + \"\\n\")\n",
" man_file.flush()\n",
" can_file.flush()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|