{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": { "id": "nAJ2Ubu1-MUb", "outputId": "1ba4d045-c94d-44d8-c744-e121299e44b4", "colab": { "base_uri": "https://localhost:8080/" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2023-11-05 01:01:17-- https://github.com/LC1332/Needy-Haruhi/blob/main/data/Jines.xlsx\n", "Resolving github.com (github.com)... 192.30.255.113\n", "Connecting to github.com (github.com)|192.30.255.113|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 4583 (4.5K) [text/plain]\n", "Saving to: ‘Jines.xlsx’\n", "\n", "\rJines.xlsx 0%[ ] 0 --.-KB/s \rJines.xlsx 100%[===================>] 4.48K --.-KB/s in 0s \n", "\n", "2023-11-05 01:01:17 (50.6 MB/s) - ‘Jines.xlsx’ saved [4583/4583]\n", "\n" ] } ], "source": [ "# 下载文件。\n", "!wget https://github.com/LC1332/Needy-Haruhi/blob/main/data/Jines.xlsx\n" ] }, { "cell_type": "code", "source": [ "! pip install pandas" ], "metadata": { "id": "siZysoEBADOv", "outputId": "1550bb64-b8d5-44a9-e376-df2a86f000f3", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.3.post1)\n", "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.23.5)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "import re\n", "import shutil\n", "\n", "import pandas as pd\n", "\n", "Jines_file = r\"/content/Jines.xlsx\"\n", "\n", "#读取与查询\n", "Jines = pd.read_excel(Jines_file)\n", "# 查询有选项的内容\n", "pattern = r'(?<=\\().*?(?=\\))'\n", "\n", "# 匹配事件\n", "Title = Jines.loc[(Jines['ParentId (more info)'].str.contains(pattern, regex=True, na=False))]\n", "\n", "Attribute_temp = {\"Affection\": 0, \"Stress\": 0, \"Darkness\": 0}\n", "\n", "\n", "# 把文件转换为txt\n", "def format_output(row):\n", " # ID\n", " ParentId = f'{row[\"ParentId (more info)\"]}'\n", " Category_temp = f'{row[\"Category\"]}'\n", " Category = sanitize_filename(Category_temp)\n", " ID = f'{row[\"Id\"]}'\n", "\n", " # 匹配标题\n", " regex1 = r\"\\w+(?= \\()\"\n", " title = re.search(regex1, ParentId)\n", " title_str = title.group()\n", "\n", " # 事件\n", " event_list = []\n", "\n", " # 匹配提问\n", " match = re.search(r\"\\(First Part\\)\", ParentId)\n", " match2 = re.search(r\"\\(First Part; end\\)\", ParentId)\n", " match3 = re.search(r\"\\(Third Part\\)\", ParentId)\n", " match4 = re.search(r\"\\(Second Part\\)\", ParentId)\n", " match5 = re.search(r\"\\(Fourth Part\\)\", ParentId)\n", "\n", " # 数值\n", " aff = f\"Affection: {row['Affection']}\"\n", " str = f\"Stress: {row['Stress']}\"\n", " dar = f\"Darkness: {row['Darkness']}\"\n", "\n", " # 匹配选项以及回复\n", " choose_time = re.search(r\"\\d+\", ParentId)\n", " reply_ = re.search(r'(\\(.*Option[0-9]+;end\\))', ParentId)\n", " reply_2 = re.search(r'(\\(.*Option[0-9]\\))', ParentId)\n", "\n", " # 处理提问\n", " # if match or match2 or match4 or match5 or match3:\n", " if match or match2 or match3 or match4 or match5:\n", "\n", " Prefix = f'\\n## 对话\\n### Prefix Category_temp:{Category} ID:{ID}'\n", " Ame = f\"糖糖: {row['BodyCn']}\"\n", " with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n", " # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串,并在每个字段之间添加一个制表符\n", " line = '\\n'.join([Prefix, Ame])\n", "\n", " line_bytes = line.encode('utf-8')\n", " # 将字节对象写入到文件中\n", " line_str = line_bytes.decode('utf-8')\n", " # 将字符串对象写入到文件中\n", " f.write(line_str)\n", "\n", " return \"\\n\".join([Prefix, Ame])\n", "\n", " # 处理选项\n", " elif row['Speaker/Action (in blue)'] == 'pi':\n", " # 跳过数值为空的回复\n", " try:\n", " key = f'\\n### Option-{choose_time.group()}'\n", " user = f\"User: {row['BodyCn']}\"\n", "\n", " if aff == 'Affection: nan':\n", " aff = ''\n", " if str == 'Stress: nan':\n", " str = ''\n", " if dar == 'Darkness: nan':\n", " dar = ''\n", " value = f\"Attribute Change: {aff} {str} {dar}\"\n", "\n", " if value == 'Attribute Change: ':\n", " value = ''\n", "\n", " with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n", " # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串,并在每个字段之间添加一个制表符\n", " line = '\\n'.join([key, user, value])\n", "\n", " line_bytes = line.encode('utf-8')\n", " # 将字节对象写入到文件中\n", " line_str = line_bytes.decode('utf-8')\n", " # 将字符串对象写入到文件中\n", " f.write(line_str)\n", " return \"\\n\".join([key, user, value])\n", " except:\n", " pass\n", "\n", " # 处理选项回复\n", " elif reply_ or (reply_2 and row['Speaker/Action (in blue)'] == 'ame'):\n", " try:\n", " key = f'\\nReply:\\n糖糖:{row[\"BodyCn\"]}'\n", "\n", " if aff == 'Affection: nan':\n", " aff = ''\n", " if str == 'Stress: nan':\n", " str = ''\n", " if dar == 'Darkness: nan':\n", " dar = ''\n", " value = f\"Attribute Change: {aff} {str} {dar}\"\n", "\n", " if value == 'Attribute Change: ':\n", " value = 'Attribute Change: None'\n", "\n", " if key == '\\nReply:\\n糖糖:nan':\n", " with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n", " # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串,并在每个字段之间添加一个制表符\n", " line = '\\n'.join([value])\n", " line_bytes = line.encode('utf-8')\n", " # 将字节对象写入到文件中\n", " line_str = line_bytes.decode('utf-8')\n", " # 将字符串对象写入到文件中\n", " f.write(line_str)\n", "\n", " return \"\\n\".join([value])\n", "\n", " with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n", " # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串,并在每个字段之间添加一个制表符\n", " line = '\\n'.join([key, value])\n", "\n", " line_bytes = line.encode('utf-8')\n", " # 将字节对象写入到文件中\n", " line_str = line_bytes.decode('utf-8')\n", " # 将字符串对象写入到文件中\n", " f.write(line_str)\n", "\n", " return \"\\n\".join([key, value])\n", " except:\n", " pass\n" ], "metadata": { "id": "DgCgTdvC-eEe", "outputId": "34774b6e-3cdb-4cab-8773-c55703d66c3b", "colab": { "base_uri": "https://localhost:8080/", "height": 410 } }, "execution_count": 13, "outputs": [ { "output_type": "error", "ename": "ValueError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m#读取与查询\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mJines\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_excel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mJines_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;31m# 查询有选项的内容\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mpattern\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mr'(?<=\\().*?(?=\\))'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 332\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;31m# error: \"Callable[[VarArg(Any), KwArg(Any)], Any]\" has no\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/excel/_base.py\u001b[0m in \u001b[0;36mread_excel\u001b[0;34m(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, decimal, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExcelFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0mshould_close\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 482\u001b[0;31m \u001b[0mio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mExcelFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstorage_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstorage_options\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 483\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 484\u001b[0m raise ValueError(\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/excel/_base.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path_or_buffer, engine, storage_options)\u001b[0m\n\u001b[1;32m 1654\u001b[0m )\n\u001b[1;32m 1655\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mext\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1656\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1657\u001b[0m \u001b[0;34m\"Excel file format cannot be determined, you must specify \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1658\u001b[0m \u001b[0;34m\"an engine manually.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: Excel file format cannot be determined, you must specify an engine manually." ] } ] }, { "cell_type": "code", "source": [ "# txt文件转换为jsonl\n", "def parse_to_jsonl(file_path):\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " lines = iter(f.readlines())\n", " dialogs = []\n", " dialog = {}\n", " option = {}\n", " for line in lines:\n", " line = line.strip()\n", " if line.startswith(\"## 对话\") or line.startswith(\"## 对话组\"):\n", " if dialog and option:\n", " dialog[\"options\"].append(option)\n", " option = {}\n", " if dialog:\n", " dialogs.append(dialog)\n", " dialog = {\"prefix\": \"\", \"options\": []}\n", " elif line.startswith(\"### Prefix\") or line.startswith('**Prefix'):\n", " prefix = next(lines).strip()\n", " ids, categories = search_in_excel(prefix)\n", " # print(ids, categories)\n", " if ids and categories:\n", " dialog[\"id\"] = ids[0]\n", " dialog[\"category\"] = categories[0]\n", " dialog[\"prefix\"] = prefix\n", " elif line.startswith(\"### Option\") or line.startswith('**Option'):\n", " if option:\n", " dialog[\"options\"].append(option)\n", " option = {\"user\": \"\", \"reply\": \"\", \"attribute_change\": \"\"}\n", "\n", " elif line.startswith(\"User\") or line.startswith(\"User:\"):\n", " option[\"user\"] = line[5:].strip()\n", " elif line.startswith(\"Reply\") or line.startswith('**Reply:**'):\n", " option[\"reply\"] = next(lines).strip()\n", " elif line.startswith(\"Attribute Change\") or line.startswith('**Attribute Change:**'):\n", " option[\"attribute_change\"] = line[17:].strip()\n", "\n", " if option:\n", " dialog[\"options\"].append(option)\n", " if dialog:\n", " dialogs.append(dialog)\n", "\n", " with open('emoji_story_23.jsonl', 'a+', encoding=\"utf-8\") as outfile:\n", " for entry in dialogs:\n", " json.dump(entry, outfile, ensure_ascii=False)\n", " outfile.write('\\n')" ], "metadata": { "id": "sghUu4Or-uC4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 转换为jsonl\n", "for filename in os.listdir('events'):\n", " if filename.endswith(\".txt\"):\n", " try:\n", " parse_to_jsonl(f'events/{filename}')\n", " except:\n", " shutil.move(f'move/{filename}', f'error/{filename}')\n", " print(filename)" ], "metadata": { "id": "nBLuoQov_C_5" }, "execution_count": null, "outputs": [] } ] }