Spaces:

silk-road
/

ChatHaruhi-Needy

Runtime error

File size: 54,536 Bytes
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/LC1332/Needy-Haruhi/blob/main/notebook/%E4%BB%8E%E8%A1%A8%E6%A0%BC%E4%B8%AD%E8%A7%A3%E6%9E%90%E4%BA%8B%E4%BB%B6.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "nAJ2Ubu1-MUb",
        "outputId": "d4b88502-60dc-49cf-efb3-4151a994e79c",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2023-11-05 02:16:18--  https://github.com/LC1332/Needy-Haruhi/raw/main/data/Jines.csv\n",
            "Resolving github.com (github.com)... 140.82.112.4\n",
            "Connecting to github.com (github.com)|140.82.112.4|:443... connected.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://raw.githubusercontent.com/LC1332/Needy-Haruhi/main/data/Jines.csv [following]\n",
            "--2023-11-05 02:16:18--  https://raw.githubusercontent.com/LC1332/Needy-Haruhi/main/data/Jines.csv\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 249489 (244K) [text/plain]\n",
            "Saving to: ‘Jines.csv’\n",
            "\n",
            "Jines.csv           100%[===================>] 243.64K  --.-KB/s    in 0.03s   \n",
            "\n",
            "2023-11-05 02:16:19 (9.50 MB/s) - ‘Jines.csv’ saved [249489/249489]\n",
            "\n"
          ]
        }
      ],
      "source": [
        "# 下载文件。\n",
        "!wget https://github.com/LC1332/Needy-Haruhi/raw/main/data/Jines.csv"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "import re\n",
        "import shutil\n",
        "\n",
        "import pandas as pd\n",
        "\n",
        "Jines_file = r\"/content/Jines.csv\"  # 更改文件路径和扩展名\n",
        "\n",
        "# # 读取CSV文件\n",
        "Jines = pd.read_csv(Jines_file, lineterminator='\\n')"
      ],
      "metadata": {
        "id": "1-kV6t_ARAGN"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "import re\n",
        "import shutil\n",
        "\n",
        "import pandas as pd\n",
        "\n",
        "Jines_file = r\"/content/Jines.csv\"  # Change the file path and extension as needed\n",
        "\n",
        "# Read the CSV file\n",
        "Jines = pd.read_csv(Jines_file, lineterminator='\\n')\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "SRQcHQgHRTDz"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "实现一个python函数，输入是字符串，输出也是一个字符串，找到第一个\"(\"，取括号之前的字符串并strip后输出\n",
        "\n",
        "例子输入\n",
        "Day0_JINE (First Part)\n",
        "例子输出\n",
        "Day0_JINE"
      ],
      "metadata": {
        "id": "j0UF0Oh7R9ou"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def extract_string_before_parentheses(input_value):\n",
        "    # Check if the input is not a string\n",
        "    if pd.isnull(input_value):\n",
        "        return \"\"\n",
        "    elif not isinstance(input_value, str):\n",
        "        print(\"Warning: Input is not a string. Converting to string.\")\n",
        "        # Convert to string if possible\n",
        "        input_value = str(input_value)\n",
        "        print(input_value)\n",
        "\n",
        "    # Split the string at the first occurrence of '('\n",
        "    parts = input_value.split('(', 1)\n",
        "    # Take the first part and strip it of whitespace\n",
        "    return parts[0].strip()\n",
        "\n",
        "# Example usage with a string:\n",
        "input_example_str = \"Day0_JINE (First Part)\"\n",
        "print(extract_string_before_parentheses(input_example_str))  # Expected output: Day0_JINE\n",
        "\n",
        "# Example usage with a non-string:\n",
        "input_example_non_str = 12345\n",
        "print(extract_string_before_parentheses(input_example_non_str))  # Expected output: Warning + \"12345\"\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "mBWfbACoSPLr",
        "outputId": "6d9d7e52-a87e-47d2-c959-ca4fd722d5c4"
      },
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Day0_JINE\n",
            "Warning: Input is not a string. Converting to string.\n",
            "12345\n",
            "12345\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "count = 0\n",
        "\n",
        "lines = []\n",
        "\n",
        "last_parent = \"\"\n",
        "last_category = \"\"\n",
        "\n",
        "all_events = []\n",
        "\n",
        "# Loop through each row and print the 'Category' and 'Parent'\n",
        "for index, row in Jines.iterrows():\n",
        "\n",
        "    if pd.isnull(row['ParentId (more info)']):\n",
        "        continue\n",
        "\n",
        "    parent = extract_string_before_parentheses(row['ParentId (more info)'])\n",
        "    category = row['Category']\n",
        "\n",
        "    if parent.startswith(\"Ending\"):\n",
        "        break\n",
        "\n",
        "\n",
        "    # print(f\"Category: {category}, Parent: {parent}\")\n",
        "\n",
        "    if category == last_category and last_parent == parent:\n",
        "        lines.append(row)\n",
        "    else:\n",
        "        data = {\n",
        "            \"parent\": last_parent,\n",
        "            \"category\": last_category,\n",
        "            \"lines\": lines\n",
        "        }\n",
        "        all_events.append(data)\n",
        "\n",
        "        last_parent = parent\n",
        "        last_category = category\n",
        "        lines = [row]\n",
        "\n",
        "if len(lines) > 0:\n",
        "    data = {\n",
        "        \"parent\": last_parent,\n",
        "        \"category\": last_category,\n",
        "        \"lines\": lines\n",
        "    }\n",
        "    all_events.append(data)\n",
        "\n",
        "all_events = all_events[1:]"
      ],
      "metadata": {
        "id": "MNZSH1qRRcdF"
      },
      "execution_count": 28,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(len(all_events))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vCcRJXR2TiE7",
        "outputId": "82be5beb-2a83-4cff-c78a-66b4f3bfb6c0"
      },
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "196\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "for i, event in enumerate(all_events):\n",
        "    if i % 3 == 0:\n",
        "        print(i, event['parent'])\n",
        "    else:\n",
        "        print(i, event['parent'], end = ' ')"
      ],
      "metadata": {
        "id": "Qe3Cb7kVTlqd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for i, event in enumerate(all_events):\n",
        "    only_ame_flag = True\n",
        "    for line in event['lines']:\n",
        "        speaker = line[\"Speaker/Action (in blue)\"]\n",
        "        if speaker != \"ame\":\n",
        "            only_ame_flag = False\n",
        "            break\n",
        "    if only_ame_flag:\n",
        "        print(event['parent'])"
      ],
      "metadata": {
        "id": "kpHE_EI_YeD5"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def check_only_ame(event):\n",
        "    for line in event['lines']:\n",
        "        speaker = line[\"Speaker/Action (in blue)\"]\n",
        "        if speaker == \"pi\":\n",
        "            return False\n",
        "    return True\n",
        "\n",
        "def check_2nd(event):\n",
        "    for line in event['lines']:\n",
        "        parent = line['ParentId (more info)']\n",
        "        if \"2ndOption\" in parent:\n",
        "            return True\n",
        "    return False"
      ],
      "metadata": {
        "id": "Ab_yJKGcZyy6"
      },
      "execution_count": 79,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def check_open(event):\n",
        "    for line in event['lines']:\n",
        "        speaker = line[\"Speaker/Action (in blue)\"]\n",
        "        if speaker == \"Any Open-Text Answer\":\n",
        "            return True\n",
        "    return False"
      ],
      "metadata": {
        "id": "tbnALRMTxW13"
      },
      "execution_count": 80,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def transfer_dialogue(event):\n",
        "    dialogues = []\n",
        "\n",
        "    last_speaker = \"\"\n",
        "    last_text = \"\"\n",
        "    last_title = \"\"\n",
        "    for i, line in enumerate(event['lines']):\n",
        "        speaker = line[\"Speaker/Action (in blue)\"]\n",
        "        title = line[\"Id\"]\n",
        "        text = line[\"BodyCn\"]\n",
        "        if speaker != last_speaker or i + 1 == len(event['lines']):\n",
        "            if last_speaker != \"\":\n",
        "                dialogues.append({\n",
        "                    \"speaker\": last_speaker,\n",
        "                    \"text\": last_text,\n",
        "                    \"title\": last_title\n",
        "                })\n",
        "            last_speaker = speaker\n",
        "            last_text = text\n",
        "            last_title = title\n",
        "        else:\n",
        "            last_text += text + \" \"\n",
        "\n",
        "    return dialogues"
      ],
      "metadata": {
        "id": "KeCarNmmfur9"
      },
      "execution_count": 81,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "如果是一开始的ame的部分，直接作为prefix\n",
        "\n",
        "然后pi的时候要看相同的parent\n",
        "\n",
        "然后后面如果有不一样的就作为post\n"
      ],
      "metadata": {
        "id": "e0nGAhy4pw-j"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "count = 0\n",
        "for i, event in enumerate(all_events):\n",
        "    if check_only_ame(event):\n",
        "        continue\n",
        "\n",
        "    if check_2nd(event):\n",
        "        continue\n",
        "\n",
        "    if check_open(event):\n",
        "        continue\n",
        "\n",
        "    parent = event['parent']\n",
        "    # print(parent, ' ' , event['category'],' ', len(event['lines']))\n",
        "\n",
        "    # dealing with one event\n",
        "    state = \"count_prefix\"\n",
        "\n",
        "    prefix = \"\"\n",
        "    options = []\n",
        "\n",
        "    # if parent in [\"Day0_JINE\"]:\n",
        "    #     verbose = True\n",
        "    # else:\n",
        "    #     verbose = False\n",
        "\n",
        "    verbose = False\n",
        "\n",
        "    record_flag = True\n",
        "\n",
        "    for i, line in enumerate(event['lines']):\n",
        "\n",
        "        speaker = line[\"Speaker/Action (in blue)\"]\n",
        "        line_parent = line['ParentId (more info)']\n",
        "        content = line[\"BodyCn\"]\n",
        "\n",
        "        if verbose:\n",
        "            print( speaker ,\" \", line_parent, \" \", content, \" \", len(options))\n",
        "\n",
        "\n",
        "        if i != len(event['lines']) - 1:\n",
        "            next_parent = event['lines'][i + 1]['ParentId (more info)']\n",
        "            next_parent = next_parent.replace(\";end\", \"\")\n",
        "            next_speaker = event['lines'][i + 1]['Speaker/Action (in blue)']\n",
        "        else:\n",
        "            next_parent = \"\"\n",
        "            next_speaker = \"\"\n",
        "\n",
        "        if state == \"count_prefix\":\n",
        "            if verbose:\n",
        "                print(\"state = \", state )\n",
        "\n",
        "            if speaker == \"ame\":\n",
        "                prefix += line[\"BodyCn\"] + \"\\n\"\n",
        "                continue\n",
        "            elif speaker == \"pi\":\n",
        "                state = \"count_option_start\"\n",
        "\n",
        "        if state == \"count_option_start\":\n",
        "            if verbose:\n",
        "                print(\"prefix = \")\n",
        "                print(prefix)\n",
        "                print(\"state = \", state )\n",
        "\n",
        "            if speaker == \"pi\":\n",
        "                option_text = line[\"BodyCn\"]\n",
        "                state = \"collect_reply\"\n",
        "                option_parent = line_parent\n",
        "                # print(\"opt_parent=\", option_parent)\n",
        "                # print(\"next_parent=\", next_parent)\n",
        "                reply = \"\"\n",
        "\n",
        "                if next_speaker == \"pi\":\n",
        "                    # 说明下一个也是选项，要直接终结掉这个选项\n",
        "                    state = \"count_post\"\n",
        "                else:\n",
        "                    continue\n",
        "            else:\n",
        "                print(\"warning! not pi's reply in count_option_start, Event = \", parent)\n",
        "\n",
        "        if state == \"collect_reply\":\n",
        "            if verbose:\n",
        "                print(\"state = \", state )\n",
        "\n",
        "            # if speaker != \"ame\" and speaker != \"pi\":\n",
        "            #     # print(\"skip speaker \", speaker)\n",
        "            #     continue\n",
        "\n",
        "            if speaker == \"ame\":\n",
        "                option_text += line[\"BodyCn\"]\n",
        "            elif speaker == \"pi\":\n",
        "                # a new reply\n",
        "                print(\"warning! not ame's reply in collect_reply, Event = \", parent)\n",
        "\n",
        "            if i == len(event['lines']) - 1:\n",
        "                state = \"count_post\"\n",
        "            else:\n",
        "                if next_parent != option_parent:\n",
        "                    state = \"count_post\"\n",
        "\n",
        "        if state == \"count_post\":\n",
        "            if verbose:\n",
        "                print(\"state = \", state )\n",
        "            option_data = {\n",
        "                \"text\": option_text,\n",
        "                \"reply\": reply\n",
        "            }\n",
        "\n",
        "            options.append(option_data)\n",
        "\n",
        "            if i == len(event['lines']) - 1:\n",
        "                break\n",
        "\n",
        "            next_speaker = event['lines'][i + 1]['Speaker/Action (in blue)']\n",
        "\n",
        "            if \"Option\" in next_parent and next_speaker == \"pi\":\n",
        "                state = \"count_option_start\"\n",
        "            elif \"Option\" not in next_parent and next_speaker == \"ame\":\n",
        "                state = \"collect_post\"\n",
        "            else:\n",
        "                # print(\"strange\", next_parent, ' ', next_speaker, ' ', content)\n",
        "                record_flag = False\n",
        "\n",
        "\n",
        "            continue\n",
        "\n",
        "    if record_flag:\n",
        "        event_name = extract_string_before_parentheses(parent)\n",
        "        print(event_name, \" len_prefix = \" , len(prefix), \" #opt = \", len(options))\n",
        "        print(prefix)\n",
        "\n",
        "        count += 1\n",
        "\n",
        "\n",
        "        # break\n",
        "\n",
        "        # if state == \"count_option_end\":\n",
        "        #     if speaker == \"ame\":\n",
        "        #         option_text += line[\"BodyCn\"]\n",
        "        #     else:\n",
        "        #         print(\"warning! not ame's reply in count_option_end\")\n",
        "\n",
        "    # print(\"prefix:\\n\",prefix)\n",
        "    # print(\"first option:\\n\",option_text)\n",
        "    # print()\n",
        "    # count += 1\n",
        "    # if count > 5:\n",
        "    #     break\n",
        "\n",
        "\n",
        "print(count)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "t5pn9UfSZOWo",
        "outputId": "c7d257f9-0fa3-4800-aef3-642566d92139"
      },
      "execution_count": 111,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Event_UberEats  len_prefix =  25  #opt =  3\n",
            "我们点外卖吧我一步也不想动了可是又超想吃饭！！！\n",
            "\n",
            "Event_Sea  len_prefix =  27  #opt =  2\n",
            "我已经彻底疲倦了\n",
            "不如我们结束这一切 现在就去海边吧\n",
            "\n",
            "Event_Pudding  len_prefix =  26  #opt =  3\n",
            "我没打招呼就把冰箱里的布丁吃了 会被判死刑吗？？？\n",
            "\n",
            "Event_Hairstyle  len_prefix =  20  #opt =  4\n",
            "想换个发型了，阿P喜欢什么样子的糖糖？\n",
            "\n",
            "Event_Money  len_prefix =  15  #opt =  3\n",
            "我要出去玩！给我零花钱！！！\n",
            "\n",
            "Event_Seikei  len_prefix =  18  #opt =  3\n",
            "如果我要整容，你觉得整哪里比较好？\n",
            "\n",
            "Event_AmePiercerd  len_prefix =  70  #opt =  2\n",
            "嗳，你来帮我打耳洞嘛 让喜欢的人给自己打耳洞很棒不是吗 有一种被支配着的感觉 鸡皮疙瘩都要起来了\n",
            "我好怕我好怕我好怕\n",
            "我好怕！\n",
            "但是来吧！\n",
            "\n",
            "Event_Charahen  len_prefix =  14  #opt =  3\n",
            "哎，你喜欢什么样的糖糖啊？\n",
            "\n",
            "Event_AmeFuture  len_prefix =  18  #opt =  3\n",
            "哎，你会希望看到糖糖将来的样子吗？\n",
            "\n",
            "Event_Sumabura  len_prefix =  41  #opt =  2\n",
            "我也想被做进那个大乱斗游戏……\n",
            "哎，如果那个游戏里面有超天酱的话，阿P会用我吗？\n",
            "\n",
            "Event_Negativ  len_prefix =  37  #opt =  3\n",
            "光是活着就好累啊……\n",
            "现在无论是谁对我说什么，我肯定都会往负面方向去理解\n",
            "\n",
            "Event_DrugHolic  len_prefix =  46  #opt =  2\n",
            "啊～不行了 不行不行不行不行\n",
            "无论思考什么，满脑袋都只有一个“死”字\n",
            "阿P，我该怎么办啊？\n",
            "\n",
            "Event_Jisatumisui  len_prefix =  30  #opt =  3\n",
            "不行了 我现在就想立刻马上消失\n",
            "阿P 我们一起去买炭吧……\n",
            "\n",
            "Event_Flower  len_prefix =  12  #opt =  3\n",
            "阿P，看！我买了小发发\n",
            "\n",
            "Event_Advice  len_prefix =  24  #opt =  3\n",
            "我正在想下次搞什么企划呢～阿P帮帮我 出出主意\n",
            "\n",
            "Event_Cheerup  len_prefix =  32  #opt =  2\n",
            "我今后也会努力加油的，你要支持我哦 还有阿P你自己也要加油哦！\n",
            "\n",
            "Event_LoveJINE  len_prefix =  10  #opt =  3\n",
            "阿P 我最喜欢你了\n",
            "\n",
            "Event_Manicure  len_prefix =  19  #opt =  4\n",
            "哎，你觉得我下次美甲做什么颜色好呢？\n",
            "\n",
            "Event_Okusan  len_prefix =  54  #opt =  3\n",
            "说到笨蛋情侣，就不得不提那个段子了\n",
            "“欢迎回家，你要先吃饭？”“还是先洗澡？”“还是……先，吃，我，呢？”\n",
            "\n",
            "Event_Copyceleb  len_prefix =  33  #opt =  3\n",
            "机会这么难得，要不整点富婆快乐活吧\n",
            "说不定还能用作下次的企划哦！\n",
            "\n",
            "Event_Menherafriend  len_prefix =  133  #opt =  3\n",
            "有个女孩发私信找我谈人生，我该怎么办呐\n",
            "「超天酱你好，我是一名高中生。之前因为精神疾病而住院了一段时间，现在跟不上学习进度，班上还没决定好志愿的人也只剩我一个了。平时看着同学们为了各自的前程努力奋斗的样子，心里总是非常地焦虑。请你告诉我，我到底应该怎么办才好呢？」\n",
            "\n",
            "Event_Okiru_Afternoon  len_prefix =  40  #opt =  2\n",
            "醒过来一看太阳都下山了 笑死\n",
            "睡太久了浑身无力～～……我可以就酱紫睡一辈子吗？\n",
            "\n",
            "Event_Okiru_Night  len_prefix =  45  #opt =  1\n",
            "要命 一个回笼觉睡到了这个点\n",
            "浪费一整天啥都没干的罪恶感好难顶啊！\n",
            "你为什么不叫醒我啦！\n",
            "\n",
            "Event_Newthings  len_prefix =  15  #opt =  3\n",
            "今天有点想试试平时不会做的事\n",
            "\n",
            "Event_Watchword  len_prefix =  29  #opt =  3\n",
            "小天使请安！这个开场白也说厌了啊～\n",
            "帮我想个别的开场白！\n",
            "\n",
            "Day0_JINE  len_prefix =  180  #opt =  2\n",
            "啊～紧张死了……\n",
            "我们两个一起想出来的“超天酱”\n",
            "终于，降临在这个世界上了\n",
            "粉丝……涨了一千啊\n",
            "这样都得不到什么被捧的感觉\n",
            "毕竟现在才刚开始呢\n",
            "想满足我黑洞似的认可欲求\n",
            "最少也得有一百万个宅宅围着我转呀\n",
            "大概一个月的时间，胜负就能见分晓吧\n",
            "因为凭我的干劲也只能坚持那么久……\n",
            "所以接下来的这一个月，咱们要努力奋斗咯！！\n",
            "我和你的话，一定能够打造厉害的主播吧？\n",
            "\n",
            "Day1_JINE  len_prefix =  208  #opt =  1\n",
            "早啊！\n",
            "这是我们当上主播后的第一个早晨呢\n",
            "……然而时间已经到中午了\n",
            "早起无能～～～\n",
            "算了，就这样吧！\n",
            "距离百万粉丝的目标只剩区区999000人了\n",
            "现在先朝着一万粉进发吧！\n",
            "就让我们潇洒登顶吧♪\n",
            "你也知道，仅凭我一个人是什么都做不到的\n",
            "阿P你每天都要给我下很多很多的指示呀\n",
            "我相信你哦\n",
            "只要是你说的\n",
            "我什么都会听\n",
            "我相信阿P\n",
            "我一定乖乖听话\n",
            "所以你一定要把我打造成最棒的主播呀……\n",
            "不然的话\n",
            "不然的话，我可是会坏掉的\n",
            "\n",
            "Event_Wishlist  len_prefix =  30  #opt =  4\n",
            "我要搞自己的心愿单了\n",
            "然后本糖允许阿P来想要往里加什么东西\n",
            "\n",
            "Event_Song  len_prefix =  53  #opt =  1\n",
            "你快看私信！\n",
            "有位作曲家联系我，说要给我写角色歌诶！\n",
            "哎呀～终于也走到这一步了～\n",
            "宅宅们的耳朵要怀孕啦～\n",
            "\n",
            "Scenario_topstreamer_trakenjoikeike  len_prefix =  100  #opt =  1\n",
            "快看，快看啊阿P！\n",
            "锵锵～粉丝破百万的纪念金盾哦！\n",
            "像黄金骑士一样，金光闪闪！\n",
            "哈，哈，哈！\n",
            "全世界的阿宅们都彻底被我的颜值俘虏啦\n",
            "而阿P，你就是这个可爱过头的女孩子最最在乎的人！\n",
            "你要以此为豪哦！\n",
            "\n",
            "30\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "for event in all_events:\n",
        "    flag = False\n",
        "    for line in event['lines']:\n",
        "        parent = line['ParentId (more info)']\n",
        "        if \"2ndOption\" in parent:\n",
        "            flag = True\n",
        "            break\n",
        "\n",
        "    if flag:\n",
        "        print(event['parent'])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "myJ3x9xGVCir",
        "outputId": "96594bd6-0178-4048-fbf9-f9fe1e3a394f"
      },
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Event_NextDate\n",
            "Event_Yandeiru\n",
            "Event_Yutabon\n",
            "Event_MailInterview\n",
            "Event_Dialog\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "count = 0\n",
        "\n",
        "for event in all_events:\n",
        "    if len(event['lines']) == 1:\n",
        "        continue\n",
        "\n",
        "    flag = False\n",
        "    for line in event['lines']:\n",
        "        parent = line['ParentId (more info)']\n",
        "        if \"2ndOption\" in parent:\n",
        "            flag = True\n",
        "            break\n",
        "\n",
        "    if flag:\n",
        "        continue\n",
        "\n",
        "    count += 1\n",
        "\n",
        "print(count)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6apNMqgXV5aH",
        "outputId": "8eb21455-999e-46b3-d575-05841afb5adb"
      },
      "execution_count": 35,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "176\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "\n",
        "\n",
        "# # 查询有选项的内容\n",
        "pattern = r'(?<=\\().*?(?=\\))'\n",
        "\n",
        "# # 匹配事件\n",
        "Title = Jines.loc[(Jines['ParentId (more info)'].str.contains(pattern, regex=True, na=False))]\n",
        "\n",
        "Attribute_temp = {\"Affection\": 0, \"Stress\": 0, \"Darkness\": 0}\n"
      ],
      "metadata": {
        "id": "vmh37VqKCiw-"
      },
      "execution_count": 61,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Jines_file = r\"/content/Jines.csv\"  # 更改文件路径和扩展名\n",
        "\n",
        "使用utf-8读取这个文件后，为我split成多行"
      ],
      "metadata": {
        "id": "5ZS0MWuxJoIN"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def sanitize_filename(filename):\n",
        "    invalid_chars = '<>:\"/\\\\|?*\\n'\n",
        "    for char in invalid_chars:\n",
        "        filename = filename.replace(char, '_')\n",
        "    return filename"
      ],
      "metadata": {
        "id": "aOBxpvtlK8B5"
      },
      "execution_count": 68,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!rm -rf /content/events\n",
        "!mkdir events\n"
      ],
      "metadata": {
        "id": "PLoXThkLLC7e"
      },
      "execution_count": 84,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def set_json_value( a, key, value, verbose = True):\n",
        "    if key in a and a[key] != value:\n",
        "        event_name = a[\"Name_while_read_csv\"]\n",
        "        if verbose:\n",
        "            print(f\"Warning! Key {key} already exists in event {event_name}\")\n",
        "            print(f\"try overwrite {a[key]} to {value}\")\n",
        "    else:\n",
        "        a[key] = value\n",
        "    return a"
      ],
      "metadata": {
        "id": "AydtmWYmLm6z"
      },
      "execution_count": 111,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "请为我实现一个python函数，输入和输出都是字符串\n",
        "去掉字符串末尾3位可能的数字。"
      ],
      "metadata": {
        "id": "yE5sAn59ODk4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def remove_trailing_digits(s: str) -> str:\n",
        "    \"\"\"\n",
        "    Remove up to three trailing digits from a string.\n",
        "\n",
        "    :param s: Input string that may end with up to three digits.\n",
        "    :return: String with the trailing digits removed.\n",
        "    \"\"\"\n",
        "    # 初始化一个变量，用于计数尾部连续数字的数量\n",
        "    trailing_digit_count = 0\n",
        "\n",
        "    # 从字符串末尾开始，向前检查每个字符\n",
        "    for char in reversed(s[-3:]):  # 查看最后三个字符\n",
        "        if char.isdigit():  # 如果字符是数字\n",
        "            trailing_digit_count += 1  # 增加计数\n",
        "        else:\n",
        "            break  # 如果遇到非数字字符，跳出循环\n",
        "\n",
        "    # 如果尾部有数字，去除相应数量的字符\n",
        "    if trailing_digit_count > 0:\n",
        "        return s[:-trailing_digit_count]\n",
        "    else:\n",
        "        return s\n",
        "\n",
        "\n",
        "# 示例用法\n",
        "input_str = \"example123\"\n",
        "output_str = remove_trailing_digits(input_str)\n",
        "print(output_str)  # 应该输出 \"example\"\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dAGwJKDoOOam",
        "outputId": "c217ad74-e14d-4c17-92d5-917c1b5ef1d5"
      },
      "execution_count": 112,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "example\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "event_name_to_data = {}\n",
        "\n",
        "# 把文件转换为txt\n",
        "def format_output(row):\n",
        "\n",
        "    # ID\n",
        "    ParentId = f'{row[\"ParentId (more info)\"]}'\n",
        "    Category_temp = f'{row[\"Category\"]}'\n",
        "    Category = sanitize_filename(Category_temp)\n",
        "    ID = f'{row[\"Id\"]}'\n",
        "\n",
        "    # 匹配标题\n",
        "    regex1 = r\"\\w+(?= \\()\"\n",
        "    title = re.search(regex1, ParentId)\n",
        "    title_str = title.group()\n",
        "\n",
        "    current_data = {}\n",
        "\n",
        "    # 通过title_str 索引出数据\n",
        "    if title_str in event_name_to_data:\n",
        "        current_data = event_name_to_data[title_str]\n",
        "    else:\n",
        "        current_data = {\"Name_while_read_csv\":title_str,\"options\":[] }\n",
        "        event_name_to_data[title_str] = current_data\n",
        "\n",
        "\n",
        "    # 事件\n",
        "    event_list = []\n",
        "\n",
        "    # 匹配提问\n",
        "    match = re.search(r\"\\(First Part\\)\", ParentId)\n",
        "    match2 = re.search(r\"\\(First Part; end\\)\", ParentId)\n",
        "    match3 = re.search(r\"\\(Third Part\\)\", ParentId)\n",
        "    match4 = re.search(r\"\\(Second Part\\)\", ParentId)\n",
        "    match5 = re.search(r\"\\(Fourth Part\\)\", ParentId)\n",
        "\n",
        "    # 数值\n",
        "    aff = f\"Affection: {row['Affection']}\"\n",
        "    str = f\"Stress: {row['Stress']}\"\n",
        "    dar = f\"Darkness: {row['Darkness']}\"\n",
        "\n",
        "    # 匹配选项以及回复\n",
        "    choose_time = re.search(r\"\\d+\", ParentId)\n",
        "    reply_ = re.search(r'(\\(.*Option[0-9]+;end\\))', ParentId)\n",
        "    reply_2 = re.search(r'(\\(.*Option[0-9]\\))', ParentId)\n",
        "\n",
        "    # 处理提问\n",
        "    # if match or match2 or match4 or match5 or match3:\n",
        "    if match or match2 or match3 or match4 or match5:\n",
        "\n",
        "        Prefix = f'\\n## 对话\\n### Prefix Category_temp:{Category} ID:{ID}'\n",
        "\n",
        "        current_data = set_json_value(current_data, \"category\", Category,False)\n",
        "\n",
        "        if ID and len(ID) > 0 and ID != \"nan\":\n",
        "            ID = remove_trailing_digits(ID)\n",
        "            current_data = set_json_value(current_data, \"id\", ID, False)\n",
        "\n",
        "\n",
        "\n",
        "        Ame = f\"糖糖: {row['BodyCn']}\"\n",
        "        with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n",
        "            # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串，并在每个字段之间添加一个制表符\n",
        "            line = '\\n'.join([Prefix, Ame])\n",
        "\n",
        "            line_bytes = line.encode('utf-8')\n",
        "            # 将字节对象写入到文件中\n",
        "            line_str = line_bytes.decode('utf-8')\n",
        "            # 将字符串对象写入到文件中\n",
        "            f.write(line_str)\n",
        "\n",
        "        return \"\\n\".join([Prefix, Ame])\n",
        "\n",
        "    # 处理选项\n",
        "    elif row['Speaker/Action (in blue)'] == 'pi':\n",
        "        # 跳过数值为空的回复\n",
        "        try:\n",
        "            key = f'\\n### Option-{choose_time.group()}'\n",
        "            user = f\"User:　{row['BodyCn']}\"\n",
        "\n",
        "            if aff == 'Affection: nan':\n",
        "                aff = ''\n",
        "            if str == 'Stress: nan':\n",
        "                str = ''\n",
        "            if dar == 'Darkness: nan':\n",
        "                dar = ''\n",
        "            value = f\"Attribute Change: {aff} {str} {dar}\"\n",
        "\n",
        "            if value == 'Attribute Change:   ':\n",
        "                value = ''\n",
        "\n",
        "            with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n",
        "                # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串，并在每个字段之间添加一个制表符\n",
        "                line = '\\n'.join([key, user, value])\n",
        "\n",
        "                line_bytes = line.encode('utf-8')\n",
        "                # 将字节对象写入到文件中\n",
        "                line_str = line_bytes.decode('utf-8')\n",
        "                # 将字符串对象写入到文件中\n",
        "                f.write(line_str)\n",
        "            return \"\\n\".join([key, user, value])\n",
        "        except:\n",
        "            pass\n",
        "\n",
        "    # 处理选项回复\n",
        "    elif reply_ or (reply_2 and row['Speaker/Action (in blue)'] == 'ame'):\n",
        "        try:\n",
        "            key = f'\\nReply：\\n糖糖：{row[\"BodyCn\"]}'\n",
        "\n",
        "            if aff == 'Affection: nan':\n",
        "                aff = ''\n",
        "            if str == 'Stress: nan':\n",
        "                str = ''\n",
        "            if dar == 'Darkness: nan':\n",
        "                dar = ''\n",
        "            value = f\"Attribute Change: {aff} {str} {dar}\"\n",
        "\n",
        "            if value == 'Attribute Change:   ':\n",
        "                value = 'Attribute Change: None'\n",
        "\n",
        "            if key == '\\nReply：\\n糖糖：nan':\n",
        "                with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n",
        "                    # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串，并在每个字段之间添加一个制表符\n",
        "                    line = '\\n'.join([value])\n",
        "                    line_bytes = line.encode('utf-8')\n",
        "                    # 将字节对象写入到文件中\n",
        "                    line_str = line_bytes.decode('utf-8')\n",
        "                    # 将字符串对象写入到文件中\n",
        "                    f.write(line_str)\n",
        "\n",
        "                return \"\\n\".join([value])\n",
        "\n",
        "            with open(f'events/{title_str}.txt', 'a+', encoding='utf-8') as f:\n",
        "                # 使用 join 方法将 Ame, Title_ame, Category 连接成一个字符串，并在每个字段之间添加一个制表符\n",
        "                line = '\\n'.join([key, value])\n",
        "\n",
        "                line_bytes = line.encode('utf-8')\n",
        "                # 将字节对象写入到文件中\n",
        "                line_str = line_bytes.decode('utf-8')\n",
        "                # 将字符串对象写入到文件中\n",
        "                f.write(line_str)\n",
        "\n",
        "            return \"\\n\".join([key, value])\n",
        "        except:\n",
        "            pass\n"
      ],
      "metadata": {
        "id": "DgCgTdvC-eEe"
      },
      "execution_count": 116,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# 转换为txt\n",
        "re_Title = Title.apply(format_output, axis=1)\n",
        "# output_str = re_Title.str.cat(sep=\"\\n \\n\")  # 用空格分隔每个元素"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZHdGWB-qF9v0",
        "outputId": "80271ca6-995a-416f-a9dc-f5b36e0a2938"
      },
      "execution_count": 115,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 250k+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 250k+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 250k+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 250k+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 500k+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 500k+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 500k+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 1m+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 1m+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 1m+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event StHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 100k+_and Stress Cap Raised to 120 to Random Noon Event_ Followers 1m+_and Stress Cap Raised to 120\n",
            "Warning! Key category already exists in event YamiHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 500k+_and Darkness 61+\n",
            "Warning! Key category already exists in event YamiHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 500k+_and Darkness 61+\n",
            "Warning! Key category already exists in event YamiHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 1m+_and Darkness 61+\n",
            "Warning! Key category already exists in event YamiHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 1m+_and Darkness 61+\n",
            "Warning! Key category already exists in event YamiHi_FollowHi\n",
            "try overwrite Random Noon Event_ Followers 250k+_and Darkness 61+ to Random Noon Event_ Followers 1m+_and Darkness 61+\n",
            "Warning! Key category already exists in event KenjoHi\n",
            "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-14\n",
            "Warning! Key category already exists in event KenjoHi\n",
            "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-9\n",
            "Warning! Key category already exists in event KenjoHi\n",
            "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-9\n",
            "Warning! Key category already exists in event KenjoHi\n",
            "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-4\n",
            "Warning! Key category already exists in event KenjoHi\n",
            "try overwrite Random Noon Event_ Darkness 0-19 to Random Noon Event_ Darkness 0-4\n",
            "Warning! Key category already exists in event YamiHi_SukiHi\n",
            "try overwrite Random Noon Event_ Affection _and Darkness at 41+ to Random Noon Event_ Affection _and Darkness at 61+\n",
            "Warning! Key category already exists in event YamiHi_SukiHi\n",
            "try overwrite Random Noon Event_ Affection _and Darkness at 41+ to Random Noon Event_ Affection _and Darkness at 61+\n",
            "Warning! Key category already exists in event YamiHi_SukiHi\n",
            "try overwrite Random Noon Event_ Affection _and Darkness at 41+ to Random Noon Event_ Affection _and Darkness at 81+\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Day0_JINE\n",
            "try overwrite Day 1_ Logged In (Before Stream) to Day 1_ Logged In (After Stream)\n",
            "Warning! Key category already exists in event Ending_Normal\n",
            "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event)\n",
            "Warning! Key category already exists in event Ending_Normal\n",
            "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event)\n",
            "Warning! Key category already exists in event Ending_Normal\n",
            "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event; answered correctly)\n",
            "Warning! Key category already exists in event Ending_Normal\n",
            "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event; answered correctly)\n",
            "Warning! Key category already exists in event Ending_Normal\n",
            "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event; answered correctly)\n",
            "Warning! Key category already exists in event Ending_Normal\n",
            "try overwrite Ending_ Utopian Parody to Ending_ Utopian Parody _(With Trauma Event; answered correctly)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(re_Title.head)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VSuD3iv_GGjt",
        "outputId": "2f9a34cf-6302-4caf-b29b-bbfaeaa5bb09"
      },
      "execution_count": 30,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Empty DataFrame\n",
            "Columns: [{\"payload\":{\"allShortcutsEnabled\":false, fileTree:{\"data\":{\"items\":[{\"name\":\"Jines.csv\", path:\"data/Jines.csv\", contentType:\"file\"}, {\"name\":\"Jines.xlsx\", path:\"data/Jines.xlsx\", contentType:\"file\"}.1, {\"name\":\"emoji_story_23.jsonl\", path:\"data/emoji_story_23.jsonl\", contentType:\"file\"}.2, {\"name\":\"original_story_23.jsonl\", path:\"data/original_story_23.jsonl\", contentType:\"file\"}], totalCount:4}, :{\"items\":[{\"name\":\"data\", path:\"data\", contentType:\"directory\"}, {\"name\":\"notebook\", path:\"notebook\", contentType:\"directory\"}.1, {\"name\":\"src\", path:\"src\", contentType:\"directory\"}.2, {\"name\":\".DS_Store\", path:\".DS_Store\", contentType:\"file\"}.3, {\"name\":\".gitignore\", path:\".gitignore\", contentType:\"file\"}.4, {\"name\":\"LICENSE\", path:\"LICENSE\", contentType:\"file\"}.5, {\"name\":\"README.md\", path:\"README.md\", contentType:\"file\"}].1, totalCount:7}}, fileTreeProcessingTime:5.894192, foldersToFetch:[], reducedMotionEnabled:null, repo:{\"id\":713164097, defaultBranch:\"main\", name:\"Needy-Haruhi\", ownerLogin:\"LC1332\", currentUserCanPush:false, isFork:false, isEmpty:false, createdAt:\"2023-11-02T01:04:46.000Z\", ownerAvatar:\"https://avatars.githubusercontent.com/u/5266090?v=4\", public:true, private:false, isOrgOwned:false}, symbolsExpanded:false, treeExpanded:true, refInfo:{\"name\":\"main\", listCacheKey:\"v0:1699146562.0\", canEdit:false, refType:\"branch\", currentOid:\"7804db2cd6540b664df0e89e5d6c2ee0e62248ac\"}, path:\"data/Jines.csv\".1, currentUser:null, blob:{\"rawLines\":null, stylingDirectives:null, csv:[[\"Category\", Id, ParentId (more info), Speaker/Action (in blue), Stress, Affection, Darkness, BodyCn, null, null.1, null.2, null.3, null.4, null.5, null.6, null.7, null.8, null.9, null.10, null.11, null.12, null.13, null.14, null.15, null.16, null.17, null.18, null.19, null.20, null], [\"Random Dusk/Night Texts\", LineWeekDay003, weekday, ame, null.21, null.22, null.23, 早上起不来！好想死～～～, ...]\n",
            "Index: []\n",
            "\n",
            "[0 rows x 46399 columns]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# txt文件转换为jsonl\n",
        "def parse_to_jsonl(file_path):\n",
        "    with open(file_path, 'r', encoding='utf-8') as f:\n",
        "        lines = iter(f.readlines())\n",
        "    dialogs = []\n",
        "    dialog = {}\n",
        "    option = {}\n",
        "    for line in lines:\n",
        "        line = line.strip()\n",
        "        if line.startswith(\"## 对话\") or line.startswith(\"## 对话组\"):\n",
        "            if dialog and option:\n",
        "                dialog[\"options\"].append(option)\n",
        "                option = {}\n",
        "            if dialog:\n",
        "                dialogs.append(dialog)\n",
        "            dialog = {\"prefix\": \"\", \"options\": []}\n",
        "        elif line.startswith(\"### Prefix\") or line.startswith('**Prefix'):\n",
        "            prefix = next(lines).strip()\n",
        "            ids, categories = search_in_excel(prefix)\n",
        "            # print(ids, categories)\n",
        "            if ids and categories:\n",
        "                dialog[\"id\"] = ids[0]\n",
        "                dialog[\"category\"] = categories[0]\n",
        "            dialog[\"prefix\"] = prefix\n",
        "        elif line.startswith(\"### Option\") or line.startswith('**Option'):\n",
        "            if option:\n",
        "                dialog[\"options\"].append(option)\n",
        "            option = {\"user\": \"\", \"reply\": \"\", \"attribute_change\": \"\"}\n",
        "\n",
        "        elif line.startswith(\"User\") or line.startswith(\"User:\"):\n",
        "            option[\"user\"] = line[5:].strip()\n",
        "        elif line.startswith(\"Reply\") or line.startswith('**Reply:**'):\n",
        "            option[\"reply\"] = next(lines).strip()\n",
        "        elif line.startswith(\"Attribute Change\") or line.startswith('**Attribute Change:**'):\n",
        "            option[\"attribute_change\"] = line[17:].strip()\n",
        "\n",
        "    if option:\n",
        "        dialog[\"options\"].append(option)\n",
        "    if dialog:\n",
        "        dialogs.append(dialog)\n",
        "\n",
        "    with open('emoji_story_23.jsonl', 'a+', encoding=\"utf-8\") as outfile:\n",
        "        for entry in dialogs:\n",
        "            json.dump(entry, outfile, ensure_ascii=False)\n",
        "            outfile.write('\\n')"
      ],
      "metadata": {
        "id": "sghUu4Or-uC4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# 转换为jsonl\n",
        "for filename in os.listdir('events'):\n",
        "    if filename.endswith(\".txt\"):\n",
        "        try:\n",
        "            parse_to_jsonl(f'events/{filename}')\n",
        "        except:\n",
        "            shutil.move(f'move/{filename}', f'error/{filename}')\n",
        "            print(filename)"
      ],
      "metadata": {
        "id": "nBLuoQov_C_5"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}