Upload 4 files

Browse files

Files changed (4) hide show

data_prep.ipynb +1013 -0
data_prep.pdf +0 -0
training.ipynb +472 -0
training.pdf +0 -0

data_prep.ipynb ADDED Viewed

	@@ -0,0 +1,1013 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8198fee9-000e-4ef9-bb13-82c649c2e816",
+   "metadata": {},
+   "source": [
+    "## Data prep for retrieving beliefs for dialogs\n",
+    "\n",
+    "**Goal:** Create a dataset to match dialogs with (possibly) relevant facts  \n",
+    "  \n",
+    "**Method:**\n",
+    "- [x] Use stacked_samsum as training dataset\n",
+    "- [x] Prepare datasets\n",
+    "    - [x] remove unnecessary columns\n",
+    "    - [x] expand the stacked dataset\n",
+    "    - [x] truncate on the right to create dangling examples\n",
+    "    - [x] augment dialog using openai to make longer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe53fc09-0942-4e9a-921c-3804a1ede8ac",
+   "metadata": {},
+   "source": [
+    "### Constants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "94dea7bd-f87b-4559-bd82-dadf3dfd6025",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"BAAI/bge-small-en-v1.5\"\n",
+    "max_len = 512\n",
+    "next_concept_sep = \"\\n[NEXT_CONCEPT]\\n\"\n",
+    "training_input_file = \"./data/train-soft.jsonl\"\n",
+    "eval_input_file = \"./data/eval.jsonl\"\n",
+    "training_hn_file = \"./data/train.jsonl\"\n",
+    "eval_size = 12_500\n",
+    "seed = 42\n",
+    "query_prefix = \"Represent this sentence for searching relevant passages: \"\n",
+    "hf_repo_name = \"julep-ai/dfe-stacked_samsum\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a1ec397-3b13-4e2b-8e0f-9cf127378b8f",
+   "metadata": {},
+   "source": [
+    "### Imports and utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7b69b396-1ef2-41f7-aea8-76cf902dec8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functools import partial\n",
+    "import os\n",
+    "import random\n",
+    "import time\n",
+    "\n",
+    "from datasets import load_dataset, load_from_disk\n",
+    "from FlagEmbedding import FlagModel\n",
+    "from FlagEmbedding.baai_general_embedding.finetune.hn_mine import find_knn_neg\n",
+    "from huggingface_hub import HfApi\n",
+    "import jsonlines as jsonl\n",
+    "import langchain\n",
+    "from langchain.cache import SQLiteCache\n",
+    "from langchain.llms import OpenAI\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from math import ceil\n",
+    "from numpy import cumsum, dot\n",
+    "from numpy.linalg import norm\n",
+    "from tqdm.auto import tqdm\n",
+    "from transformers import AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8b7b4bfb-5b60-4a76-903d-cb528731745a",
+   "metadata": {},
+   "source": [
+    "#### Tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7656e742-9baa-4acc-b536-b2a861fd1d75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5473558d-45bb-430a-9d0d-9679ea6e2bcd",
+   "metadata": {},
+   "source": [
+    "#### LLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7dedef47-411d-4803-a2a5-4789f668e4ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "langchain.llm_cache = SQLiteCache(database_path=\".langchain.db\")\n",
+    "llm = OpenAI(model_name=\"gpt-3.5-turbo-instruct\", temperature=0.7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "552f665a-4d32-40d2-8269-ed6031473aec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt_template = PromptTemplate.from_template(\n",
+    "\"\"\"\\\n",
+    "You are a dialog writer. Given a dialog continue it for {n} more turns in the same style as the original speakers. You can be creative in coming up with the next turns as long as you make sure that the new dialog is consistent with the previous messages.\n",
+    "\n",
+    "### Example Dialog\n",
+    "\n",
+    "Ken: Hi, how are you?\n",
+    "Ang: Just peachy! You?\n",
+    "Ken: I'm okay...\n",
+    "Ang: Just okay? What's wrong?\n",
+    "Ken: Just stressed; work stuff, fighting with Brad, too much going on at mom's.\n",
+    "Ang: Hang in there, it will get better!\n",
+    "Ken: I know, but it's a lot.\n",
+    "Ang: Can I do anything to help?\n",
+    "Ken: You are! Listening to me vent! LOL!\n",
+    "Ang: Are you at least doing anything fun this weekend?\n",
+    "Ken: Show Saturday night, then seeing the grandkids on Sunday at the zoo.\n",
+    "\n",
+    "### Continuation\n",
+    "\n",
+    "Ang: Sounds great! That will cheer you up!\n",
+    "Ken: Gotta run, work calls. Love you!\n",
+    "Ang: Love you too! Have a fantastic day!\n",
+    "Ken: You too!\n",
+    "\n",
+    "### Input Dialog\n",
+    "\n",
+    "{input_dialog}\n",
+    "\n",
+    "### Continuation\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "def gen_continuation(input_dialog, n=4):\n",
+    "    wait = round(random.uniform(0.3, 1.2), 3)\n",
+    "    time.sleep(wait)\n",
+    "\n",
+    "    prompt = prompt_template.format(n=n, input_dialog=input_dialog)\n",
+    "    continuation = llm(prompt).strip()\n",
+    "    \n",
+    "    return continuation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2eb6f55d-ec09-4bc5-8f1a-31e521ad3121",
+   "metadata": {},
+   "source": [
+    "#### Dataset load"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3f5420aa-d327-4d3a-8e02-90473dcca1be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get everything, we'll split it later\n",
+    "dataset = load_dataset(\n",
+    "    \"stacked-summaries/stacked-samsum-1024\", \n",
+    ")\n",
+    "\n",
+    "\n",
+    "# Remove unnecessary columns\n",
+    "dataset = dataset.remove_columns(['chapter_length', 'summary_length', 'is_stacked',])\n",
+    "\n",
+    "# Remove empty/null dialogs\n",
+    "dataset = dataset.filter(\n",
+    "    lambda row: row[\"dialogue\"]\n",
+    ")\n",
+    "\n",
+    "# Convert windows-style line endings to unix-style\n",
+    "dataset = dataset.map(\n",
+    "    lambda row: dict(dialogue=row[\"dialogue\"].replace(\"\\r\\n\", '\\n'))\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1d728969-c3bc-42e5-8a49-2e8fb16f582c",
+   "metadata": {},
+   "source": [
+    "#### Dataset prep"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c56780b7-1e2f-458d-b370-82b6c95f5173",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def count_tokens(row):\n",
+    "    \"\"\"Count tokens using the tokenizer\"\"\"\n",
+    "\n",
+    "    dialogue = row[\"dialogue\"]\n",
+    "    tokens = tokenizer.encode(dialogue, add_special_tokens=False)\n",
+    "\n",
+    "    return dict(token_count=len(tokens))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "416b074f-9660-40c3-9774-7ea17bfae5bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add token count to every row in dataset\n",
+    "dataset = dataset.map(count_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "5c3666f1-0457-4304-aeff-10060405f72e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def offset_left(\n",
+    "    dialogue: str,\n",
+    "    split_offset=0,\n",
+    "    splits=1,\n",
+    "    max_len=max_len,\n",
+    "):\n",
+    "    # Split dialog lines\n",
+    "    lines = dialogue.split(\"\\n\")\n",
+    "\n",
+    "    # Count tokens per line\n",
+    "    toks_by_line = [\n",
+    "        len(tokenizer.encode(line, add_special_tokens=False))\n",
+    "        for line in lines\n",
+    "    ]\n",
+    "\n",
+    "    # Cumulative sum of tokens per line\n",
+    "    cum_toks_by_line = cumsum(toks_by_line)\n",
+    "\n",
+    "    # Total no. of tokens\n",
+    "    total_tokens = sum(toks_by_line)\n",
+    "\n",
+    "    # Return as is if total tokens is less than max len of model\n",
+    "    if total_tokens <= max_len:\n",
+    "        return dialogue\n",
+    "\n",
+    "    # Calculate step size\n",
+    "    step_size = ceil(total_tokens / (splits * 2))\n",
+    "\n",
+    "    # Calculate left index\n",
+    "    left_index = 0\n",
+    "    for cum_toks in cum_toks_by_line:\n",
+    "        if cum_toks > (split_offset * step_size):\n",
+    "            break\n",
+    "            \n",
+    "        left_index += 1\n",
+    "\n",
+    "    # Calculate right index\n",
+    "    right_index = 0\n",
+    "    for last_cum_toks in cum_toks_by_line[::-1]:\n",
+    "        if last_cum_toks < max_len:\n",
+    "            break\n",
+    "            \n",
+    "        right_index -= 1\n",
+    "\n",
+    "    # Calc final section\n",
+    "    if right_index == 0:\n",
+    "        lines = lines[left_index:]\n",
+    "    else:\n",
+    "        lines = lines[left_index:right_index]\n",
+    "\n",
+    "    return \"\\n\".join(lines)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "580d654b-ed6a-4cf5-b81a-886905d0bd30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def truncate_lines(dialog, num=3, min=5):\n",
+    "    \"\"\"\n",
+    "    Split dialog into lines and then drop the last `num` lines,\n",
+    "    making sure there are at least `min` lines remaining.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    lines = dialog.split(\"\\n\")\n",
+    "\n",
+    "    # If too short, return as is\n",
+    "    if len(lines) - num < min:\n",
+    "        return dialog\n",
+    "\n",
+    "    if num > 0:\n",
+    "        return \"\\n\".join(lines[:-num])\n",
+    "    else:\n",
+    "        return \"\\n\".join(lines[-num:])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "6f8b5214-1f51-4974-8c20-b3e4a6aa33ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def expand_stacked(rows):\n",
+    "    \"\"\"Expand stacked samsum dataset by splitting concepts in every summary per dialog\"\"\"\n",
+    "    \n",
+    "    # Get fields by batch\n",
+    "    dialogues = rows[\"dialogue\"]\n",
+    "    summaries = rows[\"summary\"]\n",
+    "\n",
+    "    # Containers for final results\n",
+    "    is_augmented = []\n",
+    "    is_truncated = []\n",
+    "    final_dialogues = []\n",
+    "    final_summaries = []\n",
+    "\n",
+    "    # Process every dialog and summary\n",
+    "    for dialogue, summary in tqdm(zip(dialogues, summaries)):\n",
+    "        # Split the summary by the NEXT_CONCEPT separator from the dataset\n",
+    "        ss = summary.split(next_concept_sep)\n",
+    "\n",
+    "        # Split different conversations within the sample\n",
+    "        #   offset on the left to try to match relevance\n",
+    "        dd = [\n",
+    "            offset_left(d, split_offset=1) for d in dialogue.split(\"\\n\\n\")\n",
+    "        ]\n",
+    "\n",
+    "        is_truncated += [False] * len(dd)\n",
+    "        is_augmented += [False] * len(dd)\n",
+    "        final_dialogues += dd\n",
+    "        final_summaries += ss\n",
+    "\n",
+    "        # ---\n",
+    "        # Now truncate and add\n",
+    "        truncated = [truncate_lines(d) for d in dd]\n",
+    "\n",
+    "        is_augmented += [False] * len(dd)\n",
+    "        is_truncated += [t != d for t, d in zip(truncated, dd)]\n",
+    "        final_dialogues += truncated\n",
+    "        final_summaries += ss\n",
+    "\n",
+    "        # ---\n",
+    "        # Now augment and add\n",
+    "\n",
+    "        augmented = [\n",
+    "            truncate_lines(d + gen_continuation(d), num=-4)\n",
+    "            for d in dd\n",
+    "        ]\n",
+    "        \n",
+    "        is_truncated += [False] * len(dd)\n",
+    "        is_augmented += [True] * len(dd)\n",
+    "        final_dialogues += augmented\n",
+    "        final_summaries += ss\n",
+    "\n",
+    "    return dict(\n",
+    "        dialogue=final_dialogues,\n",
+    "        summary=final_summaries,\n",
+    "        is_truncated=is_truncated,\n",
+    "        token_count=[None]*len(final_summaries),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "e79f4bb3-614a-4a5a-9135-fda2dce33c55",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parameter 'function'=<function expand_stacked at 0x7f0a3a68eef0> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "091a1ff1b3c34d1b8cee91d5468e48a8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=75):   0%|          | 0/29441 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a41b9133bf5a4974b5525a1406590bc0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=75):   0%|          | 0/1633 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "036f4b46482141bb89cc7924767c8427",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=75):   0%|          | 0/1637 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Use batched mode to be able to expand the size of the dataset\n",
+    "dataset = dataset.map(expand_stacked, batch_size=10, batched=True, num_proc=75)\n",
+    "dataset = dataset.remove_columns([\"token_count\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "22beb7aa-f191-4660-a860-ef4169c229b1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "44e4d3202e914fcf9b388e47c70d5e28",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d02fecde242d4109a9f88fbcaf55ec6b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/339 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "95cdcc57a7b94830af4a5661d087df9a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a1289ee9a1ac4a8fa39ac9e26cc2b360",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a2c5e8ccbc04b56a41071401d493b66",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d887773bdb354eec83ef2d2a7f135c97",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2405e0ad01fb4117b4a90a21b764a91e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d0583b97bdb4403185692e70f2e3eb8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c33b9b1c9cde4256a5b3840830a29628",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "02687da16cf0401ea4f19a89e2e7ac9c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading metadata:   0%|          | 0.00/752 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "dataset.push_to_hub(hf_repo_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "767a4251-fab6-47ce-8cdc-e2416d70b440",
+   "metadata": {},
+   "source": [
+    "### Prepare dataset for finetuning\n",
+    "[Docs](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune)\n",
+    "\n",
+    "Format:\n",
+    "```json\n",
+    "{\"query\": str, \"pos\": List[str], \"neg\":List[str]}\n",
+    "```\n",
+    "\n",
+    "Keys:\n",
+    "- query: belief\n",
+    "- pos: list of matching conversations\n",
+    "- neg: list of random conversations from dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ea1f2c3c-211d-4740-be1b-5eac3f57416c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc180cbde424436193fbaef12800d924",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading readme:   0%|          | 0.00/752 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4e8a26389c2b4ed6ba6be892cf0c594d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dfbe638ec7424b6d99552ccf00d3703b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/81.5M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b88de79e28d643f38ab0b3def530c008",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/3.91M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9daf99846404246ab60f02caabf66ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/3.84M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c441707e6b04ff8ba0a3d68790eced7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "29fe824823394171b01394b813cabb1e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split:   0%|          | 0/338127 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "04895f3438bd40e6b3e34c2e7934f920",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split:   0%|          | 0/19131 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4c21386349cc4e15b259f3b462fe8d9a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating test split:   0%|          | 0/18381 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "dataset = load_dataset(hf_repo_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "10817e24-a6b5-49da-b1e7-6101b32a9135",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pick_random(dataset, split=\"train\", far_from=0):\n",
+    "    ds = dataset[split]\n",
+    "    ds_len = len(ds)\n",
+    "    mid = ds_len // 2\n",
+    "    which_half = far_from // mid\n",
+    "    \n",
+    "    start = (1 - which_half) * mid\n",
+    "    end = ds_len - which_half * mid\n",
+    "    idx = random.randrange(start, end)\n",
+    "    \n",
+    "    return ds[idx]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9bf3bf97-86c4-41f4-ab07-7de94ed72344",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c031292641d4e668428e227d0cb22e5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/338127 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "with jsonl.open(training_input_file, mode='w') as writer:\n",
+    "    for i, row in enumerate(tqdm(dataset[\"train\"], total=len(dataset[\"train\"]))):\n",
+    "        query = row[\"summary\"]\n",
+    "        pos = [row[\"dialogue\"]]\n",
+    "    \n",
+    "        neg = [\n",
+    "            pick_random(dataset, split=\"train\", far_from=i)[\"dialogue\"]\n",
+    "            for _ in range(3)\n",
+    "        ]\n",
+    "        \n",
+    "        writer.write(dict(query=query, pos=pos, neg=neg))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e07bc44f-302c-4c7c-b7c6-62c9cd9db3e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5394a57e7aca4e9d9c36b2f7f3b9b0f3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/12500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "with jsonl.open(eval_input_file, mode='w') as writer:\n",
+    "    for i, row in enumerate(tqdm(dataset[\"validation\"], total=eval_size)):\n",
+    "        if i > eval_size:\n",
+    "            break\n",
+    "\n",
+    "        query = row[\"summary\"]\n",
+    "        pos = [row[\"dialogue\"]]\n",
+    "    \n",
+    "        neg = [\n",
+    "            pick_random(dataset, split=\"validation\", far_from=i)[\"dialogue\"]\n",
+    "            for _ in range(3)\n",
+    "        ]\n",
+    "        \n",
+    "        writer.write(dict(query=query, pos=pos, neg=neg))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6c895f9-9ef4-4edc-b65d-722188eaa8bd",
+   "metadata": {},
+   "source": [
+    "### Mine hard negatives"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b73cf693-4138-429f-8188-0a72b36ed44b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = FlagModel(\n",
+    "    model_name,\n",
+    "    query_instruction_for_retrieval=query_prefix,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "adc677e6-c28f-49f9-a812-5cd4e93084b3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "inferencing embedding for corpus (number=37361)--------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Inference Embeddings: 100%|██████████| 146/146 [00:37<00:00,  3.87it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "inferencing embedding for queries (number=338127)--------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Inference Embeddings: 100%|██████████| 1321/1321 [00:52<00:00, 25.34it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "create index and search------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Batches: 100%|██████████| 5284/5284 [00:07<00:00, 740.63it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "find_knn_neg(\n",
+    "    model,\n",
+    "    input_file=training_input_file,\n",
+    "    candidate_pool=None,\n",
+    "    output_file=training_hn_file,\n",
+    "    sample_range=list(range(2, 200)),\n",
+    "    negative_number=10,\n",
+    "    use_gpu=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d408f52e-d8b8-4e6a-86bc-234d2b862a86",
+   "metadata": {},
+   "source": [
+    "### Add processed files to hf dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "fd79a43e-7add-4037-9b5f-5bf60db89158",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a8be41b80f2b42c8800eb12d0ec57bf9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "train.jsonl:   0%|          | 0.00/2.42G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "hf_api = HfApi()\n",
+    "\n",
+    "for path in [\n",
+    "    training_input_file,\n",
+    "    eval_input_file,\n",
+    "    training_hn_file,\n",
+    "]:\n",
+    "    hf_api.upload_file(\n",
+    "        path_or_fileobj=path,\n",
+    "        path_in_repo=path.split('/')[-1],\n",
+    "        repo_id=hf_repo_name,\n",
+    "        repo_type=\"dataset\",\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78410dd0-80c9-4f27-9a95-4b8a34604e1e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

data_prep.pdf ADDED Viewed

Binary file (63.1 kB). View file

training.ipynb ADDED Viewed

	@@ -0,0 +1,472 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ec403ba5-1356-46b7-a14f-86bf7db0c5b4",
+   "metadata": {},
+   "source": [
+    "## Train Dialog-Fact Encoder\n",
+    "\n",
+    "**Goal:** Train an embedding model to match dialogs with (possibly) relevant facts  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "723a9f8a-800a-4de0-ab89-e4d984271a5b",
+   "metadata": {},
+   "source": [
+    "### Constants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7167d6e4-7a7f-4f7f-b4e7-92b9613afed8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"BAAI/bge-base-en-v1.5\"\n",
+    "query_prefix = \"Represent this sentence for searching relevant passages: \"\n",
+    "max_len = 512\n",
+    "training_hn_file = \"./data/train.jsonl\"\n",
+    "eval_file = \"./data/eval.jsonl\"\n",
+    "batch_size = 1350\n",
+    "output_model_path = \"./dfe-base-en\"\n",
+    "hf_repo_name = \"julep-ai/dfe-base-en\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "22aad488-38c3-40b9-8e5b-6d47b41d49cf",
+   "metadata": {},
+   "source": [
+    "### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98d5e97e-df3b-43e4-b82c-2f4768a217b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import itertools as it\n",
+    "\n",
+    "import graphviz\n",
+    "import jsonlines as jsonl\n",
+    "from lion_pytorch import Lion\n",
+    "from sentence_transformers import InputExample, SentenceTransformer, losses as ls, models as ml, util\n",
+    "from sentence_transformers.evaluation import SimilarityFunction, TripletEvaluator\n",
+    "import torch\n",
+    "from torch.utils.data import DataLoader, IterableDataset\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72ee0c6c-2785-49ff-85ec-600b76af11b8",
+   "metadata": {},
+   "source": [
+    "### Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b17def02-f756-4973-a29f-dd628da34e58",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def hn_output(file):\n",
+    "    with jsonl.open(file) as reader:\n",
+    "        for entry in reader:\n",
+    "            query = entry[\"query\"]\n",
+    "            pos = [dict(dialog=dialog) for dialog in entry[\"pos\"]]\n",
+    "            neg = [dict(dialog=dialog) for dialog in entry[\"neg\"]]\n",
+    "\n",
+    "            for combined in it.product(\n",
+    "                [dict(fact=query)],\n",
+    "                pos,\n",
+    "                neg,\n",
+    "            ):\n",
+    "                yield InputExample(texts=list(combined))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "34649f83-5bc3-4b1b-a1b2-3d406b84979d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "01107f542dec483a9a48ed4b9e4b9a76",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "039f46c46d724fa0aac242492248dbff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "training_data = list(tqdm(hn_output(training_hn_file)))\n",
+    "eval_data = list(tqdm(hn_output(eval_file)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8e817f20-4e80-4842-bf45-f7439a5e2b7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataloader = DataLoader(training_data, shuffle=True, batch_size=batch_size)\n",
+    "eval_dataloader = DataLoader(eval_data, shuffle=True, batch_size=batch_size // 10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be0a103c-1c3d-41fa-933c-f0b843087658",
+   "metadata": {},
+   "source": [
+    "### DFE Model Architecture"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c8eea066-1f4e-4184-9215-0b5fdd1cdf16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Base model\n",
+    "base_model = SentenceTransformer(model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7f31eda8-d224-4d30-8a6b-ed4cb32a2c12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Freeze base transformer layers\n",
+    "for param in base_model.parameters():\n",
+    "    param.requires_grad = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "721c3897-9ef0-409f-9e9d-a693975486bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda:0\")\n",
+    "\n",
+    "# Note that we must also set _target_device, or any SentenceTransformer.fit() call will reset\n",
+    "# the body location\n",
+    "base_model._target_device = device\n",
+    "base_model = base_model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6115d96b-fe35-4a23-9a21-f3da52304f3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emb_dims = base_model._first_module().get_word_embedding_dimension() # 768\n",
+    "\n",
+    "def dense_projector(dims: int):\n",
+    "    proj_dims = dims * 2  # 1536\n",
+    "    \n",
+    "    return [\n",
+    "        ml.Dense(dims, proj_dims),  # 768 -> 1536\n",
+    "        ml.Dense(proj_dims, proj_dims), # 1536 -> 1536\n",
+    "        ml.Dropout(0.1),\n",
+    "        ml.Dense(proj_dims, proj_dims), # 1536 -> 1536\n",
+    "        ml.Dense(proj_dims, dims),  # 1536 -> 768\n",
+    "    ]\n",
+    "\n",
+    "def asym_module(dims: int, keys: list[str], allow_empty_key: bool = False):\n",
+    "    return ml.Asym(\n",
+    "        {\n",
+    "            key: dense_projector(dims)\n",
+    "            for key in keys\n",
+    "        },\n",
+    "        allow_empty_key=allow_empty_key,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "2b273b52-b3b1-4f29-9d9a-1fe00d29c686",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_model._modules[\"2\"] = asym_module(emb_dims, [\"dialog\", \"fact\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "03004002-b9d1-4b71-8ea5-bd2a2072c751",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "OrderedDict([('0',\n",
+       "              Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel ),\n",
+       "             ('1',\n",
+       "              Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})),\n",
+       "             ('2',\n",
+       "              Asym(\n",
+       "                (dialog-0): Dense({'in_features': 768, 'out_features': 1536, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})\n",
+       "                (dialog-1): Dense({'in_features': 1536, 'out_features': 1536, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})\n",
+       "                (dialog-2): Dropout(\n",
+       "                  (dropout_layer): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (dialog-3): Dense({'in_features': 1536, 'out_features': 1536, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})\n",
+       "                (dialog-4): Dense({'in_features': 1536, 'out_features': 768, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})\n",
+       "                (fact-0): Dense({'in_features': 768, 'out_features': 1536, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})\n",
+       "                (fact-1): Dense({'in_features': 1536, 'out_features': 1536, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})\n",
+       "                (fact-2): Dropout(\n",
+       "                  (dropout_layer): Dropout(p=0.1, inplace=False)\n",
+       "                )\n",
+       "                (fact-3): Dense({'in_features': 1536, 'out_features': 1536, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})\n",
+       "                (fact-4): Dense({'in_features': 1536, 'out_features': 768, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})\n",
+       "              ))])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "base_model._modules"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6ea33246-2612-443d-a5c0-4179eea1a126",
+   "metadata": {},
+   "source": [
+    "### Prepare training loss and evaluator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "e0008a08-a08d-4523-b477-212083a93aa8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_loss = ls.TripletLoss(model=base_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "53b0aba9-a279-4c90-8949-e0096b5ed4c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "triplet_evaluator = TripletEvaluator.from_input_examples(\n",
+    "    eval_data,  # Triplet is ({dialog: <some_dialog>}, {fact: <relevant_fact>}, [{fact: <negative_irrelevant_fact>}])\n",
+    "    batch_size=batch_size // 10,\n",
+    "    main_distance_function=SimilarityFunction.COSINE,\n",
+    "    show_progress_bar=True,\n",
+    "    write_csv=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6ea59f8-c1e1-404b-ba84-95c8199cd1df",
+   "metadata": {},
+   "source": [
+    "### Train model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbf3b8c9-8ef8-4198-b284-910c57f2cbca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ea0ed014f83b4651b810c0abd317add9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Epoch:   0%|          | 0/15 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5690514fe3ac4e3a84fedb128a687ec1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Iteration:   0%|          | 0/2505 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ef19638fe2504ec095fa9f6aed3d5069",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/278 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "62af9e297e044f0bbb4d9b903b229db4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/278 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4007c6d76e1445fe8263561886ab3196",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/278 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "base_model.fit(\n",
+    "    train_objectives=[(dataloader, train_loss)],\n",
+    "    evaluator=triplet_evaluator,\n",
+    "    checkpoint_save_steps=600,\n",
+    "    evaluation_steps=600,\n",
+    "    checkpoint_path=f\"{output_model_path}/ckpts\",\n",
+    "    scheduler=\"WarmupCosine\",\n",
+    "    save_best_model=True,\n",
+    "    epochs=15,\n",
+    "    warmup_steps=200,\n",
+    "    optimizer_class=Lion,\n",
+    "    optimizer_params=dict(lr=1e-4, weight_decay=1e-2),\n",
+    "    use_amp=True,\n",
+    "    output_path=output_model_path,\n",
+    "    checkpoint_save_total_limit=4,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21c91b44-4c0a-4fda-a72c-91dac70e72ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_model.push_to_hub(hf_repo_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85e7c7cd-6636-42d2-aec4-56f292ea8ba9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graphviz.set_jupyter_format('png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb0419cc-beb7-443e-b733-47c5b6cb267c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_graph = draw_graph(base_model, input_size=(1, 512), device='meta')\n",
+    "model_graph.visual_graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e478f64-f687-40e5-a315-225de31d6df6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

training.pdf ADDED Viewed

Binary file (46.1 kB). View file