Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

AlaFalaki commited on Jan 3, 2024

Commit

fcc14cf

1 Parent(s): 56b8408

Created using Colaboratory

Browse files

Files changed (1) hide show

notebooks/04-RAG_with_VectorStore.ipynb +55 -44

notebooks/04-RAG_with_VectorStore.ipynb CHANGED Viewed

@@ -4,7 +4,7 @@
   "metadata": {
     "colab": {
       "provenance": [],
-      "authorship_tag": "ABX9TyMKmMCxgNdqmZNkB0r6NNkp",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -26,9 +26,18 @@
         "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -93,12 +102,13 @@
       "source": [
         "import os\n",
         "\n",
         "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
       ],
       "metadata": {
         "id": "riuXwpSPcvWC"
       },
-      "execution_count": 2,
       "outputs": []
     },
     {
@@ -113,7 +123,16 @@
     {
       "cell_type": "markdown",
       "source": [
-        "Read the dataset as a long string."
       ],
       "metadata": {
         "id": "4fQaa1LN1mXL"
@@ -130,27 +149,30 @@
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "import csv\n",
         "\n",
         "text = \"\"\n",
         "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
         "  csv_reader = csv.reader(file)\n",
         "\n",
         "  for row in csv_reader:\n",
-        "    text += row[0]"
-      ],
-      "metadata": {
-        "id": "0Q9sxuW0g3Gd"
-      },
-      "execution_count": 3,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
         "len( text )"
       ],
       "metadata": {
@@ -160,7 +182,7 @@
         "id": "7CYwRT6R0o0I",
         "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
       },
-      "execution_count": 4,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -188,18 +210,11 @@
       "source": [
         "chunk_size = 512\n",
         "chunks = []\n",
         "for i in range(0, len(text), chunk_size):\n",
-        "    chunks.append(text[i:i + chunk_size])"
-      ],
-      "metadata": {
-        "id": "IU7zLFi01pjD"
-      },
-      "execution_count": 5,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
         "len( chunks )"
       ],
       "metadata": {
@@ -209,7 +224,7 @@
         "id": "STACTMUR1z9N",
         "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
       },
-      "execution_count": 6,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -228,12 +243,13 @@
       "source": [
         "from llama_index import Document\n",
         "\n",
         "documents = [Document(text=t) for t in chunks]"
       ],
       "metadata": {
         "id": "CtdsIUQ81_hT"
       },
-      "execution_count": 7,
       "outputs": []
     },
     {
@@ -251,14 +267,14 @@
         "import chromadb\n",
         "\n",
         "# create client and a new collection\n",
-        "# chromadb.EphemeralClient to save in-memory.\n",
         "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
         "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
       ],
       "metadata": {
         "id": "mXi56KTXk2sp"
       },
-      "execution_count": 8,
       "outputs": []
     },
     {
@@ -267,13 +283,14 @@
         "from llama_index.vector_stores import ChromaVectorStore\n",
         "from llama_index.storage.storage_context import StorageContext\n",
         "\n",
         "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
         "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
       ],
       "metadata": {
         "id": "jKXURvLtkuTS"
       },
-      "execution_count": 9,
       "outputs": []
     },
     {
@@ -281,6 +298,7 @@
       "source": [
         "from llama_index import VectorStoreIndex\n",
         "\n",
         "index = VectorStoreIndex.from_documents(\n",
         "    documents, storage_context=storage_context\n",
         ")"
@@ -288,7 +306,7 @@
       "metadata": {
         "id": "WsD52wtrlESi"
       },
-      "execution_count": 11,
       "outputs": []
     },
     {
@@ -303,12 +321,14 @@
     {
       "cell_type": "code",
       "source": [
         "query_engine = index.as_query_engine()"
       ],
       "metadata": {
         "id": "mzS13x1ZlZ5X"
       },
-      "execution_count": 12,
       "outputs": []
     },
     {
@@ -326,7 +346,7 @@
         "id": "AYsQ4uLN_Oxg",
         "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
       },
-      "execution_count": 13,
       "outputs": [
         {
           "output_type": "stream",
@@ -336,15 +356,6 @@
           ]
         }
       ]
-    },
-    {
-      "cell_type": "code",
-      "source": [],
-      "metadata": {
-        "id": "hjYiWAocnalt"
-      },
-      "execution_count": null,
-      "outputs": []
     }
   ]
 }

   "metadata": {
     "colab": {
       "provenance": [],
+      "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
       "include_colab_link": true
     },
     "kernelspec": {
         "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Install Packages and Setup Variables"
+      ],
+      "metadata": {
+        "id": "5BGJ3fxhOk2V"
+      }
+    },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
       "source": [
         "import os\n",
         "\n",
+        "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
         "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
       ],
       "metadata": {
         "id": "riuXwpSPcvWC"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
     {
       "cell_type": "markdown",
       "source": [
+        "## Download"
+      ],
+      "metadata": {
+        "id": "_Tif8-JoRH68"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
       ],
       "metadata": {
         "id": "4fQaa1LN1mXL"
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Read File"
+      ],
+      "metadata": {
+        "id": "zk-4alIxROo8"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
         "import csv\n",
         "\n",
         "text = \"\"\n",
+        "\n",
+        "# Load the file as a JSON\n",
         "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
         "  csv_reader = csv.reader(file)\n",
         "\n",
         "  for row in csv_reader:\n",
+        "    text += row[0]\n",
+        "\n",
+        "# The number of characters in the dataset.\n",
         "len( text )"
       ],
       "metadata": {
         "id": "7CYwRT6R0o0I",
         "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
       },
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "execute_result",
       "source": [
         "chunk_size = 512\n",
         "chunks = []\n",
+        "\n",
+        "# Split the long text into smaller manageable chunks of 512 characters.\n",
         "for i in range(0, len(text), chunk_size):\n",
+        "    chunks.append(text[i:i + chunk_size])\n",
+        "\n",
         "len( chunks )"
       ],
       "metadata": {
         "id": "STACTMUR1z9N",
         "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
       },
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "execute_result",
       "source": [
         "from llama_index import Document\n",
         "\n",
+        "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
         "documents = [Document(text=t) for t in chunks]"
       ],
       "metadata": {
         "id": "CtdsIUQ81_hT"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
         "import chromadb\n",
         "\n",
         "# create client and a new collection\n",
+        "# chromadb.EphemeralClient saves data in-memory.\n",
         "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
         "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
       ],
       "metadata": {
         "id": "mXi56KTXk2sp"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
         "from llama_index.vector_stores import ChromaVectorStore\n",
         "from llama_index.storage.storage_context import StorageContext\n",
         "\n",
+        "# Define a storage context object using the created vector database.\n",
         "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
         "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
       ],
       "metadata": {
         "id": "jKXURvLtkuTS"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
       "source": [
         "from llama_index import VectorStoreIndex\n",
         "\n",
+        "# Add the documents to the database and create Index / embeddings\n",
         "index = VectorStoreIndex.from_documents(\n",
         "    documents, storage_context=storage_context\n",
         ")"
       "metadata": {
         "id": "WsD52wtrlESi"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
     {
       "cell_type": "code",
       "source": [
+        "# Define a query engine that is responsible for retrieving related pieces of text,\n",
+        "# and using a LLM to formulate the final answer.\n",
         "query_engine = index.as_query_engine()"
       ],
       "metadata": {
         "id": "mzS13x1ZlZ5X"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
         "id": "AYsQ4uLN_Oxg",
         "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
       },
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
           ]
         }
       ]
     }
   ]
 }