Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

Omar Solano commited on Feb 16, 2024

Commit

dda976b

1 Parent(s): 567c34a

update llama-index

Browse files

Files changed (1) hide show

notebooks/04-RAG_with_VectorStore.ipynb +319 -347

notebooks/04-RAG_with_VectorStore.ipynb CHANGED Viewed

@@ -1,361 +1,333 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
     "colab": {
-      "provenance": [],
-      "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
-      "include_colab_link": true
     },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
   },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
-      },
-      "source": [
-        "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Install Packages and Setup Variables"
-      ],
-      "metadata": {
-        "id": "5BGJ3fxhOk2V"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "QPJzr-I9XQ7l",
-        "outputId": "9949a0e5-8bf2-4ae7-9921-1f9dfbece9ae"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m51.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m20.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.7/51.7 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m508.6/508.6 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.9/79.9 MB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.0/143.0 kB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m70.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━���━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m63.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.1/92.1 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m63.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m62.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.6/105.6 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25h  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
-            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
-            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m698.9/698.9 kB\u001b[0m \u001b[31m45.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m69.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.6/72.6 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.8/143.8 kB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m23.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m58.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25h  Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
-            "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
-            "\u001b[0m"
-          ]
-        }
-      ],
-      "source": [
-        "!pip install -q llama-index==0.9.21 openai==1.6.0 cohere==4.39 tiktoken==0.5.2 chromadb==0.4.21 kaleido==0.2.1 python-multipart==0.0.6"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "\n",
-        "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
-        "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
-      ],
-      "metadata": {
-        "id": "riuXwpSPcvWC"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Load the Dataset (CSV)"
-      ],
-      "metadata": {
-        "id": "I9JbAzFcjkpn"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Download"
-      ],
-      "metadata": {
-        "id": "_Tif8-JoRH68"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
-      ],
-      "metadata": {
-        "id": "4fQaa1LN1mXL"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.csv"
-      ],
-      "metadata": {
-        "id": "-QTUkdfJjY4N"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
     {
-      "cell_type": "markdown",
-      "source": [
-        "## Read File"
-      ],
-      "metadata": {
-        "id": "zk-4alIxROo8"
-      }
     },
     {
-      "cell_type": "code",
-      "source": [
-        "import csv\n",
-        "\n",
-        "text = \"\"\n",
-        "\n",
-        "# Load the file as a JSON\n",
-        "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
-        "  csv_reader = csv.reader(file)\n",
-        "\n",
-        "  for row in csv_reader:\n",
-        "    text += row[0]\n",
-        "\n",
-        "# The number of characters in the dataset.\n",
-        "len( text )"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "7CYwRT6R0o0I",
-        "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "23632"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 4
-        }
       ]
     },
     {
-      "cell_type": "markdown",
-      "source": [
-        "# Chunking"
-      ],
-      "metadata": {
-        "id": "S17g2RYOjmf2"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "chunk_size = 512\n",
-        "chunks = []\n",
-        "\n",
-        "# Split the long text into smaller manageable chunks of 512 characters.\n",
-        "for i in range(0, len(text), chunk_size):\n",
-        "    chunks.append(text[i:i + chunk_size])\n",
-        "\n",
-        "len( chunks )"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "STACTMUR1z9N",
-        "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "47"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 6
-        }
       ]
     },
     {
-      "cell_type": "code",
-      "source": [
-        "from llama_index import Document\n",
-        "\n",
-        "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
-        "documents = [Document(text=t) for t in chunks]"
-      ],
-      "metadata": {
-        "id": "CtdsIUQ81_hT"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Save on Chroma"
-      ],
-      "metadata": {
-        "id": "OWaT6rL7ksp8"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import chromadb\n",
-        "\n",
-        "# create client and a new collection\n",
-        "# chromadb.EphemeralClient saves data in-memory.\n",
-        "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
-        "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
-      ],
-      "metadata": {
-        "id": "mXi56KTXk2sp"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "from llama_index.vector_stores import ChromaVectorStore\n",
-        "from llama_index.storage.storage_context import StorageContext\n",
-        "\n",
-        "# Define a storage context object using the created vector database.\n",
-        "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
-        "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
-      ],
-      "metadata": {
-        "id": "jKXURvLtkuTS"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "from llama_index import VectorStoreIndex\n",
-        "\n",
-        "# Add the documents to the database and create Index / embeddings\n",
-        "index = VectorStoreIndex.from_documents(\n",
-        "    documents, storage_context=storage_context\n",
-        ")"
-      ],
-      "metadata": {
-        "id": "WsD52wtrlESi"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Query Dataset"
-      ],
-      "metadata": {
-        "id": "8JPD8yAinVSq"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Define a query engine that is responsible for retrieving related pieces of text,\n",
-        "# and using a LLM to formulate the final answer.\n",
-        "query_engine = index.as_query_engine()"
-      ],
-      "metadata": {
-        "id": "mzS13x1ZlZ5X"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "response = query_engine.query(\n",
-        "    \"How many parameters LLaMA2 model has?\"\n",
-        ")\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "AYsQ4uLN_Oxg",
-        "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "The Llama-2 model has three different sizes: 7B, 13B, and 70B.\n"
-          ]
-        }
-      ]
     }
-  ]
-}

 {
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "view-in-github"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5BGJ3fxhOk2V"
+   },
+   "source": [
+    "# Install Packages and Setup Variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
     "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "QPJzr-I9XQ7l",
+    "outputId": "9949a0e5-8bf2-4ae7-9921-1f9dfbece9ae"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q llama-index==0.10.5 llama-index-vector-stores-chroma==0.1.1 openai==1.12.0 tiktoken==0.6.0 chromadb==0.4.22 kaleido==0.2.1 python-multipart==0.0.9"
+   ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "riuXwpSPcvWC"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "I9JbAzFcjkpn"
+   },
+   "source": [
+    "# Load the Dataset (CSV)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_Tif8-JoRH68"
+   },
+   "source": [
+    "## Download"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4fQaa1LN1mXL"
+   },
+   "source": [
+    "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "-QTUkdfJjY4N"
+   },
+   "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+      "100  169k  100  169k    0     0   602k      0 --:--:-- --:--:-- --:--:--  603k\n"
+     ]
+    }
+   ],
+   "source": [
+    "!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zk-4alIxROo8"
+   },
+   "source": [
+    "## Read File"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "7CYwRT6R0o0I",
+    "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
+   },
+   "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "841"
       ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import csv\n",
+    "\n",
+    "text = \"\"\n",
+    "\n",
+    "# Load the file as a JSON\n",
+    "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
+    "  csv_reader = csv.reader(file)\n",
+    "\n",
+    "  for row in csv_reader:\n",
+    "    text += row[0]\n",
+    "\n",
+    "# The number of characters in the dataset.\n",
+    "len( text )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "S17g2RYOjmf2"
+   },
+   "source": [
+    "# Chunking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "STACTMUR1z9N",
+    "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
+   },
+   "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "2"
       ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chunk_size = 512\n",
+    "chunks = []\n",
+    "\n",
+    "# Split the long text into smaller manageable chunks of 512 characters.\n",
+    "for i in range(0, len(text), chunk_size):\n",
+    "    chunks.append(text[i:i + chunk_size])\n",
+    "\n",
+    "len( chunks )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "CtdsIUQ81_hT"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core import Document\n",
+    "\n",
+    "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
+    "documents = [Document(text=t) for t in chunks]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "OWaT6rL7ksp8"
+   },
+   "source": [
+    "# Save on Chroma"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "mXi56KTXk2sp"
+   },
+   "outputs": [],
+   "source": [
+    "import chromadb\n",
+    "\n",
+    "# create client and a new collection\n",
+    "# chromadb.EphemeralClient saves data in-memory.\n",
+    "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
+    "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "id": "jKXURvLtkuTS"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
+    "from llama_index.core import StorageContext\n",
+    "\n",
+    "# Define a storage context object using the created vector database.\n",
+    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
+    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "id": "WsD52wtrlESi"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "# Add the documents to the database and create Index / embeddings\n",
+    "index = VectorStoreIndex.from_documents(\n",
+    "    documents, storage_context=storage_context\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8JPD8yAinVSq"
+   },
+   "source": [
+    "# Query Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "id": "mzS13x1ZlZ5X"
+   },
+   "outputs": [],
+   "source": [
+    "# Define a query engine that is responsible for retrieving related pieces of text,\n",
+    "# and using a LLM to formulate the final answer.\n",
+    "query_engine = index.as_query_engine()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "AYsQ4uLN_Oxg",
+    "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
+   },
+   "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The LLaMA2 model has a certain number of parameters, but without any specific information provided in the context, it is not possible to determine the exact number of parameters.\n"
+     ]
     }
+   ],
+   "source": [
+    "response = query_engine.query(\n",
+    "    \"How many parameters LLaMA2 model has?\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
+   "include_colab_link": true,
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}