{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "5BGJ3fxhOk2V"
   },
   "source": [
    "# Install Packages and Setup Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "QPJzr-I9XQ7l",
    "outputId": "9949a0e5-8bf2-4ae7-9921-1f9dfbece9ae"
   },
   "outputs": [],
   "source": [
    "!pip install -q llama-index==0.10.5 llama-index-vector-stores-chroma==0.1.1 openai==1.12.0 tiktoken==0.6.0 chromadb==0.4.22 kaleido==0.2.1 python-multipart==0.0.9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "riuXwpSPcvWC"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "I9JbAzFcjkpn"
   },
   "source": [
    "# Load the Dataset (CSV)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_Tif8-JoRH68"
   },
   "source": [
    "## Download"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "4fQaa1LN1mXL"
   },
   "source": [
    "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "-QTUkdfJjY4N"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100  169k  100  169k    0     0   602k      0 --:--:-- --:--:-- --:--:--  603k\n"
     ]
    }
   ],
   "source": [
    "!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zk-4alIxROo8"
   },
   "source": [
    "## Read File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "7CYwRT6R0o0I",
    "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "841"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import csv\n",
    "\n",
    "text = \"\"\n",
    "\n",
    "# Load the file as a JSON\n",
    "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
    "  csv_reader = csv.reader(file)\n",
    "\n",
    "  for row in csv_reader:\n",
    "    text += row[0]\n",
    "\n",
    "# The number of characters in the dataset.\n",
    "len( text )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "S17g2RYOjmf2"
   },
   "source": [
    "# Chunking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "STACTMUR1z9N",
    "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_size = 512\n",
    "chunks = []\n",
    "\n",
    "# Split the long text into smaller manageable chunks of 512 characters.\n",
    "for i in range(0, len(text), chunk_size):\n",
    "    chunks.append(text[i:i + chunk_size])\n",
    "\n",
    "len( chunks )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "id": "CtdsIUQ81_hT"
   },
   "outputs": [],
   "source": [
    "from llama_index.core import Document\n",
    "\n",
    "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
    "documents = [Document(text=t) for t in chunks]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OWaT6rL7ksp8"
   },
   "source": [
    "# Save on Chroma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "id": "mXi56KTXk2sp"
   },
   "outputs": [],
   "source": [
    "import chromadb\n",
    "\n",
    "# create client and a new collection\n",
    "# chromadb.EphemeralClient saves data in-memory.\n",
    "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
    "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "id": "jKXURvLtkuTS"
   },
   "outputs": [],
   "source": [
    "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
    "from llama_index.core import StorageContext\n",
    "\n",
    "# Define a storage context object using the created vector database.\n",
    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "id": "WsD52wtrlESi"
   },
   "outputs": [],
   "source": [
    "from llama_index.core import VectorStoreIndex\n",
    "\n",
    "# Add the documents to the database and create Index / embeddings\n",
    "index = VectorStoreIndex.from_documents(\n",
    "    documents, storage_context=storage_context\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "8JPD8yAinVSq"
   },
   "source": [
    "# Query Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "id": "mzS13x1ZlZ5X"
   },
   "outputs": [],
   "source": [
    "# Define a query engine that is responsible for retrieving related pieces of text,\n",
    "# and using a LLM to formulate the final answer.\n",
    "query_engine = index.as_query_engine()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "AYsQ4uLN_Oxg",
    "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The LLaMA2 model has a certain number of parameters, but without any specific information provided in the context, it is not possible to determine the exact number of parameters.\n"
     ]
    }
   ],
   "source": [
    "response = query_engine.query(\n",
    "    \"How many parameters LLaMA2 model has?\"\n",
    ")\n",
    "print(response)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
   "include_colab_link": true,
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}