Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

File size: 7,867 Bytes

4b2e474
dda976b
 
08b8fbf
 
dda976b
 
 
 
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
4b2e474
dda976b
4b2e474
dda976b
 
 
 
 
08b8fbf
dda976b
4b2e474
dda976b
 
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
 
 
 
fcc14cf
dda976b
 
 
 
 
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
 
 
fcc14cf
dda976b
 
 
 
4b2e474
dda976b
 
 
4b2e474
dda976b
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
 
 
4b2e474
dda976b
 
 
 
4b2e474
dda976b
 
 
4b2e474
dda976b
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b8fbf
dda976b
 
 
 
 
a393007
dda976b
 
a393007
 
 
dda976b
 
 
 
08b8fbf
dda976b
 
 
4b2e474
dda976b
 
 
 
4b2e474
dda976b
 
 
08b8fbf
dda976b
4b2e474
dda976b
 
 
 
 
 
 
08b8fbf
 
 
 
 
 
 
dda976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a393007
dda976b

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "outputs": [],
   "source": [
    "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "5BGJ3fxhOk2V"
   },
   "source": [
    "# Install Packages and Setup Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "QPJzr-I9XQ7l",
    "outputId": "9949a0e5-8bf2-4ae7-9921-1f9dfbece9ae"
   },
   "outputs": [],
   "source": [
    "!pip install -q llama-index==0.10.5 openai==1.12.0 cohere==4.47 tiktoken==0.6.0 chromadb==0.4.22"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "id": "riuXwpSPcvWC"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "I9JbAzFcjkpn"
   },
   "source": [
    "# Load the Dataset (CSV)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_Tif8-JoRH68"
   },
   "source": [
    "## Download"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "4fQaa1LN1mXL"
   },
   "source": [
    "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "id": "-QTUkdfJjY4N"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100  169k  100  169k    0     0   743k      0 --:--:-- --:--:-- --:--:--  747k\n"
     ]
    }
   ],
   "source": [
    "!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zk-4alIxROo8"
   },
   "source": [
    "## Read File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "7CYwRT6R0o0I",
    "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "841"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import csv\n",
    "\n",
    "text = \"\"\n",
    "\n",
    "# Load the file as a JSON\n",
    "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
    "  csv_reader = csv.reader(file)\n",
    "\n",
    "  for row in csv_reader:\n",
    "    text += row[0]\n",
    "\n",
    "# The number of characters in the dataset.\n",
    "len( text )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "S17g2RYOjmf2"
   },
   "source": [
    "# Chunking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "STACTMUR1z9N",
    "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_size = 512\n",
    "chunks = []\n",
    "\n",
    "# Split the long text into smaller manageable chunks of 512 characters.\n",
    "for i in range(0, len(text), chunk_size):\n",
    "    chunks.append(text[i:i + chunk_size])\n",
    "\n",
    "len( chunks )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "id": "CtdsIUQ81_hT"
   },
   "outputs": [],
   "source": [
    "from llama_index.core import Document\n",
    "\n",
    "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
    "documents = [Document(text=t) for t in chunks]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OWaT6rL7ksp8"
   },
   "source": [
    "# Save on Chroma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "id": "mXi56KTXk2sp"
   },
   "outputs": [],
   "source": [
    "import chromadb\n",
    "\n",
    "# create client and a new collection\n",
    "# chromadb.EphemeralClient saves data in-memory.\n",
    "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
    "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "id": "jKXURvLtkuTS"
   },
   "outputs": [],
   "source": [
    "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
    "from llama_index.core import StorageContext\n",
    "\n",
    "# Define a storage context object using the created vector database.\n",
    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "id": "WsD52wtrlESi"
   },
   "outputs": [],
   "source": [
    "from llama_index.core import VectorStoreIndex\n",
    "\n",
    "# Add the documents to the database and create Index / embeddings\n",
    "index = VectorStoreIndex.from_documents(\n",
    "    documents, storage_context=storage_context\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "8JPD8yAinVSq"
   },
   "source": [
    "# Query Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "id": "mzS13x1ZlZ5X"
   },
   "outputs": [],
   "source": [
    "from llama_index.llms.openai import OpenAI\n",
    "# Define a query engine that is responsible for retrieving related pieces of text,\n",
    "# and using a LLM to formulate the final answer.\n",
    "\n",
    "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)\n",
    "query_engine = index.as_query_engine(llm=llm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "AYsQ4uLN_Oxg",
    "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The LLaMA2 model has 7 billion parameters.\n"
     ]
    }
   ],
   "source": [
    "response = query_engine.query(\n",
    "    \"How many parameters LLaMA2 model has?\"\n",
    ")\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
   "include_colab_link": true,
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}