{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "5BGJ3fxhOk2V" }, "source": [ "# Install Packages and Setup Variables\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "id": "QPJzr-I9XQ7l" }, "outputs": [], "source": [ "!pip install -q llama-index==0.10.57 llama-index-vector-stores-chroma llama-index-llms-gemini==0.1.11 langchain_google_genai google-generativeai==0.5.4 langchain==0.1.17 langchain-chroma langchain_openai==0.1.5 openai==1.37.0 chromadb" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "riuXwpSPcvWC" }, "outputs": [], "source": [ "import os\n", "# Set the following API Keys in the Python environment. Will be used later.\n", "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", "os.environ[\"GOOGLE_API_KEY\"] = \"\"\n", "\n", "# from google.colab import userdata\n", "# os.environ[\"OPENAI_API_KEY\"] = userdata.get('openai_api_key')\n", "# os.environ[\"GOOGLE_API_KEY\"] = userdata.get('Google_api_key')" ] }, { "cell_type": "markdown", "metadata": { "id": "I9JbAzFcjkpn" }, "source": [ "# Load the Dataset (CSV)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "_Tif8-JoRH68" }, "source": [ "## Download\n" ] }, { "cell_type": "markdown", "metadata": { "id": "4fQaa1LN1mXL" }, "source": [ "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-QTUkdfJjY4N", "outputId": "34becd46-808a-42ee-e620-3e6b18f79e1d" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 169k 100 169k 0 0 609k 0 --:--:-- --:--:-- --:--:-- 612k\n" ] } ], "source": [ "!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv" ] }, { "cell_type": "markdown", "metadata": { "id": "zk-4alIxROo8" }, "source": [ "## Read File\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7CYwRT6R0o0I", "outputId": "394603bd-6d33-40aa-8e06-6ef802879234" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "171044\n" ] } ], "source": [ "import csv\n", "\n", "text = \"\"\n", "\n", "# Load the file as a JSON\n", "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"utf-8\") as file:\n", " csv_reader = csv.reader(file)\n", "\n", " for idx, row in enumerate(csv_reader):\n", " if idx == 0:\n", " continue\n", " text += row[1]\n", "\n", "# The number of characters in the dataset.\n", "print(len(text))" ] }, { "cell_type": "markdown", "metadata": { "id": "S17g2RYOjmf2" }, "source": [ "# Chunking\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "STACTMUR1z9N", "outputId": "d5360ce2-2c1e-459b-a3b3-e9899fe762b5" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "335\n" ] } ], "source": [ "chunk_size = 512\n", "chunks = []\n", "\n", "# Split the long text into smaller manageable chunks of 512 characters.\n", "for i in range(0, len(text), chunk_size):\n", " chunks.append(text[i : i + chunk_size])\n", "\n", "print(len(chunks))" ] }, { "cell_type": "markdown", "metadata": { "id": "9fOomeMGqu10" }, "source": [ "#Interface of Chroma with LlamaIndex\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CtdsIUQ81_hT" }, "outputs": [], "source": [ "from llama_index.core import Document\n", "\n", "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n", "documents = [Document(text=t) for t in chunks]" ] }, { "cell_type": "markdown", "metadata": { "id": "OWaT6rL7ksp8" }, "source": [ "Save on Chroma\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mXi56KTXk2sp" }, "outputs": [], "source": [ "import chromadb\n", "\n", "# create client and a new collection\n", "# chromadb.EphemeralClient saves data in-memory.\n", "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n", "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jKXURvLtkuTS" }, "outputs": [], "source": [ "from llama_index.vector_stores.chroma import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "\n", "# Define a storage context object using the created vector database.\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81, "referenced_widgets": [ "1a0185d42be8489c87874049e5d78424", "e91d7972a14b4444808d649d5db15b7a", "bd3eab2dfee94512b88d88fbd4cb3682", "3ba2fa0f0c8449c3a22ff3593d2b6629", "eb7ce1338f6a4c9a8fb9a92e8de78821", "d9daeb567dcc4387a7ba406a559aa422", "2a99f51fa5c24ce09f629d9d5322879d", "edfd151ed26646d8a6d293a3cf1ecce6", "032b979544624bd4a9f7eb91fa9dd2e8", "28d35749a3fa4c6f80f626221f95cde1", "3d78bf2951e04c21bd89c1d944412e5a", "a0991085ef794c8cbf509370b67df911", "780fa2ff5938403cb2c109c491368dce", "f833fd603aa543f59a7e93fe980935cc", "48e949bbaf7c4079bbe8fcf620066b8e", "78fd41df7f604f0290dc034c9f01a18c", "d2df0b3067574d46b951890b996a5751", "b51937856bf64803a85ea7eb30f73cfe", "74fe3cd416994417a2138d8a32beeed8", "80626b461cce49e7bcb6e3e931c5d81c", "23f0bdb422384eaf94a69d20502fdbe4", "dfa74d96a1174e7ead6554b5c984f526" ] }, "id": "WsD52wtrlESi", "outputId": "2d522a85-cdea-477b-a693-e26b404c8ed9" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Parsing nodes: 0%| | 0/335 [00:00