{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyNMDozeQs4SEdbsiziEASx2", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QPJzr-I9XQ7l", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "971cf4cb-ee33-477b-cc7d-d652b55b81f3" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m41.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.7/51.7 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m45.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m508.6/508.6 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.9/79.9 MB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.0/143.0 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m53.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m53.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.1/92.1 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m57.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m66.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.6/105.6 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m698.9/698.9 kB\u001b[0m \u001b[31m54.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m82.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.6/72.6 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m68.9/68.9 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.8/143.8 kB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m37.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m87.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } ], "source": [ "!pip install -q llama-index==0.9.21 openai==1.6.0 cohere==4.39 tiktoken==0.5.2 chromadb==0.4.21 kaleido==0.2.1 python-multipart==0.0.6" ] }, { "cell_type": "code", "source": [ "import os\n", "\n", "os.environ[\"OPENAI_API_KEY\"] = \"\"" ], "metadata": { "id": "riuXwpSPcvWC" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Load the Dataset (CSV)" ], "metadata": { "id": "I9JbAzFcjkpn" } }, { "cell_type": "code", "source": [ "!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.csv" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wl_pbPvMlv1h", "outputId": "70f7f4be-7b80-431b-8570-f388eb21878f" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2023-12-26 19:25:41-- https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.csv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 23689 (23K) [text/plain]\n", "Saving to: ‘mini-dataset.csv’\n", "\n", "mini-dataset.csv 100%[===================>] 23.13K --.-KB/s in 0.007s \n", "\n", "2023-12-26 19:25:41 (3.10 MB/s) - ‘mini-dataset.csv’ saved [23689/23689]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "from llama_index import download_loader\n", "\n", "SimpleCSVReader = download_loader(\"SimpleCSVReader\")\n", "\n", "loader = SimpleCSVReader(encoding=\"ISO-8859-1\")\n", "documents = loader.load_data(file='./mini-dataset.csv')" ], "metadata": { "id": "0Q9sxuW0g3Gd" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Chunking" ], "metadata": { "id": "S17g2RYOjmf2" } }, { "cell_type": "code", "source": [ "from llama_index import ServiceContext\n", "from llama_index.embeddings.openai import OpenAIEmbedding\n", "\n", "# We use OpenAI's embedding model \"text-embedding-ada-002\"\n", "embed_model = OpenAIEmbedding()\n", "\n", "# initialize service context (set chunk size)\n", "service_context = ServiceContext.from_defaults(chunk_size=512, chunk_overlap=64, embed_model=embed_model)" ], "metadata": { "id": "YizvmXPejkJE" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Test chunking" ], "metadata": { "id": "ROMhNRvolTmI" } }, { "cell_type": "code", "source": [ "node_parser = service_context.node_parser\n", "\n", "nodes = node_parser.get_nodes_from_documents(documents)\n", "len( nodes )" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Oe_ePZh7lVmQ", "outputId": "8f9a2250-2c8f-4f92-f6e6-037f3a18cdbb" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "13" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "markdown", "source": [ "# Save on Chroma" ], "metadata": { "id": "OWaT6rL7ksp8" } }, { "cell_type": "code", "source": [ "import chromadb\n", "\n", "# create client and a new collection\n", "# chromadb.EphemeralClient to save in-memory.\n", "chroma_client = chromadb.PersistentClient(path=\"./mini-dataset\")\n", "chroma_collection = chroma_client.create_collection(\"mini-dataset\")" ], "metadata": { "id": "mXi56KTXk2sp" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from llama_index.vector_stores import ChromaVectorStore\n", "from llama_index.storage.storage_context import StorageContext\n", "\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ], "metadata": { "id": "jKXURvLtkuTS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from llama_index import VectorStoreIndex\n", "\n", "index = VectorStoreIndex.from_documents(\n", " documents, storage_context=storage_context, service_context=service_context\n", ")" ], "metadata": { "id": "WsD52wtrlESi" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Query Dataset" ], "metadata": { "id": "8JPD8yAinVSq" } }, { "cell_type": "code", "source": [ "query_engine = index.as_query_engine()" ], "metadata": { "id": "mzS13x1ZlZ5X" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "response = query_engine.query(\n", " \"How many parameters LLaMA2 model has?\"\n", ")\n" ], "metadata": { "id": "sb8f_wwPnZcG" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(f\"Answer: \\n\\t{response}\\n\\n\\nSources:\")\n", "\n", "for idx, source in enumerate( response.source_nodes ):\n", " print(\">\", idx+1)\n", " print(source.node)\n", " print(source.score)\n", " print(\"_-\"*40)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "N3Ri8E5Dl4Ar", "outputId": "6de37908-c3b0-41c9-e74e-8ad03b53ae6c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Answer: \n", "\tThe Llama 2 model is available in four different sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.\n", "\n", "\n", "Sources:\n", "> 1\n", "Node ID: c8db296d-ad40-4f56-b67a-15d5d5807b36\n", "Text: Meta has once again pushed the boundaries of AI with the release\n", "of Llama 2, the highly anticipated successor to its groundbreaking\n", "Llama 1 language model. Boasting a range of cutting-edge features,\n", "Llama 2 has already disrupted the AI landscape and poses a real\n", "challenge to ChatGPTÕs dominance. In this article, we will dive into\n", "the exciting wo...\n", "0.7188979822197016\n", "_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-\n", "> 2\n", "Node ID: 2c1194e4-df31-474f-85a4-b19d16b4ece7\n", "Text: Source: Meta Llama 2 paper Finding the right balance between\n", "helpfulness and safety when optimizing a model poses significant\n", "challenges. While a highly helpful model may be capable of answering\n", "any question, including sensitive ones like ÒHow do I build a bomb?Ó,\n", "it also raises concerns about potential misuse. Thus, striking the\n", "perfect equilib...\n", "0.7130334174007259\n", "_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-\n" ] } ] }, { "cell_type": "code", "source": [ "print(response)" ], "metadata": { "id": "hjYiWAocnalt" }, "execution_count": null, "outputs": [] } ] }