{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNMDozeQs4SEdbsiziEASx2",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QPJzr-I9XQ7l",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "971cf4cb-ee33-477b-cc7d-d652b55b81f3"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m41.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.7/51.7 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m45.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m508.6/508.6 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.9/79.9 MB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.0/143.0 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m53.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m53.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.1/92.1 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m57.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m66.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.6/105.6 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m698.9/698.9 kB\u001b[0m \u001b[31m54.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m82.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.6/72.6 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m68.9/68.9 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.8/143.8 kB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m37.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m87.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0m"
]
}
],
"source": [
"!pip install -q llama-index==0.9.21 openai==1.6.0 cohere==4.39 tiktoken==0.5.2 chromadb==0.4.21 kaleido==0.2.1 python-multipart==0.0.6"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"\""
],
"metadata": {
"id": "riuXwpSPcvWC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Load the Dataset (CSV)"
],
"metadata": {
"id": "I9JbAzFcjkpn"
}
},
{
"cell_type": "code",
"source": [
"!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.csv"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wl_pbPvMlv1h",
"outputId": "70f7f4be-7b80-431b-8570-f388eb21878f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"--2023-12-26 19:25:41-- https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.csv\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 23689 (23K) [text/plain]\n",
"Saving to: ‘mini-dataset.csv’\n",
"\n",
"mini-dataset.csv 100%[===================>] 23.13K --.-KB/s in 0.007s \n",
"\n",
"2023-12-26 19:25:41 (3.10 MB/s) - ‘mini-dataset.csv’ saved [23689/23689]\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from llama_index import download_loader\n",
"\n",
"SimpleCSVReader = download_loader(\"SimpleCSVReader\")\n",
"\n",
"loader = SimpleCSVReader(encoding=\"ISO-8859-1\")\n",
"documents = loader.load_data(file='./mini-dataset.csv')"
],
"metadata": {
"id": "0Q9sxuW0g3Gd"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Chunking"
],
"metadata": {
"id": "S17g2RYOjmf2"
}
},
{
"cell_type": "code",
"source": [
"from llama_index import ServiceContext\n",
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
"\n",
"# We use OpenAI's embedding model \"text-embedding-ada-002\"\n",
"embed_model = OpenAIEmbedding()\n",
"\n",
"# initialize service context (set chunk size)\n",
"service_context = ServiceContext.from_defaults(chunk_size=512, chunk_overlap=64, embed_model=embed_model)"
],
"metadata": {
"id": "YizvmXPejkJE"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Test chunking"
],
"metadata": {
"id": "ROMhNRvolTmI"
}
},
{
"cell_type": "code",
"source": [
"node_parser = service_context.node_parser\n",
"\n",
"nodes = node_parser.get_nodes_from_documents(documents)\n",
"len( nodes )"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Oe_ePZh7lVmQ",
"outputId": "8f9a2250-2c8f-4f92-f6e6-037f3a18cdbb"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"13"
]
},
"metadata": {},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"source": [
"# Save on Chroma"
],
"metadata": {
"id": "OWaT6rL7ksp8"
}
},
{
"cell_type": "code",
"source": [
"import chromadb\n",
"\n",
"# create client and a new collection\n",
"# chromadb.EphemeralClient to save in-memory.\n",
"chroma_client = chromadb.PersistentClient(path=\"./mini-dataset\")\n",
"chroma_collection = chroma_client.create_collection(\"mini-dataset\")"
],
"metadata": {
"id": "mXi56KTXk2sp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from llama_index.vector_stores import ChromaVectorStore\n",
"from llama_index.storage.storage_context import StorageContext\n",
"\n",
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
],
"metadata": {
"id": "jKXURvLtkuTS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from llama_index import VectorStoreIndex\n",
"\n",
"index = VectorStoreIndex.from_documents(\n",
" documents, storage_context=storage_context, service_context=service_context\n",
")"
],
"metadata": {
"id": "WsD52wtrlESi"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Query Dataset"
],
"metadata": {
"id": "8JPD8yAinVSq"
}
},
{
"cell_type": "code",
"source": [
"query_engine = index.as_query_engine()"
],
"metadata": {
"id": "mzS13x1ZlZ5X"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"response = query_engine.query(\n",
" \"How many parameters LLaMA2 model has?\"\n",
")\n"
],
"metadata": {
"id": "sb8f_wwPnZcG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(f\"Answer: \\n\\t{response}\\n\\n\\nSources:\")\n",
"\n",
"for idx, source in enumerate( response.source_nodes ):\n",
" print(\">\", idx+1)\n",
" print(source.node)\n",
" print(source.score)\n",
" print(\"_-\"*40)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "N3Ri8E5Dl4Ar",
"outputId": "6de37908-c3b0-41c9-e74e-8ad03b53ae6c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Answer: \n",
"\tThe Llama 2 model is available in four different sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.\n",
"\n",
"\n",
"Sources:\n",
"> 1\n",
"Node ID: c8db296d-ad40-4f56-b67a-15d5d5807b36\n",
"Text: Meta has once again pushed the boundaries of AI with the release\n",
"of Llama 2, the highly anticipated successor to its groundbreaking\n",
"Llama 1 language model. Boasting a range of cutting-edge features,\n",
"Llama 2 has already disrupted the AI landscape and poses a real\n",
"challenge to ChatGPTÕs dominance. In this article, we will dive into\n",
"the exciting wo...\n",
"0.7188979822197016\n",
"_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-\n",
"> 2\n",
"Node ID: 2c1194e4-df31-474f-85a4-b19d16b4ece7\n",
"Text: Source: Meta Llama 2 paper Finding the right balance between\n",
"helpfulness and safety when optimizing a model poses significant\n",
"challenges. While a highly helpful model may be capable of answering\n",
"any question, including sensitive ones like ÒHow do I build a bomb?Ó,\n",
"it also raises concerns about potential misuse. Thus, striking the\n",
"perfect equilib...\n",
"0.7130334174007259\n",
"_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(response)"
],
"metadata": {
"id": "hjYiWAocnalt"
},
"execution_count": null,
"outputs": []
}
]
}