{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "view-in-github"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "v9bpz99INAc1"
},
"source": [
"# Install Packages and Setup Variables"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BeuFJKlj9jKz",
"outputId": "4c3a9772-cb7d-4fc1-d0e4-64186861e3e5"
},
"outputs": [],
"source": [
"!pip install -q llama-index==0.10.5 openai==1.12.0 cohere==4.47 tiktoken==0.6.0"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "XuzgSNqcABpV"
},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
"os.environ[\"OPENAI_API_KEY\"] = \"\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "f5eV5EnvNCMM"
},
"source": [
"# Load Dataset"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "q-7mRQ-mNJlm"
},
"source": [
"## Download"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "3PsdOdMUNmEi"
},
"source": [
"The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3ImRCP7pACaI",
"outputId": "9a63bdea-54f7-4923-ccbb-cab03b312774"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 25361 100 25361 0 0 195k 0 --:--:-- --:--:-- --:--:-- 196k\n"
]
}
],
"source": [
"!curl -o ./mini-dataset.json https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.json"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bZZLK_wyEc-L"
},
"source": [
"## Read File"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "miUqycqAEfr7",
"outputId": "10005d5f-15c0-4565-a58a-6cb7e466acb4"
},
"outputs": [
{
"data": {
"text/plain": [
"22"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import json\n",
"\n",
"# Load the file as a JSON\n",
"with open('./mini-dataset.json', 'r') as file:\n",
" data = json.load(file)\n",
"\n",
"# The number of chunks in the dataset.\n",
"len( data['chunks'] )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"id": "Mq5WKj0QEfpk"
},
"outputs": [],
"source": [
"# Flatten the JSON variable to a list of texts.\n",
"texts = [item['text'] for item in data['chunks']]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "f86yksB9K571"
},
"source": [
"# Generate Embedding"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "iXrr5-tnEfm9"
},
"outputs": [],
"source": [
"from llama_index.core import Document\n",
"\n",
"# Convert the texts to Document objects so the LlamaIndex framework can process them.\n",
"documents = [Document(text=t) for t in texts]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "qQit27lBEfkV"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/louis/Documents/GitHub/ai-tutor-rag-system/.conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Parsing nodes: 100%|██████████| 22/22 [00:00<00:00, 1552.92it/s]\n",
"Generating embeddings: 100%|██████████| 22/22 [00:00<00:00, 43.01it/s]\n"
]
}
],
"source": [
"from llama_index.core import VectorStoreIndex\n",
"\n",
"# Build index / generate embeddings using OpenAI.\n",
"index = VectorStoreIndex.from_documents(documents, show_progress=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "xxB0A9ZYM-OD"
},
"outputs": [],
"source": [
"# Save the generated embeddings.\n",
"# index.storage_context.persist(persist_dir=\"indexes\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "3DoUxd8KK--Q"
},
"source": [
"# Query Dataset"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "bUaNH97dEfh9"
},
"outputs": [],
"source": [
"from llama_index.llms.openai import OpenAI\n",
"# Define a query engine that is responsible for retrieving related pieces of text,\n",
"# and using a LLM to formulate the final answer.\n",
"llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=512)\n",
"query_engine = index.as_query_engine(llm=llm)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tEgFx_aeFS5e",
"outputId": "9133bd0c-f0c5-4124-9c4b-ab6c4c32b07a"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The Llama 2 model comes in four different sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.\n"
]
}
],
"source": [
"response = query_engine.query(\n",
" \"How many parameters LLaMA2 model has?\"\n",
")\n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyMcuy0u2XnwzWnARu0WjaRq",
"include_colab_link": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 0
}