Omar Solano commited on
Commit
dda976b
Β·
1 Parent(s): 567c34a

update llama-index

Browse files
Files changed (1) hide show
  1. notebooks/04-RAG_with_VectorStore.ipynb +319 -347
notebooks/04-RAG_with_VectorStore.ipynb CHANGED
@@ -1,361 +1,333 @@
1
  {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "colab": {
6
- "provenance": [],
7
- "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
8
- "include_colab_link": true
9
  },
10
- "kernelspec": {
11
- "name": "python3",
12
- "display_name": "Python 3"
13
- },
14
- "language_info": {
15
- "name": "python"
16
- }
17
  },
18
- "cells": [
19
- {
20
- "cell_type": "markdown",
21
- "metadata": {
22
- "id": "view-in-github",
23
- "colab_type": "text"
24
- },
25
- "source": [
26
- "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
27
- ]
28
- },
29
- {
30
- "cell_type": "markdown",
31
- "source": [
32
- "# Install Packages and Setup Variables"
33
- ],
34
- "metadata": {
35
- "id": "5BGJ3fxhOk2V"
36
- }
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": null,
41
- "metadata": {
42
- "colab": {
43
- "base_uri": "https://localhost:8080/"
44
- },
45
- "id": "QPJzr-I9XQ7l",
46
- "outputId": "9949a0e5-8bf2-4ae7-9921-1f9dfbece9ae"
47
- },
48
- "outputs": [
49
- {
50
- "output_type": "stream",
51
- "name": "stdout",
52
- "text": [
53
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m51.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
54
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m20.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
55
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.7/51.7 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
56
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
57
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m508.6/508.6 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
58
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.9/79.9 MB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
59
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
60
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.0/143.0 kB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
61
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
62
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m70.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
63
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━���━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m63.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
64
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.1/92.1 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
65
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
66
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m63.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
67
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m62.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
68
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
69
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.6/105.6 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
70
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
71
- "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
72
- " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
73
- " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
74
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m698.9/698.9 kB\u001b[0m \u001b[31m45.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
75
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m69.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
76
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.6/72.6 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
77
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
78
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
79
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
80
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.8/143.8 kB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
81
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
82
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
83
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m23.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
84
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m58.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
85
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
86
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
87
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
88
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
89
- "\u001b[?25h Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
90
- "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
91
- "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
92
- "\u001b[0m"
93
- ]
94
- }
95
- ],
96
- "source": [
97
- "!pip install -q llama-index==0.9.21 openai==1.6.0 cohere==4.39 tiktoken==0.5.2 chromadb==0.4.21 kaleido==0.2.1 python-multipart==0.0.6"
98
- ]
99
- },
100
- {
101
- "cell_type": "code",
102
- "source": [
103
- "import os\n",
104
- "\n",
105
- "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
106
- "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
107
- ],
108
- "metadata": {
109
- "id": "riuXwpSPcvWC"
110
- },
111
- "execution_count": null,
112
- "outputs": []
113
- },
114
- {
115
- "cell_type": "markdown",
116
- "source": [
117
- "# Load the Dataset (CSV)"
118
- ],
119
- "metadata": {
120
- "id": "I9JbAzFcjkpn"
121
- }
122
- },
123
- {
124
- "cell_type": "markdown",
125
- "source": [
126
- "## Download"
127
- ],
128
- "metadata": {
129
- "id": "_Tif8-JoRH68"
130
- }
131
- },
132
- {
133
- "cell_type": "markdown",
134
- "source": [
135
- "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
136
- ],
137
- "metadata": {
138
- "id": "4fQaa1LN1mXL"
139
- }
140
- },
141
- {
142
- "cell_type": "code",
143
- "source": [
144
- "!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-dataset.csv"
145
- ],
146
- "metadata": {
147
- "id": "-QTUkdfJjY4N"
148
- },
149
- "execution_count": null,
150
- "outputs": []
151
- },
152
  {
153
- "cell_type": "markdown",
154
- "source": [
155
- "## Read File"
156
- ],
157
- "metadata": {
158
- "id": "zk-4alIxROo8"
159
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  },
 
 
 
 
161
  {
162
- "cell_type": "code",
163
- "source": [
164
- "import csv\n",
165
- "\n",
166
- "text = \"\"\n",
167
- "\n",
168
- "# Load the file as a JSON\n",
169
- "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
170
- " csv_reader = csv.reader(file)\n",
171
- "\n",
172
- " for row in csv_reader:\n",
173
- " text += row[0]\n",
174
- "\n",
175
- "# The number of characters in the dataset.\n",
176
- "len( text )"
177
- ],
178
- "metadata": {
179
- "colab": {
180
- "base_uri": "https://localhost:8080/"
181
- },
182
- "id": "7CYwRT6R0o0I",
183
- "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
184
- },
185
- "execution_count": null,
186
- "outputs": [
187
- {
188
- "output_type": "execute_result",
189
- "data": {
190
- "text/plain": [
191
- "23632"
192
- ]
193
- },
194
- "metadata": {},
195
- "execution_count": 4
196
- }
197
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  },
 
 
 
 
199
  {
200
- "cell_type": "markdown",
201
- "source": [
202
- "# Chunking"
203
- ],
204
- "metadata": {
205
- "id": "S17g2RYOjmf2"
206
- }
207
- },
208
- {
209
- "cell_type": "code",
210
- "source": [
211
- "chunk_size = 512\n",
212
- "chunks = []\n",
213
- "\n",
214
- "# Split the long text into smaller manageable chunks of 512 characters.\n",
215
- "for i in range(0, len(text), chunk_size):\n",
216
- " chunks.append(text[i:i + chunk_size])\n",
217
- "\n",
218
- "len( chunks )"
219
- ],
220
- "metadata": {
221
- "colab": {
222
- "base_uri": "https://localhost:8080/"
223
- },
224
- "id": "STACTMUR1z9N",
225
- "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
226
- },
227
- "execution_count": null,
228
- "outputs": [
229
- {
230
- "output_type": "execute_result",
231
- "data": {
232
- "text/plain": [
233
- "47"
234
- ]
235
- },
236
- "metadata": {},
237
- "execution_count": 6
238
- }
239
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  },
 
 
 
 
241
  {
242
- "cell_type": "code",
243
- "source": [
244
- "from llama_index import Document\n",
245
- "\n",
246
- "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
247
- "documents = [Document(text=t) for t in chunks]"
248
- ],
249
- "metadata": {
250
- "id": "CtdsIUQ81_hT"
251
- },
252
- "execution_count": null,
253
- "outputs": []
254
- },
255
- {
256
- "cell_type": "markdown",
257
- "source": [
258
- "# Save on Chroma"
259
- ],
260
- "metadata": {
261
- "id": "OWaT6rL7ksp8"
262
- }
263
- },
264
- {
265
- "cell_type": "code",
266
- "source": [
267
- "import chromadb\n",
268
- "\n",
269
- "# create client and a new collection\n",
270
- "# chromadb.EphemeralClient saves data in-memory.\n",
271
- "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
272
- "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
273
- ],
274
- "metadata": {
275
- "id": "mXi56KTXk2sp"
276
- },
277
- "execution_count": null,
278
- "outputs": []
279
- },
280
- {
281
- "cell_type": "code",
282
- "source": [
283
- "from llama_index.vector_stores import ChromaVectorStore\n",
284
- "from llama_index.storage.storage_context import StorageContext\n",
285
- "\n",
286
- "# Define a storage context object using the created vector database.\n",
287
- "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
288
- "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
289
- ],
290
- "metadata": {
291
- "id": "jKXURvLtkuTS"
292
- },
293
- "execution_count": null,
294
- "outputs": []
295
- },
296
- {
297
- "cell_type": "code",
298
- "source": [
299
- "from llama_index import VectorStoreIndex\n",
300
- "\n",
301
- "# Add the documents to the database and create Index / embeddings\n",
302
- "index = VectorStoreIndex.from_documents(\n",
303
- " documents, storage_context=storage_context\n",
304
- ")"
305
- ],
306
- "metadata": {
307
- "id": "WsD52wtrlESi"
308
- },
309
- "execution_count": null,
310
- "outputs": []
311
- },
312
- {
313
- "cell_type": "markdown",
314
- "source": [
315
- "# Query Dataset"
316
- ],
317
- "metadata": {
318
- "id": "8JPD8yAinVSq"
319
- }
320
- },
321
- {
322
- "cell_type": "code",
323
- "source": [
324
- "# Define a query engine that is responsible for retrieving related pieces of text,\n",
325
- "# and using a LLM to formulate the final answer.\n",
326
- "query_engine = index.as_query_engine()"
327
- ],
328
- "metadata": {
329
- "id": "mzS13x1ZlZ5X"
330
- },
331
- "execution_count": null,
332
- "outputs": []
333
- },
334
- {
335
- "cell_type": "code",
336
- "source": [
337
- "response = query_engine.query(\n",
338
- " \"How many parameters LLaMA2 model has?\"\n",
339
- ")\n",
340
- "print(response)"
341
- ],
342
- "metadata": {
343
- "colab": {
344
- "base_uri": "https://localhost:8080/"
345
- },
346
- "id": "AYsQ4uLN_Oxg",
347
- "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
348
- },
349
- "execution_count": null,
350
- "outputs": [
351
- {
352
- "output_type": "stream",
353
- "name": "stdout",
354
- "text": [
355
- "The Llama-2 model has three different sizes: 7B, 13B, and 70B.\n"
356
- ]
357
- }
358
- ]
359
  }
360
- ]
361
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "colab_type": "text",
7
+ "id": "view-in-github"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "metadata": {
16
+ "id": "5BGJ3fxhOk2V"
17
+ },
18
+ "source": [
19
+ "# Install Packages and Setup Variables"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 1,
25
+ "metadata": {
26
  "colab": {
27
+ "base_uri": "https://localhost:8080/"
 
 
28
  },
29
+ "id": "QPJzr-I9XQ7l",
30
+ "outputId": "9949a0e5-8bf2-4ae7-9921-1f9dfbece9ae"
31
+ },
32
+ "outputs": [],
33
+ "source": [
34
+ "!pip install -q llama-index==0.10.5 llama-index-vector-stores-chroma==0.1.1 openai==1.12.0 tiktoken==0.6.0 chromadb==0.4.22 kaleido==0.2.1 python-multipart==0.0.9"
35
+ ]
36
  },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 2,
40
+ "metadata": {
41
+ "id": "riuXwpSPcvWC"
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "import os\n",
46
+ "\n",
47
+ "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
48
+ "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "metadata": {
54
+ "id": "I9JbAzFcjkpn"
55
+ },
56
+ "source": [
57
+ "# Load the Dataset (CSV)"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "metadata": {
63
+ "id": "_Tif8-JoRH68"
64
+ },
65
+ "source": [
66
+ "## Download"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "markdown",
71
+ "metadata": {
72
+ "id": "4fQaa1LN1mXL"
73
+ },
74
+ "source": [
75
+ "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 3,
81
+ "metadata": {
82
+ "id": "-QTUkdfJjY4N"
83
+ },
84
+ "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  {
86
+ "name": "stdout",
87
+ "output_type": "stream",
88
+ "text": [
89
+ " % Total % Received % Xferd Average Speed Time Time Time Current\n",
90
+ " Dload Upload Total Spent Left Speed\n",
91
+ "100 169k 100 169k 0 0 602k 0 --:--:-- --:--:-- --:--:-- 603k\n"
92
+ ]
93
+ }
94
+ ],
95
+ "source": [
96
+ "!curl -o ./mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "markdown",
101
+ "metadata": {
102
+ "id": "zk-4alIxROo8"
103
+ },
104
+ "source": [
105
+ "## Read File"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 4,
111
+ "metadata": {
112
+ "colab": {
113
+ "base_uri": "https://localhost:8080/"
114
  },
115
+ "id": "7CYwRT6R0o0I",
116
+ "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
117
+ },
118
+ "outputs": [
119
  {
120
+ "data": {
121
+ "text/plain": [
122
+ "841"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  ]
124
+ },
125
+ "execution_count": 4,
126
+ "metadata": {},
127
+ "output_type": "execute_result"
128
+ }
129
+ ],
130
+ "source": [
131
+ "import csv\n",
132
+ "\n",
133
+ "text = \"\"\n",
134
+ "\n",
135
+ "# Load the file as a JSON\n",
136
+ "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
137
+ " csv_reader = csv.reader(file)\n",
138
+ "\n",
139
+ " for row in csv_reader:\n",
140
+ " text += row[0]\n",
141
+ "\n",
142
+ "# The number of characters in the dataset.\n",
143
+ "len( text )"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "markdown",
148
+ "metadata": {
149
+ "id": "S17g2RYOjmf2"
150
+ },
151
+ "source": [
152
+ "# Chunking"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 5,
158
+ "metadata": {
159
+ "colab": {
160
+ "base_uri": "https://localhost:8080/"
161
  },
162
+ "id": "STACTMUR1z9N",
163
+ "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
164
+ },
165
+ "outputs": [
166
  {
167
+ "data": {
168
+ "text/plain": [
169
+ "2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  ]
171
+ },
172
+ "execution_count": 5,
173
+ "metadata": {},
174
+ "output_type": "execute_result"
175
+ }
176
+ ],
177
+ "source": [
178
+ "chunk_size = 512\n",
179
+ "chunks = []\n",
180
+ "\n",
181
+ "# Split the long text into smaller manageable chunks of 512 characters.\n",
182
+ "for i in range(0, len(text), chunk_size):\n",
183
+ " chunks.append(text[i:i + chunk_size])\n",
184
+ "\n",
185
+ "len( chunks )"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": 6,
191
+ "metadata": {
192
+ "id": "CtdsIUQ81_hT"
193
+ },
194
+ "outputs": [],
195
+ "source": [
196
+ "from llama_index.core import Document\n",
197
+ "\n",
198
+ "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
199
+ "documents = [Document(text=t) for t in chunks]"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "markdown",
204
+ "metadata": {
205
+ "id": "OWaT6rL7ksp8"
206
+ },
207
+ "source": [
208
+ "# Save on Chroma"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": 7,
214
+ "metadata": {
215
+ "id": "mXi56KTXk2sp"
216
+ },
217
+ "outputs": [],
218
+ "source": [
219
+ "import chromadb\n",
220
+ "\n",
221
+ "# create client and a new collection\n",
222
+ "# chromadb.EphemeralClient saves data in-memory.\n",
223
+ "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
224
+ "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 8,
230
+ "metadata": {
231
+ "id": "jKXURvLtkuTS"
232
+ },
233
+ "outputs": [],
234
+ "source": [
235
+ "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
236
+ "from llama_index.core import StorageContext\n",
237
+ "\n",
238
+ "# Define a storage context object using the created vector database.\n",
239
+ "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
240
+ "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
241
+ ]
242
+ },
243
+ {
244
+ "cell_type": "code",
245
+ "execution_count": 9,
246
+ "metadata": {
247
+ "id": "WsD52wtrlESi"
248
+ },
249
+ "outputs": [],
250
+ "source": [
251
+ "from llama_index.core import VectorStoreIndex\n",
252
+ "\n",
253
+ "# Add the documents to the database and create Index / embeddings\n",
254
+ "index = VectorStoreIndex.from_documents(\n",
255
+ " documents, storage_context=storage_context\n",
256
+ ")"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "markdown",
261
+ "metadata": {
262
+ "id": "8JPD8yAinVSq"
263
+ },
264
+ "source": [
265
+ "# Query Dataset"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 10,
271
+ "metadata": {
272
+ "id": "mzS13x1ZlZ5X"
273
+ },
274
+ "outputs": [],
275
+ "source": [
276
+ "# Define a query engine that is responsible for retrieving related pieces of text,\n",
277
+ "# and using a LLM to formulate the final answer.\n",
278
+ "query_engine = index.as_query_engine()"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 11,
284
+ "metadata": {
285
+ "colab": {
286
+ "base_uri": "https://localhost:8080/"
287
  },
288
+ "id": "AYsQ4uLN_Oxg",
289
+ "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
290
+ },
291
+ "outputs": [
292
  {
293
+ "name": "stdout",
294
+ "output_type": "stream",
295
+ "text": [
296
+ "The LLaMA2 model has a certain number of parameters, but without any specific information provided in the context, it is not possible to determine the exact number of parameters.\n"
297
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  }
299
+ ],
300
+ "source": [
301
+ "response = query_engine.query(\n",
302
+ " \"How many parameters LLaMA2 model has?\"\n",
303
+ ")\n",
304
+ "print(response)"
305
+ ]
306
+ }
307
+ ],
308
+ "metadata": {
309
+ "colab": {
310
+ "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
311
+ "include_colab_link": true,
312
+ "provenance": []
313
+ },
314
+ "kernelspec": {
315
+ "display_name": "Python 3",
316
+ "name": "python3"
317
+ },
318
+ "language_info": {
319
+ "codemirror_mode": {
320
+ "name": "ipython",
321
+ "version": 3
322
+ },
323
+ "file_extension": ".py",
324
+ "mimetype": "text/x-python",
325
+ "name": "python",
326
+ "nbconvert_exporter": "python",
327
+ "pygments_lexer": "ipython3",
328
+ "version": "3.11.7"
329
+ }
330
+ },
331
+ "nbformat": 4,
332
+ "nbformat_minor": 0
333
+ }