Created using Colaboratory
Browse files
notebooks/04-RAG_with_VectorStore.ipynb
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
"metadata": {
|
5 |
"colab": {
|
6 |
"provenance": [],
|
7 |
-
"authorship_tag": "
|
8 |
"include_colab_link": true
|
9 |
},
|
10 |
"kernelspec": {
|
@@ -26,9 +26,18 @@
|
|
26 |
"<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
27 |
]
|
28 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
{
|
30 |
"cell_type": "code",
|
31 |
-
"execution_count":
|
32 |
"metadata": {
|
33 |
"colab": {
|
34 |
"base_uri": "https://localhost:8080/"
|
@@ -93,12 +102,13 @@
|
|
93 |
"source": [
|
94 |
"import os\n",
|
95 |
"\n",
|
|
|
96 |
"os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
|
97 |
],
|
98 |
"metadata": {
|
99 |
"id": "riuXwpSPcvWC"
|
100 |
},
|
101 |
-
"execution_count":
|
102 |
"outputs": []
|
103 |
},
|
104 |
{
|
@@ -113,7 +123,16 @@
|
|
113 |
{
|
114 |
"cell_type": "markdown",
|
115 |
"source": [
|
116 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
],
|
118 |
"metadata": {
|
119 |
"id": "4fQaa1LN1mXL"
|
@@ -130,27 +149,30 @@
|
|
130 |
"execution_count": null,
|
131 |
"outputs": []
|
132 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
{
|
134 |
"cell_type": "code",
|
135 |
"source": [
|
136 |
"import csv\n",
|
137 |
"\n",
|
138 |
"text = \"\"\n",
|
|
|
|
|
139 |
"with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
|
140 |
" csv_reader = csv.reader(file)\n",
|
141 |
"\n",
|
142 |
" for row in csv_reader:\n",
|
143 |
-
" text += row[0]"
|
144 |
-
|
145 |
-
|
146 |
-
"id": "0Q9sxuW0g3Gd"
|
147 |
-
},
|
148 |
-
"execution_count": 3,
|
149 |
-
"outputs": []
|
150 |
-
},
|
151 |
-
{
|
152 |
-
"cell_type": "code",
|
153 |
-
"source": [
|
154 |
"len( text )"
|
155 |
],
|
156 |
"metadata": {
|
@@ -160,7 +182,7 @@
|
|
160 |
"id": "7CYwRT6R0o0I",
|
161 |
"outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
|
162 |
},
|
163 |
-
"execution_count":
|
164 |
"outputs": [
|
165 |
{
|
166 |
"output_type": "execute_result",
|
@@ -188,18 +210,11 @@
|
|
188 |
"source": [
|
189 |
"chunk_size = 512\n",
|
190 |
"chunks = []\n",
|
|
|
|
|
191 |
"for i in range(0, len(text), chunk_size):\n",
|
192 |
-
" chunks.append(text[i:i + chunk_size])"
|
193 |
-
|
194 |
-
"metadata": {
|
195 |
-
"id": "IU7zLFi01pjD"
|
196 |
-
},
|
197 |
-
"execution_count": 5,
|
198 |
-
"outputs": []
|
199 |
-
},
|
200 |
-
{
|
201 |
-
"cell_type": "code",
|
202 |
-
"source": [
|
203 |
"len( chunks )"
|
204 |
],
|
205 |
"metadata": {
|
@@ -209,7 +224,7 @@
|
|
209 |
"id": "STACTMUR1z9N",
|
210 |
"outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
|
211 |
},
|
212 |
-
"execution_count":
|
213 |
"outputs": [
|
214 |
{
|
215 |
"output_type": "execute_result",
|
@@ -228,12 +243,13 @@
|
|
228 |
"source": [
|
229 |
"from llama_index import Document\n",
|
230 |
"\n",
|
|
|
231 |
"documents = [Document(text=t) for t in chunks]"
|
232 |
],
|
233 |
"metadata": {
|
234 |
"id": "CtdsIUQ81_hT"
|
235 |
},
|
236 |
-
"execution_count":
|
237 |
"outputs": []
|
238 |
},
|
239 |
{
|
@@ -251,14 +267,14 @@
|
|
251 |
"import chromadb\n",
|
252 |
"\n",
|
253 |
"# create client and a new collection\n",
|
254 |
-
"# chromadb.EphemeralClient
|
255 |
"chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
|
256 |
"chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
|
257 |
],
|
258 |
"metadata": {
|
259 |
"id": "mXi56KTXk2sp"
|
260 |
},
|
261 |
-
"execution_count":
|
262 |
"outputs": []
|
263 |
},
|
264 |
{
|
@@ -267,13 +283,14 @@
|
|
267 |
"from llama_index.vector_stores import ChromaVectorStore\n",
|
268 |
"from llama_index.storage.storage_context import StorageContext\n",
|
269 |
"\n",
|
|
|
270 |
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
|
271 |
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
|
272 |
],
|
273 |
"metadata": {
|
274 |
"id": "jKXURvLtkuTS"
|
275 |
},
|
276 |
-
"execution_count":
|
277 |
"outputs": []
|
278 |
},
|
279 |
{
|
@@ -281,6 +298,7 @@
|
|
281 |
"source": [
|
282 |
"from llama_index import VectorStoreIndex\n",
|
283 |
"\n",
|
|
|
284 |
"index = VectorStoreIndex.from_documents(\n",
|
285 |
" documents, storage_context=storage_context\n",
|
286 |
")"
|
@@ -288,7 +306,7 @@
|
|
288 |
"metadata": {
|
289 |
"id": "WsD52wtrlESi"
|
290 |
},
|
291 |
-
"execution_count":
|
292 |
"outputs": []
|
293 |
},
|
294 |
{
|
@@ -303,12 +321,14 @@
|
|
303 |
{
|
304 |
"cell_type": "code",
|
305 |
"source": [
|
|
|
|
|
306 |
"query_engine = index.as_query_engine()"
|
307 |
],
|
308 |
"metadata": {
|
309 |
"id": "mzS13x1ZlZ5X"
|
310 |
},
|
311 |
-
"execution_count":
|
312 |
"outputs": []
|
313 |
},
|
314 |
{
|
@@ -326,7 +346,7 @@
|
|
326 |
"id": "AYsQ4uLN_Oxg",
|
327 |
"outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
|
328 |
},
|
329 |
-
"execution_count":
|
330 |
"outputs": [
|
331 |
{
|
332 |
"output_type": "stream",
|
@@ -336,15 +356,6 @@
|
|
336 |
]
|
337 |
}
|
338 |
]
|
339 |
-
},
|
340 |
-
{
|
341 |
-
"cell_type": "code",
|
342 |
-
"source": [],
|
343 |
-
"metadata": {
|
344 |
-
"id": "hjYiWAocnalt"
|
345 |
-
},
|
346 |
-
"execution_count": null,
|
347 |
-
"outputs": []
|
348 |
}
|
349 |
]
|
350 |
}
|
|
|
4 |
"metadata": {
|
5 |
"colab": {
|
6 |
"provenance": [],
|
7 |
+
"authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
|
8 |
"include_colab_link": true
|
9 |
},
|
10 |
"kernelspec": {
|
|
|
26 |
"<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
27 |
]
|
28 |
},
|
29 |
+
{
|
30 |
+
"cell_type": "markdown",
|
31 |
+
"source": [
|
32 |
+
"# Install Packages and Setup Variables"
|
33 |
+
],
|
34 |
+
"metadata": {
|
35 |
+
"id": "5BGJ3fxhOk2V"
|
36 |
+
}
|
37 |
+
},
|
38 |
{
|
39 |
"cell_type": "code",
|
40 |
+
"execution_count": null,
|
41 |
"metadata": {
|
42 |
"colab": {
|
43 |
"base_uri": "https://localhost:8080/"
|
|
|
102 |
"source": [
|
103 |
"import os\n",
|
104 |
"\n",
|
105 |
+
"# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
|
106 |
"os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
|
107 |
],
|
108 |
"metadata": {
|
109 |
"id": "riuXwpSPcvWC"
|
110 |
},
|
111 |
+
"execution_count": null,
|
112 |
"outputs": []
|
113 |
},
|
114 |
{
|
|
|
123 |
{
|
124 |
"cell_type": "markdown",
|
125 |
"source": [
|
126 |
+
"## Download"
|
127 |
+
],
|
128 |
+
"metadata": {
|
129 |
+
"id": "_Tif8-JoRH68"
|
130 |
+
}
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "markdown",
|
134 |
+
"source": [
|
135 |
+
"The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
|
136 |
],
|
137 |
"metadata": {
|
138 |
"id": "4fQaa1LN1mXL"
|
|
|
149 |
"execution_count": null,
|
150 |
"outputs": []
|
151 |
},
|
152 |
+
{
|
153 |
+
"cell_type": "markdown",
|
154 |
+
"source": [
|
155 |
+
"## Read File"
|
156 |
+
],
|
157 |
+
"metadata": {
|
158 |
+
"id": "zk-4alIxROo8"
|
159 |
+
}
|
160 |
+
},
|
161 |
{
|
162 |
"cell_type": "code",
|
163 |
"source": [
|
164 |
"import csv\n",
|
165 |
"\n",
|
166 |
"text = \"\"\n",
|
167 |
+
"\n",
|
168 |
+
"# Load the file as a JSON\n",
|
169 |
"with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
|
170 |
" csv_reader = csv.reader(file)\n",
|
171 |
"\n",
|
172 |
" for row in csv_reader:\n",
|
173 |
+
" text += row[0]\n",
|
174 |
+
"\n",
|
175 |
+
"# The number of characters in the dataset.\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
"len( text )"
|
177 |
],
|
178 |
"metadata": {
|
|
|
182 |
"id": "7CYwRT6R0o0I",
|
183 |
"outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
|
184 |
},
|
185 |
+
"execution_count": null,
|
186 |
"outputs": [
|
187 |
{
|
188 |
"output_type": "execute_result",
|
|
|
210 |
"source": [
|
211 |
"chunk_size = 512\n",
|
212 |
"chunks = []\n",
|
213 |
+
"\n",
|
214 |
+
"# Split the long text into smaller manageable chunks of 512 characters.\n",
|
215 |
"for i in range(0, len(text), chunk_size):\n",
|
216 |
+
" chunks.append(text[i:i + chunk_size])\n",
|
217 |
+
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
"len( chunks )"
|
219 |
],
|
220 |
"metadata": {
|
|
|
224 |
"id": "STACTMUR1z9N",
|
225 |
"outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
|
226 |
},
|
227 |
+
"execution_count": null,
|
228 |
"outputs": [
|
229 |
{
|
230 |
"output_type": "execute_result",
|
|
|
243 |
"source": [
|
244 |
"from llama_index import Document\n",
|
245 |
"\n",
|
246 |
+
"# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
|
247 |
"documents = [Document(text=t) for t in chunks]"
|
248 |
],
|
249 |
"metadata": {
|
250 |
"id": "CtdsIUQ81_hT"
|
251 |
},
|
252 |
+
"execution_count": null,
|
253 |
"outputs": []
|
254 |
},
|
255 |
{
|
|
|
267 |
"import chromadb\n",
|
268 |
"\n",
|
269 |
"# create client and a new collection\n",
|
270 |
+
"# chromadb.EphemeralClient saves data in-memory.\n",
|
271 |
"chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
|
272 |
"chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
|
273 |
],
|
274 |
"metadata": {
|
275 |
"id": "mXi56KTXk2sp"
|
276 |
},
|
277 |
+
"execution_count": null,
|
278 |
"outputs": []
|
279 |
},
|
280 |
{
|
|
|
283 |
"from llama_index.vector_stores import ChromaVectorStore\n",
|
284 |
"from llama_index.storage.storage_context import StorageContext\n",
|
285 |
"\n",
|
286 |
+
"# Define a storage context object using the created vector database.\n",
|
287 |
"vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
|
288 |
"storage_context = StorageContext.from_defaults(vector_store=vector_store)"
|
289 |
],
|
290 |
"metadata": {
|
291 |
"id": "jKXURvLtkuTS"
|
292 |
},
|
293 |
+
"execution_count": null,
|
294 |
"outputs": []
|
295 |
},
|
296 |
{
|
|
|
298 |
"source": [
|
299 |
"from llama_index import VectorStoreIndex\n",
|
300 |
"\n",
|
301 |
+
"# Add the documents to the database and create Index / embeddings\n",
|
302 |
"index = VectorStoreIndex.from_documents(\n",
|
303 |
" documents, storage_context=storage_context\n",
|
304 |
")"
|
|
|
306 |
"metadata": {
|
307 |
"id": "WsD52wtrlESi"
|
308 |
},
|
309 |
+
"execution_count": null,
|
310 |
"outputs": []
|
311 |
},
|
312 |
{
|
|
|
321 |
{
|
322 |
"cell_type": "code",
|
323 |
"source": [
|
324 |
+
"# Define a query engine that is responsible for retrieving related pieces of text,\n",
|
325 |
+
"# and using a LLM to formulate the final answer.\n",
|
326 |
"query_engine = index.as_query_engine()"
|
327 |
],
|
328 |
"metadata": {
|
329 |
"id": "mzS13x1ZlZ5X"
|
330 |
},
|
331 |
+
"execution_count": null,
|
332 |
"outputs": []
|
333 |
},
|
334 |
{
|
|
|
346 |
"id": "AYsQ4uLN_Oxg",
|
347 |
"outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
|
348 |
},
|
349 |
+
"execution_count": null,
|
350 |
"outputs": [
|
351 |
{
|
352 |
"output_type": "stream",
|
|
|
356 |
]
|
357 |
}
|
358 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
}
|
360 |
]
|
361 |
}
|