AlaFalaki commited on
Commit
fcc14cf
Β·
1 Parent(s): 56b8408

Created using Colaboratory

Browse files
notebooks/04-RAG_with_VectorStore.ipynb CHANGED
@@ -4,7 +4,7 @@
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
- "authorship_tag": "ABX9TyMKmMCxgNdqmZNkB0r6NNkp",
8
  "include_colab_link": true
9
  },
10
  "kernelspec": {
@@ -26,9 +26,18 @@
26
  "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
27
  ]
28
  },
 
 
 
 
 
 
 
 
 
29
  {
30
  "cell_type": "code",
31
- "execution_count": 1,
32
  "metadata": {
33
  "colab": {
34
  "base_uri": "https://localhost:8080/"
@@ -93,12 +102,13 @@
93
  "source": [
94
  "import os\n",
95
  "\n",
 
96
  "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
97
  ],
98
  "metadata": {
99
  "id": "riuXwpSPcvWC"
100
  },
101
- "execution_count": 2,
102
  "outputs": []
103
  },
104
  {
@@ -113,7 +123,16 @@
113
  {
114
  "cell_type": "markdown",
115
  "source": [
116
- "Read the dataset as a long string."
 
 
 
 
 
 
 
 
 
117
  ],
118
  "metadata": {
119
  "id": "4fQaa1LN1mXL"
@@ -130,27 +149,30 @@
130
  "execution_count": null,
131
  "outputs": []
132
  },
 
 
 
 
 
 
 
 
 
133
  {
134
  "cell_type": "code",
135
  "source": [
136
  "import csv\n",
137
  "\n",
138
  "text = \"\"\n",
 
 
139
  "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
140
  " csv_reader = csv.reader(file)\n",
141
  "\n",
142
  " for row in csv_reader:\n",
143
- " text += row[0]"
144
- ],
145
- "metadata": {
146
- "id": "0Q9sxuW0g3Gd"
147
- },
148
- "execution_count": 3,
149
- "outputs": []
150
- },
151
- {
152
- "cell_type": "code",
153
- "source": [
154
  "len( text )"
155
  ],
156
  "metadata": {
@@ -160,7 +182,7 @@
160
  "id": "7CYwRT6R0o0I",
161
  "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
162
  },
163
- "execution_count": 4,
164
  "outputs": [
165
  {
166
  "output_type": "execute_result",
@@ -188,18 +210,11 @@
188
  "source": [
189
  "chunk_size = 512\n",
190
  "chunks = []\n",
 
 
191
  "for i in range(0, len(text), chunk_size):\n",
192
- " chunks.append(text[i:i + chunk_size])"
193
- ],
194
- "metadata": {
195
- "id": "IU7zLFi01pjD"
196
- },
197
- "execution_count": 5,
198
- "outputs": []
199
- },
200
- {
201
- "cell_type": "code",
202
- "source": [
203
  "len( chunks )"
204
  ],
205
  "metadata": {
@@ -209,7 +224,7 @@
209
  "id": "STACTMUR1z9N",
210
  "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
211
  },
212
- "execution_count": 6,
213
  "outputs": [
214
  {
215
  "output_type": "execute_result",
@@ -228,12 +243,13 @@
228
  "source": [
229
  "from llama_index import Document\n",
230
  "\n",
 
231
  "documents = [Document(text=t) for t in chunks]"
232
  ],
233
  "metadata": {
234
  "id": "CtdsIUQ81_hT"
235
  },
236
- "execution_count": 7,
237
  "outputs": []
238
  },
239
  {
@@ -251,14 +267,14 @@
251
  "import chromadb\n",
252
  "\n",
253
  "# create client and a new collection\n",
254
- "# chromadb.EphemeralClient to save in-memory.\n",
255
  "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
256
  "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
257
  ],
258
  "metadata": {
259
  "id": "mXi56KTXk2sp"
260
  },
261
- "execution_count": 8,
262
  "outputs": []
263
  },
264
  {
@@ -267,13 +283,14 @@
267
  "from llama_index.vector_stores import ChromaVectorStore\n",
268
  "from llama_index.storage.storage_context import StorageContext\n",
269
  "\n",
 
270
  "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
271
  "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
272
  ],
273
  "metadata": {
274
  "id": "jKXURvLtkuTS"
275
  },
276
- "execution_count": 9,
277
  "outputs": []
278
  },
279
  {
@@ -281,6 +298,7 @@
281
  "source": [
282
  "from llama_index import VectorStoreIndex\n",
283
  "\n",
 
284
  "index = VectorStoreIndex.from_documents(\n",
285
  " documents, storage_context=storage_context\n",
286
  ")"
@@ -288,7 +306,7 @@
288
  "metadata": {
289
  "id": "WsD52wtrlESi"
290
  },
291
- "execution_count": 11,
292
  "outputs": []
293
  },
294
  {
@@ -303,12 +321,14 @@
303
  {
304
  "cell_type": "code",
305
  "source": [
 
 
306
  "query_engine = index.as_query_engine()"
307
  ],
308
  "metadata": {
309
  "id": "mzS13x1ZlZ5X"
310
  },
311
- "execution_count": 12,
312
  "outputs": []
313
  },
314
  {
@@ -326,7 +346,7 @@
326
  "id": "AYsQ4uLN_Oxg",
327
  "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
328
  },
329
- "execution_count": 13,
330
  "outputs": [
331
  {
332
  "output_type": "stream",
@@ -336,15 +356,6 @@
336
  ]
337
  }
338
  ]
339
- },
340
- {
341
- "cell_type": "code",
342
- "source": [],
343
- "metadata": {
344
- "id": "hjYiWAocnalt"
345
- },
346
- "execution_count": null,
347
- "outputs": []
348
  }
349
  ]
350
  }
 
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
+ "authorship_tag": "ABX9TyNQkVEh0x7hcM9U+6JSEkSG",
8
  "include_colab_link": true
9
  },
10
  "kernelspec": {
 
26
  "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/04-RAG_with_VectorStore.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
27
  ]
28
  },
29
+ {
30
+ "cell_type": "markdown",
31
+ "source": [
32
+ "# Install Packages and Setup Variables"
33
+ ],
34
+ "metadata": {
35
+ "id": "5BGJ3fxhOk2V"
36
+ }
37
+ },
38
  {
39
  "cell_type": "code",
40
+ "execution_count": null,
41
  "metadata": {
42
  "colab": {
43
  "base_uri": "https://localhost:8080/"
 
102
  "source": [
103
  "import os\n",
104
  "\n",
105
+ "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
106
  "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
107
  ],
108
  "metadata": {
109
  "id": "riuXwpSPcvWC"
110
  },
111
+ "execution_count": null,
112
  "outputs": []
113
  },
114
  {
 
123
  {
124
  "cell_type": "markdown",
125
  "source": [
126
+ "## Download"
127
+ ],
128
+ "metadata": {
129
+ "id": "_Tif8-JoRH68"
130
+ }
131
+ },
132
+ {
133
+ "cell_type": "markdown",
134
+ "source": [
135
+ "The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string."
136
  ],
137
  "metadata": {
138
  "id": "4fQaa1LN1mXL"
 
149
  "execution_count": null,
150
  "outputs": []
151
  },
152
+ {
153
+ "cell_type": "markdown",
154
+ "source": [
155
+ "## Read File"
156
+ ],
157
+ "metadata": {
158
+ "id": "zk-4alIxROo8"
159
+ }
160
+ },
161
  {
162
  "cell_type": "code",
163
  "source": [
164
  "import csv\n",
165
  "\n",
166
  "text = \"\"\n",
167
+ "\n",
168
+ "# Load the file as a JSON\n",
169
  "with open(\"./mini-dataset.csv\", mode=\"r\", encoding=\"ISO-8859-1\") as file:\n",
170
  " csv_reader = csv.reader(file)\n",
171
  "\n",
172
  " for row in csv_reader:\n",
173
+ " text += row[0]\n",
174
+ "\n",
175
+ "# The number of characters in the dataset.\n",
 
 
 
 
 
 
 
 
176
  "len( text )"
177
  ],
178
  "metadata": {
 
182
  "id": "7CYwRT6R0o0I",
183
  "outputId": "6f0f05ae-c92f-45b2-bbc3-d12add118021"
184
  },
185
+ "execution_count": null,
186
  "outputs": [
187
  {
188
  "output_type": "execute_result",
 
210
  "source": [
211
  "chunk_size = 512\n",
212
  "chunks = []\n",
213
+ "\n",
214
+ "# Split the long text into smaller manageable chunks of 512 characters.\n",
215
  "for i in range(0, len(text), chunk_size):\n",
216
+ " chunks.append(text[i:i + chunk_size])\n",
217
+ "\n",
 
 
 
 
 
 
 
 
 
218
  "len( chunks )"
219
  ],
220
  "metadata": {
 
224
  "id": "STACTMUR1z9N",
225
  "outputId": "8ce58d6b-a38d-48e3-8316-7435907488cf"
226
  },
227
+ "execution_count": null,
228
  "outputs": [
229
  {
230
  "output_type": "execute_result",
 
243
  "source": [
244
  "from llama_index import Document\n",
245
  "\n",
246
+ "# Convert the chunks to Document objects so the LlamaIndex framework can process them.\n",
247
  "documents = [Document(text=t) for t in chunks]"
248
  ],
249
  "metadata": {
250
  "id": "CtdsIUQ81_hT"
251
  },
252
+ "execution_count": null,
253
  "outputs": []
254
  },
255
  {
 
267
  "import chromadb\n",
268
  "\n",
269
  "# create client and a new collection\n",
270
+ "# chromadb.EphemeralClient saves data in-memory.\n",
271
  "chroma_client = chromadb.PersistentClient(path=\"./mini-chunked-dataset\")\n",
272
  "chroma_collection = chroma_client.create_collection(\"mini-chunked-dataset\")"
273
  ],
274
  "metadata": {
275
  "id": "mXi56KTXk2sp"
276
  },
277
+ "execution_count": null,
278
  "outputs": []
279
  },
280
  {
 
283
  "from llama_index.vector_stores import ChromaVectorStore\n",
284
  "from llama_index.storage.storage_context import StorageContext\n",
285
  "\n",
286
+ "# Define a storage context object using the created vector database.\n",
287
  "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
288
  "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
289
  ],
290
  "metadata": {
291
  "id": "jKXURvLtkuTS"
292
  },
293
+ "execution_count": null,
294
  "outputs": []
295
  },
296
  {
 
298
  "source": [
299
  "from llama_index import VectorStoreIndex\n",
300
  "\n",
301
+ "# Add the documents to the database and create Index / embeddings\n",
302
  "index = VectorStoreIndex.from_documents(\n",
303
  " documents, storage_context=storage_context\n",
304
  ")"
 
306
  "metadata": {
307
  "id": "WsD52wtrlESi"
308
  },
309
+ "execution_count": null,
310
  "outputs": []
311
  },
312
  {
 
321
  {
322
  "cell_type": "code",
323
  "source": [
324
+ "# Define a query engine that is responsible for retrieving related pieces of text,\n",
325
+ "# and using a LLM to formulate the final answer.\n",
326
  "query_engine = index.as_query_engine()"
327
  ],
328
  "metadata": {
329
  "id": "mzS13x1ZlZ5X"
330
  },
331
+ "execution_count": null,
332
  "outputs": []
333
  },
334
  {
 
346
  "id": "AYsQ4uLN_Oxg",
347
  "outputId": "bf2181ad-27f6-40a2-b792-8a2714a60c29"
348
  },
349
+ "execution_count": null,
350
  "outputs": [
351
  {
352
  "output_type": "stream",
 
356
  ]
357
  }
358
  ]
 
 
 
 
 
 
 
 
 
359
  }
360
  ]
361
  }