AlaFalaki commited on
Commit
baae216
Β·
1 Parent(s): e59af6d

Created using Colab

Browse files
Files changed (1) hide show
  1. notebooks/Web_Search_API.ipynb +303 -18
notebooks/Web_Search_API.ipynb CHANGED
@@ -4,7 +4,7 @@
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
- "authorship_tag": "ABX9TyM7DVBQbBv7iSjrA/U71HaV",
8
  "include_colab_link": true
9
  },
10
  "kernelspec": {
@@ -28,26 +28,36 @@
28
  },
29
  {
30
  "cell_type": "code",
31
- "execution_count": 2,
32
  "metadata": {
33
  "colab": {
34
  "base_uri": "https://localhost:8080/"
35
  },
36
  "id": "JboB5VaCJUrb",
37
- "outputId": "2433bc46-9d7f-476e-bfe9-0e4be5f4e51a"
38
  },
39
  "outputs": [
40
  {
41
  "output_type": "stream",
42
  "name": "stdout",
43
  "text": [
44
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.5/12.5 MB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
45
- "\u001b[?25h"
 
 
 
 
 
 
 
 
 
 
46
  ]
47
  }
48
  ],
49
  "source": [
50
- "!pip install -q llama-index==0.10.5 openai==1.12.0 tiktoken==0.6.0 llama-index-tools-google==0.1.3"
51
  ]
52
  },
53
  {
@@ -56,18 +66,29 @@
56
  "import os\n",
57
  "\n",
58
  "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
59
- "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\""
 
 
60
  ],
61
  "metadata": {
62
  "id": "1NKAn5scN_g9"
63
  },
64
- "execution_count": 5,
65
  "outputs": []
66
  },
67
  {
68
  "cell_type": "markdown",
69
  "source": [
70
- "# Define Google Search Tool"
 
 
 
 
 
 
 
 
 
71
  ],
72
  "metadata": {
73
  "id": "0LMypoqUyuXq"
@@ -78,12 +99,12 @@
78
  "source": [
79
  "from llama_index.tools.google import GoogleSearchToolSpec\n",
80
  "\n",
81
- "tool_spec = GoogleSearchToolSpec(key=\"[GOOGLE_API_KEY]\", engine=\"[GOOGLE_ENGINE_ID]\")"
82
  ],
83
  "metadata": {
84
  "id": "4Q7sc69nJvWI"
85
  },
86
- "execution_count": 54,
87
  "outputs": []
88
  },
89
  {
@@ -100,13 +121,13 @@
100
  "metadata": {
101
  "id": "VrbuIOaMeOIf"
102
  },
103
- "execution_count": 69,
104
  "outputs": []
105
  },
106
  {
107
  "cell_type": "markdown",
108
  "source": [
109
- "# Create the Agent"
110
  ],
111
  "metadata": {
112
  "id": "T3ENpLyBy7UL"
@@ -122,7 +143,7 @@
122
  "metadata": {
123
  "id": "-_Ab47ppK8b2"
124
  },
125
- "execution_count": 70,
126
  "outputs": []
127
  },
128
  {
@@ -133,7 +154,7 @@
133
  "metadata": {
134
  "id": "YcUyz1-FlCQ8"
135
  },
136
- "execution_count": 71,
137
  "outputs": []
138
  },
139
  {
@@ -149,7 +170,7 @@
149
  "id": "w4wK5sY-lOOv",
150
  "outputId": "8090a106-6fac-4514-fdbd-c72a01b28169"
151
  },
152
- "execution_count": 72,
153
  "outputs": [
154
  {
155
  "output_type": "execute_result",
@@ -178,7 +199,7 @@
178
  "id": "TM_cvBA1nTJM",
179
  "outputId": "0bf3533a-c62d-4d0d-bd76-76c043477042"
180
  },
181
- "execution_count": 73,
182
  "outputs": [
183
  {
184
  "output_type": "execute_result",
@@ -193,11 +214,275 @@
193
  }
194
  ]
195
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  {
197
  "cell_type": "code",
198
  "source": [],
199
  "metadata": {
200
- "id": "SPUgKiKpygLn"
201
  },
202
  "execution_count": null,
203
  "outputs": []
 
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
+ "authorship_tag": "ABX9TyNH2OsWaT8fcT3tgDhO3NQn",
8
  "include_colab_link": true
9
  },
10
  "kernelspec": {
 
28
  },
29
  {
30
  "cell_type": "code",
31
+ "execution_count": null,
32
  "metadata": {
33
  "colab": {
34
  "base_uri": "https://localhost:8080/"
35
  },
36
  "id": "JboB5VaCJUrb",
37
+ "outputId": "b7221d06-8783-4586-f98a-72af45cae54f"
38
  },
39
  "outputs": [
40
  {
41
  "output_type": "stream",
42
  "name": "stdout",
43
  "text": [
44
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.1/211.1 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
45
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.3/81.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
46
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.6/97.6 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
47
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
48
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m24.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
49
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
50
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
51
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
52
+ " Building wheel for tinysegmenter (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
53
+ " Building wheel for feedfinder2 (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
54
+ " Building wheel for jieba3k (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
55
+ " Building wheel for sgmllib3k (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
56
  ]
57
  }
58
  ],
59
  "source": [
60
+ "!pip install -q llama-index==0.10.5 openai==1.12.0 tiktoken==0.6.0 llama-index-tools-google==0.1.3 newspaper3k==0.2.8"
61
  ]
62
  },
63
  {
 
66
  "import os\n",
67
  "\n",
68
  "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
69
+ "os.environ[\"OPENAI_API_KEY\"] = \"[OPENAI_API_KEY]\"\n",
70
+ "GOOGLE_SEARCH_KEY = \"[GOOGLE_SEARCH_KEY]\"\n",
71
+ "GOOGLE_SEARCH_ENGINE = \"[GOOGLE_SEARCH_ENGINE]\""
72
  ],
73
  "metadata": {
74
  "id": "1NKAn5scN_g9"
75
  },
76
+ "execution_count": null,
77
  "outputs": []
78
  },
79
  {
80
  "cell_type": "markdown",
81
  "source": [
82
+ "# Using Agents/Tools"
83
+ ],
84
+ "metadata": {
85
+ "id": "ex1gQVHvITMI"
86
+ }
87
+ },
88
+ {
89
+ "cell_type": "markdown",
90
+ "source": [
91
+ "## Define Google Search Tool"
92
  ],
93
  "metadata": {
94
  "id": "0LMypoqUyuXq"
 
99
  "source": [
100
  "from llama_index.tools.google import GoogleSearchToolSpec\n",
101
  "\n",
102
+ "tool_spec = GoogleSearchToolSpec(key=GOOGLE_SEARCH_KEY, engine=GOOGLE_SEARCH_ENGINE)"
103
  ],
104
  "metadata": {
105
  "id": "4Q7sc69nJvWI"
106
  },
107
+ "execution_count": null,
108
  "outputs": []
109
  },
110
  {
 
121
  "metadata": {
122
  "id": "VrbuIOaMeOIf"
123
  },
124
+ "execution_count": null,
125
  "outputs": []
126
  },
127
  {
128
  "cell_type": "markdown",
129
  "source": [
130
+ "## Create the Agent"
131
  ],
132
  "metadata": {
133
  "id": "T3ENpLyBy7UL"
 
143
  "metadata": {
144
  "id": "-_Ab47ppK8b2"
145
  },
146
+ "execution_count": null,
147
  "outputs": []
148
  },
149
  {
 
154
  "metadata": {
155
  "id": "YcUyz1-FlCQ8"
156
  },
157
+ "execution_count": null,
158
  "outputs": []
159
  },
160
  {
 
170
  "id": "w4wK5sY-lOOv",
171
  "outputId": "8090a106-6fac-4514-fdbd-c72a01b28169"
172
  },
173
+ "execution_count": null,
174
  "outputs": [
175
  {
176
  "output_type": "execute_result",
 
199
  "id": "TM_cvBA1nTJM",
200
  "outputId": "0bf3533a-c62d-4d0d-bd76-76c043477042"
201
  },
202
+ "execution_count": null,
203
  "outputs": [
204
  {
205
  "output_type": "execute_result",
 
214
  }
215
  ]
216
  },
217
+ {
218
+ "cell_type": "markdown",
219
+ "source": [
220
+ "# Using Tools w/ VectorStoreIndex"
221
+ ],
222
+ "metadata": {
223
+ "id": "who-NM4pIhPn"
224
+ }
225
+ },
226
+ {
227
+ "cell_type": "markdown",
228
+ "source": [
229
+ "A limitation of the current agent/tool in LlamaIndex is that it **relies solely on the page description from the retrieved pages** to answer questions. This approach will miss answers that are not visible in the page's description tag. To address this, a possible workaround is to fetch the page results, extract the page content using the newspaper3k library, and then create an index based on the downloaded content. Also, the previous method stacks all retrieved items from the search engine into a single document, making it **difficult to pinpoint the exact source** of the response. However, the following method will enable us to present the sources easily."
230
+ ],
231
+ "metadata": {
232
+ "id": "9g9cTM9GI-19"
233
+ }
234
+ },
235
+ {
236
+ "cell_type": "markdown",
237
+ "source": [
238
+ "## Define Google Search Tool"
239
+ ],
240
+ "metadata": {
241
+ "id": "31G_fxxJIsbC"
242
+ }
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "source": [
247
+ "from llama_index.tools.google import GoogleSearchToolSpec\n",
248
+ "\n",
249
+ "tool_spec = GoogleSearchToolSpec(key=GOOGLE_SEARCH_KEY, engine=GOOGLE_SEARCH_ENGINE)"
250
+ ],
251
+ "metadata": {
252
+ "id": "lwRmj2odIHxt"
253
+ },
254
+ "execution_count": null,
255
+ "outputs": []
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "source": [
260
+ "search_results = tool_spec.google_search(\"LLaMA2 model details\")"
261
+ ],
262
+ "metadata": {
263
+ "id": "UVIxdj04Bsf2"
264
+ },
265
+ "execution_count": null,
266
+ "outputs": []
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "source": [
271
+ "import json\n",
272
+ "\n",
273
+ "search_results = json.loads( search_results[0].text )"
274
+ ],
275
+ "metadata": {
276
+ "id": "AlYDNfg2BsdQ"
277
+ },
278
+ "execution_count": null,
279
+ "outputs": []
280
+ },
281
+ {
282
+ "cell_type": "markdown",
283
+ "source": [
284
+ "## Read Each URL Contents"
285
+ ],
286
+ "metadata": {
287
+ "id": "pHALd3uhIxtQ"
288
+ }
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "source": [
293
+ "import newspaper\n",
294
+ "pages_content = []\n",
295
+ "\n",
296
+ "for item in search_results['items']:\n",
297
+ "\n",
298
+ " try:\n",
299
+ " article = newspaper.Article( item['link'] )\n",
300
+ " article.download()\n",
301
+ " article.parse()\n",
302
+ " if len(article.text) > 0:\n",
303
+ " pages_content.append({ \"url\": item['link'], \"text\": article.text, \"title\": item['title'] })\n",
304
+ " except:\n",
305
+ " continue\n",
306
+ "\n",
307
+ "print(len(pages_content))"
308
+ ],
309
+ "metadata": {
310
+ "colab": {
311
+ "base_uri": "https://localhost:8080/"
312
+ },
313
+ "id": "jXz3JFduBsaq",
314
+ "outputId": "1b795423-26a6-4a61-a878-cca5e27dd5d1"
315
+ },
316
+ "execution_count": null,
317
+ "outputs": [
318
+ {
319
+ "output_type": "stream",
320
+ "name": "stdout",
321
+ "text": [
322
+ "8\n"
323
+ ]
324
+ }
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "markdown",
329
+ "source": [
330
+ "## Create the Index"
331
+ ],
332
+ "metadata": {
333
+ "id": "iqxa_qRVI3G0"
334
+ }
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "source": [
339
+ "from llama_index.core import Document\n",
340
+ "\n",
341
+ "# Convert the texts to Document objects so the LlamaIndex framework can process them.\n",
342
+ "documents = [Document(text=row[\"text\"], metadata={\"title\": row[\"title\"], \"url\": row[\"url\"]}) for row in pages_content]"
343
+ ],
344
+ "metadata": {
345
+ "id": "O4PkK8DuBsZT"
346
+ },
347
+ "execution_count": null,
348
+ "outputs": []
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "source": [
353
+ "from llama_index.core import VectorStoreIndex\n",
354
+ "from llama_index.core.node_parser import SentenceSplitter\n",
355
+ "\n",
356
+ "# Build index / generate embeddings using OpenAI.\n",
357
+ "index = VectorStoreIndex.from_documents(\n",
358
+ " documents,\n",
359
+ " transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=64)],\n",
360
+ ")"
361
+ ],
362
+ "metadata": {
363
+ "id": "2RtMBWpgBsWX"
364
+ },
365
+ "execution_count": null,
366
+ "outputs": []
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "source": [
371
+ "# Define a query engine that is responsible for retrieving related pieces of text,\n",
372
+ "# and using a LLM to formulate the final answer.\n",
373
+ "query_engine = index.as_query_engine()"
374
+ ],
375
+ "metadata": {
376
+ "id": "xV_ibEZ_BsM4"
377
+ },
378
+ "execution_count": null,
379
+ "outputs": []
380
+ },
381
+ {
382
+ "cell_type": "markdown",
383
+ "source": [
384
+ "## Query"
385
+ ],
386
+ "metadata": {
387
+ "id": "nziwu27MI6ih"
388
+ }
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "source": [
393
+ "response = query_engine.query(\n",
394
+ " \"How many parameters LLaMA2 model has?\"\n",
395
+ ")\n",
396
+ "print(response)"
397
+ ],
398
+ "metadata": {
399
+ "colab": {
400
+ "base_uri": "https://localhost:8080/"
401
+ },
402
+ "id": "5K1h2_t-HNPe",
403
+ "outputId": "58ce5d66-eddc-43fe-e7c8-d78bc0cb8c32"
404
+ },
405
+ "execution_count": null,
406
+ "outputs": [
407
+ {
408
+ "output_type": "stream",
409
+ "name": "stdout",
410
+ "text": [
411
+ "LLaMA2 model has sizes ranging from 7 to 70 billion parameters.\n"
412
+ ]
413
+ }
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "source": [
419
+ "response = query_engine.query(\n",
420
+ " \"How many parameters LLaMA2 model has? list exact sizes.\"\n",
421
+ ")\n",
422
+ "print(response)"
423
+ ],
424
+ "metadata": {
425
+ "colab": {
426
+ "base_uri": "https://localhost:8080/"
427
+ },
428
+ "id": "Xea7ZeidH27i",
429
+ "outputId": "d455c379-9c91-4c9e-e9c1-6bd2deb7342e"
430
+ },
431
+ "execution_count": null,
432
+ "outputs": [
433
+ {
434
+ "output_type": "stream",
435
+ "name": "stdout",
436
+ "text": [
437
+ "The LLaMA2 model comes in several sizes with different numbers of parameters:\n",
438
+ "- LLaMA2 7B\n",
439
+ "- LLaMA2 13B\n",
440
+ "- LLaMA2 33B\n",
441
+ "- LLaMA2 65B\n"
442
+ ]
443
+ }
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "source": [
449
+ "# Show the retrieved nodes\n",
450
+ "for src in response.source_nodes:\n",
451
+ " print(\"Title\\t\", src.metadata['title'])\n",
452
+ " print(\"Source\\t\", src.metadata['url'])\n",
453
+ " print(\"Score\\t\", src.score)\n",
454
+ " print(\"-_\"*20)"
455
+ ],
456
+ "metadata": {
457
+ "colab": {
458
+ "base_uri": "https://localhost:8080/"
459
+ },
460
+ "id": "4QpGPD5nHORP",
461
+ "outputId": "8f9fc185-7745-4357-8471-25d34726cdd8"
462
+ },
463
+ "execution_count": null,
464
+ "outputs": [
465
+ {
466
+ "output_type": "stream",
467
+ "name": "stdout",
468
+ "text": [
469
+ "Title\t Introducing LLaMA: A foundational, 65-billion-parameter language ...\n",
470
+ "Source\t https://ai.meta.com/blog/large-language-model-llama-meta-ai/\n",
471
+ "Score\t 0.8124383491026671\n",
472
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
473
+ "Title\t Llama 2 follow-up: too much RLHF, GPU sizing, technical details\n",
474
+ "Source\t https://www.interconnects.ai/p/llama-2-part-2\n",
475
+ "Score\t 0.8046542892214631\n",
476
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
477
+ ]
478
+ }
479
+ ]
480
+ },
481
  {
482
  "cell_type": "code",
483
  "source": [],
484
  "metadata": {
485
+ "id": "B5b4nZ-qHpdP"
486
  },
487
  "execution_count": null,
488
  "outputs": []