AlaFalaki commited on
Commit
bacb279
Β·
1 Parent(s): 859a77d

Created using Colaboratory

Browse files
notebooks/09-Better_Embedding_Model.ipynb CHANGED
@@ -4,7 +4,7 @@
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
- "authorship_tag": "ABX9TyOuxoYC0mOLg3qm/0hreMFy",
8
  "include_colab_link": true
9
  },
10
  "kernelspec": {
@@ -725,7 +725,7 @@
725
  },
726
  {
727
  "cell_type": "code",
728
- "execution_count": 1,
729
  "metadata": {
730
  "id": "QPJzr-I9XQ7l",
731
  "colab": {
@@ -787,13 +787,13 @@
787
  "import os\n",
788
  "\n",
789
  "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
790
- "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR_OPENAI_KEY>\"\n",
791
- "os.environ[\"CO_API_KEY\"] = \"<YOUR_COHERE_KEY>\""
792
  ],
793
  "metadata": {
794
  "id": "riuXwpSPcvWC"
795
  },
796
- "execution_count": 2,
797
  "outputs": []
798
  },
799
  {
@@ -806,7 +806,7 @@
806
  "metadata": {
807
  "id": "jIEeZzqLbz0J"
808
  },
809
- "execution_count": 3,
810
  "outputs": []
811
  },
812
  {
@@ -828,7 +828,7 @@
828
  "metadata": {
829
  "id": "9oGT6crooSSj"
830
  },
831
- "execution_count": 4,
832
  "outputs": []
833
  },
834
  {
@@ -853,7 +853,7 @@
853
  "metadata": {
854
  "id": "SQP87lHczHKc"
855
  },
856
- "execution_count": 5,
857
  "outputs": []
858
  },
859
  {
@@ -867,7 +867,7 @@
867
  "metadata": {
868
  "id": "zAaGcYMJzHAN"
869
  },
870
- "execution_count": 6,
871
  "outputs": []
872
  },
873
  {
@@ -909,7 +909,7 @@
909
  "id": "wl_pbPvMlv1h",
910
  "outputId": "bc9a0415-a1fb-4e89-a2b4-165420106b34"
911
  },
912
- "execution_count": 7,
913
  "outputs": [
914
  {
915
  "output_type": "stream",
@@ -964,7 +964,7 @@
964
  },
965
  "outputId": "a8361aa6-522d-4def-e49b-ed08d9c8e7d1"
966
  },
967
- "execution_count": 8,
968
  "outputs": [
969
  {
970
  "output_type": "execute_result",
@@ -998,7 +998,7 @@
998
  "metadata": {
999
  "id": "YizvmXPejkJE"
1000
  },
1001
- "execution_count": 9,
1002
  "outputs": []
1003
  },
1004
  {
@@ -1015,6 +1015,8 @@
1015
  "source": [
1016
  "from llama_index.text_splitter import TokenTextSplitter\n",
1017
  "\n",
 
 
1018
  "text_splitter = TokenTextSplitter(\n",
1019
  " separator=\" \", chunk_size=512, chunk_overlap=128\n",
1020
  ")"
@@ -1022,7 +1024,7 @@
1022
  "metadata": {
1023
  "id": "9z3t70DGWsjO"
1024
  },
1025
- "execution_count": 10,
1026
  "outputs": []
1027
  },
1028
  {
@@ -1049,6 +1051,8 @@
1049
  "from llama_index.embeddings.cohereai import CohereEmbedding\n",
1050
  "from llama_index.ingestion import IngestionPipeline\n",
1051
  "\n",
 
 
1052
  "pipeline = IngestionPipeline(\n",
1053
  " transformations=[\n",
1054
  " text_splitter,\n",
@@ -1060,6 +1064,7 @@
1060
  " vector_store=vector_store\n",
1061
  ")\n",
1062
  "\n",
 
1063
  "nodes = pipeline.run(documents=documents, show_progress=True);"
1064
  ],
1065
  "metadata": {
@@ -1094,7 +1099,7 @@
1094
  "id": "P9LDJ7o-Wsc-",
1095
  "outputId": "cd49bff2-b0da-4722-8baa-6a07f1023b39"
1096
  },
1097
- "execution_count": 11,
1098
  "outputs": [
1099
  {
1100
  "output_type": "display_data",
@@ -1167,7 +1172,7 @@
1167
  "id": "mPGa85hM2P3P",
1168
  "outputId": "9d7811ba-1e10-4098-b6eb-77a4e7d37457"
1169
  },
1170
- "execution_count": 12,
1171
  "outputs": [
1172
  {
1173
  "output_type": "execute_result",
@@ -1193,7 +1198,7 @@
1193
  "id": "jjnmscmq2cXK",
1194
  "outputId": "5f6fa176-4e09-4cc7-bd17-8236b061ad17"
1195
  },
1196
- "execution_count": 13,
1197
  "outputs": [
1198
  {
1199
  "output_type": "execute_result",
@@ -1210,6 +1215,7 @@
1210
  {
1211
  "cell_type": "code",
1212
  "source": [
 
1213
  "!zip -r vectorstore_cohere.zip mini-llama-articles"
1214
  ],
1215
  "metadata": {
@@ -1219,7 +1225,7 @@
1219
  "id": "hV9G0lSUJJSa",
1220
  "outputId": "453a4ea3-dfda-4da1-ac29-929834c83b40"
1221
  },
1222
- "execution_count": 33,
1223
  "outputs": [
1224
  {
1225
  "output_type": "stream",
@@ -1245,10 +1251,19 @@
1245
  "id": "OWaT6rL7ksp8"
1246
  }
1247
  },
 
 
 
 
 
 
 
 
 
1248
  {
1249
  "cell_type": "code",
1250
  "source": [
1251
- "!unzip vectorstore_cohere.zip"
1252
  ],
1253
  "metadata": {
1254
  "id": "EF-wobGAJRgL"
@@ -1267,7 +1282,7 @@
1267
  "metadata": {
1268
  "id": "mXi56KTXk2sp"
1269
  },
1270
- "execution_count": 14,
1271
  "outputs": []
1272
  },
1273
  {
@@ -1275,11 +1290,14 @@
1275
  "source": [
1276
  "from llama_index import ServiceContext\n",
1277
  "\n",
 
1278
  "embed_model = CohereEmbedding(\n",
1279
  " model_name=\"embed-english-v3.0\",\n",
1280
  " input_type=\"search_query\",\n",
1281
  ")\n",
1282
  "\n",
 
 
1283
  "service_context = ServiceContext.from_defaults(\n",
1284
  " llm=llm, embed_model=embed_model\n",
1285
  ")"
@@ -1287,21 +1305,21 @@
1287
  "metadata": {
1288
  "id": "9l0PaY230syE"
1289
  },
1290
- "execution_count": 37,
1291
  "outputs": []
1292
  },
1293
  {
1294
  "cell_type": "code",
1295
  "source": [
1296
- "# Create your index\n",
1297
  "from llama_index import VectorStoreIndex\n",
1298
  "\n",
 
1299
  "index = VectorStoreIndex.from_vector_store(vector_store, service_context=service_context)"
1300
  ],
1301
  "metadata": {
1302
  "id": "jKXURvLtkuTS"
1303
  },
1304
- "execution_count": 38,
1305
  "outputs": []
1306
  },
1307
  {
@@ -1325,7 +1343,7 @@
1325
  "metadata": {
1326
  "id": "b0gue7cyctt1"
1327
  },
1328
- "execution_count": 29,
1329
  "outputs": []
1330
  },
1331
  {
@@ -1341,7 +1359,7 @@
1341
  "id": "VKK3jMprctre",
1342
  "outputId": "cb85d598-d1bc-49e9-818f-c7bbde465864"
1343
  },
1344
- "execution_count": 30,
1345
  "outputs": [
1346
  {
1347
  "output_type": "execute_result",
@@ -1361,6 +1379,7 @@
1361
  {
1362
  "cell_type": "code",
1363
  "source": [
 
1364
  "for src in res.source_nodes:\n",
1365
  " print(\"Node ID\\t\", src.node_id)\n",
1366
  " print(\"Title\\t\", src.metadata['title'])\n",
@@ -1375,7 +1394,7 @@
1375
  "id": "465dH4yQc7Ct",
1376
  "outputId": "3d2b3ce2-7705-41bb-80e3-4fe6b390dcef"
1377
  },
1378
- "execution_count": 31,
1379
  "outputs": [
1380
  {
1381
  "output_type": "stream",
@@ -1410,6 +1429,9 @@
1410
  "from llama_index.evaluation import generate_question_context_pairs\n",
1411
  "from llama_index.llms import OpenAI\n",
1412
  "\n",
 
 
 
1413
  "llm = OpenAI(model=\"gpt-3.5-turbo\")\n",
1414
  "rag_eval_dataset = generate_question_context_pairs(\n",
1415
  " nodes,\n",
@@ -1417,7 +1439,7 @@
1417
  " num_questions_per_chunk=1\n",
1418
  ")\n",
1419
  "\n",
1420
- "# We can save the dataset as a json file for later use.\n",
1421
  "rag_eval_dataset.save_json(\"./rag_eval_dataset_cohere.json\")"
1422
  ],
1423
  "metadata": {
@@ -1427,7 +1449,7 @@
1427
  },
1428
  "outputId": "85b0765e-5a42-4f60-ccff-fc4bc688f65a"
1429
  },
1430
- "execution_count": 32,
1431
  "outputs": [
1432
  {
1433
  "output_type": "stream",
@@ -1438,20 +1460,29 @@
1438
  }
1439
  ]
1440
  },
 
 
 
 
 
 
 
 
 
1441
  {
1442
  "cell_type": "code",
1443
  "source": [
1444
- "from llama_index.finetuning.embeddings.common import (\n",
1445
- " EmbeddingQAFinetuneDataset,\n",
1446
- ")\n",
1447
- "rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json(\n",
1448
- " \"./rag_eval_dataset_cohere.json\"\n",
1449
- ")"
1450
  ],
1451
  "metadata": {
1452
  "id": "3sA1K84U254o"
1453
  },
1454
- "execution_count": 34,
1455
  "outputs": []
1456
  },
1457
  {
@@ -1459,6 +1490,7 @@
1459
  "source": [
1460
  "import pandas as pd\n",
1461
  "\n",
 
1462
  "def display_results_retriever(name, eval_results):\n",
1463
  " \"\"\"Display results from evaluate.\"\"\"\n",
1464
  "\n",
@@ -1481,7 +1513,7 @@
1481
  "metadata": {
1482
  "id": "H7ubvcbk27vr"
1483
  },
1484
- "execution_count": 35,
1485
  "outputs": []
1486
  },
1487
  {
@@ -1505,7 +1537,7 @@
1505
  "id": "uNLxDxoc2-Ac",
1506
  "outputId": "8a2df94d-99b5-4aa4-a31e-b6c94256d1bb"
1507
  },
1508
- "execution_count": 39,
1509
  "outputs": [
1510
  {
1511
  "output_type": "stream",
@@ -1568,7 +1600,7 @@
1568
  "id": "3ukkWC9R2_0J",
1569
  "outputId": "d177c25d-a163-4b71-97f4-2af468737bbb"
1570
  },
1571
- "execution_count": 40,
1572
  "outputs": [
1573
  {
1574
  "output_type": "stream",
 
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
+ "authorship_tag": "ABX9TyPOc0CzdBgKoadyg4eV8JWo",
8
  "include_colab_link": true
9
  },
10
  "kernelspec": {
 
725
  },
726
  {
727
  "cell_type": "code",
728
+ "execution_count": null,
729
  "metadata": {
730
  "id": "QPJzr-I9XQ7l",
731
  "colab": {
 
787
  "import os\n",
788
  "\n",
789
  "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
790
+ "os.environ[\"OPENAI_API_KEY\"] = \"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\"\n",
791
+ "os.environ[\"CO_API_KEY\"] = \"844npZDOg4G9mRqCiIBeJZCrvMuXHdS4lTBYrqoY\""
792
  ],
793
  "metadata": {
794
  "id": "riuXwpSPcvWC"
795
  },
796
+ "execution_count": null,
797
  "outputs": []
798
  },
799
  {
 
806
  "metadata": {
807
  "id": "jIEeZzqLbz0J"
808
  },
809
+ "execution_count": null,
810
  "outputs": []
811
  },
812
  {
 
828
  "metadata": {
829
  "id": "9oGT6crooSSj"
830
  },
831
+ "execution_count": null,
832
  "outputs": []
833
  },
834
  {
 
853
  "metadata": {
854
  "id": "SQP87lHczHKc"
855
  },
856
+ "execution_count": null,
857
  "outputs": []
858
  },
859
  {
 
867
  "metadata": {
868
  "id": "zAaGcYMJzHAN"
869
  },
870
+ "execution_count": null,
871
  "outputs": []
872
  },
873
  {
 
909
  "id": "wl_pbPvMlv1h",
910
  "outputId": "bc9a0415-a1fb-4e89-a2b4-165420106b34"
911
  },
912
+ "execution_count": null,
913
  "outputs": [
914
  {
915
  "output_type": "stream",
 
964
  },
965
  "outputId": "a8361aa6-522d-4def-e49b-ed08d9c8e7d1"
966
  },
967
+ "execution_count": null,
968
  "outputs": [
969
  {
970
  "output_type": "execute_result",
 
998
  "metadata": {
999
  "id": "YizvmXPejkJE"
1000
  },
1001
+ "execution_count": null,
1002
  "outputs": []
1003
  },
1004
  {
 
1015
  "source": [
1016
  "from llama_index.text_splitter import TokenTextSplitter\n",
1017
  "\n",
1018
+ "# Define the splitter object that split the text into segments with 512 tokens,\n",
1019
+ "# with a 128 overlap between the segments.\n",
1020
  "text_splitter = TokenTextSplitter(\n",
1021
  " separator=\" \", chunk_size=512, chunk_overlap=128\n",
1022
  ")"
 
1024
  "metadata": {
1025
  "id": "9z3t70DGWsjO"
1026
  },
1027
+ "execution_count": null,
1028
  "outputs": []
1029
  },
1030
  {
 
1051
  "from llama_index.embeddings.cohereai import CohereEmbedding\n",
1052
  "from llama_index.ingestion import IngestionPipeline\n",
1053
  "\n",
1054
+ "# Create the pipeline to apply the transformation on each chunk,\n",
1055
+ "# and store the transformed text in the chroma vector store.\n",
1056
  "pipeline = IngestionPipeline(\n",
1057
  " transformations=[\n",
1058
  " text_splitter,\n",
 
1064
  " vector_store=vector_store\n",
1065
  ")\n",
1066
  "\n",
1067
+ "# Run the transformation pipeline.\n",
1068
  "nodes = pipeline.run(documents=documents, show_progress=True);"
1069
  ],
1070
  "metadata": {
 
1099
  "id": "P9LDJ7o-Wsc-",
1100
  "outputId": "cd49bff2-b0da-4722-8baa-6a07f1023b39"
1101
  },
1102
+ "execution_count": null,
1103
  "outputs": [
1104
  {
1105
  "output_type": "display_data",
 
1172
  "id": "mPGa85hM2P3P",
1173
  "outputId": "9d7811ba-1e10-4098-b6eb-77a4e7d37457"
1174
  },
1175
+ "execution_count": null,
1176
  "outputs": [
1177
  {
1178
  "output_type": "execute_result",
 
1198
  "id": "jjnmscmq2cXK",
1199
  "outputId": "5f6fa176-4e09-4cc7-bd17-8236b061ad17"
1200
  },
1201
+ "execution_count": null,
1202
  "outputs": [
1203
  {
1204
  "output_type": "execute_result",
 
1215
  {
1216
  "cell_type": "code",
1217
  "source": [
1218
+ "# Compress the vector store directory to a zip file to be able to download and use later.\n",
1219
  "!zip -r vectorstore_cohere.zip mini-llama-articles"
1220
  ],
1221
  "metadata": {
 
1225
  "id": "hV9G0lSUJJSa",
1226
  "outputId": "453a4ea3-dfda-4da1-ac29-929834c83b40"
1227
  },
1228
+ "execution_count": null,
1229
  "outputs": [
1230
  {
1231
  "output_type": "stream",
 
1251
  "id": "OWaT6rL7ksp8"
1252
  }
1253
  },
1254
+ {
1255
+ "cell_type": "markdown",
1256
+ "source": [
1257
+ "If you have already uploaded the zip file for the vector store checkpoint, please uncomment the code in the following cell block to extract its contents. After doing so, you will be able to load the dataset from local storage."
1258
+ ],
1259
+ "metadata": {
1260
+ "id": "B4w8xP2Ggrvf"
1261
+ }
1262
+ },
1263
  {
1264
  "cell_type": "code",
1265
  "source": [
1266
+ "# !unzip vectorstore_cohere.zip"
1267
  ],
1268
  "metadata": {
1269
  "id": "EF-wobGAJRgL"
 
1282
  "metadata": {
1283
  "id": "mXi56KTXk2sp"
1284
  },
1285
+ "execution_count": null,
1286
  "outputs": []
1287
  },
1288
  {
 
1290
  "source": [
1291
  "from llama_index import ServiceContext\n",
1292
  "\n",
1293
+ "# Define the Cohere Embedding Model\n",
1294
  "embed_model = CohereEmbedding(\n",
1295
  " model_name=\"embed-english-v3.0\",\n",
1296
  " input_type=\"search_query\",\n",
1297
  ")\n",
1298
  "\n",
1299
+ "# Define the ServiceCotext object to tie the LLM for generating final answer,\n",
1300
+ "# and the embedding model to help with retrieving related nodes.\n",
1301
  "service_context = ServiceContext.from_defaults(\n",
1302
  " llm=llm, embed_model=embed_model\n",
1303
  ")"
 
1305
  "metadata": {
1306
  "id": "9l0PaY230syE"
1307
  },
1308
+ "execution_count": null,
1309
  "outputs": []
1310
  },
1311
  {
1312
  "cell_type": "code",
1313
  "source": [
 
1314
  "from llama_index import VectorStoreIndex\n",
1315
  "\n",
1316
+ "# Create the index based on the vector store.\n",
1317
  "index = VectorStoreIndex.from_vector_store(vector_store, service_context=service_context)"
1318
  ],
1319
  "metadata": {
1320
  "id": "jKXURvLtkuTS"
1321
  },
1322
+ "execution_count": null,
1323
  "outputs": []
1324
  },
1325
  {
 
1343
  "metadata": {
1344
  "id": "b0gue7cyctt1"
1345
  },
1346
+ "execution_count": null,
1347
  "outputs": []
1348
  },
1349
  {
 
1359
  "id": "VKK3jMprctre",
1360
  "outputId": "cb85d598-d1bc-49e9-818f-c7bbde465864"
1361
  },
1362
+ "execution_count": null,
1363
  "outputs": [
1364
  {
1365
  "output_type": "execute_result",
 
1379
  {
1380
  "cell_type": "code",
1381
  "source": [
1382
+ "# Show the retrieved nodes\n",
1383
  "for src in res.source_nodes:\n",
1384
  " print(\"Node ID\\t\", src.node_id)\n",
1385
  " print(\"Title\\t\", src.metadata['title'])\n",
 
1394
  "id": "465dH4yQc7Ct",
1395
  "outputId": "3d2b3ce2-7705-41bb-80e3-4fe6b390dcef"
1396
  },
1397
+ "execution_count": null,
1398
  "outputs": [
1399
  {
1400
  "output_type": "stream",
 
1429
  "from llama_index.evaluation import generate_question_context_pairs\n",
1430
  "from llama_index.llms import OpenAI\n",
1431
  "\n",
1432
+ "# Create questions for each segment. These questions will be used to\n",
1433
+ "# assess whether the retriever can accurately identify and return the\n",
1434
+ "# corresponding segment when queried.\n",
1435
  "llm = OpenAI(model=\"gpt-3.5-turbo\")\n",
1436
  "rag_eval_dataset = generate_question_context_pairs(\n",
1437
  " nodes,\n",
 
1439
  " num_questions_per_chunk=1\n",
1440
  ")\n",
1441
  "\n",
1442
+ "# We can save the evaluation dataset as a json file for later use.\n",
1443
  "rag_eval_dataset.save_json(\"./rag_eval_dataset_cohere.json\")"
1444
  ],
1445
  "metadata": {
 
1449
  },
1450
  "outputId": "85b0765e-5a42-4f60-ccff-fc4bc688f65a"
1451
  },
1452
+ "execution_count": null,
1453
  "outputs": [
1454
  {
1455
  "output_type": "stream",
 
1460
  }
1461
  ]
1462
  },
1463
+ {
1464
+ "cell_type": "markdown",
1465
+ "source": [
1466
+ "If you have uploaded the generated question JSON file, please uncomment the code in the next cell block. This will avoid the need to generate the questions manually, saving you time and effort."
1467
+ ],
1468
+ "metadata": {
1469
+ "id": "998nNEGYhKhu"
1470
+ }
1471
+ },
1472
  {
1473
  "cell_type": "code",
1474
  "source": [
1475
+ "# from llama_index.finetuning.embeddings.common import (\n",
1476
+ "# EmbeddingQAFinetuneDataset,\n",
1477
+ "# )\n",
1478
+ "# rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json(\n",
1479
+ "# \"./rag_eval_dataset_cohere.json\"\n",
1480
+ "# )"
1481
  ],
1482
  "metadata": {
1483
  "id": "3sA1K84U254o"
1484
  },
1485
+ "execution_count": null,
1486
  "outputs": []
1487
  },
1488
  {
 
1490
  "source": [
1491
  "import pandas as pd\n",
1492
  "\n",
1493
+ "# A simple function to show the evaluation result.\n",
1494
  "def display_results_retriever(name, eval_results):\n",
1495
  " \"\"\"Display results from evaluate.\"\"\"\n",
1496
  "\n",
 
1513
  "metadata": {
1514
  "id": "H7ubvcbk27vr"
1515
  },
1516
+ "execution_count": null,
1517
  "outputs": []
1518
  },
1519
  {
 
1537
  "id": "uNLxDxoc2-Ac",
1538
  "outputId": "8a2df94d-99b5-4aa4-a31e-b6c94256d1bb"
1539
  },
1540
+ "execution_count": null,
1541
  "outputs": [
1542
  {
1543
  "output_type": "stream",
 
1600
  "id": "3ukkWC9R2_0J",
1601
  "outputId": "d177c25d-a163-4b71-97f4-2af468737bbb"
1602
  },
1603
+ "execution_count": null,
1604
  "outputs": [
1605
  {
1606
  "output_type": "stream",