{"cells":[{"cell_type":"markdown","metadata":{"id":"ZyP3dXRfcXLa"},"source":["# Ingest PDF benefit documents into Chroma vector DB\n","- install and imports\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"X2R9TjVzNV_E","outputId":"cf55e3b8-8515-40a5-d55b-0ff89649d0de","executionInfo":{"status":"ok","timestamp":1677873495731,"user_tz":300,"elapsed":22783,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting sentence-transformers\n"," Downloading sentence-transformers-2.2.2.tar.gz (85 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.0/86.0 KB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","Collecting transformers<5.0.0,>=4.6.0\n"," Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m25.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from sentence-transformers) (4.64.1)\n","Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.8/dist-packages (from sentence-transformers) (1.13.1+cu116)\n","Requirement already satisfied: torchvision in /usr/local/lib/python3.8/dist-packages (from sentence-transformers) (0.14.1+cu116)\n","Requirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from sentence-transformers) (1.22.4)\n","Requirement already satisfied: scikit-learn in /usr/local/lib/python3.8/dist-packages (from sentence-transformers) (1.2.1)\n","Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from sentence-transformers) (1.10.1)\n","Requirement already satisfied: nltk in /usr/local/lib/python3.8/dist-packages (from sentence-transformers) (3.7)\n","Collecting sentencepiece\n"," Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting huggingface-hub>=0.4.0\n"," Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.3/190.3 KB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (6.0)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (4.5.0)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (3.9.0)\n","Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (23.0)\n","Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2.25.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (2022.6.2)\n","Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n"," Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m62.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.8/dist-packages (from nltk->sentence-transformers) (8.1.3)\n","Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from nltk->sentence-transformers) (1.2.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn->sentence-transformers) (3.1.0)\n","Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.8/dist-packages (from torchvision->sentence-transformers) (8.4.0)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2.10)\n","Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (4.0.0)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (1.26.14)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2022.12.7)\n","Building wheels for collected packages: sentence-transformers\n"," Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=7c9ce558631df7bba2b3e6f84f849a7c5cab880e343a5be696c25e28ff7dbe81\n"," Stored in directory: /root/.cache/pip/wheels/5e/6f/8c/d88aec621f3f542d26fac0342bef5e693335d125f4e54aeffe\n","Successfully built sentence-transformers\n","Installing collected packages: tokenizers, sentencepiece, huggingface-hub, transformers, sentence-transformers\n","Successfully installed huggingface-hub-0.12.1 sentence-transformers-2.2.2 sentencepiece-0.1.97 tokenizers-0.13.2 transformers-4.26.1\n"]}],"source":["!pip install -U sentence-transformers "]},{"cell_type":"code","source":["#rank_bm25"],"metadata":{"id":"s808GJXunbB0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!pip install langchain"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"T6_80h9LpY0Z","executionInfo":{"status":"ok","timestamp":1677873517108,"user_tz":300,"elapsed":21385,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"ab8f5f89-8f43-4355-9f82-a0a8ea288f8e"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting langchain\n"," Downloading langchain-0.0.100-py3-none-any.whl (343 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m343.1/343.1 KB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: pydantic<2,>=1 in /usr/local/lib/python3.8/dist-packages (from langchain) (1.10.5)\n","Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.8/dist-packages (from langchain) (1.22.4)\n","Collecting dataclasses-json<0.6.0,>=0.5.7\n"," Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)\n","Collecting aleph-alpha-client<3.0.0,>=2.15.0\n"," Downloading aleph_alpha_client-2.16.1-py3-none-any.whl (38 kB)\n","Collecting deeplake<4.0.0,>=3.2.9\n"," Downloading deeplake-3.2.13.tar.gz (439 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.5/439.5 KB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.8/dist-packages (from langchain) (2.25.1)\n","Requirement already satisfied: PyYAML<7,>=6 in /usr/local/lib/python3.8/dist-packages (from langchain) (6.0)\n","Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.8/dist-packages (from langchain) (8.2.2)\n","Requirement already satisfied: SQLAlchemy<2,>=1 in /usr/local/lib/python3.8/dist-packages (from langchain) (1.4.46)\n","Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.8/dist-packages (from langchain) (3.8.4)\n","Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.8.2)\n","Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n","Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (4.0.2)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (22.2.0)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n","Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (3.0.1)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.3)\n","Requirement already satisfied: urllib3>=1.26 in /usr/local/lib/python3.8/dist-packages (from aleph-alpha-client<3.0.0,>=2.15.0->langchain) (1.26.14)\n","Collecting aiohttp-retry>=2.8.3\n"," Downloading aiohttp_retry-2.8.3-py3-none-any.whl (9.8 kB)\n","Collecting requests<3,>=2\n"," Downloading requests-2.28.2-py3-none-any.whl (62 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.8/62.8 KB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tokenizers>=0.13.2 in /usr/local/lib/python3.8/dist-packages (from aleph-alpha-client<3.0.0,>=2.15.0->langchain) (0.13.2)\n","Collecting aiodns>=3.0.0\n"," Downloading aiodns-3.0.0-py3-none-any.whl (5.0 kB)\n","Collecting marshmallow-enum<2.0.0,>=1.5.1\n"," Downloading marshmallow_enum-1.5.1-py2.py3-none-any.whl (4.2 kB)\n","Collecting typing-inspect>=0.4.0\n"," Downloading typing_inspect-0.8.0-py3-none-any.whl (8.7 kB)\n","Requirement already satisfied: marshmallow<4.0.0,>=3.3.0 in /usr/local/lib/python3.8/dist-packages (from dataclasses-json<0.6.0,>=0.5.7->langchain) (3.19.0)\n","Requirement already satisfied: pillow in /usr/local/lib/python3.8/dist-packages (from deeplake<4.0.0,>=3.2.9->langchain) (8.4.0)\n","Collecting boto3\n"," Downloading boto3-1.26.83-py3-none-any.whl (134 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.7/134.7 KB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.8/dist-packages (from deeplake<4.0.0,>=3.2.9->langchain) (8.1.3)\n","Collecting pathos\n"," Downloading pathos-0.3.0-py3-none-any.whl (79 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.8/79.8 KB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting humbug>=0.2.6\n"," Downloading humbug-0.2.8-py3-none-any.whl (13 kB)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from deeplake<4.0.0,>=3.2.9->langchain) (4.64.1)\n","Collecting numcodecs\n"," Downloading numcodecs-0.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.7/6.7 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting pyjwt\n"," Downloading PyJWT-2.6.0-py3-none-any.whl (20 kB)\n","Collecting hub>=2.8.7\n"," Downloading hub-3.0.1-py3-none-any.whl (1.4 kB)\n","Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.8/dist-packages (from pydantic<2,>=1->langchain) (4.5.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2->langchain) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2->langchain) (2022.12.7)\n","Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.8/dist-packages (from SQLAlchemy<2,>=1->langchain) (2.0.2)\n","Collecting pycares>=4.0.0\n"," Downloading pycares-4.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (288 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m289.0/289.0 KB\u001b[0m \u001b[31m22.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: packaging>=17.0 in /usr/local/lib/python3.8/dist-packages (from marshmallow<4.0.0,>=3.3.0->dataclasses-json<0.6.0,>=0.5.7->langchain) (23.0)\n","Collecting mypy-extensions>=0.3.0\n"," Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n","Collecting s3transfer<0.7.0,>=0.6.0\n"," Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.6/79.6 KB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1\n"," Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n","Collecting botocore<1.30.0,>=1.29.83\n"," Downloading botocore-1.29.83-py3-none-any.whl (10.5 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.5/10.5 MB\u001b[0m \u001b[31m64.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: entrypoints in /usr/local/lib/python3.8/dist-packages (from numcodecs->deeplake<4.0.0,>=3.2.9->langchain) (0.4)\n","Collecting multiprocess>=0.70.14\n"," Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.0/132.0 KB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting ppft>=1.7.6.6\n"," Downloading ppft-1.7.6.6-py3-none-any.whl (52 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m52.8/52.8 KB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting pox>=0.3.2\n"," Downloading pox-0.3.2-py3-none-any.whl (29 kB)\n","Collecting dill>=0.3.6\n"," Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 KB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.8/dist-packages (from botocore<1.30.0,>=1.29.83->boto3->deeplake<4.0.0,>=3.2.9->langchain) (2.8.2)\n","Requirement already satisfied: cffi>=1.5.0 in /usr/local/lib/python3.8/dist-packages (from pycares>=4.0.0->aiodns>=3.0.0->aleph-alpha-client<3.0.0,>=2.15.0->langchain) (1.15.1)\n","Requirement already satisfied: pycparser in /usr/local/lib/python3.8/dist-packages (from cffi>=1.5.0->pycares>=4.0.0->aiodns>=3.0.0->aleph-alpha-client<3.0.0,>=2.15.0->langchain) (2.21)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.30.0,>=1.29.83->boto3->deeplake<4.0.0,>=3.2.9->langchain) (1.15.0)\n","Building wheels for collected packages: deeplake\n"," Building wheel for deeplake (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for deeplake: filename=deeplake-3.2.13-py3-none-any.whl size=534840 sha256=800c8d01d9af5274c0d514185587bcb6dd406c2157936b1ff47f8323d18edce0\n"," Stored in directory: /root/.cache/pip/wheels/51/ba/d1/7b15d6a38922c6935ccd2e77142e1dfbee0324c8f1ba706352\n","Successfully built deeplake\n","Installing collected packages: requests, pyjwt, ppft, pox, numcodecs, mypy-extensions, jmespath, dill, typing-inspect, pycares, multiprocess, marshmallow-enum, humbug, botocore, s3transfer, pathos, dataclasses-json, aiohttp-retry, aiodns, boto3, aleph-alpha-client, hub, deeplake, langchain\n"," Attempting uninstall: requests\n"," Found existing installation: requests 2.25.1\n"," Uninstalling requests-2.25.1:\n"," Successfully uninstalled requests-2.25.1\n","Successfully installed aiodns-3.0.0 aiohttp-retry-2.8.3 aleph-alpha-client-2.16.1 boto3-1.26.83 botocore-1.29.83 dataclasses-json-0.5.7 deeplake-3.2.13 dill-0.3.6 hub-3.0.1 humbug-0.2.8 jmespath-1.0.1 langchain-0.0.100 marshmallow-enum-1.5.1 multiprocess-0.70.14 mypy-extensions-1.0.0 numcodecs-0.11.0 pathos-0.3.0 pox-0.3.2 ppft-1.7.6.6 pycares-4.3.0 pyjwt-2.6.0 requests-2.28.2 s3transfer-0.6.0 typing-inspect-0.8.0\n"]}]},{"cell_type":"code","source":["!pip install PyPDF2"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"v-SW53x91M0u","executionInfo":{"status":"ok","timestamp":1677873526478,"user_tz":300,"elapsed":9395,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"d7fff801-d141-4c6c-8ea4-712002ba5a93"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting PyPDF2\n"," Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 KB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: typing_extensions>=3.10.0.0 in /usr/local/lib/python3.8/dist-packages (from PyPDF2) (4.5.0)\n","Installing collected packages: PyPDF2\n","Successfully installed PyPDF2-3.0.1\n"]}]},{"cell_type":"code","source":["!pip install -U chromadb"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WfKglAO7l98x","executionInfo":{"status":"ok","timestamp":1677873582564,"user_tz":300,"elapsed":56120,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"faee7817-bf06-4832-ac11-45d17d0a48b6"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting chromadb\n"," Downloading chromadb-0.3.10-py3-none-any.whl (40 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 KB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting uvicorn[standard]>=0.18.3\n"," Downloading uvicorn-0.20.0-py3-none-any.whl (56 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.9/56.9 KB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting clickhouse-connect>=0.5.7\n"," Downloading clickhouse_connect-0.5.14-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (916 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m916.4/916.4 KB\u001b[0m \u001b[31m22.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting hnswlib>=0.7\n"," Downloading hnswlib-0.7.0.tar.gz (33 kB)\n"," Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n"," Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n"," Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n","Requirement already satisfied: pandas>=1.3 in /usr/local/lib/python3.8/dist-packages (from chromadb) (1.3.5)\n","Requirement already satisfied: sentence-transformers>=2.2.2 in /usr/local/lib/python3.8/dist-packages (from chromadb) (2.2.2)\n","Requirement already satisfied: pydantic>=1.9 in /usr/local/lib/python3.8/dist-packages (from chromadb) (1.10.5)\n","Collecting fastapi>=0.85.1\n"," Downloading fastapi-0.92.0-py3-none-any.whl (56 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.2/56.2 KB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: requests>=2.28 in /usr/local/lib/python3.8/dist-packages (from chromadb) (2.28.2)\n","Collecting duckdb>=0.5.1\n"," Downloading duckdb-0.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.2 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.2/15.2 MB\u001b[0m \u001b[31m67.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy>=1.21.6 in /usr/local/lib/python3.8/dist-packages (from chromadb) (1.22.4)\n","Requirement already satisfied: pytz in /usr/local/lib/python3.8/dist-packages (from clickhouse-connect>=0.5.7->chromadb) (2022.7.1)\n","Requirement already satisfied: certifi in /usr/local/lib/python3.8/dist-packages (from clickhouse-connect>=0.5.7->chromadb) (2022.12.7)\n","Collecting zstandard\n"," Downloading zstandard-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m72.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: urllib3>=1.26 in /usr/local/lib/python3.8/dist-packages (from clickhouse-connect>=0.5.7->chromadb) (1.26.14)\n","Collecting lz4\n"," Downloading lz4-4.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m64.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting starlette<0.26.0,>=0.25.0\n"," Downloading starlette-0.25.0-py3-none-any.whl (66 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 KB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=1.3->chromadb) (2.8.2)\n","Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.8/dist-packages (from pydantic>=1.9->chromadb) (4.5.0)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.8/dist-packages (from requests>=2.28->chromadb) (3.0.1)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests>=2.28->chromadb) (2.10)\n","Requirement already satisfied: nltk in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (3.7)\n","Requirement already satisfied: torchvision in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (0.14.1+cu116)\n","Requirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (0.12.1)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (4.64.1)\n","Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (1.10.1)\n","Requirement already satisfied: scikit-learn in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (1.2.1)\n","Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (1.13.1+cu116)\n","Requirement already satisfied: sentencepiece in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (0.1.97)\n","Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /usr/local/lib/python3.8/dist-packages (from sentence-transformers>=2.2.2->chromadb) (4.26.1)\n","Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.8/dist-packages (from uvicorn[standard]>=0.18.3->chromadb) (8.1.3)\n","Collecting h11>=0.8\n"," Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 KB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting httptools>=0.5.0\n"," Downloading httptools-0.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (427 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m427.8/427.8 KB\u001b[0m \u001b[31m41.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from uvicorn[standard]>=0.18.3->chromadb) (6.0)\n","Collecting python-dotenv>=0.13\n"," Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)\n","Collecting watchfiles>=0.13\n"," Downloading watchfiles-0.18.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m64.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting websockets>=10.4\n"," Downloading websockets-10.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m107.0/107.0 KB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting uvloop!=0.15.0,!=0.15.1,>=0.14.0\n"," Downloading uvloop-0.17.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m71.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers>=2.2.2->chromadb) (3.9.0)\n","Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers>=2.2.2->chromadb) (23.0)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas>=1.3->chromadb) (1.15.0)\n","Collecting anyio<5,>=3.4.0\n"," Downloading anyio-3.6.2-py3-none-any.whl (80 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.6/80.6 KB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers>=2.2.2->chromadb) (2022.6.2)\n","Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.8/dist-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers>=2.2.2->chromadb) (0.13.2)\n","Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from nltk->sentence-transformers>=2.2.2->chromadb) (1.2.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn->sentence-transformers>=2.2.2->chromadb) (3.1.0)\n","Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.8/dist-packages (from torchvision->sentence-transformers>=2.2.2->chromadb) (8.4.0)\n","Collecting sniffio>=1.1\n"," Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)\n","Building wheels for collected packages: hnswlib\n"," Building wheel for hnswlib (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for hnswlib: filename=hnswlib-0.7.0-cp38-cp38-linux_x86_64.whl size=2122759 sha256=1501023d2f03d777d631722e5eef13159162b3c7c0bfed8ed09c6f7f8af90fdf\n"," Stored in directory: /root/.cache/pip/wheels/93/0d/13/bbdc55499ef621f8f722fad91050fbb1380709f0c62fa7719a\n","Successfully built hnswlib\n","Installing collected packages: duckdb, zstandard, websockets, uvloop, sniffio, python-dotenv, lz4, httptools, hnswlib, h11, uvicorn, clickhouse-connect, anyio, watchfiles, starlette, fastapi, chromadb\n","Successfully installed anyio-3.6.2 chromadb-0.3.10 clickhouse-connect-0.5.14 duckdb-0.7.1 fastapi-0.92.0 h11-0.14.0 hnswlib-0.7.0 httptools-0.5.0 lz4-4.3.2 python-dotenv-1.0.0 sniffio-1.3.0 starlette-0.25.0 uvicorn-0.20.0 uvloop-0.17.0 watchfiles-0.18.1 websockets-10.4 zstandard-0.20.0\n"]}]},{"cell_type":"code","source":["import pickle\n","import json\n","from sentence_transformers import SentenceTransformer, CrossEncoder, util\n","import gzip\n","import os\n","from torch import load as torch_load\n","from torch import cuda as torch_cuda\n","\n","if not torch_cuda.is_available():\n"," print(\"Warning: No GPU found. Please add GPU to your notebook\")"],"metadata":{"id":"FalS1pl3lZiU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter"],"metadata":{"id":"YozNQMm_pUvq"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import io\n","from urllib import request\n","from urllib.request import Request, urlopen\n","from PyPDF2 import PdfReader, PdfFileReader\n","import glob"],"metadata":{"id":"_mvxWttK0zKt"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import chromadb\n","from chromadb.config import Settings"],"metadata":{"id":"dgU6DPNxsXMn"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import pandas as pd"],"metadata":{"id":"lPS18ZtssWz3"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search\n","bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')\n","bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens\n","\n","# #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality\n","# cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')"],"metadata":{"id":"QUPkHJdWm93P","executionInfo":{"status":"ok","timestamp":1677873607526,"user_tz":300,"elapsed":8663,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"colab":{"base_uri":"https://localhost:8080/","height":465,"referenced_widgets":["10eb9779438748bfb8ad99dcd9ce3d3a","09b438d106b1414e9b892f02cd777bab","d39682b0728149cfb6d4a12ca73133f1","490f1762de9d41eba6d640a2f58fefa6","3032bd2dc53e431482b99b85bf122b10","5d96897bfefb41979247783949026359","c702d88201f84031bc45a2baadf53b27","fd4e91968c0541e88bf59b63bb68912c","44e2130f4a9f4a0e86a0606f8f922075","90d19743b19145ee815480859a216cb6","41a1f45cf0054671bcc31ee911347864","5dfb8b5bf1624ee2a29b1c1ab8fff987","507b908348fe468aa1bf31d81bfc473a","ca6ef33cff074a51b45ab60672bd302d","ec2f49b717e34dc6987adde9258f2213","b1b51a3cb930453da1ab3c355e0587cc","5d1eab2b17b04984b4a295eac6601cfb","6d43636ec55e4d73a9c9922f55db464f","22564c97abb54b3f87328b81422d838c","791f5a810a07451c8d81895076e4913a","f63d663704c64c6897c2c3a1c5aafb11","654ab570ee934876811d46bec6a88ba9","2550746a5374498784a78b1dc1f968ba","f40be0b257144d9ba77130f771bf5189","24bae4e1885f434a8a43cfe2e359053d","977925667a464a968e7862e68e626e56","0a0abca0196c4b5f8a2ea97fad5059cd","2a77ad4716c44729a816c7d770b847ba","333f147e2afa42bd869a76e3db90698a","5ef6d9b6670b40d99e9ed70ed0a34b9a","26f0315fd93340bf99b13b3760d0e81e","cc9e51f21c7e4d2581396632633bbae7","18c5b6f048a44cd18265bbbe0ef280b3","c5fbfd770bd84ec6be4ea0fb0298a2eb","d088c981dab4402986bd3e3b8c83f7d2","c7dc80b4f95d4a57a1f9d1e915c4ec7c","43e41846315547b7ae51c8d4e5f23270","403fbe6be4804e27902492b6cda7bbd2","3cbf7c743c6e486ca9d87afff293313d","89d2292e7e824749a9205cf984fb1a40","2a7603e012e74e328089821a3381c538","5d1fac9ce54a4c188aa05a9412d39c17","6c845c91912743ad9d2da8e98177e7d9","9b651be15b7f408586e59d0fb443264a","a8f2efdf2e0247a2a7c1e60d54712006","75c2be0b65024b3491f4a9a9137947bb","d5216bd8fba84d60928acc027cc23368","2ad43accae3744338867ef30ef2da5db","48806dcb683b4372a69fe965a57f4c8f","e35f7f5fb1094a8183c21b4496579550","462dca889cc3447da922c26d540dffbf","c44f5da8f80542cf96f147fc38a2e95e","ac2c36664f8441b9b7d1585cd1c77a60","a8ce8e853292477e80269422c56058e0","2763dd360df143068cd92e49c2c31307","5ec485900b3040d78656e7d318d16acc","fe50813bf66c455cbf6a899f1f3eb955","db27f2420ef84869a10ee2b0b6e92040","da1b07dbc73d4c14a6e7c80087f51586","bc671c5262ee49dc994d53423e106523","f74ba3ad8a854f40a4ea9af657f0aee3","1bfc3db4c678481caff282cf93fb5a3d","d959529e209a4b5787012ec878eafdab","5e887f9eb05b4ea0947420037ff96760","0207ee19e740471682fafc4a10097259","fc2887847f614b2f9111eac778e9e42f","b6e3b382f045468fa4f012136a552ad5","f9bedd5d61544580b42f05da5f8270a7","1be5017a9d5f489cbde4d883f9686ec8","afa5c620e1f24483a8ab7b87bfeaf298","a07432d2d8df405ea2e8032d97a7bbe0","264a9bd6629d408c82f758482e02b729","656f995855d14d68ad43258f30d599d9","54d4d77f26a6421496c2d977cd08a57a","dc922365b67a474aa1a9181eb5f89be8","0ac3c9c3839b4023a1feeca2af21cdc2","ae01a57f919f4462ba1343e00c4fd8a7","c72d53de40084fb2a17dce7529e1ad75","e1f1d9a7a72c40c9be76fef87961a783","25d5ea8732b64617bcde0e87aa021793","dca89a8063c44d69be2f0095f42565b1","6779b27c12524da4bc8a0934501fa203","7600418561aa4ac98c42ce60ef439f39","4831dd60fe71451a80b7e8e4d61a16d4","3386e4758b7e457694894028a31b7d57","052d4181304f4033becef9c28944cd6e","621b9f695d084fc1a424bac6ca85c533","ff8315840f034d02b3c1a4e980f83c0c","ab209c56576a41e499fb5e0aa591f1a0","ef4d767d86964715857f191e9c75f3db","2f7543120e2f4dfea62bdb352e33fc6f","a114607aa2d34e50a6fc7dfc6f61eaf3","72cb3a8eed854194b43d430b1878597d","b4408d19207b43d78738bbd6dfeac9b3","e6df7d0584ef414fbf918c206350950e","208e6a30d28945dcba5dcfd1f215e134","7cfe9cf9a2e042d487aeb4f1c5a89f42","d3a07519e4c5492aad31a9160133aefb","b1291f22210e4a7cb357e5b980042265","ca54b099876e4aff87beebf053ae13c6","9a1903824de04f22a3fa31363f5664e6","58ea627715884245b4b3173745b7d733","05a13f4145b64bf88bd8f4db3a6caf15","69aee706ec63419aa624d35f1cf1c496","8b109d650e924b8a867fd1ff02d98cd0","4c0bc9e688f64cc2937f095ff5a5b40f","b4ed2740c74b436b9fc65c4edc5133ae","b1d680314d904327aac3a40cd2bcbb2a","d9d59afae6b14420adf65e929ba1510d","4ea017522d454cde80529f9ee614eb59","5402dd52e4c645fcbceafb3bdf560f1e","4c8947aca44e465d97aab54217d0f6b1","d2c971beabe74931939e524a39e54c62","00f236f8eee94e6db95cf51659e28ead","a1eef1837d6441c4a02ad0287fb48fb9","ab3d8e32fa46496d9533e97f4ed195c1","7dfa374ccdb44c43becd41e29ea372cd","4f1e417e57064827bcf2b070e00159b8","45d591ac44404454926ddfa5e43c3d8a","11cb840ebf84479d8011d9c89cc5350c","f62623f11e8e4d68a2b51dfd62c33a2c","f7323bf4d33f4631b569d52b4154d3a3","b0b49c6a4e95400f9d68f97e6077d098","b7875c951bb743dcbdb351d6c5848f89","be102a07521b4313b7e895c9d890b7cb","7ba845c4d61f4b01b289c73ba897cf0f","fe0181fcc41548ccb475b8791a1edf31","875c703211944c09815a9b65d1ef99a0","bb26108792a94fdaa1ff67b26bb60828","598b1b4a72024919b4a72fc035ae12f8","d3c4d32075724f71adc6efdcd2838c59","68e7587f539248d8945109c080284901","8b68c790ef1c4293aaf462884f23b16d","5ef45280b900471d974c13f18f2c8314","ea14a9959d8e4e788fba58e0db8016f5","8a35e88d87ea48018a463db3a3010003","0922ce12e4e0480d8e0c61ad2bb5b433","c3b99174eacd45278679e690604fcc05","008205b1dc1d40b5975582df9215f993","aa3f038e874549daa2183f20fcad1974","eaa9138cfeae43d0a261facaa824ebae","44264555f3b84f62a39d372ad63f7d6c","d0161c593c014f3cb6e188288c889687","29a5864e3a3c4553a7c9da7f4a52a886","6c18994dedd646329d7dfd170a94d9d8","d97c0a708a4248b48970b851a995620f","1f719c24095142fa884ffd1b8eb237aa","642fc575c2f6403393c54efcddb4ff8d","602965ae6e5b416eb8cf26cc84985bd6","dc9749bc4b05454eb76c949238085791","08424bd0309041588cfe55e349983fb7","7fa0a408d9d34ab295a810ebd8e844fc","6e5f5c98e4db47fe8b14b4bba8010d9c","0cef510d28e94d4f9fbc6a37788a15f9"]},"outputId":"97b512ce-6b80-44b1-90c7-bc4979771cdd"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["Downloading (…)5fedf/.gitattributes: 0%| | 0.00/737 [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmyembeddings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbi_encoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmypassages\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconvert_to_tensor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshow_progress_bar\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmyembeddings_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmyembeddings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mNameError\u001b[0m: name 'bi_encoder' is not defined"]}]},{"cell_type":"code","source":[],"metadata":{"id":"ETDTU0YskOkd"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["chroma_client = chromadb.Client(settings=Settings(\n"," chroma_db_impl=\"duckdb+parquet\",\n"," persist_directory=\"./q_sales/data/mychromadb/\" # Optional, defaults to .chromadb/ in the current directory\n","))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ltMoTm1vkOM6","executionInfo":{"status":"ok","timestamp":1677874078364,"user_tz":300,"elapsed":946,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"d5c73556-a1c0-4069-9aa5-7158b80e559a"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["DEBUG:Chroma:Logger created\n"]},{"output_type":"stream","name":"stdout","text":["Running Chroma using direct local API.\n","No existing DB found in ./q_sales/data/mychromadb/, skipping load\n","No existing DB found in ./q_sales/data/mychromadb/, skipping load\n"]}]},{"cell_type":"code","source":["!ls './q_sales/data/mychromadb/'"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"W8c2EQlc2Oc0","executionInfo":{"status":"ok","timestamp":1677874179770,"user_tz":300,"elapsed":249,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"a45c1543-a0ab-4b58-f597-abdb18f4fe99"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["index\n"]}]},{"cell_type":"code","source":["#chroma_client.delete_collection(name=\"benefit_collection\")"],"metadata":{"id":"UkP3SvYN2OWE"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["collection = chroma_client.create_collection(name=\"benefit_collection\", embedding_function=bi_encoder)"],"metadata":{"id":"N33r1sDe2OQD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["mx_id = (collection.count() - 1)\n","mx_id"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"eF0pekWTtWD_","executionInfo":{"status":"ok","timestamp":1677607757193,"user_tz":300,"elapsed":25,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"abb8d4c9-1538-4423-9610-19258283a677"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["-1"]},"metadata":{},"execution_count":22}]},{"cell_type":"code","source":["# start from max of current value doc id\n","mydocids = []\n","for i, p in enumerate(mypassages):\n"," mydocids.append(str(i) ) #+(mx_id+1) ### for subsequent runs\n","len(mydocids)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GEHaf0itjr_m","executionInfo":{"status":"ok","timestamp":1677874164843,"user_tz":300,"elapsed":121,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"e24ab1a0-bec3-466d-c147-de2bd626d300"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1390"]},"metadata":{},"execution_count":26}]},{"cell_type":"code","source":["collection.add(\n"," embeddings= myembeddings_list,\n"," documents= mypassages,\n"," metadatas=mymetadatas,\n"," ids=mydocids\n",")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Rs6OBCvG2OKH","executionInfo":{"status":"ok","timestamp":1677874172793,"user_tz":300,"elapsed":977,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"beed37ce-c0dc-4e27-b07f-7029d1e85efe"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["DEBUG:Chroma:Index not found\n","DEBUG:Chroma:Index saved to {self._save_folder}/index.bin\n","DEBUG:Chroma:Index saved to {self._save_folder}/index.bin\n"]}]},{"cell_type":"code","source":["collection.get(ids=[str(1389)],\n"," include=[\"documents\",\"metadatas\"])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"BkzqWk6wtje3","executionInfo":{"status":"ok","timestamp":1677874196587,"user_tz":300,"elapsed":123,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"443b3ed9-673c-4db9-f06d-d05a52c37ffb"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["{'ids': ['1389'],\n"," 'embeddings': None,\n"," 'documents': ['Humana Community (HMO) \\nH1036236000 ENG \\nJefferson County \\nHumana.com \\nGNHH4HIEN_23_C Summary of Benefits H1036236000SB23'],\n"," 'metadatas': [{'source': 'H1036236000SB23.pdf',\n"," 'page': 28,\n"," 'url': 'https://www.humana-medicare.com/BenefitSummary/2023PDFs/H1036236000SB23.pdf#page=28'}]}"]},"metadata":{},"execution_count":29}]},{"cell_type":"code","source":["# important to do this if you want to save the data for re-use\n","chroma_client.persist()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-XpFU35Zssa2","executionInfo":{"status":"ok","timestamp":1677874206175,"user_tz":300,"elapsed":446,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"bb123aa5-0a1b-4df1-e7bf-07584aee91fc"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Persisting DB to disk, putting it in the save folder ./q_sales/data/mychromadb/\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":30}]},{"cell_type":"code","source":["!ls './q_sales/data/mychromadb/'"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"O8w01y9vGMLt","executionInfo":{"status":"ok","timestamp":1677874221452,"user_tz":300,"elapsed":301,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"96db664c-3ea1-4a46-93c6-0f1b693620d5"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["chroma-collections.parquet chroma-embeddings.parquet index\n"]}]},{"cell_type":"markdown","source":["# Use existing collection"],"metadata":{"id":"sg331hx1tZbR"}},{"cell_type":"code","source":["chroma_client = chromadb.Client(settings=Settings(\n"," chroma_db_impl=\"duckdb+parquet\",\n"," persist_directory=\"./q_sales/data/mychromadb/\" # Optional, defaults to .chromadb/ in the current directory\n","))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0Rs_e9kevzsN","executionInfo":{"status":"ok","timestamp":1677599923305,"user_tz":300,"elapsed":1739,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"685616c1-5c30-4c53-a60e-0589e00c6715"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["DEBUG:Chroma:Logger created\n"]},{"output_type":"stream","name":"stdout","text":["Running Chroma using direct local API.\n","loaded in 1382 embeddings\n","loaded in 1 collections\n"]}]},{"cell_type":"code","source":["collection = chroma_client.get_collection(name=\"benefit_collection\", embedding_function=bi_encoder)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":465,"referenced_widgets":["337e49dc298644829d5df46209ea7e1b","efc77cc8e81044a0832ab8d441ba6348","6a5d3d6375954dcd82c762e244cfa5e6","3aca9144c9d44a5d90f77aec0d979ed8","f130241689ca420baa51dd1993946ca3","77daf0f055c345f598255a732c8f9e97","a2f3c5ed55944009a7402d11a5999b0a","93f0597b552f4073bb985e6f367729e7","a8e7197fe1a24b71a67e438b9af456f2","d105f2864c2748a180a5fc1a043e9095","1c491013f21f46c5ab258ddd57d858a0","cd655c21860842fc80934fc27713f222","7012c11e2c0d4f238b77e0126a3b20ce","973c0abde7d04e5887e7e89e27e9fe6d","dca55d0e4b1846b1a66ba1b7885272f0","8e62b39063cd48a6928be1573011ec55","8f96edb4b0034b6b826ab067bafbfd09","21bfd9668522449ba941fcc255c213d9","9b3464d752b84840aa84e20318b73b1e","dcf53258bcca45ad9118dbf16270f911","593168b0357c498fa7f1a270c3d53220","198b5ace460d4693b8432bb7f085d981","335360a065f24452ab0432963359db35","9ce0f3f290a043f6a05a440948f3a34a","84896981856d4a1a926fc3cf43de480f","08cc343446f94cef8555849cf4f46c2d","8a752eeb6b5442ec9caa6ad79657508b","8c886d187c0349a3a2c3883fbacd8ae6","ebe1fc7907f14f5e837b0194673a3408","8fae49e49726418a9e0a05086bf04893","a425601dd6b34c67a3cb1f576b07a242","7d5baead98484d8eba8bed2acc58b237","0bc4d6c8422d4d90843062ba9e143a06","a0b68fae0a2a46bfa60bb25b2fb7ced6","9be0958002cd40488147853009d5dd82","3c9d03c641ca49cbab5bc80af8a9b852","f0bfd4c550574a8098c22c72dbf976be","ecf2c0077c634a4991af9ef120e4835c","4a67e685885946d5beafb30ab28325b9","92c597fcbb98414ea128322709c9c796","309dcab2f95a413699403b7f0cd85b5b","b81accd383dc4bbf9b8f48e1ddc66b6d","c8fa4a4723a64109adc15af2f002a7fb","f4f510af77174661a586c4cbefd5497b","05d43c556875468ba42170ac5a77d911","eb7057b57f2140c5b1f412504d5deecc","e29d6ec071114e4cb6bb079d022d8f85","f877973143d7486c85338fcc523e090b","9087abe238164d51bb436b9c2d610154","c47e8c11654d40a694a7673d8984c1ac","23a22d89c9584fb59dcdd13cc7bc4db5","70152cd563024d92879ce41888bb32d7","ff7ece545814400694703c55672ad251","199b0374d6584497b84ccd4fb38c0e27","c9c1b7017ef543b3a45cb393bf5830ea","05db165fa7384a3aabba6ab7b8a1d4a0","74347e8110214ece9100d5d2491d520c","475f3b7c661847ccb2e16b89a7d1b095","fc931b2bfaad4887b6b1294a26a393e3","d5a0395128894415af0da0e062ea48c8","2ebf0ba338474c738add00022dcd628e","cc9507f149554043ba2b7f6368752025","2737d00efc1d4f30828c4d488acadee2","25bcfa5638d9438da4fa638dc00df643","3a66e1650dd5462b9a1654c0457d8d8e","26dc39823e974c9da68431712ce8a0ae","08b65e81e10842a8931c3fa949a2eff5","1e4fde32aaf04fea9aeea14ba4309b26","346b814a070b4baf8b08cdc9adec2eb7","44b0ff336d27498f94920c26ecdf93ba","6aff551799304bb5913b7dbef55355ae","01f065296f8249999b6c40718e11c4e1","86f647a1803a4f0381a55148e6df0e7e","df40d2b873bc46b7aaf5501bd8e1cdd7","9bfa46d458404a5d956b9068bb160e08","ca31de1ea8964ddbbb961921c4c020f7","fcab0947c39845fc90551456d1d7fcf0","fbc125c3ef61483bbf05b1cceae0e0fe","4f2c1ffface644898783a4d491cffde8","ea919ccdfba54935b9d04abf7879ca59","15d679b04d404c63a600d9a34265f1f6","1ef62c85f4d24ecda06e0156444950f4","3a2c8d4ad3c54371863f9f90dc87e132","d8014b7e3e074b2f9dccc190c0f8a553","b85bc062c33f412a83b76c5d3bc929e7","bb337481c8af4a81a6b7faf7c7f62b60","a08c4962e5bd4d67afd40ee556be2245","c5f32f9282a54299902d438ffa37d8e0","4e2a37645cb142beb98cd24f0581e4a4","79e02e766dd94e159e174e599d2e66dd","efa79ca411b24078ba35bd03580924f9","79a322fee53344d285020d9c1ba559f3","0165b6b9d46c47c39b58a9f9c68b6890","7b647e450b894b16b5a8b8bd2ba7e3a0","aa0ead1eeaba4aeda8b8743414d114d9","74a1978025954f639acac28dd63cd97a","9ddc09a65dfd43918aad6e5f55d4d975","6273b92c189346fd8fbc72c141c16654","ea3ec9c283b74c4498b0a9f3dc2a8c56","a116a02342d14e91a279dc6115ddacb5","e1f351f2144d494aa9f1dafbf1dfb3ca","dd216fcfe2ce47f081a3b4bc8845719c","c4b56adecd4c44daae71a43a0612411f","f9fd73e7efe84839ba1f5fbe8218b76c","aa00393f0a6d42b6978851468b8218e7","2f78598708ff444e83baece3d9707865","dba078e3d2f04797aeaa62be3ef43858","a8f21b0ad01541a3a881ba8584ac17ff","e8af010f77ea4c87ad1001f7176c5d6a","e96bdb2304334d7dbb2cba17348b4bf6","810898b6e8c34e75b4e9e73dfd47626d","f08c3a0d02844d3e9fbcd1824bf47649","87db2fd8a7a64155b0a6dd8b629f2e16","20e38571695e4c849dfccccd70dba668","7f28eec8e4fd41d780b3731845d337b7","c692fea24bda4266891175fd6557eaac","8b5ffa580ac24c659b58339d99606ab5","9f0994e5eaa24b87a66384a6c458b9fc","ff7d58bdc6b54b6f9ce72b4fcb6e1672","e242437557814610886b8b23b3ba59f6","a8f0d49c86db4bde9aed2431c13923c3","fd953f76d0f04556b5851db0226092fe","4e7f956bb6364dceaa9eaa6d8d4e9923","2411efc7f97143069f1119924abd9710","d358f6564d434e619a011317adb52d57","751fa8ffdfd14027b931a2599e41a747","51c7591098874016a924e22ac413c92c","ef08bdf0c3184fbb89059be05d0873f3","75240dcc922147a692d374047ea46aa0","97d0c5032293454abf905d545754bfed","aa41473022554791ae9ffdaf96bb8b1e","467e3bdba5ec4d34b041984ac131f838","f6716e1424ae4bf5a4b255fa6320bc78","90fc896c9822481380d8b61686a760e2","781a01cb2570428397ff1b5b5a29ce8d","6ac61a3a264f43c5a194f4210d35a774","cbc2a7006b31470c9685c7695d797d35","6be33ca85c764b2396ed550d7b487deb","ac1571ec17fa4c5c9985cb51322e6cb5","c7b594254a394e6a885339648f64c537","ae0d55e8e071437ca4289689b3bae16e","04e2d855780644d9ade77d86553f67c3","8111548c29bb4daaafef1afa61e0f633","95d5b6a8a22d43399b09c8c30004b5e3","d78bf4f56bbb4838bac348552ad08243","77cb3259d1614dde92bb43b207d3ff70","086baa2fc9d34f9484867dc16b43fdd0","8787ade4cc9e4644a1ceed3a318c5785","46844e545b07419cae4a75f2e04632d5","e9c7e1fed3dc4920826acbf7b8ca4bec","a3899dc9b216434c8637f4d518391b75","085517d904d74a4db480ce1577706aca","43456079f4284eb783b15dd2f75b0cad","9430cde7e675440eb862dfe330da5f8f"]},"id":"ipvck4yltY4X","executionInfo":{"status":"ok","timestamp":1677599937136,"user_tz":300,"elapsed":5628,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"6c51ce30-333c-4ed3-bedb-6b63b74b27fd"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["Downloading (…)e9125/.gitattributes: 0%| | 0.00/1.18k [00:00\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
idsembeddingsdocumentsdistancessourcepageurl
040[0.03424154967069626, 0.039619386196136475, -0...•Obesity screening and counseling \\n•Prostate ...0.947326H1036236000SB23.pdf7https://www.humana-medicare.com/BenefitSummary...
038[0.03241836652159691, 0.02051578275859356, 0.0...•Alcohol misuse counseling \\n•Bone mass measur...0.970621H1036236000SB23.pdf7https://www.humana-medicare.com/BenefitSummary...
039[-0.012141721323132515, 0.006035255268216133, ...•Colorectal cancer screenings (colonoscopy, fe...1.155411H1036236000SB23.pdf7https://www.humana-medicare.com/BenefitSummary...
041[0.00424062879756093, 0.11815327405929565, 0.0...sign of tobacco-related disease) \\n•Vaccines, ...1.279324H1036236000SB23.pdf7https://www.humana-medicare.com/BenefitSummary...
042[-0.07712288945913315, 0.12926293909549713, 0....•Routine physical exam \\n•Medicare diabetes pr...1.358703H1036236000SB23.pdf7https://www.humana-medicare.com/BenefitSummary...
0154[0.005615254398435354, 0.07474204152822495, 0....and should not be construed as medical advice....1.402395H1036236000SB23.pdf17https://www.humana-medicare.com/BenefitSummary...
028[-0.006602243520319462, 0.06320885568857193, 0...contact your PCP or refer to the Evidence of C...1.415057H1036236000SB23.pdf6https://www.humana-medicare.com/BenefitSummary...
078[-0.008955741301178932, 0.06341414153575897, 0...contact your PCP or refer to the Evidence of C...1.417454H1036236000SB23.pdf10https://www.humana-medicare.com/BenefitSummary...
062[-0.00851522572338581, 0.07434328645467758, 0....contact your PCP or refer to the Evidence of C...1.422721H1036236000SB23.pdf9https://www.humana-medicare.com/BenefitSummary...
048[-0.00867101177573204, 0.06424014270305634, 0....contact your PCP or refer to the Evidence of C...1.423319H1036236000SB23.pdf8https://www.humana-medicare.com/BenefitSummary...
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n"," \n"," "]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["# summary of benefits text\n","- example of single text file"],"metadata":{"id":"kdwdCgnyrXWl"}},{"cell_type":"code","source":["# import sample text\n","with open('/content/gdrive/My Drive/Colab Notebooks/ama_wiki/data/summary-of-benefits-paragraphs.txt') as f:\n"," policy_doc = f.read()"],"metadata":{"id":"UX_b1q91ocxi"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["policy_doc[:90]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":35},"id":"U_0DHJi5o8D7","executionInfo":{"status":"ok","timestamp":1675366776544,"user_tz":300,"elapsed":5,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"de929e2d-9a53-48a5-db58-376936a95b46"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'These are the summary of benefits for the plan named Humana Community HMO H1036-236. This '"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":8}]},{"cell_type":"code","source":["text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0, separator='\\n' )\n","passages = text_splitter.split_text(policy_doc)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"YREpKYUUo7tA","executionInfo":{"status":"ok","timestamp":1675367316231,"user_tz":300,"elapsed":321,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"7e1c4b1e-d7fb-43da-8694-b53eefd772ef"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["WARNING:root:Created a chunk of size 547, which is longer than the specified 500\n","WARNING:root:Created a chunk of size 550, which is longer than the specified 500\n","WARNING:root:Created a chunk of size 511, which is longer than the specified 500\n"]}]},{"cell_type":"code","source":["len(passages)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QFv2pth1qPvJ","executionInfo":{"status":"ok","timestamp":1675367323585,"user_tz":300,"elapsed":332,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"ff02e7d1-1b0f-423d-e421-12637a50aaa9"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["60"]},"metadata":{},"execution_count":35}]},{"cell_type":"code","source":["passages[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4BtOIR-5o7lM","executionInfo":{"status":"ok","timestamp":1675367323934,"user_tz":300,"elapsed":6,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"94b47403-09d9-4c76-9b37-c76e72e2d694"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['These are the summary of benefits for the plan named Humana Community HMO H1036-236. This plan is available in the county of Jefferson in Kentucky. This plan applies to the year 2023.\\n\\nThe Pre-Enrollment Checklist includes Understanding the Benefits.\\nBefore making an enrollment decision, it is important that you fully understand our benefits and rules. If you have any questions, you can call and speak to a customer service representative at 1-800-833-2364 (TTY: 711).',\n"," 'Understanding the Benefits. The Evidence of Coverage (EOC) provides a complete list of all coverage and services. It is important to review plan coverage, costs and benefits before you enroll. Visit Humana.com/medicare or call 1-800-833-2364 (TTY: 711) to view a copy of the Evidence of Coverage EOC.\\nReview the provider directory (or ask your doctor) to make sure the doctors you see now are in the network. If they are not listed, it means you will likely have to select a new doctor.',\n"," 'Review the pharmacy directory to make sure the pharmacy you use for any prescription medicines is in the network. If the pharmacy is not listed, you will likely have to select a new pharmacy for your prescriptions.\\nReview the formulary to make sure your drugs are covered. \\n\\nHere are important Rules. You must continue to pay your Medicare Part B premium. This premium is normally taken out of your Social Security check each month.',\n"," 'Benefits, premiums and/or copayments/co-insurance may change on January 1, 2024.\\nExcept in emergency or urgent situations, we do not cover services by out-of-network providers. Out-of-network providers are doctors who are not listed in the provider directory.\\n\\nTo find out more about the Humana Community HMO plan, including the health and drug services it covers in this easy-to-use guide.',\n"," 'To be eligible to join the Humana Community HMO plan, you must be entitled to Medicare Part A, be enrolled in Medicare Part B and live in our service area.\\nIf you are a member of this plan, call toll-free 1-800-457-4708. If you are not a member of this plan, call toll free 1-800-833-2364. From October 1 to March 31, call 7 days a week from 8am to 8pm. From April 1st to September 30th, you can call from Monday to Friday from 8am to 8pm. Our website is https://humana.com/medicare .']"]},"metadata":{},"execution_count":36}]},{"cell_type":"code","source":["# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)\n","corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["d503a70e1f704f239a75a3b1d7f69c8f","48fe6a6945684e07b2095905cb41661f","c544a1ac56314779a4ff7b1ad3a595ba","5e4c6af0c57a44b484e38ae1fae6ba5c","97892889c978496f90220b9c03a9d012","e232649fa0944e73a2b4916c81588874","861d0f219f094696bddf87292d84e202","373a6133b9854de8923f96c8baf9fd42","f2f8ee1d5c654c05836f445d8695df0c","279cec801f3e41ee9be6cb742db10d72","c9ecab42c6de4b80a7e43cbec7a0b6b4"]},"id":"H2gFBwee6qoJ","executionInfo":{"status":"ok","timestamp":1675367331557,"user_tz":300,"elapsed":561,"user":{"displayName":"Greg Hayworth","userId":"07798746719312628238"}},"outputId":"d067e906-533c-4c3c-a04f-1e58d8002890"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["Batches: 0%| | 0/2 [00:00