{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Indexing\n","Using [Haystack](https://github.com/deepset-ai/haystack), the following steps are performed:\n","- load and preprocess documents downloaded from Wikipedia\n","- create document store and write documents\n","- initialize retriever and generate document embeddings"]},{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","trusted":true},"outputs":[],"source":["! pip install farm-haystack[faiss-gpu]==1.7.0"]},{"cell_type":"markdown","metadata":{},"source":["## Load documents"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:23.692554Z","iopub.status.busy":"2022-08-21T08:23:23.692208Z","iopub.status.idle":"2022-08-21T08:23:23.700721Z","shell.execute_reply":"2022-08-21T08:23:23.698130Z","shell.execute_reply.started":"2022-08-21T08:23:23.692512Z"},"trusted":true},"outputs":[],"source":["import glob, json"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:23.707774Z","iopub.status.busy":"2022-08-21T08:23:23.704107Z","iopub.status.idle":"2022-08-21T08:23:25.026910Z","shell.execute_reply":"2022-08-21T08:23:25.025990Z","shell.execute_reply.started":"2022-08-21T08:23:23.705010Z"},"trusted":true},"outputs":[],"source":["docs=[]\n","\n","for json_file in glob.glob('../input/crawl-rock/rock_wiki/*.json'):\n"," with open(json_file, 'r') as fin:\n"," doc=json.load(fin)\n","\n"," docs.append(doc)\n"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:25.030530Z","iopub.status.busy":"2022-08-21T08:23:25.029931Z","iopub.status.idle":"2022-08-21T08:23:25.039324Z","shell.execute_reply":"2022-08-21T08:23:25.037960Z","shell.execute_reply.started":"2022-08-21T08:23:25.030491Z"},"trusted":true},"outputs":[{"data":{"text/plain":["453"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["len(docs)"]},{"cell_type":"markdown","metadata":{},"source":["## Preprocess documents"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:25.050479Z","iopub.status.busy":"2022-08-21T08:23:25.050099Z","iopub.status.idle":"2022-08-21T08:23:42.089083Z","shell.execute_reply":"2022-08-21T08:23:42.087929Z","shell.execute_reply.started":"2022-08-21T08:23:25.050446Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"108e8c46426f44e7be98a8ae930d81ce","version_major":2,"version_minor":0},"text/plain":["Preprocessing: 0%| | 0/453 [00:00,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ,\n"," ]"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["preprocessed_docs[:10]"]},{"cell_type":"markdown","metadata":{},"source":["## Create document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:42.119585Z","iopub.status.busy":"2022-08-21T08:23:42.118544Z","iopub.status.idle":"2022-08-21T08:23:42.124669Z","shell.execute_reply":"2022-08-21T08:23:42.123597Z","shell.execute_reply.started":"2022-08-21T08:23:42.119551Z"},"trusted":true},"outputs":[],"source":["from haystack.document_stores import FAISSDocumentStore\n","from haystack.nodes import EmbeddingRetriever"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:23:42.129562Z","iopub.status.busy":"2022-08-21T08:23:42.128772Z","iopub.status.idle":"2022-08-21T08:23:42.259879Z","shell.execute_reply":"2022-08-21T08:23:42.258950Z","shell.execute_reply.started":"2022-08-21T08:23:42.129518Z"},"trusted":true},"outputs":[],"source":["# the document store settings are those compatible with Embedding Retriever\n","document_store = FAISSDocumentStore(\n"," similarity=\"dot_product\",\n"," embedding_dim=768)"]},{"cell_type":"code","execution_count":46,"metadata":{"execution":{"iopub.execute_input":"2022-08-21T08:43:25.952230Z","iopub.status.busy":"2022-08-21T08:43:25.951856Z","iopub.status.idle":"2022-08-21T08:46:12.506842Z","shell.execute_reply":"2022-08-21T08:46:12.505845Z","shell.execute_reply.started":"2022-08-21T08:43:25.952198Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"dbd72ecf0d36401ba26826f7d9a42540","version_major":2,"version_minor":0},"text/plain":["Writing Documents: 0%| | 0/50024 [00:00