{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6106eb87", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "os.environ[\"OPENAI_API_KEY\"] = \"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\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "69b6dd68", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "import json\n", "from langchain.retrievers import ParentDocumentRetriever\n", "from langchain.document_loaders import TextLoader\n", "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.vectorstores import Chroma\n", "from langchain.storage import InMemoryStore\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.document_loaders import JSONLoader\n", "from langchain.document_loaders import UnstructuredFileLoader\n", "\n", "from langchain.document_loaders import DirectoryLoader\n", "from pprint import pprint\n", "\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "82bab809", "metadata": {}, "outputs": [], "source": [ "loaders = [\n", " TextLoader(\"offering_0.001.txt\", encoding='ascii'),\n", "# TextLoader(\"proper_review.txt\", encoding='ascii'),\n", " TextLoader(\"output_proper_review_chunk_1.txt\", encoding='ascii'),\n", "]\n", "docs = []\n", "for loader in loaders:\n", " docs.extend(loader.load())" ] }, { "cell_type": "code", "execution_count": 4, "id": "3ddeb88e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[{\"hotel_class\": 4.0, \"region_id\": 60763, \"url\": \"http://www.tripadvisor.com/Hotel_Review-g60763-d113317-Reviews-Casablanca_Hotel_Times_Square-New_York_City_New_York.html\", \"phone\": \"\", \"details\": null, \"address\": {\"region\": \"NY\", \"street-address\": \"147 West 43rd Street\", \"postal-code\": \"10036\", \"locality\": \"New York City\"}, \"type\": \"hotel\", \"id\": 113317, \"name\": \"Casablanca Hotel Times Square\"}, {\"hotel_class\": 5.0, \"region_id\": 32655, \"url\": \"http://www.tripadvisor.com/Hotel_Review-g32655-d76049-Reviews-Four_Seasons_Hotel_Los_Angeles_at_Beverly_Hills-Los_Angeles_California.html\", \"phone\": \"\", \"details\": null, \"address\": {\"region\": \"CA\", \"street-address\": \"300 S Doheny Dr\", \"postal-code\": \"90048\", \"locality\": \"Los Angeles\"}, \"type\": \"hotel\", \"id\": 76049, \"name\": \"Four Seasons Hotel Los Angeles at Beverly Hills\"}, {\"hotel_class\": 3.5, \"region_id\": 60763, \"url\": \"http://www.tripadvisor.com/Hotel_Review-g60763-d99352-Reviews-Hilton_Garden_Inn_Times_Square-New_York_City_New_York.html\", \"phone\": \"\", \"details\": null, \"address\": {\"region\": \"NY\", \"street-address\": \"790 Eighth Avenue\", \"postal-code\": \"10019\", \"locality\": \"New York City\"}, \"type\": \"hotel\", \"id\": 99352, \"name\": \"Hilton Garden Inn Times Square\"}, {\"hotel_class\": 4.0, \"region_id\": 60763, \"url\": \"http://www.tripadvisor.com/Hotel_Review-g60763-d93589-Reviews-The_Michelangelo_Hotel-New_York_City_New_York.html\", \"phone\": \"\", \"details\": null, \"address\": {\"region\": \"NY\", \"street-address\": \"152 West 51st Street\", \"postal-code\": \"10019\", \"locality\": \"New York City\"}, \"type\": \"hotel\", \"id\": 93589, \"name\": \"The Michelangelo Hotel\"}]'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "docs[0].page_content" ] }, { "cell_type": "code", "execution_count": 5, "id": "d176f732", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1604" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(docs[0].page_content)" ] }, { "cell_type": "code", "execution_count": 6, "id": "1eaddb14", "metadata": {}, "outputs": [], "source": [ "# This text splitter is used to create the child documents\n", "child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n", "# The vectorstore to use to index the child chunks\n", "vectorstore = Chroma(\n", " collection_name=\"documents\",\n", " embedding_function=OpenAIEmbeddings()\n", ")\n", "\n", "store = InMemoryStore()\n", "\n", "full_doc_retriever = ParentDocumentRetriever(\n", " vectorstore=vectorstore,\n", " docstore=store,\n", " child_splitter=child_splitter,\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "id": "eda754d9", "metadata": {}, "outputs": [], "source": [ "full_doc_retriever.add_documents(docs, ids=None)" ] }, { "cell_type": "code", "execution_count": 8, "id": "8a8a0e79", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['ff55bd1d-b2f2-4a33-98e6-29e390ed1a4a',\n", " '0f70cf2a-df4e-4aad-bcd7-2edd23b6ec83']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(store.yield_keys())" ] }, { "cell_type": "code", "execution_count": 9, "id": "50bc42f2", "metadata": {}, "outputs": [], "source": [ "parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)\n", "\n", "child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n", "\n", "vectorstore = Chroma(\n", " collection_name=\"split_parents\",\n", " embedding_function=OpenAIEmbeddings()\n", ")\n", "\n", "store = InMemoryStore()" ] }, { "cell_type": "code", "execution_count": 10, "id": "9e8759de", "metadata": {}, "outputs": [], "source": [ "larger_chunks_retriever = ParentDocumentRetriever(\n", " vectorstore=vectorstore,\n", " docstore=store,\n", " child_splitter=child_splitter,\n", " parent_splitter=parent_splitter,\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "id": "48adee3c", "metadata": {}, "outputs": [], "source": [ "larger_chunks_retriever.add_documents(docs)" ] }, { "cell_type": "code", "execution_count": 12, "id": "264fe212", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(list(store.yield_keys()))" ] }, { "cell_type": "code", "execution_count": 13, "id": "f7494db8", "metadata": {}, "outputs": [], "source": [ "sub_docs = vectorstore.similarity_search(\"Hotel_Times_Square\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "09c8576d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(sub_docs)" ] }, { "cell_type": "code", "execution_count": 17, "id": "b563a4d6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('[{\"hotel_class\": 4.0, \"region_id\": 60763, \"url\": '\n", " '\"http://www.tripadvisor.com/Hotel_Review-g60763-d113317-Reviews-Casablanca_Hotel_Times_Square-New_York_City_New_York.html\", '\n", " '\"phone\": \"\", \"details\": null, \"address\": {\"region\": \"NY\", \"street-address\": '\n", " '\"147 West 43rd Street\", \"postal-code\": \"10036\", \"locality\": \"New York '\n", " 'City\"}, \"type\": \"hotel\", \"id\": 113317, \"name\": \"Casablanca Hotel Times '\n", " 'Square\"},')\n" ] } ], "source": [ "pprint(sub_docs[0].page_content)" ] }, { "cell_type": "code", "execution_count": 18, "id": "db71de1e", "metadata": {}, "outputs": [], "source": [ "retrieved_docs = larger_chunks_retriever.get_relevant_documents(\"Hotel_Times_Square\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "a36da8be", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(retrieved_docs)" ] }, { "cell_type": "code", "execution_count": 25, "id": "85ee2872", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('5.0}, \"title\": \"\\\\u201cOne of NYC\\'s Best Hotels, Hands Down\\\\u201d\", '\n", " '\"text\": \"Excellent staff (they remembered our names from the moment our '\n", " 'family walked in). Our original suite was overbooked, so we ended up in a '\n", " 'much nicer suite (no complaints here). Extremely convenient location to '\n", " 'Fifth Avenue shopping, restaurants, and subway system. Use the concierge, '\n", " 'they worked magic throughout our entire stay. Regardless of whether we '\n", " 'needed tickets, or merely a ride, the staff would go out of their way to '\n", " 'make sure we were happy. Honestly will have trouble staying at any other '\n", " 'hotel in the NYC area. Room service was superb. One of our family members '\n", " 'forgot their toothbrush. We called downstairs and within 3 minutes, we had '\n", " \"four. \\\\nFor those who stay at this hotel, there's always a chance at a \"\n", " 'celebrity sighting. Our hotel neighbor happened to be Jamie Foxx, who was '\n", " 'staying during his Django Unchained promotional tour.\", \"author\": '\n", " '{\"username\": \"Breakonacloud-com\", \"num_reviews\": 1, \"id\": '\n", " '\"B766E25C73F354AC433EF628F3ACD23B\", \"location\": \"Nashville, Tennessee\"}, '\n", " '\"date_stayed\": \"December 2012\", \"offering_id\": 1776857, \"num_helpful_votes\": '\n", " '0, \"date\": \"December 19, 2012\", \"id\": 147762928, \"via_mobile\": false}, '\n", " '{\"ratings\": {\"service\": 5.0, \"cleanliness\": 5.0, \"overall\": 5.0, \"value\": '\n", " '4.0, \"rooms\": 5.0, \"location\": 4.0}, \"title\": \"\\\\u201cApartment living for '\n", " 'less than most 5 stars\\\\u201d\", \"text\": \"I stayed at the Setai for 3 nights '\n", " 'last week, as my company reserved a block for us. Although the hotel is not '\n", " 'near Lincoln Center, which is where we shuttled to every day, I found the '\n", " 'hotel to be a true gem. I have stayed at countless hotels in NYC and this is '\n", " 'by far the best value. Loved the spacious rooms, modern furnishings and all '\n", " 'the free offerings (mini bar, cookies, Internet, pressing). Regina/Chris in '\n", " 'sales stayed all night to help coordinate my guests, and Brian/Matt/Louis '\n", " 'from the front desk could not have been nicer. The door men were amazing- '\n", " 'they assisted our last-minute')\n" ] } ], "source": [ "pprint(retrieved_docs[1].page_content)" ] }, { "cell_type": "code", "execution_count": 24, "id": "7d5c455b", "metadata": {}, "outputs": [], "source": [ "from langchain.chains import RetrievalQA\n", "from langchain.llms import OpenAI\n", "\n", "qa = RetrievalQA.from_chain_type(llm=OpenAI(),\n", " chain_type=\"stuff\",\n", " retriever=larger_chunks_retriever)" ] }, { "cell_type": "code", "execution_count": 26, "id": "ccfb7151", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\" The reviewer noted that check-in was easy and they got upgraded to a suite. They appreciated the two bathrooms, the housekeeping service, the turn-down service, the lobby's signature scent, the books in the room, the comfortable beds, the TOTO toilets, the drinks and dinner at the Lambs Club bar and restaurant, the concierge's help with pre-arranged deliveries, the complimentary non-alcoholic beverages and snacks in the kitchenette, the daily Happy Hour with complimentary wine, and the convenient location.\"" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"What amenities and services stood out to the reviewer during their stay at the hotel?\"\n", "qa.run(query)" ] }, { "cell_type": "code", "execution_count": 27, "id": "cf0d84f4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7860\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import gradio as gr\n", "\n", "iface = gr.Interface(fn=qa, \n", " inputs=\"text\", \n", " outputs=\"text\",\n", " title=\"Parent Document Retriever\",\n", " )\n", "\n", "iface.launch()" ] }, { "cell_type": "code", "execution_count": null, "id": "79229c1c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }