File size: 8,476 Bytes
27bbfe3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c6d21d
27bbfe3
 
6c6d21d
27bbfe3
6c6d21d
27bbfe3
6c6d21d
27bbfe3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c6d21d
 
 
 
 
 
27bbfe3
 
6c6d21d
 
 
 
27bbfe3
6c6d21d
 
 
 
 
27bbfe3
 
 
6c6d21d
 
27bbfe3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# -*- coding: utf-8 -*-
# Install all the required libraries

# !pip install -U -q pdfplumber tiktoken openai chromaDB sentence-transformers

# Import all the required Libraries

import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
# import openai
import chromadb

# openai.api_key = open("api_key.txt", "r").read().strip()

def initialize_conversation():
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    conversation = [
            f"""
            You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
            The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
            Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe, with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.

            <EXAMPLE>
            INPUT: "What are the premium rates for different types of insurance under this policy?"

            OUTPUT:
            The premium rate(s) for each Member insured for Life Insurance will be:

            Premium Rates:
            1. Member Life Insurance: $0.210 for each $1,000 of insurance in force.
            2. Member Accidental Death and Dismemberment Insurance: $0.025 for each $1,000 of Member Life Insurance in force.
            3. Dependent Life Insurance: $1.46 for each Member insured for Dependent Life Insurance.

            Multiple Policy Discount: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.

            Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
            </EXAMPLE>

            <EXAMPLE>
            INPUT: "What are the Contributions from Members?"

            OUTPUT:
            Members are not required to contribute a part of the premium for their Member insurance under this Group Policy.
            Members are required to contribute a part of the premium for their Dependent's insurance under this Group Policy.

            Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
            </EXAMPLE>

            Guidelines:
            1. Extract information that directly answers the user's query from the document excerpts.
            3. Provide the final response as a well-formatted and easily readable text along with the citation.
            4. Provide your complete response using the relevant parts in the documents.
            5. The generated response should answer the query directly addressing the user and avoiding additional information.
            6. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
            7. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
            
            # Start with a short welcome message with smiley only in the begining of the chat session and not in every response.
            """
    ] 
                
    # conversation = [{"role": "user", "parts": system_message}] 
    # conversation = [{"role": "system", "content": system_message}]

    return conversation

# Import the SentenceTransformer Embedding Function into chroma
from chromadb.utils import embedding_functions
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Call PersistentClient() so the collections including cache can be stored in a permanent storage
client = chromadb.PersistentClient()

"""
We will also implement a data/collection cache to improve the performance of the overall search system."""

# Set up the embedding function

def generate_embeddings(embedding_function):

    # Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
    insurance_collection = client.get_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

    return insurance_collection

insurance_collection = generate_embeddings(embedding_function)

"""##<font color = yellow> Search Layer

### Semantic Search with Cache

We will perform a semantic search of a query in the collections embeddings to get several top semantically similar results based on the *distance* parameter.
"""

# test query
# query = "What are the premium rates for different types of insurance under this policy?"
# query = "what are the benefits payable for different types of insurance under this policy?"
# query = "What are the Contributions from Members??"

"""#### Document retreival"""

# Implementing Cache in Semantic Search

def retreive_results(query):
    # Set a threshold for cache search
    threshold = 0.2

    ids = []
    documents = []
    distances = []
    metadatas = []

    results_df = pd.DataFrame()

    # If the distance is greater than the threshold, then return the results from the main collection.
    
        # Query the collection against the user query and return the top 10 results
    results = insurance_collection.query(
    query_texts=query,
    n_results=10
    )

    # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
    # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
    Keys = []
    Values = []

    for key, val in results.items():
        if val is None:
            continue
        if key in ['ids', 'metadatas', 'documents', 'distances']:
            for i in range(10):
                Keys.append(str(key)+str(i))
                Values.append(str(val[0][i]))

    result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
    results_df = pd.DataFrame.from_dict(result_dict)

    return results_df

# results_df = retreive_results(query, insurance_collection, cache_collection)
# results_df.head(5)

"""#### Re-Ranking with a Cross Encoder

We will perform Re-ranking of the search results using cross-encoder to move more relevant chunks at the top.
"""

# Import the CrossEncoder library from sentence_transformers
from sentence_transformers import CrossEncoder, util
# Initialise the cross encoder model
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

#function to re-rank results using cross-encoder
def rerank_with_cross_encoder(query, results_df, top_k=3):

    # Input (query, response) pairs for each of the top 10 responses received from the semantic search to the cross encoder
    # Generate the cross_encoder scores for these pairs

    cross_inputs = [[query, response] for response in results_df['Documents']]
    cross_rerank_scores = cross_encoder.predict(cross_inputs)
    # print(cross_rerank_scores)

    # Store the rerank_scores in results_df
    results_df['Reranked_scores'] = cross_rerank_scores
    # print(results_df)

    # Return the top_kresults from semantic search
    top_semantic = results_df.sort_values(by='Distances')
    # print(top_semantic[:top_k])

    # Return the top_k results after reranking
    top_ranks_df = results_df.sort_values(by='Reranked_scores', ascending=False)
    # print(top_ranks[:top_k])

    top_docs = top_ranks_df[["Documents", "Metadatas"]][:top_k]
    # top_ranks = top_ranks[:][:top_k]
    print(top_docs)

    return top_docs #, top_ranks_df

# top_docs = rerank_with_cross_encoder(results_df)
# top_docs