Spaces:
Running
Running
File size: 8,476 Bytes
27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
# -*- coding: utf-8 -*-
# Install all the required libraries
# !pip install -U -q pdfplumber tiktoken openai chromaDB sentence-transformers
# Import all the required Libraries
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
# import openai
import chromadb
# openai.api_key = open("api_key.txt", "r").read().strip()
def initialize_conversation():
"""
Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
"""
conversation = [
f"""
You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe, with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
<EXAMPLE>
INPUT: "What are the premium rates for different types of insurance under this policy?"
OUTPUT:
The premium rate(s) for each Member insured for Life Insurance will be:
Premium Rates:
1. Member Life Insurance: $0.210 for each $1,000 of insurance in force.
2. Member Accidental Death and Dismemberment Insurance: $0.025 for each $1,000 of Member Life Insurance in force.
3. Dependent Life Insurance: $1.46 for each Member insured for Dependent Life Insurance.
Multiple Policy Discount: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
</EXAMPLE>
<EXAMPLE>
INPUT: "What are the Contributions from Members?"
OUTPUT:
Members are not required to contribute a part of the premium for their Member insurance under this Group Policy.
Members are required to contribute a part of the premium for their Dependent's insurance under this Group Policy.
Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
</EXAMPLE>
Guidelines:
1. Extract information that directly answers the user's query from the document excerpts.
3. Provide the final response as a well-formatted and easily readable text along with the citation.
4. Provide your complete response using the relevant parts in the documents.
5. The generated response should answer the query directly addressing the user and avoiding additional information.
6. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
7. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
# Start with a short welcome message with smiley only in the begining of the chat session and not in every response.
"""
]
# conversation = [{"role": "user", "parts": system_message}]
# conversation = [{"role": "system", "content": system_message}]
return conversation
# Import the SentenceTransformer Embedding Function into chroma
from chromadb.utils import embedding_functions
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
# Call PersistentClient() so the collections including cache can be stored in a permanent storage
client = chromadb.PersistentClient()
"""
We will also implement a data/collection cache to improve the performance of the overall search system."""
# Set up the embedding function
def generate_embeddings(embedding_function):
# Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
insurance_collection = client.get_collection(name='RAG_on_Insurance', embedding_function=embedding_function)
return insurance_collection
insurance_collection = generate_embeddings(embedding_function)
"""##<font color = yellow> Search Layer
### Semantic Search with Cache
We will perform a semantic search of a query in the collections embeddings to get several top semantically similar results based on the *distance* parameter.
"""
# test query
# query = "What are the premium rates for different types of insurance under this policy?"
# query = "what are the benefits payable for different types of insurance under this policy?"
# query = "What are the Contributions from Members??"
"""#### Document retreival"""
# Implementing Cache in Semantic Search
def retreive_results(query):
# Set a threshold for cache search
threshold = 0.2
ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()
# If the distance is greater than the threshold, then return the results from the main collection.
# Query the collection against the user query and return the top 10 results
results = insurance_collection.query(
query_texts=query,
n_results=10
)
# Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
# Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
Keys = []
Values = []
for key, val in results.items():
if val is None:
continue
if key in ['ids', 'metadatas', 'documents', 'distances']:
for i in range(10):
Keys.append(str(key)+str(i))
Values.append(str(val[0][i]))
result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
results_df = pd.DataFrame.from_dict(result_dict)
return results_df
# results_df = retreive_results(query, insurance_collection, cache_collection)
# results_df.head(5)
"""#### Re-Ranking with a Cross Encoder
We will perform Re-ranking of the search results using cross-encoder to move more relevant chunks at the top.
"""
# Import the CrossEncoder library from sentence_transformers
from sentence_transformers import CrossEncoder, util
# Initialise the cross encoder model
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
#function to re-rank results using cross-encoder
def rerank_with_cross_encoder(query, results_df, top_k=3):
# Input (query, response) pairs for each of the top 10 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs
cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)
# print(cross_rerank_scores)
# Store the rerank_scores in results_df
results_df['Reranked_scores'] = cross_rerank_scores
# print(results_df)
# Return the top_kresults from semantic search
top_semantic = results_df.sort_values(by='Distances')
# print(top_semantic[:top_k])
# Return the top_k results after reranking
top_ranks_df = results_df.sort_values(by='Reranked_scores', ascending=False)
# print(top_ranks[:top_k])
top_docs = top_ranks_df[["Documents", "Metadatas"]][:top_k]
# top_ranks = top_ranks[:][:top_k]
print(top_docs)
return top_docs #, top_ranks_df
# top_docs = rerank_with_cross_encoder(results_df)
# top_docs
|