Spaces:

prithvirajpawar
/

insuranceai

Running

App Files Files Community

prithvirajpawar commited on about 1 month ago

Commit

6c6d21d

1 Parent(s): dd8e7ac

removed and added many files

Browse files

Files changed (8) hide show

app/.DS_Store → .DS_Store +0 -0
Principal-Sample-Life-Insurance-Policy.pdf +0 -0
app/__init__.py +0 -0
app/__pycache__/__init__.cpython-311.pyc +0 -0
app/__pycache__/main.cpython-311.pyc +0 -0
helpmate_ai.py +21 -682
app/main.py → main.py +6 -1
requirements.txt +2 -2

app/.DS_Store → .DS_Store RENAMED Viewed

Binary files a/app/.DS_Store and b/.DS_Store differ

Principal-Sample-Life-Insurance-Policy.pdf DELETED Viewed

Binary file (223 kB)

app/__init__.py DELETED Viewed

File without changes

app/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (162 Bytes)

app/__pycache__/main.cpython-311.pyc DELETED Viewed

Binary file (4.77 kB)

helpmate_ai.py CHANGED Viewed

@@ -69,165 +69,6 @@ def initialize_conversation():
     return conversation
-"""#### Read, Process, and Chunk the PDF File
-We will be using **pdfplumber** library to read and process the PDF files.
-"""
-# Define the path of the PDF
-pdf_path = 'Principal-Sample-Life-Insurance-Policy.pdf'
-"""Reading PDF file and exploring it for delimeters to decide chunking stategy
-"""
-# Open the PDF file
-# with pdfplumber.open(pdf_path) as pdf:
-#     # Get one of the pages from the PDF and examine it
-#     single_page = pdf.pages[0]
-#     # Extract text from the first page
-#     text = single_page.extract_text()
-#     # Print the extracted text
-#     visible_text = text.replace("\n", "<NEWLINE>\n").replace("\t", "[TAB]").replace("  ", "[SPACE]")
-#     print(visible_text)
-    # print(text)
-"""*Looking at the the file we will go fixed-size chunking strategy either page or certain token size. We will experiment with various token-size for optimal output.*
-#### Function to perform Page-Based Chunking
-"""
-# Function to extract text page-wise from a PDF file.
-def extract_pages_from_pdf(pdf_path):
-    # p = 0
-    page_cunks = []
-    # with pdfplumber.open(pdf_path) as pdf:
-    pdf = pdfplumber.open(pdf_path);
-    for page_no, page in enumerate(pdf.pages):
-        # page_no = f"Page {p+1}"
-        text = page.extract_text()
-        page_cunks.append([page_no + 1, text])
-        # p +=1
-    return page_cunks
-page_cunks = extract_pages_from_pdf(pdf_path)
-# for page_chunk in page_cunks[0:5]:
-#   print(page_chunk)
-"""#### Functions to perform fixed size chunking using token-size
-We will be using OpenAI 'gpt-3.5-turbo' model for generating answer so we choose size of chunks such that it does not exceed token limit of the model which is 4096(input and output)
-"""
-# Load the tokenizer
-tokenizer = tiktoken.get_encoding("cl100k_base")
-# Define the token limit for each chunk
-TOKEN_SIZE = 512  # Adjust for optimal output
-def chunk_text_by_token_size(text, TOKEN_SIZE):
-    # Tokenize the text
-    tokens = tokenizer.encode(text)
-    # Chunk the tokens into fixed-size chunks
-    chunks = [tokens[i:i + TOKEN_SIZE] for i in range(0, len(tokens), TOKEN_SIZE)]
-    # Convert the chunks back into text
-    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
-    return text_chunks
-def fixed_size_chunking_of_pdf(pdf_path):
-    # Extract text from a PDF
-    with pdfplumber.open(pdf_path) as pdf:
-        # Initialize a list to store chunks
-        all_chunks = []
-        # Iterate over all the pages
-        for page_no, page in enumerate(pdf.pages):
-            # Extract text from the page
-            text = page.extract_text()
-            # Chunk the text based on token limit
-            page_chunks = chunk_text_by_token_size(text, TOKEN_SIZE)
-            for text_chunk in page_chunks:
-              all_chunks.append([f"{page_no + 1}", text_chunk])
-    return all_chunks
-# Append the chunks to the list
-all_chunks = fixed_size_chunking_of_pdf(pdf_path)
-# Example: Print the first chunk
-# for chunk in all_chunks[0:5]:
-#     print(chunk)
-"""We will store the chunks in a dataframe for further processng.
-chunks smaller than length 10 might be some empty pages or very few words so will be dropped.
-Depending on the chunking srategy relevant functions are called.
-"""
-# functions for storing chunks in data frame for further processing
-def store_docs_to_df(chunks):
-    # Initialize a list to store chunks
-    data = []
-    # Convert the extracted list to a DF, and add a column to store document names
-    extracted_text_df = pd.DataFrame(chunks, columns=['Page No.', 'Text'])
-    # Append the extracted text and Page number to the list
-    data.append(extracted_text_df)
-    # Concatenate all the DFs in the list 'data' together
-    insurance_pdf_data = pd.concat(data, ignore_index=True)
-    # insurance_pdfs_data.head(20)
-    # Let's also check the length of all the texts as there might be some empty pages or with very few words that we can drop
-    insurance_pdf_data['Text_Length'] = insurance_pdf_data['Text'].apply(lambda x: len(x.split(' ')))
-    insurance_pdf_data['Text_Length']
-    # Retain only the rows with a text length of at least 10
-    insurance_pdf_data = insurance_pdf_data.loc[insurance_pdf_data['Text_Length'] >= 10]
-    # insurance_pdfs_data
-    # Store the metadata for each page in a separate column
-    # insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Page No.': x['Page No.'], 'Chunk No': x['Chunk No']}, axis=1)
-    insurance_pdf_data['Metadata'] = insurance_pdf_data.apply(lambda x: {'Page No.': x['Page No.']}, axis=1)
-    # insurance_pdfs_data
-    return insurance_pdf_data
-chunks_df = store_docs_to_df(page_cunks) # page based chunking
-# chunks_df = store_docs_to_df(all_chunks) # chunking based on size=token-size
-# chunks_df.tail(5)
-"""## Generate and Store Embeddings
-In this section, we will embed the chunks and store them in a ChromaDB collection.
-"""
-# Define the path where chroma collections will be stored
-chroma_data_path = '/content/drive/MyDrive/HelpMate_AI_Codes/ChromaDB_Data'
-# Import the OpenAI Embedding Function into chroma
-# from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
-# embedding_function = OpenAIEmbeddingFunction(
-#     api_key=openai.api_key,
-#     model_name="text-embedding-ada-002"
-# )
 # Import the SentenceTransformer Embedding Function into chroma
 from chromadb.utils import embedding_functions
 # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
@@ -238,64 +79,18 @@ embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(mo
 client = chromadb.PersistentClient()
 """
 We will also implement a data/collection cache to improve the performance of the overall search system."""
 # Set up the embedding function
-def generate_embeddings(chunks_df, embedding_function):
-    all_collections = client.list_collections()
-    collection_exists = any(col.name == 'RAG_on_Insurance' for col in all_collections)
-    if collection_exists:
-      client.delete_collection(name='RAG_on_Insurance')
     # Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
-    insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)
-    # Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma
-    documents_list = chunks_df["Text"].tolist()
-    metadata_list = chunks_df['Metadata'].tolist()
-    # Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.
-    insurance_collection.add(
-        documents= documents_list,
-        ids = [str(i) for i in range(0, len(documents_list))],
-        metadatas = metadata_list
-    )
-    collection_exists = any(col.name == 'Insurance_Cache' for col in all_collections)
-    if collection_exists:
-      client.delete_collection(name='Insurance_Cache')
-    cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)
-    # print(client.list_collections())
-    # print(cache_collection.peek())
-    # cache_results = cache_collection.query(
-    #     query_texts=query,
-    #     n_results=1
-    # )
-    # print(cache_results)
-    return insurance_collection, cache_collection
-insurance_collection, cache_collection = generate_embeddings(chunks_df, embedding_function)
-# insurance_collection.peek(5)
-# Let's take a look at the first few entries in the collection
-# sample = insurance_collection.peek(5)
-# sample
-# print(insurance_collection.get(
-#     ids = ['4','5','6'],
-#     include = ['documents', 'metadatas']
-# ))
 """##<font color = yellow> Search Layer
@@ -324,77 +119,29 @@ def retreive_results(query):
     results_df = pd.DataFrame()
-    # Searh the Cache collection first
-    # Query the collection against the user query and return the top 20 results
-    cache_results = cache_collection.query(
-        query_texts=query,
-        n_results=1
     )
-    # print(cache_results)
-    # print(f"cache_results top distance: {cache_results['distances'][0][0]}")
-    # If the distance is greater than the threshold, then return the results from the main collection.
-    if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
-          # Query the collection against the user query and return the top 10 results
-          results = insurance_collection.query(
-          query_texts=query,
-          n_results=10
-          )
-          # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
-          # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
-          Keys = []
-          Values = []
-          for key, val in results.items():
-            if val is None:
-              continue
-            if key in ['ids', 'metadatas', 'documents', 'distances']:
-              for i in range(10):
                 Keys.append(str(key)+str(i))
                 Values.append(str(val[0][i]))
-                # print(key, i)
-          cache_collection.add(
-              documents= [query],
-              ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
-              metadatas = dict(zip(Keys, Values))
-          )
-          # print("Not found in cache. Found in main collection.")
-          result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
-          results_df = pd.DataFrame.from_dict(result_dict)
-    # If the distance is, however, less than the threshold, you can return the results from cache
-    elif cache_results['distances'][0][0] <= threshold:
-          cache_result_dict = cache_results['metadatas'][0][0]
-          # Loop through each inner list and then through the dictionary
-          for key, value in cache_result_dict.items():
-              if 'ids' in key:
-                  ids.append(value)
-              elif 'documents' in key:
-                  documents.append(value)
-              elif 'distances' in key:
-                  distances.append(value)
-              elif 'metadatas' in key:
-                  metadatas.append(value)
-          print("Found in cache!")
-          # Create a DataFrame
-          results_df = pd.DataFrame({
-            'IDs': ids,
-            'Documents': documents,
-            'Distances': distances,
-            'Metadatas': metadatas
-          })
-    # print(results_df)
     return results_df
@@ -444,414 +191,6 @@ def rerank_with_cross_encoder(query, results_df, top_k=3):
 # top_docs = rerank_with_cross_encoder(results_df)
 # top_docs
-"""##<font color = yellow> Generative Layer
-### Retrieval Augmented Generation(RAG)
-We will now use OpenAI *gpt-3.5-turbo* along with the user query and prompt with top ranked docs, to generate a direct answer to the query along with citations.
-"""
-# # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
-# def create_prompt(query, top_docs):
-#     """
-#     Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
-#     """
-#     prompt = [
-#                 {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
-#                 {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
-#                                                 You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'.
-#                                                 These search results are essentially one paragraph of an insurance document that may be relevant to the user query.
-#                                                 The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page.
-#                                                 The policy document describes about 3 different policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'
-#                                                 Use the documents in '{top_docs}' to answer the query '{query}'.
-#                                                 Follow the guidelines below when performing the task:
-#                                                 1. Try to provide relevant/accurate numbers if available.
-#                                                 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
-#                                                 3. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
-#                                                 4. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
-#                                                 5. If you think that the query is not relevant to the document, reply that the query is irrelevant.
-#                                                 6. Provide the final response as a well-formatted and easily readable text along with the citation.
-#                                                 7. Provide your complete response using the relevant parts in the documents.
-#                                                 8. The generated response should answer the query directly addressing the user and avoiding additional information.
-#                                                 9. Provide the final response as a well-formatted and easily readable text.
-#                                                 """},
-#               ]
-#     return prompt
-# # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
-# def create_prompt(query, top_docs):
-#     """
-#     Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
-#     """
-#     prompt = [
-#                 {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
-#                 {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
-#                                                 You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'. These search results are essentially one paragraph of an insurance document that may be relevant to the user query.
-#                                                 The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page.
-#                                                 The policy document describes about 3 different policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'
-#                                                 Use the documents in '{top_docs}' to answer the query '{query}'.
-#                                                 Follow the guidelines below when performing the task.
-#                                                 1. Try to provide relevant/accurate numbers if available.
-#                                                 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
-#                                                 4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
-#                                                 5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
-#                                                 6. If you think that the query is not relevant to the document, reply that the query is irrelevant.
-#                                                 7. Provide the final response as a well-formatted and easily readable text along with the citation.
-#                                                 8. Provide your complete response using the relevant parts in the documents.
-#                                                 The generated response should answer the query directly addressing the user and avoiding additional information. Provide the final response as a well-formatted and easily readable text.
-#                                                 **Example 1:**
-#                                                 **Query**: "What are the benefits of the whole life insurance policy?"
-#                                                 **Search Results**: Dataframe contains an excerpt from a whole life insurance policy document: "The policy provides lifelong coverage, a guaranteed death benefit, and a cash value component that grows over time."
-#                                                 **Response**: "The whole life insurance policy offers lifelong coverage with a guaranteed death benefit. Additionally, it accumulates cash value over time, which can be accessed or borrowed against by the policyholder."
-#                                                 **Citations**: Policy Name: Lifetime Protection Plan, Page: 7
-#                                                 **Example 2:**
-#                                                 **Query**: "What is the death benefit for a final expense life insurance policy?"
-#                                                 **Search Results**: Dataframe contains a document with the following excerpt: "The final expense policy provides a death benefit of up to $10,000, intended to cover funeral costs and other end-of-life expenses."
-#                                                 **Response**: "The final expense life insurance policy provides a death benefit of up to $10,000, which is typically used to cover funeral costs and other end-of-life expenses."
-#                                                 **Citations**: Policy Name: Final Expense Protection, Page: 3
-#                                                 """},
-#               ]
-#     return prompt
-# # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
-# def create_prompt(query, top_docs):
-#     """
-#     Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
-#     """
-#     prompt = [
-#                 {
-#                   "role": "system",
-#                   "content": "You are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely."
-#                 },
-#                 {
-#                   "role": "user",
-#                   "content": f"""
-#                   You are given a user query and a set of relevant insurance policy document excerpts retrieved by a Retrieval-Augmented Generation (RAG) system.
-#                   Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
-#                   The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
-#                   Guidelines:
-#                   1. Extract information that directly answers the user's query from the document excerpts.
-#                   2. Organize the response using clear headings, bullet points, or tables where applicable.
-#                   3. Cite the relevant policy name(s) and page number(s) using the metadata from the dataframe.
-#                   4. If the provided excerpts do not fully answer the query, provide all available information and suggest which sections of the policy document the user should review for further details.
-#                   5. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
-#                   ### Example Query:
-#                   **User Query**: "What are the premium rates for different types of insurance under this policy?"
-#                   **Extracted Information**:
-#                   **Article 2 - Premium Rates**:
-#                   1. **Member Life Insurance**: $0.210 for each $1,000 of insurance in force.
-#                   2. **Member Accidental Death and Dismemberment Insurance**: $0.025 for each $1,000 of Member Life Insurance in force.
-#                   3. **Dependent Life Insurance**: $1.46 for each Member insured for Dependent Life Insurance.
-#                   **Multiple Policy Discount**: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
-#                   **Citations**: Policy Name: Group Life Insurance Policy, Page Number: 12.
-#                   ### Your Task:
-#                   The user query is: '{query}'
-#                   """
-#                 }
-#               ]
-#     return prompt
-# # function to create prompt having the top ranked docs and query.
-# def create_prompt(query, top_docs):
-#     """
-#     Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
-#     """
-#     prompt = [
-#                 {
-#                   "role": "system",
-#                   "content": "You are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely."
-#                 },
-#                 {
-#                   "role": "user",
-#                   "content": f"""
-#                   You are given a user query and a set of relevant insurance policy document excerpts retrieved by a Retrieval-Augmented Generation (RAG) system.
-#                   Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
-#                   The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
-#                   Guidelines:
-#                   1. Extract information that directly answers the user's query from the document excerpts.
-#                   2. Organize the response using clear headings, bullet points, or tables where applicable.
-#                   3. If the text includes tables with relevant information, reformat them into a clear, readable structure.
-#                   4. Cite the relevant policy name(s) and page number(s) using the metadata from the dataframe.
-#                   5. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
-#                   6. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
-#                   ### Example Query:
-#                   **User Query**: "What are the premium rates for different types of insurance under this policy?"
-#                   **Premium Rates**:
-#                   1. **Member Life Insurance**: $0.210 for each $1,000 of insurance in force.
-#                   2. **Member Accidental Death and Dismemberment Insurance**: $0.025 for each $1,000 of Member Life Insurance in force.
-#                   3. **Dependent Life Insurance**: $1.46 for each Member insured for Dependent Life Insurance.
-#                   **Multiple Policy Discount**: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
-#                   **Citations**: Policy Name: Group Life Insurance Policy, Page Number: 12.
-#                   ### Your Task:
-#                   The user query is: '{query}'
-#                   """
-#                 }
-#               ]
-#     return prompt
-# prompt = create_prompt(query, top_docs)
-# # function to generate the response.
-# def generate_response(query, top_docs):
-#     """
-#     Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
-#     """
-#     messages = [
-#                 {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
-#                 {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
-#                                                 You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'. These search results are essentially one page of an insurance document that may be relevant to the user query.
-#                                                 The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.
-#                                                 Use the documents in '{top_docs}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.
-#                                                 Follow the guidelines below when performing the task.
-#                                                 1. Try to provide relevant/accurate numbers if available.
-#                                                 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
-#                                                 3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
-#                                                 3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
-#                                                 4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
-#                                                 5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
-#                                                 The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
-#                                                 """},
-#               ]
-#     response = openai.chat.completions.create(
-#         model="gpt-3.5-turbo",
-#         messages=messages
-#     )
-#     return response.choices[0].message.content.split('\n')
-# response = generate_response(query, top_docs)
-# print(query + '\n')
-# print("\n".join(response))
-# function to generate the response.
-def generate_response(query, top_docs):
-    """
-    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
-    """
-    messages = f"""
-                Remember your system message and that you are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely.
-                Your task is to extract and present relevant information from the policy documents to answer the user’s query.
-                The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
-                The user input is: '{query}'
-                """
-    # response = openai.chat.completions.create (
-    #     model="gpt
-                  ### Your Task:-3.5-turbo",
-    #     messages=messages
-    # )
-    conversation = [{"role": "user", "parts": messages}]
-    return conversation #response.choices[0].message.content.split('\n')
-# response = generate_response(query, top_docs)
-# print(query + '\n')
-# print("\n".join(response))
-"""## <font color = yellow> Query Search
-### <font color = yellow> Query #1
-"""
-# query1 = "what happens if failed to Pay Premium?"
-# results_df = retreive_results(query1, insurance_collection, cache_collection)
-# top_docs = rerank_with_cross_encoder(results_df)
-# top_docs
-# #generate response
-# response = generate_response(query1, top_docs)
-# print("\n".join(response))
-# """### <font color = yellow> Query #2"""
-# query2 = "what are the eligibility requirements for different types of insurance under this policy?"
-# results_df = retreive_results(query2, insurance_collection, cache_collection)
-# top_docs = rerank_with_cross_encoder(results_df)
-# top_docs
-# #generate response
-# response = generate_response(query2, top_docs)
-# print("\n".join(response))
-# """### <font color = yellow> Query #3"""
-# query3 = "What are the Termination Rights of the Policyholder?"
-# results_df = retreive_results(query3, insurance_collection, cache_collection)
-# top_docs = rerank_with_cross_encoder(results_df)
-# top_docs
-# #generate response
-# response = generate_response(query3, top_docs)
-# print("\n".join(response))
-# def run_pipeline(chunk_strategy,
-#                  embedding_function,
-#                  chroma_data_path,
-#                  query,
-#                  cross_encoder,
-#                  top_k,
-#                  rag_model,
-#                  prompt_style="default"):
-#     # Embedding layer
-#     # Preprocess documents
-#     # Extract text
-#     # Split into chunks
-#     if chunk_strategy == "page":
-#       docs = extract_pages_from_pdf(pdf_path)
-#     elif chunk_strategy == "fixed_size":
-#       docs = fixed_size_chunking_of_pdf(pdf_path)
-#     docs_df = store_docs_to_df(docs)
-#     # Generate embeddings and store in chromadb collection and cache
-#     insurance_collection, cache_collection = generate_embeddings(docs_df, embedding_function)
-#     # Retrieve documents relevant to query from collections and store in cache
-#     results_df = retreive_results(query, insurance_collection, cache_collection)
-#     # Re-rank with Cross Encoder
-#     top_re_ranks, top_df = rerank_with_cross_encoder(results_df, top_k)
-#     # Create prompt
-#     prompt = create_prompt(query, top_re_ranks)
-#     # Generate response
-#     response = generate_response(prompt, rag_model)
-#     return top_df, response
-# # select chunking strategy
-# # chunk_strategy = "page"
-# chunk_strategy = "fixed_size"
-# # Load the tokenizer
-# tokenizer = tiktoken.get_encoding("cl100k_base")
-# # Define the token limit for each chunk
-# TOKEN_SIZE = 500  # Adjust this based on your needs
-# # Import the OpenAI Embedding Function into chroma
-# from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
-# # select the model and initialise the embedding function
-# # model = "text-embedding-ada-002"
-# # embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)
-# from chromadb.utils import embedding_functions
-# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
-# # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
-# # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
-# # Import the CrossEncoder library from sentence_transformers
-# from sentence_transformers import CrossEncoder, util
-# # Initialise the cross encoder model
-# cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
-# # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
-# # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
-# # test query
-# # query = "what are the eligibility requirements?"
-# # query = "what are the eligibility requirements for different types of insurance under this policy?"
-# # query = "what are the benefits payable?"
-# # query = "what are the benefits payable for different types of insurance under this policy?"
-# # query = "What are the benefits payable of Member Accidental Death and Dismemberment Insurance?"
-# # query = "What are the benefits of Member Life Insurance?"
-# # query = "How much is the premium amount?"
-# # query = "How much is the premium amount for different types of insurance under this policy?"
-# # query = "How much is the premium rate?"
-# # query = "What are the premium rates for different types of insurance under this policy?"
-# # query = "What are the premium rates?"
-# # print(query)
-# # how much top query results to consider for generating response
-# top_k = 5
-# # select RAG model
-# rag_model = "gpt-3.5-turbo"
-# top_df, response = run_pipeline(chunk_strategy,
-#                         embedding_function,
-#                         chroma_data_path,
-#                         query,
-#                         cross_encoder,
-#                         top_k,
-#                         rag_model)
-# # results_df = run_pipeline(chunk_strategy,
-# #                         embedding_function,
-# #                         chroma_data_path,
-# #                         query,
-# #                         cross_encoder,
-# #                         top_k,
-# #                         rag_model)
-# # top_re_ranks = run_pipeline(chunk_strategy,
-# #                         embedding_function,
-# #                         chroma_data_path,
-# #                         query,
-# #                         cross_encoder,
-# #                         top_k,
-# #                         rag_model)
-# print("\n".join(response))
-# # print(prompt)
-# # top_re_ranks
-# # docs_df.head(100)
-# # top_semantic_search
-# top_df
-# # results_df

     return conversation
 # Import the SentenceTransformer Embedding Function into chroma
 from chromadb.utils import embedding_functions
 # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
 client = chromadb.PersistentClient()
 """
 We will also implement a data/collection cache to improve the performance of the overall search system."""
 # Set up the embedding function
+def generate_embeddings(embedding_function):
     # Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
+    insurance_collection = client.get_collection(name='RAG_on_Insurance', embedding_function=embedding_function)
+    return insurance_collection
+insurance_collection = generate_embeddings(embedding_function)
 """##<font color = yellow> Search Layer
     results_df = pd.DataFrame()
+    # If the distance is greater than the threshold, then return the results from the main collection.
+        # Query the collection against the user query and return the top 10 results
+    results = insurance_collection.query(
+    query_texts=query,
+    n_results=10
     )
+    # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
+    # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
+    Keys = []
+    Values = []
+    for key, val in results.items():
+        if val is None:
+            continue
+        if key in ['ids', 'metadatas', 'documents', 'distances']:
+            for i in range(10):
                 Keys.append(str(key)+str(i))
                 Values.append(str(val[0][i]))
+    result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
+    results_df = pd.DataFrame.from_dict(result_dict)
     return results_df
 # top_docs = rerank_with_cross_encoder(results_df)
 # top_docs

app/main.py → main.py RENAMED Viewed

@@ -6,9 +6,14 @@ from fastapi.staticfiles import StaticFiles
 from helpmate_ai import initialize_conversation, retreive_results, rerank_with_cross_encoder, generate_response
 import re
 import google.generativeai as genai
 # Configure Gemini API
-gemini_api_key = open("gemini_api_key.txt", "r").read().strip()
 genai.configure(api_key=gemini_api_key)
 # Initialize FastAPI app

 from helpmate_ai import initialize_conversation, retreive_results, rerank_with_cross_encoder, generate_response
 import re
 import google.generativeai as genai
+import os
+from dotenv import load_dotenv
 # Configure Gemini API
+# gemini_api_key = open("gemini_api_key.txt", "r").read().strip()
+load_dotenv()
+gemini_api_key = os.getenv("GEMINI_API_KEY")
 genai.configure(api_key=gemini_api_key)
 # Initialize FastAPI app

requirements.txt CHANGED Viewed

@@ -7,6 +7,6 @@ fastapi
 uvicorn
 jinja2
 python-multipart
-pdfplumber
 sentence_transformers
-tiktoken

 uvicorn
 jinja2
 python-multipart
 sentence_transformers
+os
+dotenv