prithvirajpawar commited on
Commit
6c6d21d
Β·
1 Parent(s): dd8e7ac

removed and added many files

Browse files
app/.DS_Store β†’ .DS_Store RENAMED
Binary files a/app/.DS_Store and b/.DS_Store differ
 
Principal-Sample-Life-Insurance-Policy.pdf DELETED
Binary file (223 kB)
 
app/__init__.py DELETED
File without changes
app/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (162 Bytes)
 
app/__pycache__/main.cpython-311.pyc DELETED
Binary file (4.77 kB)
 
helpmate_ai.py CHANGED
@@ -69,165 +69,6 @@ def initialize_conversation():
69
 
70
  return conversation
71
 
72
- """#### Read, Process, and Chunk the PDF File
73
-
74
- We will be using **pdfplumber** library to read and process the PDF files.
75
- """
76
-
77
- # Define the path of the PDF
78
- pdf_path = 'Principal-Sample-Life-Insurance-Policy.pdf'
79
-
80
- """Reading PDF file and exploring it for delimeters to decide chunking stategy
81
-
82
-
83
- """
84
-
85
- # Open the PDF file
86
- # with pdfplumber.open(pdf_path) as pdf:
87
-
88
- # # Get one of the pages from the PDF and examine it
89
- # single_page = pdf.pages[0]
90
-
91
- # # Extract text from the first page
92
- # text = single_page.extract_text()
93
-
94
- # # Print the extracted text
95
-
96
- # visible_text = text.replace("\n", "<NEWLINE>\n").replace("\t", "[TAB]").replace(" ", "[SPACE]")
97
- # print(visible_text)
98
- # print(text)
99
-
100
- """*Looking at the the file we will go fixed-size chunking strategy either page or certain token size. We will experiment with various token-size for optimal output.*
101
-
102
- #### Function to perform Page-Based Chunking
103
- """
104
-
105
- # Function to extract text page-wise from a PDF file.
106
- def extract_pages_from_pdf(pdf_path):
107
- # p = 0
108
- page_cunks = []
109
-
110
- # with pdfplumber.open(pdf_path) as pdf:
111
- pdf = pdfplumber.open(pdf_path);
112
- for page_no, page in enumerate(pdf.pages):
113
- # page_no = f"Page {p+1}"
114
- text = page.extract_text()
115
-
116
- page_cunks.append([page_no + 1, text])
117
- # p +=1
118
-
119
- return page_cunks
120
- page_cunks = extract_pages_from_pdf(pdf_path)
121
-
122
- # for page_chunk in page_cunks[0:5]:
123
- # print(page_chunk)
124
-
125
- """#### Functions to perform fixed size chunking using token-size
126
-
127
- We will be using OpenAI 'gpt-3.5-turbo' model for generating answer so we choose size of chunks such that it does not exceed token limit of the model which is 4096(input and output)
128
- """
129
-
130
- # Load the tokenizer
131
- tokenizer = tiktoken.get_encoding("cl100k_base")
132
- # Define the token limit for each chunk
133
- TOKEN_SIZE = 512 # Adjust for optimal output
134
-
135
- def chunk_text_by_token_size(text, TOKEN_SIZE):
136
- # Tokenize the text
137
- tokens = tokenizer.encode(text)
138
-
139
- # Chunk the tokens into fixed-size chunks
140
- chunks = [tokens[i:i + TOKEN_SIZE] for i in range(0, len(tokens), TOKEN_SIZE)]
141
-
142
- # Convert the chunks back into text
143
- text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
144
-
145
- return text_chunks
146
-
147
- def fixed_size_chunking_of_pdf(pdf_path):
148
- # Extract text from a PDF
149
- with pdfplumber.open(pdf_path) as pdf:
150
- # Initialize a list to store chunks
151
- all_chunks = []
152
-
153
- # Iterate over all the pages
154
- for page_no, page in enumerate(pdf.pages):
155
-
156
- # Extract text from the page
157
- text = page.extract_text()
158
-
159
- # Chunk the text based on token limit
160
- page_chunks = chunk_text_by_token_size(text, TOKEN_SIZE)
161
-
162
- for text_chunk in page_chunks:
163
- all_chunks.append([f"{page_no + 1}", text_chunk])
164
-
165
- return all_chunks
166
-
167
- # Append the chunks to the list
168
- all_chunks = fixed_size_chunking_of_pdf(pdf_path)
169
-
170
- # Example: Print the first chunk
171
- # for chunk in all_chunks[0:5]:
172
- # print(chunk)
173
-
174
- """We will store the chunks in a dataframe for further processng.
175
-
176
- chunks smaller than length 10 might be some empty pages or very few words so will be dropped.
177
-
178
- Depending on the chunking srategy relevant functions are called.
179
- """
180
-
181
- # functions for storing chunks in data frame for further processing
182
- def store_docs_to_df(chunks):
183
- # Initialize a list to store chunks
184
- data = []
185
- # Convert the extracted list to a DF, and add a column to store document names
186
- extracted_text_df = pd.DataFrame(chunks, columns=['Page No.', 'Text'])
187
- # Append the extracted text and Page number to the list
188
- data.append(extracted_text_df)
189
-
190
- # Concatenate all the DFs in the list 'data' together
191
- insurance_pdf_data = pd.concat(data, ignore_index=True)
192
- # insurance_pdfs_data.head(20)
193
-
194
- # Let's also check the length of all the texts as there might be some empty pages or with very few words that we can drop
195
-
196
- insurance_pdf_data['Text_Length'] = insurance_pdf_data['Text'].apply(lambda x: len(x.split(' ')))
197
- insurance_pdf_data['Text_Length']
198
-
199
- # Retain only the rows with a text length of at least 10
200
-
201
- insurance_pdf_data = insurance_pdf_data.loc[insurance_pdf_data['Text_Length'] >= 10]
202
- # insurance_pdfs_data
203
-
204
- # Store the metadata for each page in a separate column
205
- # insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Page No.': x['Page No.'], 'Chunk No': x['Chunk No']}, axis=1)
206
- insurance_pdf_data['Metadata'] = insurance_pdf_data.apply(lambda x: {'Page No.': x['Page No.']}, axis=1)
207
- # insurance_pdfs_data
208
-
209
- return insurance_pdf_data
210
-
211
- chunks_df = store_docs_to_df(page_cunks) # page based chunking
212
- # chunks_df = store_docs_to_df(all_chunks) # chunking based on size=token-size
213
-
214
- # chunks_df.tail(5)
215
-
216
- """## Generate and Store Embeddings
217
-
218
- In this section, we will embed the chunks and store them in a ChromaDB collection.
219
- """
220
-
221
- # Define the path where chroma collections will be stored
222
- chroma_data_path = '/content/drive/MyDrive/HelpMate_AI_Codes/ChromaDB_Data'
223
-
224
- # Import the OpenAI Embedding Function into chroma
225
- # from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
226
- # embedding_function = OpenAIEmbeddingFunction(
227
- # api_key=openai.api_key,
228
- # model_name="text-embedding-ada-002"
229
- # )
230
-
231
  # Import the SentenceTransformer Embedding Function into chroma
232
  from chromadb.utils import embedding_functions
233
  # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
@@ -238,64 +79,18 @@ embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(mo
238
  client = chromadb.PersistentClient()
239
 
240
  """
241
-
242
  We will also implement a data/collection cache to improve the performance of the overall search system."""
243
 
244
  # Set up the embedding function
245
 
246
- def generate_embeddings(chunks_df, embedding_function):
247
-
248
- all_collections = client.list_collections()
249
- collection_exists = any(col.name == 'RAG_on_Insurance' for col in all_collections)
250
-
251
- if collection_exists:
252
- client.delete_collection(name='RAG_on_Insurance')
253
 
254
  # Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
255
- insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)
256
-
257
- # Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma
258
- documents_list = chunks_df["Text"].tolist()
259
- metadata_list = chunks_df['Metadata'].tolist()
260
-
261
- # Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.
262
-
263
- insurance_collection.add(
264
- documents= documents_list,
265
- ids = [str(i) for i in range(0, len(documents_list))],
266
- metadatas = metadata_list
267
- )
268
-
269
- collection_exists = any(col.name == 'Insurance_Cache' for col in all_collections)
270
-
271
- if collection_exists:
272
- client.delete_collection(name='Insurance_Cache')
273
-
274
- cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)
275
-
276
- # print(client.list_collections())
277
-
278
- # print(cache_collection.peek())
279
-
280
- # cache_results = cache_collection.query(
281
- # query_texts=query,
282
- # n_results=1
283
- # )
284
-
285
- # print(cache_results)
286
-
287
- return insurance_collection, cache_collection
288
 
289
- insurance_collection, cache_collection = generate_embeddings(chunks_df, embedding_function)
290
- # insurance_collection.peek(5)
291
 
292
- # Let's take a look at the first few entries in the collection
293
- # sample = insurance_collection.peek(5)
294
- # sample
295
- # print(insurance_collection.get(
296
- # ids = ['4','5','6'],
297
- # include = ['documents', 'metadatas']
298
- # ))
299
 
300
  """##<font color = yellow> Search Layer
301
 
@@ -324,77 +119,29 @@ def retreive_results(query):
324
 
325
  results_df = pd.DataFrame()
326
 
327
- # Searh the Cache collection first
328
- # Query the collection against the user query and return the top 20 results
329
-
330
- cache_results = cache_collection.query(
331
- query_texts=query,
332
- n_results=1
333
  )
334
 
335
- # print(cache_results)
336
- # print(f"cache_results top distance: {cache_results['distances'][0][0]}")
 
 
337
 
338
- # If the distance is greater than the threshold, then return the results from the main collection.
339
- if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
340
- # Query the collection against the user query and return the top 10 results
341
- results = insurance_collection.query(
342
- query_texts=query,
343
- n_results=10
344
- )
345
-
346
- # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
347
- # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
348
- Keys = []
349
- Values = []
350
-
351
- for key, val in results.items():
352
- if val is None:
353
- continue
354
- if key in ['ids', 'metadatas', 'documents', 'distances']:
355
- for i in range(10):
356
  Keys.append(str(key)+str(i))
357
  Values.append(str(val[0][i]))
358
- # print(key, i)
359
-
360
- cache_collection.add(
361
- documents= [query],
362
- ids = [query], # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
363
- metadatas = dict(zip(Keys, Values))
364
- )
365
-
366
- # print("Not found in cache. Found in main collection.")
367
-
368
- result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
369
- results_df = pd.DataFrame.from_dict(result_dict)
370
-
371
- # If the distance is, however, less than the threshold, you can return the results from cache
372
-
373
- elif cache_results['distances'][0][0] <= threshold:
374
- cache_result_dict = cache_results['metadatas'][0][0]
375
-
376
- # Loop through each inner list and then through the dictionary
377
- for key, value in cache_result_dict.items():
378
- if 'ids' in key:
379
- ids.append(value)
380
- elif 'documents' in key:
381
- documents.append(value)
382
- elif 'distances' in key:
383
- distances.append(value)
384
- elif 'metadatas' in key:
385
- metadatas.append(value)
386
 
387
- print("Found in cache!")
388
-
389
- # Create a DataFrame
390
- results_df = pd.DataFrame({
391
- 'IDs': ids,
392
- 'Documents': documents,
393
- 'Distances': distances,
394
- 'Metadatas': metadatas
395
- })
396
-
397
- # print(results_df)
398
 
399
  return results_df
400
 
@@ -444,414 +191,6 @@ def rerank_with_cross_encoder(query, results_df, top_k=3):
444
  # top_docs = rerank_with_cross_encoder(results_df)
445
  # top_docs
446
 
447
- """##<font color = yellow> Generative Layer
448
-
449
- ### Retrieval Augmented Generation(RAG)
450
-
451
- We will now use OpenAI *gpt-3.5-turbo* along with the user query and prompt with top ranked docs, to generate a direct answer to the query along with citations.
452
- """
453
-
454
- # # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
455
-
456
- # def create_prompt(query, top_docs):
457
- # """
458
- # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
459
- # """
460
- # prompt = [
461
- # {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
462
- # {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
463
- # You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'.
464
- # These search results are essentially one paragraph of an insurance document that may be relevant to the user query.
465
-
466
- # The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page.
467
-
468
- # The policy document describes about 3 different policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'
469
-
470
- # Use the documents in '{top_docs}' to answer the query '{query}'.
471
-
472
- # Follow the guidelines below when performing the task:
473
- # 1. Try to provide relevant/accurate numbers if available.
474
- # 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
475
- # 3. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
476
- # 4. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
477
- # 5. If you think that the query is not relevant to the document, reply that the query is irrelevant.
478
- # 6. Provide the final response as a well-formatted and easily readable text along with the citation.
479
- # 7. Provide your complete response using the relevant parts in the documents.
480
- # 8. The generated response should answer the query directly addressing the user and avoiding additional information.
481
- # 9. Provide the final response as a well-formatted and easily readable text.
482
-
483
- # """},
484
- # ]
485
-
486
- # return prompt
487
-
488
- # # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
489
-
490
- # def create_prompt(query, top_docs):
491
- # """
492
- # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
493
- # """
494
- # prompt = [
495
- # {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
496
- # {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
497
- # You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'. These search results are essentially one paragraph of an insurance document that may be relevant to the user query.
498
-
499
- # The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page.
500
-
501
- # The policy document describes about 3 different policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'
502
-
503
- # Use the documents in '{top_docs}' to answer the query '{query}'.
504
-
505
- # Follow the guidelines below when performing the task.
506
- # 1. Try to provide relevant/accurate numbers if available.
507
- # 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
508
- # 4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
509
- # 5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
510
- # 6. If you think that the query is not relevant to the document, reply that the query is irrelevant.
511
- # 7. Provide the final response as a well-formatted and easily readable text along with the citation.
512
- # 8. Provide your complete response using the relevant parts in the documents.
513
-
514
- # The generated response should answer the query directly addressing the user and avoiding additional information. Provide the final response as a well-formatted and easily readable text.
515
- # **Example 1:**
516
- # **Query**: "What are the benefits of the whole life insurance policy?"
517
- # **Search Results**: Dataframe contains an excerpt from a whole life insurance policy document: "The policy provides lifelong coverage, a guaranteed death benefit, and a cash value component that grows over time."
518
- # **Response**: "The whole life insurance policy offers lifelong coverage with a guaranteed death benefit. Additionally, it accumulates cash value over time, which can be accessed or borrowed against by the policyholder."
519
- # **Citations**: Policy Name: Lifetime Protection Plan, Page: 7
520
-
521
- # **Example 2:**
522
- # **Query**: "What is the death benefit for a final expense life insurance policy?"
523
- # **Search Results**: Dataframe contains a document with the following excerpt: "The final expense policy provides a death benefit of up to $10,000, intended to cover funeral costs and other end-of-life expenses."
524
- # **Response**: "The final expense life insurance policy provides a death benefit of up to $10,000, which is typically used to cover funeral costs and other end-of-life expenses."
525
- # **Citations**: Policy Name: Final Expense Protection, Page: 3
526
-
527
- # """},
528
- # ]
529
-
530
- # return prompt
531
-
532
- # # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
533
-
534
- # def create_prompt(query, top_docs):
535
-
536
- # """
537
- # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
538
- # """
539
- # prompt = [
540
- # {
541
- # "role": "system",
542
- # "content": "You are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely."
543
- # },
544
- # {
545
- # "role": "user",
546
- # "content": f"""
547
- # You are given a user query and a set of relevant insurance policy document excerpts retrieved by a Retrieval-Augmented Generation (RAG) system.
548
-
549
- # Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
550
-
551
- # The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
552
-
553
- # Guidelines:
554
- # 1. Extract information that directly answers the user's query from the document excerpts.
555
- # 2. Organize the response using clear headings, bullet points, or tables where applicable.
556
- # 3. Cite the relevant policy name(s) and page number(s) using the metadata from the dataframe.
557
- # 4. If the provided excerpts do not fully answer the query, provide all available information and suggest which sections of the policy document the user should review for further details.
558
- # 5. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
559
-
560
- # ### Example Query:
561
- # **User Query**: "What are the premium rates for different types of insurance under this policy?"
562
-
563
- # **Extracted Information**:
564
- # **Article 2 - Premium Rates**:
565
- # 1. **Member Life Insurance**: $0.210 for each $1,000 of insurance in force.
566
- # 2. **Member Accidental Death and Dismemberment Insurance**: $0.025 for each $1,000 of Member Life Insurance in force.
567
- # 3. **Dependent Life Insurance**: $1.46 for each Member insured for Dependent Life Insurance.
568
-
569
- # **Multiple Policy Discount**: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
570
-
571
- # **Citations**: Policy Name: Group Life Insurance Policy, Page Number: 12.
572
-
573
- # ### Your Task:
574
- # The user query is: '{query}'
575
- # """
576
- # }
577
- # ]
578
- # return prompt
579
-
580
- # # function to create prompt having the top ranked docs and query.
581
-
582
- # def create_prompt(query, top_docs):
583
-
584
- # """
585
- # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
586
- # """
587
- # prompt = [
588
- # {
589
- # "role": "system",
590
- # "content": "You are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely."
591
- # },
592
- # {
593
- # "role": "user",
594
- # "content": f"""
595
- # You are given a user query and a set of relevant insurance policy document excerpts retrieved by a Retrieval-Augmented Generation (RAG) system.
596
-
597
- # Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
598
-
599
- # The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
600
-
601
- # Guidelines:
602
- # 1. Extract information that directly answers the user's query from the document excerpts.
603
- # 2. Organize the response using clear headings, bullet points, or tables where applicable.
604
- # 3. If the text includes tables with relevant information, reformat them into a clear, readable structure.
605
- # 4. Cite the relevant policy name(s) and page number(s) using the metadata from the dataframe.
606
- # 5. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
607
- # 6. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
608
-
609
- # ### Example Query:
610
- # **User Query**: "What are the premium rates for different types of insurance under this policy?"
611
-
612
- # **Premium Rates**:
613
- # 1. **Member Life Insurance**: $0.210 for each $1,000 of insurance in force.
614
- # 2. **Member Accidental Death and Dismemberment Insurance**: $0.025 for each $1,000 of Member Life Insurance in force.
615
- # 3. **Dependent Life Insurance**: $1.46 for each Member insured for Dependent Life Insurance.
616
-
617
- # **Multiple Policy Discount**: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
618
-
619
- # **Citations**: Policy Name: Group Life Insurance Policy, Page Number: 12.
620
-
621
- # ### Your Task:
622
- # The user query is: '{query}'
623
- # """
624
- # }
625
- # ]
626
- # return prompt
627
-
628
- # prompt = create_prompt(query, top_docs)
629
-
630
- # # function to generate the response.
631
-
632
- # def generate_response(query, top_docs):
633
- # """
634
- # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
635
- # """
636
- # messages = [
637
- # {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
638
- # {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
639
- # You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'. These search results are essentially one page of an insurance document that may be relevant to the user query.
640
-
641
- # The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.
642
-
643
- # Use the documents in '{top_docs}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.
644
-
645
- # Follow the guidelines below when performing the task.
646
- # 1. Try to provide relevant/accurate numbers if available.
647
- # 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
648
- # 3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
649
- # 3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
650
- # 4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
651
- # 5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
652
-
653
- # The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
654
- # """},
655
- # ]
656
-
657
- # response = openai.chat.completions.create(
658
- # model="gpt-3.5-turbo",
659
- # messages=messages
660
- # )
661
-
662
- # return response.choices[0].message.content.split('\n')
663
-
664
- # response = generate_response(query, top_docs)
665
- # print(query + '\n')
666
- # print("\n".join(response))
667
-
668
- # function to generate the response.
669
-
670
- def generate_response(query, top_docs):
671
- """
672
- Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
673
- """
674
- messages = f"""
675
- Remember your system message and that you are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely.
676
- Your task is to extract and present relevant information from the policy documents to answer the user’s query.
677
- The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
678
- The user input is: '{query}'
679
- """
680
-
681
- # response = openai.chat.completions.create (
682
- # model="gpt
683
- ### Your Task:-3.5-turbo",
684
- # messages=messages
685
- # )
686
- conversation = [{"role": "user", "parts": messages}]
687
-
688
- return conversation #response.choices[0].message.content.split('\n')
689
-
690
- # response = generate_response(query, top_docs)
691
- # print(query + '\n')
692
- # print("\n".join(response))
693
-
694
- """## <font color = yellow> Query Search
695
-
696
- ### <font color = yellow> Query #1
697
- """
698
-
699
- # query1 = "what happens if failed to Pay Premium?"
700
-
701
- # results_df = retreive_results(query1, insurance_collection, cache_collection)
702
- # top_docs = rerank_with_cross_encoder(results_df)
703
- # top_docs
704
-
705
- # #generate response
706
- # response = generate_response(query1, top_docs)
707
-
708
- # print("\n".join(response))
709
-
710
- # """### <font color = yellow> Query #2"""
711
-
712
- # query2 = "what are the eligibility requirements for different types of insurance under this policy?"
713
-
714
- # results_df = retreive_results(query2, insurance_collection, cache_collection)
715
- # top_docs = rerank_with_cross_encoder(results_df)
716
- # top_docs
717
-
718
- # #generate response
719
- # response = generate_response(query2, top_docs)
720
- # print("\n".join(response))
721
-
722
- # """### <font color = yellow> Query #3"""
723
-
724
- # query3 = "What are the Termination Rights of the Policyholder?"
725
-
726
- # results_df = retreive_results(query3, insurance_collection, cache_collection)
727
- # top_docs = rerank_with_cross_encoder(results_df)
728
- # top_docs
729
-
730
- # #generate response
731
- # response = generate_response(query3, top_docs)
732
- # print("\n".join(response))
733
-
734
- # def run_pipeline(chunk_strategy,
735
- # embedding_function,
736
- # chroma_data_path,
737
- # query,
738
- # cross_encoder,
739
- # top_k,
740
- # rag_model,
741
- # prompt_style="default"):
742
-
743
- # # Embedding layer
744
- # # Preprocess documents
745
-
746
- # # Extract text
747
- # # Split into chunks
748
- # if chunk_strategy == "page":
749
- # docs = extract_pages_from_pdf(pdf_path)
750
- # elif chunk_strategy == "fixed_size":
751
- # docs = fixed_size_chunking_of_pdf(pdf_path)
752
-
753
- # docs_df = store_docs_to_df(docs)
754
-
755
- # # Generate embeddings and store in chromadb collection and cache
756
- # insurance_collection, cache_collection = generate_embeddings(docs_df, embedding_function)
757
-
758
- # # Retrieve documents relevant to query from collections and store in cache
759
- # results_df = retreive_results(query, insurance_collection, cache_collection)
760
-
761
- # # Re-rank with Cross Encoder
762
- # top_re_ranks, top_df = rerank_with_cross_encoder(results_df, top_k)
763
-
764
- # # Create prompt
765
- # prompt = create_prompt(query, top_re_ranks)
766
-
767
- # # Generate response
768
- # response = generate_response(prompt, rag_model)
769
-
770
- # return top_df, response
771
-
772
- # # select chunking strategy
773
-
774
- # # chunk_strategy = "page"
775
- # chunk_strategy = "fixed_size"
776
- # # Load the tokenizer
777
- # tokenizer = tiktoken.get_encoding("cl100k_base")
778
- # # Define the token limit for each chunk
779
- # TOKEN_SIZE = 500 # Adjust this based on your needs
780
-
781
- # # Import the OpenAI Embedding Function into chroma
782
- # from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
783
-
784
- # # select the model and initialise the embedding function
785
- # # model = "text-embedding-ada-002"
786
- # # embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)
787
-
788
- # from chromadb.utils import embedding_functions
789
- # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
790
- # # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
791
- # # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
792
-
793
- # # Import the CrossEncoder library from sentence_transformers
794
- # from sentence_transformers import CrossEncoder, util
795
- # # Initialise the cross encoder model
796
- # cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
797
- # # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
798
- # # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
799
-
800
- # # test query
801
- # # query = "what are the eligibility requirements?"
802
- # # query = "what are the eligibility requirements for different types of insurance under this policy?"
803
-
804
- # # query = "what are the benefits payable?"
805
- # # query = "what are the benefits payable for different types of insurance under this policy?"
806
- # # query = "What are the benefits payable of Member Accidental Death and Dismemberment Insurance?"
807
- # # query = "What are the benefits of Member Life Insurance?"
808
-
809
- # # query = "How much is the premium amount?"
810
- # # query = "How much is the premium amount for different types of insurance under this policy?"
811
-
812
- # # query = "How much is the premium rate?"
813
- # # query = "What are the premium rates for different types of insurance under this policy?"
814
- # # query = "What are the premium rates?"
815
-
816
- # # print(query)
817
-
818
- # # how much top query results to consider for generating response
819
- # top_k = 5
820
-
821
- # # select RAG model
822
- # rag_model = "gpt-3.5-turbo"
823
-
824
- # top_df, response = run_pipeline(chunk_strategy,
825
- # embedding_function,
826
- # chroma_data_path,
827
- # query,
828
- # cross_encoder,
829
- # top_k,
830
- # rag_model)
831
- # # results_df = run_pipeline(chunk_strategy,
832
- # # embedding_function,
833
- # # chroma_data_path,
834
- # # query,
835
- # # cross_encoder,
836
- # # top_k,
837
- # # rag_model)
838
-
839
- # # top_re_ranks = run_pipeline(chunk_strategy,
840
- # # embedding_function,
841
- # # chroma_data_path,
842
- # # query,
843
- # # cross_encoder,
844
- # # top_k,
845
- # # rag_model)
846
-
847
- # print("\n".join(response))
848
- # # print(prompt)
849
- # # top_re_ranks
850
- # # docs_df.head(100)
851
- # # top_semantic_search
852
- # top_df
853
- # # results_df
854
-
855
 
856
 
857
 
 
69
 
70
  return conversation
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  # Import the SentenceTransformer Embedding Function into chroma
73
  from chromadb.utils import embedding_functions
74
  # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
 
79
  client = chromadb.PersistentClient()
80
 
81
  """
 
82
  We will also implement a data/collection cache to improve the performance of the overall search system."""
83
 
84
  # Set up the embedding function
85
 
86
+ def generate_embeddings(embedding_function):
 
 
 
 
 
 
87
 
88
  # Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
89
+ insurance_collection = client.get_collection(name='RAG_on_Insurance', embedding_function=embedding_function)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ return insurance_collection
 
92
 
93
+ insurance_collection = generate_embeddings(embedding_function)
 
 
 
 
 
 
94
 
95
  """##<font color = yellow> Search Layer
96
 
 
119
 
120
  results_df = pd.DataFrame()
121
 
122
+ # If the distance is greater than the threshold, then return the results from the main collection.
123
+
124
+ # Query the collection against the user query and return the top 10 results
125
+ results = insurance_collection.query(
126
+ query_texts=query,
127
+ n_results=10
128
  )
129
 
130
+ # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
131
+ # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
132
+ Keys = []
133
+ Values = []
134
 
135
+ for key, val in results.items():
136
+ if val is None:
137
+ continue
138
+ if key in ['ids', 'metadatas', 'documents', 'distances']:
139
+ for i in range(10):
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  Keys.append(str(key)+str(i))
141
  Values.append(str(val[0][i]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
144
+ results_df = pd.DataFrame.from_dict(result_dict)
 
 
 
 
 
 
 
 
 
145
 
146
  return results_df
147
 
 
191
  # top_docs = rerank_with_cross_encoder(results_df)
192
  # top_docs
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
 
app/main.py β†’ main.py RENAMED
@@ -6,9 +6,14 @@ from fastapi.staticfiles import StaticFiles
6
  from helpmate_ai import initialize_conversation, retreive_results, rerank_with_cross_encoder, generate_response
7
  import re
8
  import google.generativeai as genai
 
 
9
 
10
  # Configure Gemini API
11
- gemini_api_key = open("gemini_api_key.txt", "r").read().strip()
 
 
 
12
  genai.configure(api_key=gemini_api_key)
13
 
14
  # Initialize FastAPI app
 
6
  from helpmate_ai import initialize_conversation, retreive_results, rerank_with_cross_encoder, generate_response
7
  import re
8
  import google.generativeai as genai
9
+ import os
10
+ from dotenv import load_dotenv
11
 
12
  # Configure Gemini API
13
+ # gemini_api_key = open("gemini_api_key.txt", "r").read().strip()
14
+ load_dotenv()
15
+
16
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
17
  genai.configure(api_key=gemini_api_key)
18
 
19
  # Initialize FastAPI app
requirements.txt CHANGED
@@ -7,6 +7,6 @@ fastapi
7
  uvicorn
8
  jinja2
9
  python-multipart
10
- pdfplumber
11
  sentence_transformers
12
- tiktoken
 
 
7
  uvicorn
8
  jinja2
9
  python-multipart
 
10
  sentence_transformers
11
+ os
12
+ dotenv