prithvirajpawar commited on
Commit
27bbfe3
·
1 Parent(s): 4a43b26

Deploy FastAPI app

Browse files
Files changed (47) hide show
  1. Dockerfile +17 -0
  2. app/.DS_Store +0 -0
  3. app/__init__.py +0 -0
  4. app/__pycache__/__init__.cpython-311.pyc +0 -0
  5. app/__pycache__/main.cpython-311.pyc +0 -0
  6. app/main.py +90 -0
  7. chroma/.DS_Store +0 -0
  8. chroma/198995c5-3e47-440b-98d8-b095e7b992c3/data_level0.bin +3 -0
  9. chroma/198995c5-3e47-440b-98d8-b095e7b992c3/header.bin +3 -0
  10. chroma/198995c5-3e47-440b-98d8-b095e7b992c3/length.bin +3 -0
  11. chroma/198995c5-3e47-440b-98d8-b095e7b992c3/link_lists.bin +0 -0
  12. chroma/chroma.sqlite3 +3 -0
  13. chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/data_level0.bin +3 -0
  14. chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/header.bin +3 -0
  15. chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/length.bin +3 -0
  16. chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/link_lists.bin +0 -0
  17. chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/data_level0.bin +3 -0
  18. chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/header.bin +3 -0
  19. chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/length.bin +3 -0
  20. chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/link_lists.bin +0 -0
  21. chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/data_level0.bin +3 -0
  22. chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/header.bin +3 -0
  23. chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/length.bin +3 -0
  24. chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/link_lists.bin +0 -0
  25. chroma/fc83983d-80db-49a4-a6fb-42834cc83377/data_level0.bin +3 -0
  26. chroma/fc83983d-80db-49a4-a6fb-42834cc83377/header.bin +3 -0
  27. chroma/fc83983d-80db-49a4-a6fb-42834cc83377/length.bin +3 -0
  28. chroma/fc83983d-80db-49a4-a6fb-42834cc83377/link_lists.bin +0 -0
  29. chroma/fd3b321e-3bab-4583-88b0-405d918cd938/data_level0.bin +3 -0
  30. chroma/fd3b321e-3bab-4583-88b0-405d918cd938/header.bin +3 -0
  31. chroma/fd3b321e-3bab-4583-88b0-405d918cd938/length.bin +3 -0
  32. chroma/fd3b321e-3bab-4583-88b0-405d918cd938/link_lists.bin +0 -0
  33. chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/data_level0.bin +3 -0
  34. chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/header.bin +3 -0
  35. chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/length.bin +3 -0
  36. chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/link_lists.bin +0 -0
  37. chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/data_level0.bin +3 -0
  38. chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/header.bin +3 -0
  39. chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/length.bin +3 -0
  40. chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/link_lists.bin +0 -0
  41. helpmate_ai.py +860 -0
  42. requirements.txt +12 -0
  43. static/css/styles.css +100 -0
  44. static/send-icon.png +0 -0
  45. templates/index_bye.html +21 -0
  46. templates/index_hello.html +21 -0
  47. templates/index_invite.html +51 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python base image
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory
5
+ WORKDIR /app
6
+
7
+ # Copy project files to the container
8
+ COPY . /app
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Expose the default FastAPI port
14
+ EXPOSE 8000
15
+
16
+ # Command to run FastAPI with Uvicorn
17
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
app/.DS_Store ADDED
Binary file (6.15 kB). View file
 
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (162 Bytes). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (4.77 kB). View file
 
app/main.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request, Form
2
+ from fastapi.responses import RedirectResponse
3
+ from fastapi.templating import Jinja2Templates
4
+ from fastapi.staticfiles import StaticFiles
5
+ # from fastapi.middleware.cors import CORSMiddleware
6
+ from helpmate_ai import initialize_conversation, retreive_results, rerank_with_cross_encoder, generate_response
7
+ import re
8
+ import google.generativeai as genai
9
+
10
+ # Configure Gemini API
11
+ gemini_api_key = open("gemini_api_key.txt", "r").read().strip()
12
+ genai.configure(api_key=gemini_api_key)
13
+
14
+ # Initialize FastAPI app
15
+ app = FastAPI()
16
+
17
+ # Set up templates
18
+ templates = Jinja2Templates(directory="templates")
19
+
20
+ # Serve static files (if needed)
21
+ app.mount("/static", StaticFiles(directory="static"), name="static")
22
+
23
+ # Enable CORS middleware if needed
24
+ # app.add_middleware(
25
+ # CORSMiddleware,
26
+ # allow_origins=["*"], # Adjust origins as per requirements
27
+ # allow_credentials=True,
28
+ # allow_methods=["*"],
29
+ # allow_headers=["*"],
30
+ # )
31
+
32
+ def format_rag_response(response_text):
33
+ formatted_text = response_text.replace("\n", "<br>")
34
+ formatted_text = re.sub(r'(\*\*.*?\*\*)', r'<strong>\1</strong>', formatted_text).replace("**", "")
35
+ formatted_text = re.sub(r'(\d+\.\s)', r'<br><strong>\1</strong>', formatted_text)
36
+ formatted_text = re.sub(r'(\-\s)', r'<br>&bull; ', formatted_text)
37
+ formatted_text = re.sub(r'(Citations?:\s)', r'<br><em>\1</em>', formatted_text)
38
+ formatted_text = re.sub(r'\|\s*', r'</td><td>', formatted_text)
39
+ formatted_text = re.sub(r'\n\|\s*', r'<tr><td>', formatted_text)
40
+ return formatted_text
41
+
42
+ conversation_bot = []
43
+ conversation = initialize_conversation()
44
+
45
+ # Initialize Gemini model
46
+ model = genai.GenerativeModel("gemini-1.5-flash", system_instruction=conversation)
47
+
48
+ def get_gemini_completions(conversation):
49
+ response = model.generate_content(conversation)
50
+ return response.text
51
+
52
+ introduction = get_gemini_completions(conversation)
53
+ conversation_bot.append({'bot': introduction})
54
+ top_3_laptops = None
55
+
56
+ @app.get("/")
57
+ async def default_func(request: Request):
58
+ global conversation_bot
59
+ return templates.TemplateResponse("index_invite.html", {"request": request, "name_xyz": conversation_bot})
60
+
61
+ @app.post("/end_conv")
62
+ async def end_conv():
63
+ global conversation_bot, conversation, top_3_laptops
64
+ conversation_bot = []
65
+ conversation = initialize_conversation()
66
+ introduction = get_gemini_completions(conversation)
67
+ conversation_bot.append({'bot': introduction})
68
+ top_3_laptops = None
69
+ return RedirectResponse(url="/", status_code=303)
70
+
71
+ @app.post("/invite")
72
+ async def invite(user_input_message: str = Form(...)):
73
+ global conversation_bot, conversation, top_3_laptops
74
+ user_input = user_input_message
75
+ conversation_bot.append({'user': user_input})
76
+
77
+ results_df = retreive_results(user_input)
78
+ top_docs = rerank_with_cross_encoder(user_input, results_df)
79
+
80
+ # Generate response
81
+ messages = generate_response(user_input, top_docs)
82
+ response_assistant = get_gemini_completions(messages)
83
+
84
+ conversation_bot.append({'bot': format_rag_response(response_assistant)})
85
+ return RedirectResponse(url="/", status_code=303)
86
+
87
+ # Run the application
88
+ if __name__ == '__main__':
89
+ import uvicorn
90
+ uvicorn.run(app, host="0.0.0.0", port=8000, debug=True)
chroma/.DS_Store ADDED
Binary file (10.2 kB). View file
 
chroma/198995c5-3e47-440b-98d8-b095e7b992c3/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
3
+ size 1676000
chroma/198995c5-3e47-440b-98d8-b095e7b992c3/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
chroma/198995c5-3e47-440b-98d8-b095e7b992c3/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b8084d9f7913e043f1ebcd06e456f5d571fd8394d9da876cf420ecd973a671f
3
+ size 4000
chroma/198995c5-3e47-440b-98d8-b095e7b992c3/link_lists.bin ADDED
File without changes
chroma/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2390eaa4b256b7af85f6e852ef50dca2cbfb8dcc78202b4482808f64783a0685
3
+ size 159731712
chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8205b87f647f23b8f4460452ac5cfbedfa817774ba1c318aa8109f1724f86ce7
3
+ size 1676000
chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06b76cba3959bfe87474b2c0d7355085c5190842788d6f0b9b22671facf8d341
3
+ size 4000
chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/link_lists.bin ADDED
File without changes
chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
3
+ size 1676000
chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e2b719ded2660106ef0a6e7cc5cbd492704d7cbb332cc98386c90d4e84aa30
3
+ size 4000
chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/link_lists.bin ADDED
File without changes
chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
3
+ size 1676000
chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86be761d2a303904d59def5f8ccf5e6c4574cdbeb568a0675301dea1e52a137b
3
+ size 4000
chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/link_lists.bin ADDED
File without changes
chroma/fc83983d-80db-49a4-a6fb-42834cc83377/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:485cdcf2eecbe50ac5822a13a36ebd375438c46d68e6d9bdb699433cf23ef595
3
+ size 1676000
chroma/fc83983d-80db-49a4-a6fb-42834cc83377/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
chroma/fc83983d-80db-49a4-a6fb-42834cc83377/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
3
+ size 4000
chroma/fc83983d-80db-49a4-a6fb-42834cc83377/link_lists.bin ADDED
File without changes
chroma/fd3b321e-3bab-4583-88b0-405d918cd938/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b93d412c384df38f07adeaf81c6ef7036283fa5c5db4fb32d467152d4b2acf08
3
+ size 1676000
chroma/fd3b321e-3bab-4583-88b0-405d918cd938/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
chroma/fd3b321e-3bab-4583-88b0-405d918cd938/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7604482081864f9ca53d019e460d4ba59e7573e18fd3a38e7236864b4fc1cb78
3
+ size 4000
chroma/fd3b321e-3bab-4583-88b0-405d918cd938/link_lists.bin ADDED
File without changes
chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d01e91f079437520222f51034df61cc054d6dd33b15bc61063cea20dde157d7
3
+ size 1676000
chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc1120cd20faa1fa96577121a592a28d2c01a086f7af19dc13ea8c9a82e718b
3
+ size 4000
chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/link_lists.bin ADDED
File without changes
chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49ff7e332456970ddc9126a8c0129209ad0b62c469ebc99af8291a976a7300c0
3
+ size 1676000
chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:988459be7b8489fed34996257c4cd17982e5d48dacc56656f165d21655ac98b3
3
+ size 4000
chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/link_lists.bin ADDED
File without changes
helpmate_ai.py ADDED
@@ -0,0 +1,860 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Install all the required libraries
3
+
4
+ # !pip install -U -q pdfplumber tiktoken openai chromaDB sentence-transformers
5
+
6
+ # Import all the required Libraries
7
+
8
+ import pdfplumber
9
+ from pathlib import Path
10
+ import pandas as pd
11
+ from operator import itemgetter
12
+ import json
13
+ import tiktoken
14
+ # import openai
15
+ import chromadb
16
+
17
+ # openai.api_key = open("api_key.txt", "r").read().strip()
18
+
19
+ def initialize_conversation():
20
+ """
21
+ Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
22
+ """
23
+ conversation = [
24
+ f"""
25
+ You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
26
+ The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
27
+ Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe, with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
28
+
29
+ <EXAMPLE>
30
+ INPUT: "What are the premium rates for different types of insurance under this policy?"
31
+
32
+ OUTPUT:
33
+ The premium rate(s) for each Member insured for Life Insurance will be:
34
+
35
+ Premium Rates:
36
+ 1. Member Life Insurance: $0.210 for each $1,000 of insurance in force.
37
+ 2. Member Accidental Death and Dismemberment Insurance: $0.025 for each $1,000 of Member Life Insurance in force.
38
+ 3. Dependent Life Insurance: $1.46 for each Member insured for Dependent Life Insurance.
39
+
40
+ Multiple Policy Discount: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
41
+
42
+ Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
43
+ </EXAMPLE>
44
+
45
+ <EXAMPLE>
46
+ INPUT: "What are the Contributions from Members?"
47
+
48
+ OUTPUT:
49
+ Members are not required to contribute a part of the premium for their Member insurance under this Group Policy.
50
+ Members are required to contribute a part of the premium for their Dependent's insurance under this Group Policy.
51
+
52
+ Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
53
+ </EXAMPLE>
54
+
55
+ Guidelines:
56
+ 1. Extract information that directly answers the user's query from the document excerpts.
57
+ 3. Provide the final response as a well-formatted and easily readable text along with the citation.
58
+ 4. Provide your complete response using the relevant parts in the documents.
59
+ 5. The generated response should answer the query directly addressing the user and avoiding additional information.
60
+ 6. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
61
+ 7. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
62
+
63
+ # Start with a short welcome message with smiley only in the begining of the chat session and not in every response.
64
+ """
65
+ ]
66
+
67
+ # conversation = [{"role": "user", "parts": system_message}]
68
+ # conversation = [{"role": "system", "content": system_message}]
69
+
70
+ return conversation
71
+
72
+ """#### Read, Process, and Chunk the PDF File
73
+
74
+ We will be using **pdfplumber** library to read and process the PDF files.
75
+ """
76
+
77
+ # Define the path of the PDF
78
+ pdf_path = 'Principal-Sample-Life-Insurance-Policy.pdf'
79
+
80
+ """Reading PDF file and exploring it for delimeters to decide chunking stategy
81
+
82
+
83
+ """
84
+
85
+ # Open the PDF file
86
+ # with pdfplumber.open(pdf_path) as pdf:
87
+
88
+ # # Get one of the pages from the PDF and examine it
89
+ # single_page = pdf.pages[0]
90
+
91
+ # # Extract text from the first page
92
+ # text = single_page.extract_text()
93
+
94
+ # # Print the extracted text
95
+
96
+ # visible_text = text.replace("\n", "<NEWLINE>\n").replace("\t", "[TAB]").replace(" ", "[SPACE]")
97
+ # print(visible_text)
98
+ # print(text)
99
+
100
+ """*Looking at the the file we will go fixed-size chunking strategy either page or certain token size. We will experiment with various token-size for optimal output.*
101
+
102
+ #### Function to perform Page-Based Chunking
103
+ """
104
+
105
+ # Function to extract text page-wise from a PDF file.
106
+ def extract_pages_from_pdf(pdf_path):
107
+ # p = 0
108
+ page_cunks = []
109
+
110
+ # with pdfplumber.open(pdf_path) as pdf:
111
+ pdf = pdfplumber.open(pdf_path);
112
+ for page_no, page in enumerate(pdf.pages):
113
+ # page_no = f"Page {p+1}"
114
+ text = page.extract_text()
115
+
116
+ page_cunks.append([page_no + 1, text])
117
+ # p +=1
118
+
119
+ return page_cunks
120
+ page_cunks = extract_pages_from_pdf(pdf_path)
121
+
122
+ # for page_chunk in page_cunks[0:5]:
123
+ # print(page_chunk)
124
+
125
+ """#### Functions to perform fixed size chunking using token-size
126
+
127
+ We will be using OpenAI 'gpt-3.5-turbo' model for generating answer so we choose size of chunks such that it does not exceed token limit of the model which is 4096(input and output)
128
+ """
129
+
130
+ # Load the tokenizer
131
+ tokenizer = tiktoken.get_encoding("cl100k_base")
132
+ # Define the token limit for each chunk
133
+ TOKEN_SIZE = 512 # Adjust for optimal output
134
+
135
+ def chunk_text_by_token_size(text, TOKEN_SIZE):
136
+ # Tokenize the text
137
+ tokens = tokenizer.encode(text)
138
+
139
+ # Chunk the tokens into fixed-size chunks
140
+ chunks = [tokens[i:i + TOKEN_SIZE] for i in range(0, len(tokens), TOKEN_SIZE)]
141
+
142
+ # Convert the chunks back into text
143
+ text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
144
+
145
+ return text_chunks
146
+
147
+ def fixed_size_chunking_of_pdf(pdf_path):
148
+ # Extract text from a PDF
149
+ with pdfplumber.open(pdf_path) as pdf:
150
+ # Initialize a list to store chunks
151
+ all_chunks = []
152
+
153
+ # Iterate over all the pages
154
+ for page_no, page in enumerate(pdf.pages):
155
+
156
+ # Extract text from the page
157
+ text = page.extract_text()
158
+
159
+ # Chunk the text based on token limit
160
+ page_chunks = chunk_text_by_token_size(text, TOKEN_SIZE)
161
+
162
+ for text_chunk in page_chunks:
163
+ all_chunks.append([f"{page_no + 1}", text_chunk])
164
+
165
+ return all_chunks
166
+
167
+ # Append the chunks to the list
168
+ all_chunks = fixed_size_chunking_of_pdf(pdf_path)
169
+
170
+ # Example: Print the first chunk
171
+ # for chunk in all_chunks[0:5]:
172
+ # print(chunk)
173
+
174
+ """We will store the chunks in a dataframe for further processng.
175
+
176
+ chunks smaller than length 10 might be some empty pages or very few words so will be dropped.
177
+
178
+ Depending on the chunking srategy relevant functions are called.
179
+ """
180
+
181
+ # functions for storing chunks in data frame for further processing
182
+ def store_docs_to_df(chunks):
183
+ # Initialize a list to store chunks
184
+ data = []
185
+ # Convert the extracted list to a DF, and add a column to store document names
186
+ extracted_text_df = pd.DataFrame(chunks, columns=['Page No.', 'Text'])
187
+ # Append the extracted text and Page number to the list
188
+ data.append(extracted_text_df)
189
+
190
+ # Concatenate all the DFs in the list 'data' together
191
+ insurance_pdf_data = pd.concat(data, ignore_index=True)
192
+ # insurance_pdfs_data.head(20)
193
+
194
+ # Let's also check the length of all the texts as there might be some empty pages or with very few words that we can drop
195
+
196
+ insurance_pdf_data['Text_Length'] = insurance_pdf_data['Text'].apply(lambda x: len(x.split(' ')))
197
+ insurance_pdf_data['Text_Length']
198
+
199
+ # Retain only the rows with a text length of at least 10
200
+
201
+ insurance_pdf_data = insurance_pdf_data.loc[insurance_pdf_data['Text_Length'] >= 10]
202
+ # insurance_pdfs_data
203
+
204
+ # Store the metadata for each page in a separate column
205
+ # insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Page No.': x['Page No.'], 'Chunk No': x['Chunk No']}, axis=1)
206
+ insurance_pdf_data['Metadata'] = insurance_pdf_data.apply(lambda x: {'Page No.': x['Page No.']}, axis=1)
207
+ # insurance_pdfs_data
208
+
209
+ return insurance_pdf_data
210
+
211
+ chunks_df = store_docs_to_df(page_cunks) # page based chunking
212
+ # chunks_df = store_docs_to_df(all_chunks) # chunking based on size=token-size
213
+
214
+ # chunks_df.tail(5)
215
+
216
+ """## Generate and Store Embeddings
217
+
218
+ In this section, we will embed the chunks and store them in a ChromaDB collection.
219
+ """
220
+
221
+ # Define the path where chroma collections will be stored
222
+ chroma_data_path = '/content/drive/MyDrive/HelpMate_AI_Codes/ChromaDB_Data'
223
+
224
+ # Import the OpenAI Embedding Function into chroma
225
+ # from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
226
+ # embedding_function = OpenAIEmbeddingFunction(
227
+ # api_key=openai.api_key,
228
+ # model_name="text-embedding-ada-002"
229
+ # )
230
+
231
+ # Import the SentenceTransformer Embedding Function into chroma
232
+ from chromadb.utils import embedding_functions
233
+ # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
234
+ # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
235
+ embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
236
+
237
+ # Call PersistentClient() so the collections including cache can be stored in a permanent storage
238
+ client = chromadb.PersistentClient()
239
+
240
+ """
241
+
242
+ We will also implement a data/collection cache to improve the performance of the overall search system."""
243
+
244
+ # Set up the embedding function
245
+
246
+ def generate_embeddings(chunks_df, embedding_function):
247
+
248
+ all_collections = client.list_collections()
249
+ collection_exists = any(col.name == 'RAG_on_Insurance' for col in all_collections)
250
+
251
+ if collection_exists:
252
+ client.delete_collection(name='RAG_on_Insurance')
253
+
254
+ # Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
255
+ insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)
256
+
257
+ # Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma
258
+ documents_list = chunks_df["Text"].tolist()
259
+ metadata_list = chunks_df['Metadata'].tolist()
260
+
261
+ # Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.
262
+
263
+ insurance_collection.add(
264
+ documents= documents_list,
265
+ ids = [str(i) for i in range(0, len(documents_list))],
266
+ metadatas = metadata_list
267
+ )
268
+
269
+ collection_exists = any(col.name == 'Insurance_Cache' for col in all_collections)
270
+
271
+ if collection_exists:
272
+ client.delete_collection(name='Insurance_Cache')
273
+
274
+ cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)
275
+
276
+ # print(client.list_collections())
277
+
278
+ # print(cache_collection.peek())
279
+
280
+ # cache_results = cache_collection.query(
281
+ # query_texts=query,
282
+ # n_results=1
283
+ # )
284
+
285
+ # print(cache_results)
286
+
287
+ return insurance_collection, cache_collection
288
+
289
+ insurance_collection, cache_collection = generate_embeddings(chunks_df, embedding_function)
290
+ # insurance_collection.peek(5)
291
+
292
+ # Let's take a look at the first few entries in the collection
293
+ # sample = insurance_collection.peek(5)
294
+ # sample
295
+ # print(insurance_collection.get(
296
+ # ids = ['4','5','6'],
297
+ # include = ['documents', 'metadatas']
298
+ # ))
299
+
300
+ """##<font color = yellow> Search Layer
301
+
302
+ ### Semantic Search with Cache
303
+
304
+ We will perform a semantic search of a query in the collections embeddings to get several top semantically similar results based on the *distance* parameter.
305
+ """
306
+
307
+ # test query
308
+ # query = "What are the premium rates for different types of insurance under this policy?"
309
+ # query = "what are the benefits payable for different types of insurance under this policy?"
310
+ # query = "What are the Contributions from Members??"
311
+
312
+ """#### Document retreival"""
313
+
314
+ # Implementing Cache in Semantic Search
315
+
316
+ def retreive_results(query):
317
+ # Set a threshold for cache search
318
+ threshold = 0.2
319
+
320
+ ids = []
321
+ documents = []
322
+ distances = []
323
+ metadatas = []
324
+
325
+ results_df = pd.DataFrame()
326
+
327
+ # Searh the Cache collection first
328
+ # Query the collection against the user query and return the top 20 results
329
+
330
+ cache_results = cache_collection.query(
331
+ query_texts=query,
332
+ n_results=1
333
+ )
334
+
335
+ # print(cache_results)
336
+ # print(f"cache_results top distance: {cache_results['distances'][0][0]}")
337
+
338
+ # If the distance is greater than the threshold, then return the results from the main collection.
339
+ if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
340
+ # Query the collection against the user query and return the top 10 results
341
+ results = insurance_collection.query(
342
+ query_texts=query,
343
+ n_results=10
344
+ )
345
+
346
+ # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
347
+ # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
348
+ Keys = []
349
+ Values = []
350
+
351
+ for key, val in results.items():
352
+ if val is None:
353
+ continue
354
+ if key in ['ids', 'metadatas', 'documents', 'distances']:
355
+ for i in range(10):
356
+ Keys.append(str(key)+str(i))
357
+ Values.append(str(val[0][i]))
358
+ # print(key, i)
359
+
360
+ cache_collection.add(
361
+ documents= [query],
362
+ ids = [query], # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
363
+ metadatas = dict(zip(Keys, Values))
364
+ )
365
+
366
+ # print("Not found in cache. Found in main collection.")
367
+
368
+ result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
369
+ results_df = pd.DataFrame.from_dict(result_dict)
370
+
371
+ # If the distance is, however, less than the threshold, you can return the results from cache
372
+
373
+ elif cache_results['distances'][0][0] <= threshold:
374
+ cache_result_dict = cache_results['metadatas'][0][0]
375
+
376
+ # Loop through each inner list and then through the dictionary
377
+ for key, value in cache_result_dict.items():
378
+ if 'ids' in key:
379
+ ids.append(value)
380
+ elif 'documents' in key:
381
+ documents.append(value)
382
+ elif 'distances' in key:
383
+ distances.append(value)
384
+ elif 'metadatas' in key:
385
+ metadatas.append(value)
386
+
387
+ print("Found in cache!")
388
+
389
+ # Create a DataFrame
390
+ results_df = pd.DataFrame({
391
+ 'IDs': ids,
392
+ 'Documents': documents,
393
+ 'Distances': distances,
394
+ 'Metadatas': metadatas
395
+ })
396
+
397
+ # print(results_df)
398
+
399
+ return results_df
400
+
401
+ # results_df = retreive_results(query, insurance_collection, cache_collection)
402
+ # results_df.head(5)
403
+
404
+ """#### Re-Ranking with a Cross Encoder
405
+
406
+ We will perform Re-ranking of the search results using cross-encoder to move more relevant chunks at the top.
407
+ """
408
+
409
+ # Import the CrossEncoder library from sentence_transformers
410
+ from sentence_transformers import CrossEncoder, util
411
+ # Initialise the cross encoder model
412
+ # cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
413
+ # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
414
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
415
+
416
+ #function to re-rank results using cross-encoder
417
+ def rerank_with_cross_encoder(query, results_df, top_k=3):
418
+
419
+ # Input (query, response) pairs for each of the top 10 responses received from the semantic search to the cross encoder
420
+ # Generate the cross_encoder scores for these pairs
421
+
422
+ cross_inputs = [[query, response] for response in results_df['Documents']]
423
+ cross_rerank_scores = cross_encoder.predict(cross_inputs)
424
+ # print(cross_rerank_scores)
425
+
426
+ # Store the rerank_scores in results_df
427
+ results_df['Reranked_scores'] = cross_rerank_scores
428
+ # print(results_df)
429
+
430
+ # Return the top_kresults from semantic search
431
+ top_semantic = results_df.sort_values(by='Distances')
432
+ # print(top_semantic[:top_k])
433
+
434
+ # Return the top_k results after reranking
435
+ top_ranks_df = results_df.sort_values(by='Reranked_scores', ascending=False)
436
+ # print(top_ranks[:top_k])
437
+
438
+ top_docs = top_ranks_df[["Documents", "Metadatas"]][:top_k]
439
+ # top_ranks = top_ranks[:][:top_k]
440
+ print(top_docs)
441
+
442
+ return top_docs #, top_ranks_df
443
+
444
+ # top_docs = rerank_with_cross_encoder(results_df)
445
+ # top_docs
446
+
447
+ """##<font color = yellow> Generative Layer
448
+
449
+ ### Retrieval Augmented Generation(RAG)
450
+
451
+ We will now use OpenAI *gpt-3.5-turbo* along with the user query and prompt with top ranked docs, to generate a direct answer to the query along with citations.
452
+ """
453
+
454
+ # # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
455
+
456
+ # def create_prompt(query, top_docs):
457
+ # """
458
+ # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
459
+ # """
460
+ # prompt = [
461
+ # {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
462
+ # {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
463
+ # You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'.
464
+ # These search results are essentially one paragraph of an insurance document that may be relevant to the user query.
465
+
466
+ # The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page.
467
+
468
+ # The policy document describes about 3 different policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'
469
+
470
+ # Use the documents in '{top_docs}' to answer the query '{query}'.
471
+
472
+ # Follow the guidelines below when performing the task:
473
+ # 1. Try to provide relevant/accurate numbers if available.
474
+ # 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
475
+ # 3. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
476
+ # 4. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
477
+ # 5. If you think that the query is not relevant to the document, reply that the query is irrelevant.
478
+ # 6. Provide the final response as a well-formatted and easily readable text along with the citation.
479
+ # 7. Provide your complete response using the relevant parts in the documents.
480
+ # 8. The generated response should answer the query directly addressing the user and avoiding additional information.
481
+ # 9. Provide the final response as a well-formatted and easily readable text.
482
+
483
+ # """},
484
+ # ]
485
+
486
+ # return prompt
487
+
488
+ # # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
489
+
490
+ # def create_prompt(query, top_docs):
491
+ # """
492
+ # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
493
+ # """
494
+ # prompt = [
495
+ # {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
496
+ # {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
497
+ # You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'. These search results are essentially one paragraph of an insurance document that may be relevant to the user query.
498
+
499
+ # The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page.
500
+
501
+ # The policy document describes about 3 different policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'
502
+
503
+ # Use the documents in '{top_docs}' to answer the query '{query}'.
504
+
505
+ # Follow the guidelines below when performing the task.
506
+ # 1. Try to provide relevant/accurate numbers if available.
507
+ # 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
508
+ # 4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
509
+ # 5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
510
+ # 6. If you think that the query is not relevant to the document, reply that the query is irrelevant.
511
+ # 7. Provide the final response as a well-formatted and easily readable text along with the citation.
512
+ # 8. Provide your complete response using the relevant parts in the documents.
513
+
514
+ # The generated response should answer the query directly addressing the user and avoiding additional information. Provide the final response as a well-formatted and easily readable text.
515
+ # **Example 1:**
516
+ # **Query**: "What are the benefits of the whole life insurance policy?"
517
+ # **Search Results**: Dataframe contains an excerpt from a whole life insurance policy document: "The policy provides lifelong coverage, a guaranteed death benefit, and a cash value component that grows over time."
518
+ # **Response**: "The whole life insurance policy offers lifelong coverage with a guaranteed death benefit. Additionally, it accumulates cash value over time, which can be accessed or borrowed against by the policyholder."
519
+ # **Citations**: Policy Name: Lifetime Protection Plan, Page: 7
520
+
521
+ # **Example 2:**
522
+ # **Query**: "What is the death benefit for a final expense life insurance policy?"
523
+ # **Search Results**: Dataframe contains a document with the following excerpt: "The final expense policy provides a death benefit of up to $10,000, intended to cover funeral costs and other end-of-life expenses."
524
+ # **Response**: "The final expense life insurance policy provides a death benefit of up to $10,000, which is typically used to cover funeral costs and other end-of-life expenses."
525
+ # **Citations**: Policy Name: Final Expense Protection, Page: 3
526
+
527
+ # """},
528
+ # ]
529
+
530
+ # return prompt
531
+
532
+ # # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
533
+
534
+ # def create_prompt(query, top_docs):
535
+
536
+ # """
537
+ # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
538
+ # """
539
+ # prompt = [
540
+ # {
541
+ # "role": "system",
542
+ # "content": "You are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely."
543
+ # },
544
+ # {
545
+ # "role": "user",
546
+ # "content": f"""
547
+ # You are given a user query and a set of relevant insurance policy document excerpts retrieved by a Retrieval-Augmented Generation (RAG) system.
548
+
549
+ # Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
550
+
551
+ # The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
552
+
553
+ # Guidelines:
554
+ # 1. Extract information that directly answers the user's query from the document excerpts.
555
+ # 2. Organize the response using clear headings, bullet points, or tables where applicable.
556
+ # 3. Cite the relevant policy name(s) and page number(s) using the metadata from the dataframe.
557
+ # 4. If the provided excerpts do not fully answer the query, provide all available information and suggest which sections of the policy document the user should review for further details.
558
+ # 5. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
559
+
560
+ # ### Example Query:
561
+ # **User Query**: "What are the premium rates for different types of insurance under this policy?"
562
+
563
+ # **Extracted Information**:
564
+ # **Article 2 - Premium Rates**:
565
+ # 1. **Member Life Insurance**: $0.210 for each $1,000 of insurance in force.
566
+ # 2. **Member Accidental Death and Dismemberment Insurance**: $0.025 for each $1,000 of Member Life Insurance in force.
567
+ # 3. **Dependent Life Insurance**: $1.46 for each Member insured for Dependent Life Insurance.
568
+
569
+ # **Multiple Policy Discount**: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
570
+
571
+ # **Citations**: Policy Name: Group Life Insurance Policy, Page Number: 12.
572
+
573
+ # ### Your Task:
574
+ # The user query is: '{query}'
575
+ # """
576
+ # }
577
+ # ]
578
+ # return prompt
579
+
580
+ # # function to create prompt having the top ranked docs and query.
581
+
582
+ # def create_prompt(query, top_docs):
583
+
584
+ # """
585
+ # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
586
+ # """
587
+ # prompt = [
588
+ # {
589
+ # "role": "system",
590
+ # "content": "You are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely."
591
+ # },
592
+ # {
593
+ # "role": "user",
594
+ # "content": f"""
595
+ # You are given a user query and a set of relevant insurance policy document excerpts retrieved by a Retrieval-Augmented Generation (RAG) system.
596
+
597
+ # Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
598
+
599
+ # The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
600
+
601
+ # Guidelines:
602
+ # 1. Extract information that directly answers the user's query from the document excerpts.
603
+ # 2. Organize the response using clear headings, bullet points, or tables where applicable.
604
+ # 3. If the text includes tables with relevant information, reformat them into a clear, readable structure.
605
+ # 4. Cite the relevant policy name(s) and page number(s) using the metadata from the dataframe.
606
+ # 5. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
607
+ # 6. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
608
+
609
+ # ### Example Query:
610
+ # **User Query**: "What are the premium rates for different types of insurance under this policy?"
611
+
612
+ # **Premium Rates**:
613
+ # 1. **Member Life Insurance**: $0.210 for each $1,000 of insurance in force.
614
+ # 2. **Member Accidental Death and Dismemberment Insurance**: $0.025 for each $1,000 of Member Life Insurance in force.
615
+ # 3. **Dependent Life Insurance**: $1.46 for each Member insured for Dependent Life Insurance.
616
+
617
+ # **Multiple Policy Discount**: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
618
+
619
+ # **Citations**: Policy Name: Group Life Insurance Policy, Page Number: 12.
620
+
621
+ # ### Your Task:
622
+ # The user query is: '{query}'
623
+ # """
624
+ # }
625
+ # ]
626
+ # return prompt
627
+
628
+ # prompt = create_prompt(query, top_docs)
629
+
630
+ # # function to generate the response.
631
+
632
+ # def generate_response(query, top_docs):
633
+ # """
634
+ # Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
635
+ # """
636
+ # messages = [
637
+ # {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
638
+ # {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
639
+ # You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'. These search results are essentially one page of an insurance document that may be relevant to the user query.
640
+
641
+ # The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.
642
+
643
+ # Use the documents in '{top_docs}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.
644
+
645
+ # Follow the guidelines below when performing the task.
646
+ # 1. Try to provide relevant/accurate numbers if available.
647
+ # 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
648
+ # 3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
649
+ # 3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
650
+ # 4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
651
+ # 5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
652
+
653
+ # The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
654
+ # """},
655
+ # ]
656
+
657
+ # response = openai.chat.completions.create(
658
+ # model="gpt-3.5-turbo",
659
+ # messages=messages
660
+ # )
661
+
662
+ # return response.choices[0].message.content.split('\n')
663
+
664
+ # response = generate_response(query, top_docs)
665
+ # print(query + '\n')
666
+ # print("\n".join(response))
667
+
668
+ # function to generate the response.
669
+
670
+ def generate_response(query, top_docs):
671
+ """
672
+ Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
673
+ """
674
+ messages = f"""
675
+ Remember your system message and that you are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely.
676
+ Your task is to extract and present relevant information from the policy documents to answer the user’s query.
677
+ The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
678
+ The user input is: '{query}'
679
+ """
680
+
681
+ # response = openai.chat.completions.create (
682
+ # model="gpt
683
+ ### Your Task:-3.5-turbo",
684
+ # messages=messages
685
+ # )
686
+ conversation = [{"role": "user", "parts": messages}]
687
+
688
+ return conversation #response.choices[0].message.content.split('\n')
689
+
690
+ # response = generate_response(query, top_docs)
691
+ # print(query + '\n')
692
+ # print("\n".join(response))
693
+
694
+ """## <font color = yellow> Query Search
695
+
696
+ ### <font color = yellow> Query #1
697
+ """
698
+
699
+ # query1 = "what happens if failed to Pay Premium?"
700
+
701
+ # results_df = retreive_results(query1, insurance_collection, cache_collection)
702
+ # top_docs = rerank_with_cross_encoder(results_df)
703
+ # top_docs
704
+
705
+ # #generate response
706
+ # response = generate_response(query1, top_docs)
707
+
708
+ # print("\n".join(response))
709
+
710
+ # """### <font color = yellow> Query #2"""
711
+
712
+ # query2 = "what are the eligibility requirements for different types of insurance under this policy?"
713
+
714
+ # results_df = retreive_results(query2, insurance_collection, cache_collection)
715
+ # top_docs = rerank_with_cross_encoder(results_df)
716
+ # top_docs
717
+
718
+ # #generate response
719
+ # response = generate_response(query2, top_docs)
720
+ # print("\n".join(response))
721
+
722
+ # """### <font color = yellow> Query #3"""
723
+
724
+ # query3 = "What are the Termination Rights of the Policyholder?"
725
+
726
+ # results_df = retreive_results(query3, insurance_collection, cache_collection)
727
+ # top_docs = rerank_with_cross_encoder(results_df)
728
+ # top_docs
729
+
730
+ # #generate response
731
+ # response = generate_response(query3, top_docs)
732
+ # print("\n".join(response))
733
+
734
+ # def run_pipeline(chunk_strategy,
735
+ # embedding_function,
736
+ # chroma_data_path,
737
+ # query,
738
+ # cross_encoder,
739
+ # top_k,
740
+ # rag_model,
741
+ # prompt_style="default"):
742
+
743
+ # # Embedding layer
744
+ # # Preprocess documents
745
+
746
+ # # Extract text
747
+ # # Split into chunks
748
+ # if chunk_strategy == "page":
749
+ # docs = extract_pages_from_pdf(pdf_path)
750
+ # elif chunk_strategy == "fixed_size":
751
+ # docs = fixed_size_chunking_of_pdf(pdf_path)
752
+
753
+ # docs_df = store_docs_to_df(docs)
754
+
755
+ # # Generate embeddings and store in chromadb collection and cache
756
+ # insurance_collection, cache_collection = generate_embeddings(docs_df, embedding_function)
757
+
758
+ # # Retrieve documents relevant to query from collections and store in cache
759
+ # results_df = retreive_results(query, insurance_collection, cache_collection)
760
+
761
+ # # Re-rank with Cross Encoder
762
+ # top_re_ranks, top_df = rerank_with_cross_encoder(results_df, top_k)
763
+
764
+ # # Create prompt
765
+ # prompt = create_prompt(query, top_re_ranks)
766
+
767
+ # # Generate response
768
+ # response = generate_response(prompt, rag_model)
769
+
770
+ # return top_df, response
771
+
772
+ # # select chunking strategy
773
+
774
+ # # chunk_strategy = "page"
775
+ # chunk_strategy = "fixed_size"
776
+ # # Load the tokenizer
777
+ # tokenizer = tiktoken.get_encoding("cl100k_base")
778
+ # # Define the token limit for each chunk
779
+ # TOKEN_SIZE = 500 # Adjust this based on your needs
780
+
781
+ # # Import the OpenAI Embedding Function into chroma
782
+ # from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
783
+
784
+ # # select the model and initialise the embedding function
785
+ # # model = "text-embedding-ada-002"
786
+ # # embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)
787
+
788
+ # from chromadb.utils import embedding_functions
789
+ # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
790
+ # # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
791
+ # # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
792
+
793
+ # # Import the CrossEncoder library from sentence_transformers
794
+ # from sentence_transformers import CrossEncoder, util
795
+ # # Initialise the cross encoder model
796
+ # cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
797
+ # # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
798
+ # # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
799
+
800
+ # # test query
801
+ # # query = "what are the eligibility requirements?"
802
+ # # query = "what are the eligibility requirements for different types of insurance under this policy?"
803
+
804
+ # # query = "what are the benefits payable?"
805
+ # # query = "what are the benefits payable for different types of insurance under this policy?"
806
+ # # query = "What are the benefits payable of Member Accidental Death and Dismemberment Insurance?"
807
+ # # query = "What are the benefits of Member Life Insurance?"
808
+
809
+ # # query = "How much is the premium amount?"
810
+ # # query = "How much is the premium amount for different types of insurance under this policy?"
811
+
812
+ # # query = "How much is the premium rate?"
813
+ # # query = "What are the premium rates for different types of insurance under this policy?"
814
+ # # query = "What are the premium rates?"
815
+
816
+ # # print(query)
817
+
818
+ # # how much top query results to consider for generating response
819
+ # top_k = 5
820
+
821
+ # # select RAG model
822
+ # rag_model = "gpt-3.5-turbo"
823
+
824
+ # top_df, response = run_pipeline(chunk_strategy,
825
+ # embedding_function,
826
+ # chroma_data_path,
827
+ # query,
828
+ # cross_encoder,
829
+ # top_k,
830
+ # rag_model)
831
+ # # results_df = run_pipeline(chunk_strategy,
832
+ # # embedding_function,
833
+ # # chroma_data_path,
834
+ # # query,
835
+ # # cross_encoder,
836
+ # # top_k,
837
+ # # rag_model)
838
+
839
+ # # top_re_ranks = run_pipeline(chunk_strategy,
840
+ # # embedding_function,
841
+ # # chroma_data_path,
842
+ # # query,
843
+ # # cross_encoder,
844
+ # # top_k,
845
+ # # rag_model)
846
+
847
+ # print("\n".join(response))
848
+ # # print(prompt)
849
+ # # top_re_ranks
850
+ # # docs_df.head(100)
851
+ # # top_semantic_search
852
+ # top_df
853
+ # # results_df
854
+
855
+
856
+
857
+
858
+
859
+
860
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chromadb==0.5.7
2
+ google.generativeai
3
+ python-dotenv
4
+ pandas
5
+ numpy
6
+ fastapi
7
+ uvicorn
8
+ jinja2
9
+ python-multipart
10
+ pdfplumber
11
+ sentence_transformers
12
+ tiktoken
static/css/styles.css ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ button {
2
+ /* These styles apply to all buttons */
3
+ color: white; /* Color of the button text */
4
+ background-color: rgb(165, 152, 111); /* Background color of the button */
5
+ border: 2px solid black; /* Border around the button. It's 2px wide, solid, and black */
6
+ padding: 10px; /* Space between the button text and the edge of the button */
7
+ width: 200px; /* The button is 200px wide */
8
+ margin-top: 20px; /* Space between the top of the button and the element above it */
9
+ border-radius: 10px; /* Rounds the corners of the button */
10
+ cursor: pointer; /* The cursor turns into a hand when it's over the button */
11
+ font-family: 'Courier New', Courier, monospace; /* Font of the text. If 'Courier New' isn't available, the browser will try to use 'Courier'. If that's not available, it will use any monospace font. */
12
+ }
13
+
14
+ .outercontainer {
15
+ /* These styles apply to the div with class "outercontainer" */
16
+ max-width: 800px; /* Maximum width of the container. If the screen is narrower than 800px, the container will shrink to fit it */
17
+ margin: 0 auto; /* Centers the container. 'auto' makes the left and right margins equal */
18
+ padding: 20px; /* Space between the content of the container and its borders */
19
+ border: 5px solid #ccc; /* Border around the container. It's 5px wide, solid, and a light grey (#ccc) */
20
+ border-radius: 50px; /* Rounds the corners of the border */
21
+ }
22
+
23
+ .conversationcontainer{
24
+ /* These styles apply to the div with class "conversation" */
25
+ border: 5px solid #ccc; /* Border around the conversation. It's 5px wide, solid, and a light grey (#ccc) */
26
+ padding: 10px; /* Space between the content of the conversation and its borders */
27
+ height: 500px; /* The conversation box is always 500px tall */
28
+ background-color: #f3f3f3; /* Background color of the conversation */
29
+ overflow-y: scroll; /* If the conversation gets too tall for its container, it will become scrollable */
30
+ border-radius: 50px; /* Rounds the corners of the border */
31
+ }
32
+
33
+ #titlestyle{
34
+ color: black; /* Color of the heading text */
35
+ background-color: rgb(227, 224, 212); /* Background color of the heading */
36
+ border: 2px solid black; /* Border around the heading. It's 2px wide, solid, and black */
37
+ padding: 10px; /* Space between the content of the heading and its borders */
38
+ margin-top: 20px; /* Space between the top of the heading and the element above it */
39
+ border-radius: 10px; /* Rounds the corners of the border */
40
+ width:fit-content; /* The width of the heading is just enough to fit its content */
41
+ }
42
+
43
+ #inputtextbox{
44
+ /* These styles apply to text input fields */
45
+ width: 85%; /* Text inputs are 85% as wide as their container */
46
+ border: 5px solid #ccc; /* Border around the text inputs. It's 5px wide, solid, and a light grey (#ccc) */
47
+ padding: 10px; /* Space between the text in the input and the edge of the input field */
48
+ background-color: #f3f3f3f3; /* Background color of the text input field */
49
+ border-radius: 10px; /* Rounds the corners of the text input field */
50
+ }
51
+
52
+ #submitbutton {
53
+ /* These styles apply to the submit button */
54
+ background: url(/static/send-icon.png) no-repeat center center; /* Background image of the button. The image is centered and doesn't repeat */
55
+ background-size: cover; /* The image covers the entire background of the button */
56
+ border: none; /* The button has no border */
57
+ width: 40px; /* The button is 40px wide */
58
+ height: 40px; /* The button is 40px tall */
59
+ cursor: pointer; /* The cursor turns into a hand when it's over the button */
60
+ }
61
+
62
+ .user {
63
+ /* These styles apply to divs with class "user", i.e., user messages */
64
+ color: white; /* Color of the user's text */
65
+ background-color: rgb(0, 195, 255); /* Background color of the user's messages */
66
+ padding: 8px; /* Space between the user's text and the edge of its container */
67
+ border-radius: 10px; /* Rounds the corners of the user's messages */
68
+ float: right; /* Floats the user's messages to the right */
69
+ clear: both; /* Positions the user's messages below any floated elements that came before it */
70
+ max-width: 80%; /* Maximum width of the user's messages */
71
+ margin: 5px;
72
+ }
73
+
74
+ .bot {
75
+ /* Same as above, but for the bot's messages */
76
+ color: white;
77
+ background-color: rgb(0, 128, 90);
78
+ padding: 8px;
79
+ border-radius: 10px;
80
+ float: left;
81
+ clear: both;
82
+ max-width: 80%;
83
+ margin: 5px;
84
+ }
85
+
86
+ .center {
87
+ /* These styles apply to elements with class "center" */
88
+ display: flex; /* The center element is a flex container */
89
+ justify-content: center; /* The items inside the flex container are centered along the horizontal line */
90
+ }
91
+
92
+ form {
93
+ /* These styles apply to all forms */
94
+ margin: 10px; /* Space around the outside of each form */
95
+ }
96
+
97
+ body {
98
+ font-family: 'Courier New', Courier, monospace; /* Font of the text. If 'Courier New' isn't available, the browser will try to use 'Courier'. If that's not available, it will use any monospace font. */
99
+ margin: 50px; /* Space around the outside of the <body> */
100
+ }
static/send-icon.png ADDED
templates/index_bye.html ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html>
2
+ <head>
3
+ <title>
4
+ Invite App
5
+ </title>
6
+ </head>
7
+ <body>
8
+ <div>
9
+ <h1>
10
+ Invite App
11
+ </h1>
12
+ </div>
13
+ <div>
14
+ <h2>
15
+ Bye, you are not invited to the event, {{ name_xyz }}
16
+
17
+ </h2>
18
+ </div>
19
+
20
+ </body>
21
+ </html>
templates/index_hello.html ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html>
2
+ <head>
3
+ <title>
4
+ Invite App
5
+ </title>
6
+ </head>
7
+ <body>
8
+ <div>
9
+ <h1>
10
+ Invite App
11
+ </h1>
12
+ </div>
13
+ <div>
14
+ <h2>
15
+ Hello, you are invited to the event, {{ name_xyz }}
16
+
17
+ </h2>
18
+ </div>
19
+
20
+ </body>
21
+ </html>
templates/index_invite.html ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html>
2
+ <head>
3
+ <title>
4
+ Insurance Policy AI Assistant
5
+ </title>
6
+ <link rel="stylesheet" type="text/css" href="/static/css/styles.css">
7
+ </head>
8
+ <body>
9
+ <div class="outercontainer">
10
+ <div class="center">
11
+ <h1 id="titlestyle">
12
+ Insurance Policy AI Assistant
13
+ </h1>
14
+ </div>
15
+ <div class="conversationcontainer" id="chatcontainer">
16
+ {% for entry in name_xyz %}
17
+ <div class="{% if entry.bot %}bot{% else %}user{% endif %}">
18
+ {% if entry.bot %}
19
+ {{ entry.bot|safe }}
20
+ {% else %}
21
+ {{ entry.user }}
22
+ {% endif %}
23
+ </div>
24
+ {% endfor %}
25
+ </div>
26
+
27
+ <form action = "/invite" method="POST" class="center">
28
+ <input type ="text" name="user_input_message" id = "inputtextbox">
29
+ <input type="submit" value=" " id="submitbutton">
30
+ </form>
31
+
32
+ <form action="/end_conv" method="POST" class="center">
33
+ <button type="submit">END CONVERSATION</button>
34
+ </form>
35
+ </div>
36
+
37
+ <script>
38
+ // Scroll to the bottom of the chat container on page load and after a new message is sent
39
+ function scrollToBottom() {
40
+ var chatContainer = document.getElementById('chatcontainer');
41
+ chatContainer.scrollTop = chatContainer.scrollHeight;
42
+ }
43
+
44
+ window.onload = scrollToBottom;
45
+
46
+ </script>
47
+
48
+
49
+
50
+ </body>
51
+ </html>