Spaces:
Running
Running
prithvirajpawar
commited on
Commit
·
27bbfe3
1
Parent(s):
4a43b26
Deploy FastAPI app
Browse files- Dockerfile +17 -0
- app/.DS_Store +0 -0
- app/__init__.py +0 -0
- app/__pycache__/__init__.cpython-311.pyc +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/main.py +90 -0
- chroma/.DS_Store +0 -0
- chroma/198995c5-3e47-440b-98d8-b095e7b992c3/data_level0.bin +3 -0
- chroma/198995c5-3e47-440b-98d8-b095e7b992c3/header.bin +3 -0
- chroma/198995c5-3e47-440b-98d8-b095e7b992c3/length.bin +3 -0
- chroma/198995c5-3e47-440b-98d8-b095e7b992c3/link_lists.bin +0 -0
- chroma/chroma.sqlite3 +3 -0
- chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/data_level0.bin +3 -0
- chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/header.bin +3 -0
- chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/length.bin +3 -0
- chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/link_lists.bin +0 -0
- chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/data_level0.bin +3 -0
- chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/header.bin +3 -0
- chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/length.bin +3 -0
- chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/link_lists.bin +0 -0
- chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/data_level0.bin +3 -0
- chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/header.bin +3 -0
- chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/length.bin +3 -0
- chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/link_lists.bin +0 -0
- chroma/fc83983d-80db-49a4-a6fb-42834cc83377/data_level0.bin +3 -0
- chroma/fc83983d-80db-49a4-a6fb-42834cc83377/header.bin +3 -0
- chroma/fc83983d-80db-49a4-a6fb-42834cc83377/length.bin +3 -0
- chroma/fc83983d-80db-49a4-a6fb-42834cc83377/link_lists.bin +0 -0
- chroma/fd3b321e-3bab-4583-88b0-405d918cd938/data_level0.bin +3 -0
- chroma/fd3b321e-3bab-4583-88b0-405d918cd938/header.bin +3 -0
- chroma/fd3b321e-3bab-4583-88b0-405d918cd938/length.bin +3 -0
- chroma/fd3b321e-3bab-4583-88b0-405d918cd938/link_lists.bin +0 -0
- chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/data_level0.bin +3 -0
- chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/header.bin +3 -0
- chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/length.bin +3 -0
- chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/link_lists.bin +0 -0
- chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/data_level0.bin +3 -0
- chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/header.bin +3 -0
- chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/length.bin +3 -0
- chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/link_lists.bin +0 -0
- helpmate_ai.py +860 -0
- requirements.txt +12 -0
- static/css/styles.css +100 -0
- static/send-icon.png +0 -0
- templates/index_bye.html +21 -0
- templates/index_hello.html +21 -0
- templates/index_invite.html +51 -0
Dockerfile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python base image
|
2 |
+
FROM python:3.9-slim
|
3 |
+
|
4 |
+
# Set the working directory
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Copy project files to the container
|
8 |
+
COPY . /app
|
9 |
+
|
10 |
+
# Install dependencies
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Expose the default FastAPI port
|
14 |
+
EXPOSE 8000
|
15 |
+
|
16 |
+
# Command to run FastAPI with Uvicorn
|
17 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
app/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app/__init__.py
ADDED
File without changes
|
app/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (162 Bytes). View file
|
|
app/__pycache__/main.cpython-311.pyc
ADDED
Binary file (4.77 kB). View file
|
|
app/main.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, Request, Form
|
2 |
+
from fastapi.responses import RedirectResponse
|
3 |
+
from fastapi.templating import Jinja2Templates
|
4 |
+
from fastapi.staticfiles import StaticFiles
|
5 |
+
# from fastapi.middleware.cors import CORSMiddleware
|
6 |
+
from helpmate_ai import initialize_conversation, retreive_results, rerank_with_cross_encoder, generate_response
|
7 |
+
import re
|
8 |
+
import google.generativeai as genai
|
9 |
+
|
10 |
+
# Configure Gemini API
|
11 |
+
gemini_api_key = open("gemini_api_key.txt", "r").read().strip()
|
12 |
+
genai.configure(api_key=gemini_api_key)
|
13 |
+
|
14 |
+
# Initialize FastAPI app
|
15 |
+
app = FastAPI()
|
16 |
+
|
17 |
+
# Set up templates
|
18 |
+
templates = Jinja2Templates(directory="templates")
|
19 |
+
|
20 |
+
# Serve static files (if needed)
|
21 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
22 |
+
|
23 |
+
# Enable CORS middleware if needed
|
24 |
+
# app.add_middleware(
|
25 |
+
# CORSMiddleware,
|
26 |
+
# allow_origins=["*"], # Adjust origins as per requirements
|
27 |
+
# allow_credentials=True,
|
28 |
+
# allow_methods=["*"],
|
29 |
+
# allow_headers=["*"],
|
30 |
+
# )
|
31 |
+
|
32 |
+
def format_rag_response(response_text):
|
33 |
+
formatted_text = response_text.replace("\n", "<br>")
|
34 |
+
formatted_text = re.sub(r'(\*\*.*?\*\*)', r'<strong>\1</strong>', formatted_text).replace("**", "")
|
35 |
+
formatted_text = re.sub(r'(\d+\.\s)', r'<br><strong>\1</strong>', formatted_text)
|
36 |
+
formatted_text = re.sub(r'(\-\s)', r'<br>• ', formatted_text)
|
37 |
+
formatted_text = re.sub(r'(Citations?:\s)', r'<br><em>\1</em>', formatted_text)
|
38 |
+
formatted_text = re.sub(r'\|\s*', r'</td><td>', formatted_text)
|
39 |
+
formatted_text = re.sub(r'\n\|\s*', r'<tr><td>', formatted_text)
|
40 |
+
return formatted_text
|
41 |
+
|
42 |
+
conversation_bot = []
|
43 |
+
conversation = initialize_conversation()
|
44 |
+
|
45 |
+
# Initialize Gemini model
|
46 |
+
model = genai.GenerativeModel("gemini-1.5-flash", system_instruction=conversation)
|
47 |
+
|
48 |
+
def get_gemini_completions(conversation):
|
49 |
+
response = model.generate_content(conversation)
|
50 |
+
return response.text
|
51 |
+
|
52 |
+
introduction = get_gemini_completions(conversation)
|
53 |
+
conversation_bot.append({'bot': introduction})
|
54 |
+
top_3_laptops = None
|
55 |
+
|
56 |
+
@app.get("/")
|
57 |
+
async def default_func(request: Request):
|
58 |
+
global conversation_bot
|
59 |
+
return templates.TemplateResponse("index_invite.html", {"request": request, "name_xyz": conversation_bot})
|
60 |
+
|
61 |
+
@app.post("/end_conv")
|
62 |
+
async def end_conv():
|
63 |
+
global conversation_bot, conversation, top_3_laptops
|
64 |
+
conversation_bot = []
|
65 |
+
conversation = initialize_conversation()
|
66 |
+
introduction = get_gemini_completions(conversation)
|
67 |
+
conversation_bot.append({'bot': introduction})
|
68 |
+
top_3_laptops = None
|
69 |
+
return RedirectResponse(url="/", status_code=303)
|
70 |
+
|
71 |
+
@app.post("/invite")
|
72 |
+
async def invite(user_input_message: str = Form(...)):
|
73 |
+
global conversation_bot, conversation, top_3_laptops
|
74 |
+
user_input = user_input_message
|
75 |
+
conversation_bot.append({'user': user_input})
|
76 |
+
|
77 |
+
results_df = retreive_results(user_input)
|
78 |
+
top_docs = rerank_with_cross_encoder(user_input, results_df)
|
79 |
+
|
80 |
+
# Generate response
|
81 |
+
messages = generate_response(user_input, top_docs)
|
82 |
+
response_assistant = get_gemini_completions(messages)
|
83 |
+
|
84 |
+
conversation_bot.append({'bot': format_rag_response(response_assistant)})
|
85 |
+
return RedirectResponse(url="/", status_code=303)
|
86 |
+
|
87 |
+
# Run the application
|
88 |
+
if __name__ == '__main__':
|
89 |
+
import uvicorn
|
90 |
+
uvicorn.run(app, host="0.0.0.0", port=8000, debug=True)
|
chroma/.DS_Store
ADDED
Binary file (10.2 kB). View file
|
|
chroma/198995c5-3e47-440b-98d8-b095e7b992c3/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
|
3 |
+
size 1676000
|
chroma/198995c5-3e47-440b-98d8-b095e7b992c3/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
chroma/198995c5-3e47-440b-98d8-b095e7b992c3/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6b8084d9f7913e043f1ebcd06e456f5d571fd8394d9da876cf420ecd973a671f
|
3 |
+
size 4000
|
chroma/198995c5-3e47-440b-98d8-b095e7b992c3/link_lists.bin
ADDED
File without changes
|
chroma/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2390eaa4b256b7af85f6e852ef50dca2cbfb8dcc78202b4482808f64783a0685
|
3 |
+
size 159731712
|
chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8205b87f647f23b8f4460452ac5cfbedfa817774ba1c318aa8109f1724f86ce7
|
3 |
+
size 1676000
|
chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06b76cba3959bfe87474b2c0d7355085c5190842788d6f0b9b22671facf8d341
|
3 |
+
size 4000
|
chroma/ed2d8aa0-0cf3-4a31-b498-d3e7dcdc5566/link_lists.bin
ADDED
File without changes
|
chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
|
3 |
+
size 1676000
|
chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35e2b719ded2660106ef0a6e7cc5cbd492704d7cbb332cc98386c90d4e84aa30
|
3 |
+
size 4000
|
chroma/f190ba25-fd88-4232-a7dc-2390c45704d4/link_lists.bin
ADDED
File without changes
|
chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
|
3 |
+
size 1676000
|
chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86be761d2a303904d59def5f8ccf5e6c4574cdbeb568a0675301dea1e52a137b
|
3 |
+
size 4000
|
chroma/f28c5b78-1366-45d0-b9f9-10f0ba9054ed/link_lists.bin
ADDED
File without changes
|
chroma/fc83983d-80db-49a4-a6fb-42834cc83377/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:485cdcf2eecbe50ac5822a13a36ebd375438c46d68e6d9bdb699433cf23ef595
|
3 |
+
size 1676000
|
chroma/fc83983d-80db-49a4-a6fb-42834cc83377/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
chroma/fc83983d-80db-49a4-a6fb-42834cc83377/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
|
3 |
+
size 4000
|
chroma/fc83983d-80db-49a4-a6fb-42834cc83377/link_lists.bin
ADDED
File without changes
|
chroma/fd3b321e-3bab-4583-88b0-405d918cd938/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b93d412c384df38f07adeaf81c6ef7036283fa5c5db4fb32d467152d4b2acf08
|
3 |
+
size 1676000
|
chroma/fd3b321e-3bab-4583-88b0-405d918cd938/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
chroma/fd3b321e-3bab-4583-88b0-405d918cd938/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7604482081864f9ca53d019e460d4ba59e7573e18fd3a38e7236864b4fc1cb78
|
3 |
+
size 4000
|
chroma/fd3b321e-3bab-4583-88b0-405d918cd938/link_lists.bin
ADDED
File without changes
|
chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d01e91f079437520222f51034df61cc054d6dd33b15bc61063cea20dde157d7
|
3 |
+
size 1676000
|
chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5bc1120cd20faa1fa96577121a592a28d2c01a086f7af19dc13ea8c9a82e718b
|
3 |
+
size 4000
|
chroma/fd9978b6-a866-4bec-9cea-1215ebeb34e9/link_lists.bin
ADDED
File without changes
|
chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49ff7e332456970ddc9126a8c0129209ad0b62c469ebc99af8291a976a7300c0
|
3 |
+
size 1676000
|
chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:988459be7b8489fed34996257c4cd17982e5d48dacc56656f165d21655ac98b3
|
3 |
+
size 4000
|
chroma/ff952c54-2ce3-4bd1-b8d0-3430b4745e3e/link_lists.bin
ADDED
File without changes
|
helpmate_ai.py
ADDED
@@ -0,0 +1,860 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Install all the required libraries
|
3 |
+
|
4 |
+
# !pip install -U -q pdfplumber tiktoken openai chromaDB sentence-transformers
|
5 |
+
|
6 |
+
# Import all the required Libraries
|
7 |
+
|
8 |
+
import pdfplumber
|
9 |
+
from pathlib import Path
|
10 |
+
import pandas as pd
|
11 |
+
from operator import itemgetter
|
12 |
+
import json
|
13 |
+
import tiktoken
|
14 |
+
# import openai
|
15 |
+
import chromadb
|
16 |
+
|
17 |
+
# openai.api_key = open("api_key.txt", "r").read().strip()
|
18 |
+
|
19 |
+
def initialize_conversation():
|
20 |
+
"""
|
21 |
+
Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
|
22 |
+
"""
|
23 |
+
conversation = [
|
24 |
+
f"""
|
25 |
+
You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
|
26 |
+
The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
|
27 |
+
Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe, with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
|
28 |
+
|
29 |
+
<EXAMPLE>
|
30 |
+
INPUT: "What are the premium rates for different types of insurance under this policy?"
|
31 |
+
|
32 |
+
OUTPUT:
|
33 |
+
The premium rate(s) for each Member insured for Life Insurance will be:
|
34 |
+
|
35 |
+
Premium Rates:
|
36 |
+
1. Member Life Insurance: $0.210 for each $1,000 of insurance in force.
|
37 |
+
2. Member Accidental Death and Dismemberment Insurance: $0.025 for each $1,000 of Member Life Insurance in force.
|
38 |
+
3. Dependent Life Insurance: $1.46 for each Member insured for Dependent Life Insurance.
|
39 |
+
|
40 |
+
Multiple Policy Discount: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
|
41 |
+
|
42 |
+
Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
|
43 |
+
</EXAMPLE>
|
44 |
+
|
45 |
+
<EXAMPLE>
|
46 |
+
INPUT: "What are the Contributions from Members?"
|
47 |
+
|
48 |
+
OUTPUT:
|
49 |
+
Members are not required to contribute a part of the premium for their Member insurance under this Group Policy.
|
50 |
+
Members are required to contribute a part of the premium for their Dependent's insurance under this Group Policy.
|
51 |
+
|
52 |
+
Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
|
53 |
+
</EXAMPLE>
|
54 |
+
|
55 |
+
Guidelines:
|
56 |
+
1. Extract information that directly answers the user's query from the document excerpts.
|
57 |
+
3. Provide the final response as a well-formatted and easily readable text along with the citation.
|
58 |
+
4. Provide your complete response using the relevant parts in the documents.
|
59 |
+
5. The generated response should answer the query directly addressing the user and avoiding additional information.
|
60 |
+
6. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
|
61 |
+
7. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
|
62 |
+
|
63 |
+
# Start with a short welcome message with smiley only in the begining of the chat session and not in every response.
|
64 |
+
"""
|
65 |
+
]
|
66 |
+
|
67 |
+
# conversation = [{"role": "user", "parts": system_message}]
|
68 |
+
# conversation = [{"role": "system", "content": system_message}]
|
69 |
+
|
70 |
+
return conversation
|
71 |
+
|
72 |
+
"""#### Read, Process, and Chunk the PDF File
|
73 |
+
|
74 |
+
We will be using **pdfplumber** library to read and process the PDF files.
|
75 |
+
"""
|
76 |
+
|
77 |
+
# Define the path of the PDF
|
78 |
+
pdf_path = 'Principal-Sample-Life-Insurance-Policy.pdf'
|
79 |
+
|
80 |
+
"""Reading PDF file and exploring it for delimeters to decide chunking stategy
|
81 |
+
|
82 |
+
|
83 |
+
"""
|
84 |
+
|
85 |
+
# Open the PDF file
|
86 |
+
# with pdfplumber.open(pdf_path) as pdf:
|
87 |
+
|
88 |
+
# # Get one of the pages from the PDF and examine it
|
89 |
+
# single_page = pdf.pages[0]
|
90 |
+
|
91 |
+
# # Extract text from the first page
|
92 |
+
# text = single_page.extract_text()
|
93 |
+
|
94 |
+
# # Print the extracted text
|
95 |
+
|
96 |
+
# visible_text = text.replace("\n", "<NEWLINE>\n").replace("\t", "[TAB]").replace(" ", "[SPACE]")
|
97 |
+
# print(visible_text)
|
98 |
+
# print(text)
|
99 |
+
|
100 |
+
"""*Looking at the the file we will go fixed-size chunking strategy either page or certain token size. We will experiment with various token-size for optimal output.*
|
101 |
+
|
102 |
+
#### Function to perform Page-Based Chunking
|
103 |
+
"""
|
104 |
+
|
105 |
+
# Function to extract text page-wise from a PDF file.
|
106 |
+
def extract_pages_from_pdf(pdf_path):
|
107 |
+
# p = 0
|
108 |
+
page_cunks = []
|
109 |
+
|
110 |
+
# with pdfplumber.open(pdf_path) as pdf:
|
111 |
+
pdf = pdfplumber.open(pdf_path);
|
112 |
+
for page_no, page in enumerate(pdf.pages):
|
113 |
+
# page_no = f"Page {p+1}"
|
114 |
+
text = page.extract_text()
|
115 |
+
|
116 |
+
page_cunks.append([page_no + 1, text])
|
117 |
+
# p +=1
|
118 |
+
|
119 |
+
return page_cunks
|
120 |
+
page_cunks = extract_pages_from_pdf(pdf_path)
|
121 |
+
|
122 |
+
# for page_chunk in page_cunks[0:5]:
|
123 |
+
# print(page_chunk)
|
124 |
+
|
125 |
+
"""#### Functions to perform fixed size chunking using token-size
|
126 |
+
|
127 |
+
We will be using OpenAI 'gpt-3.5-turbo' model for generating answer so we choose size of chunks such that it does not exceed token limit of the model which is 4096(input and output)
|
128 |
+
"""
|
129 |
+
|
130 |
+
# Load the tokenizer
|
131 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
132 |
+
# Define the token limit for each chunk
|
133 |
+
TOKEN_SIZE = 512 # Adjust for optimal output
|
134 |
+
|
135 |
+
def chunk_text_by_token_size(text, TOKEN_SIZE):
|
136 |
+
# Tokenize the text
|
137 |
+
tokens = tokenizer.encode(text)
|
138 |
+
|
139 |
+
# Chunk the tokens into fixed-size chunks
|
140 |
+
chunks = [tokens[i:i + TOKEN_SIZE] for i in range(0, len(tokens), TOKEN_SIZE)]
|
141 |
+
|
142 |
+
# Convert the chunks back into text
|
143 |
+
text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
|
144 |
+
|
145 |
+
return text_chunks
|
146 |
+
|
147 |
+
def fixed_size_chunking_of_pdf(pdf_path):
|
148 |
+
# Extract text from a PDF
|
149 |
+
with pdfplumber.open(pdf_path) as pdf:
|
150 |
+
# Initialize a list to store chunks
|
151 |
+
all_chunks = []
|
152 |
+
|
153 |
+
# Iterate over all the pages
|
154 |
+
for page_no, page in enumerate(pdf.pages):
|
155 |
+
|
156 |
+
# Extract text from the page
|
157 |
+
text = page.extract_text()
|
158 |
+
|
159 |
+
# Chunk the text based on token limit
|
160 |
+
page_chunks = chunk_text_by_token_size(text, TOKEN_SIZE)
|
161 |
+
|
162 |
+
for text_chunk in page_chunks:
|
163 |
+
all_chunks.append([f"{page_no + 1}", text_chunk])
|
164 |
+
|
165 |
+
return all_chunks
|
166 |
+
|
167 |
+
# Append the chunks to the list
|
168 |
+
all_chunks = fixed_size_chunking_of_pdf(pdf_path)
|
169 |
+
|
170 |
+
# Example: Print the first chunk
|
171 |
+
# for chunk in all_chunks[0:5]:
|
172 |
+
# print(chunk)
|
173 |
+
|
174 |
+
"""We will store the chunks in a dataframe for further processng.
|
175 |
+
|
176 |
+
chunks smaller than length 10 might be some empty pages or very few words so will be dropped.
|
177 |
+
|
178 |
+
Depending on the chunking srategy relevant functions are called.
|
179 |
+
"""
|
180 |
+
|
181 |
+
# functions for storing chunks in data frame for further processing
|
182 |
+
def store_docs_to_df(chunks):
|
183 |
+
# Initialize a list to store chunks
|
184 |
+
data = []
|
185 |
+
# Convert the extracted list to a DF, and add a column to store document names
|
186 |
+
extracted_text_df = pd.DataFrame(chunks, columns=['Page No.', 'Text'])
|
187 |
+
# Append the extracted text and Page number to the list
|
188 |
+
data.append(extracted_text_df)
|
189 |
+
|
190 |
+
# Concatenate all the DFs in the list 'data' together
|
191 |
+
insurance_pdf_data = pd.concat(data, ignore_index=True)
|
192 |
+
# insurance_pdfs_data.head(20)
|
193 |
+
|
194 |
+
# Let's also check the length of all the texts as there might be some empty pages or with very few words that we can drop
|
195 |
+
|
196 |
+
insurance_pdf_data['Text_Length'] = insurance_pdf_data['Text'].apply(lambda x: len(x.split(' ')))
|
197 |
+
insurance_pdf_data['Text_Length']
|
198 |
+
|
199 |
+
# Retain only the rows with a text length of at least 10
|
200 |
+
|
201 |
+
insurance_pdf_data = insurance_pdf_data.loc[insurance_pdf_data['Text_Length'] >= 10]
|
202 |
+
# insurance_pdfs_data
|
203 |
+
|
204 |
+
# Store the metadata for each page in a separate column
|
205 |
+
# insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Page No.': x['Page No.'], 'Chunk No': x['Chunk No']}, axis=1)
|
206 |
+
insurance_pdf_data['Metadata'] = insurance_pdf_data.apply(lambda x: {'Page No.': x['Page No.']}, axis=1)
|
207 |
+
# insurance_pdfs_data
|
208 |
+
|
209 |
+
return insurance_pdf_data
|
210 |
+
|
211 |
+
chunks_df = store_docs_to_df(page_cunks) # page based chunking
|
212 |
+
# chunks_df = store_docs_to_df(all_chunks) # chunking based on size=token-size
|
213 |
+
|
214 |
+
# chunks_df.tail(5)
|
215 |
+
|
216 |
+
"""## Generate and Store Embeddings
|
217 |
+
|
218 |
+
In this section, we will embed the chunks and store them in a ChromaDB collection.
|
219 |
+
"""
|
220 |
+
|
221 |
+
# Define the path where chroma collections will be stored
|
222 |
+
chroma_data_path = '/content/drive/MyDrive/HelpMate_AI_Codes/ChromaDB_Data'
|
223 |
+
|
224 |
+
# Import the OpenAI Embedding Function into chroma
|
225 |
+
# from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
|
226 |
+
# embedding_function = OpenAIEmbeddingFunction(
|
227 |
+
# api_key=openai.api_key,
|
228 |
+
# model_name="text-embedding-ada-002"
|
229 |
+
# )
|
230 |
+
|
231 |
+
# Import the SentenceTransformer Embedding Function into chroma
|
232 |
+
from chromadb.utils import embedding_functions
|
233 |
+
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
|
234 |
+
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
|
235 |
+
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
236 |
+
|
237 |
+
# Call PersistentClient() so the collections including cache can be stored in a permanent storage
|
238 |
+
client = chromadb.PersistentClient()
|
239 |
+
|
240 |
+
"""
|
241 |
+
|
242 |
+
We will also implement a data/collection cache to improve the performance of the overall search system."""
|
243 |
+
|
244 |
+
# Set up the embedding function
|
245 |
+
|
246 |
+
def generate_embeddings(chunks_df, embedding_function):
|
247 |
+
|
248 |
+
all_collections = client.list_collections()
|
249 |
+
collection_exists = any(col.name == 'RAG_on_Insurance' for col in all_collections)
|
250 |
+
|
251 |
+
if collection_exists:
|
252 |
+
client.delete_collection(name='RAG_on_Insurance')
|
253 |
+
|
254 |
+
# Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
|
255 |
+
insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)
|
256 |
+
|
257 |
+
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma
|
258 |
+
documents_list = chunks_df["Text"].tolist()
|
259 |
+
metadata_list = chunks_df['Metadata'].tolist()
|
260 |
+
|
261 |
+
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.
|
262 |
+
|
263 |
+
insurance_collection.add(
|
264 |
+
documents= documents_list,
|
265 |
+
ids = [str(i) for i in range(0, len(documents_list))],
|
266 |
+
metadatas = metadata_list
|
267 |
+
)
|
268 |
+
|
269 |
+
collection_exists = any(col.name == 'Insurance_Cache' for col in all_collections)
|
270 |
+
|
271 |
+
if collection_exists:
|
272 |
+
client.delete_collection(name='Insurance_Cache')
|
273 |
+
|
274 |
+
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)
|
275 |
+
|
276 |
+
# print(client.list_collections())
|
277 |
+
|
278 |
+
# print(cache_collection.peek())
|
279 |
+
|
280 |
+
# cache_results = cache_collection.query(
|
281 |
+
# query_texts=query,
|
282 |
+
# n_results=1
|
283 |
+
# )
|
284 |
+
|
285 |
+
# print(cache_results)
|
286 |
+
|
287 |
+
return insurance_collection, cache_collection
|
288 |
+
|
289 |
+
insurance_collection, cache_collection = generate_embeddings(chunks_df, embedding_function)
|
290 |
+
# insurance_collection.peek(5)
|
291 |
+
|
292 |
+
# Let's take a look at the first few entries in the collection
|
293 |
+
# sample = insurance_collection.peek(5)
|
294 |
+
# sample
|
295 |
+
# print(insurance_collection.get(
|
296 |
+
# ids = ['4','5','6'],
|
297 |
+
# include = ['documents', 'metadatas']
|
298 |
+
# ))
|
299 |
+
|
300 |
+
"""##<font color = yellow> Search Layer
|
301 |
+
|
302 |
+
### Semantic Search with Cache
|
303 |
+
|
304 |
+
We will perform a semantic search of a query in the collections embeddings to get several top semantically similar results based on the *distance* parameter.
|
305 |
+
"""
|
306 |
+
|
307 |
+
# test query
|
308 |
+
# query = "What are the premium rates for different types of insurance under this policy?"
|
309 |
+
# query = "what are the benefits payable for different types of insurance under this policy?"
|
310 |
+
# query = "What are the Contributions from Members??"
|
311 |
+
|
312 |
+
"""#### Document retreival"""
|
313 |
+
|
314 |
+
# Implementing Cache in Semantic Search
|
315 |
+
|
316 |
+
def retreive_results(query):
|
317 |
+
# Set a threshold for cache search
|
318 |
+
threshold = 0.2
|
319 |
+
|
320 |
+
ids = []
|
321 |
+
documents = []
|
322 |
+
distances = []
|
323 |
+
metadatas = []
|
324 |
+
|
325 |
+
results_df = pd.DataFrame()
|
326 |
+
|
327 |
+
# Searh the Cache collection first
|
328 |
+
# Query the collection against the user query and return the top 20 results
|
329 |
+
|
330 |
+
cache_results = cache_collection.query(
|
331 |
+
query_texts=query,
|
332 |
+
n_results=1
|
333 |
+
)
|
334 |
+
|
335 |
+
# print(cache_results)
|
336 |
+
# print(f"cache_results top distance: {cache_results['distances'][0][0]}")
|
337 |
+
|
338 |
+
# If the distance is greater than the threshold, then return the results from the main collection.
|
339 |
+
if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
|
340 |
+
# Query the collection against the user query and return the top 10 results
|
341 |
+
results = insurance_collection.query(
|
342 |
+
query_texts=query,
|
343 |
+
n_results=10
|
344 |
+
)
|
345 |
+
|
346 |
+
# Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
|
347 |
+
# Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
|
348 |
+
Keys = []
|
349 |
+
Values = []
|
350 |
+
|
351 |
+
for key, val in results.items():
|
352 |
+
if val is None:
|
353 |
+
continue
|
354 |
+
if key in ['ids', 'metadatas', 'documents', 'distances']:
|
355 |
+
for i in range(10):
|
356 |
+
Keys.append(str(key)+str(i))
|
357 |
+
Values.append(str(val[0][i]))
|
358 |
+
# print(key, i)
|
359 |
+
|
360 |
+
cache_collection.add(
|
361 |
+
documents= [query],
|
362 |
+
ids = [query], # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
|
363 |
+
metadatas = dict(zip(Keys, Values))
|
364 |
+
)
|
365 |
+
|
366 |
+
# print("Not found in cache. Found in main collection.")
|
367 |
+
|
368 |
+
result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
|
369 |
+
results_df = pd.DataFrame.from_dict(result_dict)
|
370 |
+
|
371 |
+
# If the distance is, however, less than the threshold, you can return the results from cache
|
372 |
+
|
373 |
+
elif cache_results['distances'][0][0] <= threshold:
|
374 |
+
cache_result_dict = cache_results['metadatas'][0][0]
|
375 |
+
|
376 |
+
# Loop through each inner list and then through the dictionary
|
377 |
+
for key, value in cache_result_dict.items():
|
378 |
+
if 'ids' in key:
|
379 |
+
ids.append(value)
|
380 |
+
elif 'documents' in key:
|
381 |
+
documents.append(value)
|
382 |
+
elif 'distances' in key:
|
383 |
+
distances.append(value)
|
384 |
+
elif 'metadatas' in key:
|
385 |
+
metadatas.append(value)
|
386 |
+
|
387 |
+
print("Found in cache!")
|
388 |
+
|
389 |
+
# Create a DataFrame
|
390 |
+
results_df = pd.DataFrame({
|
391 |
+
'IDs': ids,
|
392 |
+
'Documents': documents,
|
393 |
+
'Distances': distances,
|
394 |
+
'Metadatas': metadatas
|
395 |
+
})
|
396 |
+
|
397 |
+
# print(results_df)
|
398 |
+
|
399 |
+
return results_df
|
400 |
+
|
401 |
+
# results_df = retreive_results(query, insurance_collection, cache_collection)
|
402 |
+
# results_df.head(5)
|
403 |
+
|
404 |
+
"""#### Re-Ranking with a Cross Encoder
|
405 |
+
|
406 |
+
We will perform Re-ranking of the search results using cross-encoder to move more relevant chunks at the top.
|
407 |
+
"""
|
408 |
+
|
409 |
+
# Import the CrossEncoder library from sentence_transformers
|
410 |
+
from sentence_transformers import CrossEncoder, util
|
411 |
+
# Initialise the cross encoder model
|
412 |
+
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
|
413 |
+
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
|
414 |
+
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
415 |
+
|
416 |
+
#function to re-rank results using cross-encoder
|
417 |
+
def rerank_with_cross_encoder(query, results_df, top_k=3):
|
418 |
+
|
419 |
+
# Input (query, response) pairs for each of the top 10 responses received from the semantic search to the cross encoder
|
420 |
+
# Generate the cross_encoder scores for these pairs
|
421 |
+
|
422 |
+
cross_inputs = [[query, response] for response in results_df['Documents']]
|
423 |
+
cross_rerank_scores = cross_encoder.predict(cross_inputs)
|
424 |
+
# print(cross_rerank_scores)
|
425 |
+
|
426 |
+
# Store the rerank_scores in results_df
|
427 |
+
results_df['Reranked_scores'] = cross_rerank_scores
|
428 |
+
# print(results_df)
|
429 |
+
|
430 |
+
# Return the top_kresults from semantic search
|
431 |
+
top_semantic = results_df.sort_values(by='Distances')
|
432 |
+
# print(top_semantic[:top_k])
|
433 |
+
|
434 |
+
# Return the top_k results after reranking
|
435 |
+
top_ranks_df = results_df.sort_values(by='Reranked_scores', ascending=False)
|
436 |
+
# print(top_ranks[:top_k])
|
437 |
+
|
438 |
+
top_docs = top_ranks_df[["Documents", "Metadatas"]][:top_k]
|
439 |
+
# top_ranks = top_ranks[:][:top_k]
|
440 |
+
print(top_docs)
|
441 |
+
|
442 |
+
return top_docs #, top_ranks_df
|
443 |
+
|
444 |
+
# top_docs = rerank_with_cross_encoder(results_df)
|
445 |
+
# top_docs
|
446 |
+
|
447 |
+
"""##<font color = yellow> Generative Layer
|
448 |
+
|
449 |
+
### Retrieval Augmented Generation(RAG)
|
450 |
+
|
451 |
+
We will now use OpenAI *gpt-3.5-turbo* along with the user query and prompt with top ranked docs, to generate a direct answer to the query along with citations.
|
452 |
+
"""
|
453 |
+
|
454 |
+
# # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
|
455 |
+
|
456 |
+
# def create_prompt(query, top_docs):
|
457 |
+
# """
|
458 |
+
# Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
|
459 |
+
# """
|
460 |
+
# prompt = [
|
461 |
+
# {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
|
462 |
+
# {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
|
463 |
+
# You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'.
|
464 |
+
# These search results are essentially one paragraph of an insurance document that may be relevant to the user query.
|
465 |
+
|
466 |
+
# The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page.
|
467 |
+
|
468 |
+
# The policy document describes about 3 different policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'
|
469 |
+
|
470 |
+
# Use the documents in '{top_docs}' to answer the query '{query}'.
|
471 |
+
|
472 |
+
# Follow the guidelines below when performing the task:
|
473 |
+
# 1. Try to provide relevant/accurate numbers if available.
|
474 |
+
# 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
|
475 |
+
# 3. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
|
476 |
+
# 4. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
|
477 |
+
# 5. If you think that the query is not relevant to the document, reply that the query is irrelevant.
|
478 |
+
# 6. Provide the final response as a well-formatted and easily readable text along with the citation.
|
479 |
+
# 7. Provide your complete response using the relevant parts in the documents.
|
480 |
+
# 8. The generated response should answer the query directly addressing the user and avoiding additional information.
|
481 |
+
# 9. Provide the final response as a well-formatted and easily readable text.
|
482 |
+
|
483 |
+
# """},
|
484 |
+
# ]
|
485 |
+
|
486 |
+
# return prompt
|
487 |
+
|
488 |
+
# # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
|
489 |
+
|
490 |
+
# def create_prompt(query, top_docs):
|
491 |
+
# """
|
492 |
+
# Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
|
493 |
+
# """
|
494 |
+
# prompt = [
|
495 |
+
# {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
|
496 |
+
# {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
|
497 |
+
# You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'. These search results are essentially one paragraph of an insurance document that may be relevant to the user query.
|
498 |
+
|
499 |
+
# The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page.
|
500 |
+
|
501 |
+
# The policy document describes about 3 different policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'
|
502 |
+
|
503 |
+
# Use the documents in '{top_docs}' to answer the query '{query}'.
|
504 |
+
|
505 |
+
# Follow the guidelines below when performing the task.
|
506 |
+
# 1. Try to provide relevant/accurate numbers if available.
|
507 |
+
# 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
|
508 |
+
# 4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
|
509 |
+
# 5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
|
510 |
+
# 6. If you think that the query is not relevant to the document, reply that the query is irrelevant.
|
511 |
+
# 7. Provide the final response as a well-formatted and easily readable text along with the citation.
|
512 |
+
# 8. Provide your complete response using the relevant parts in the documents.
|
513 |
+
|
514 |
+
# The generated response should answer the query directly addressing the user and avoiding additional information. Provide the final response as a well-formatted and easily readable text.
|
515 |
+
# **Example 1:**
|
516 |
+
# **Query**: "What are the benefits of the whole life insurance policy?"
|
517 |
+
# **Search Results**: Dataframe contains an excerpt from a whole life insurance policy document: "The policy provides lifelong coverage, a guaranteed death benefit, and a cash value component that grows over time."
|
518 |
+
# **Response**: "The whole life insurance policy offers lifelong coverage with a guaranteed death benefit. Additionally, it accumulates cash value over time, which can be accessed or borrowed against by the policyholder."
|
519 |
+
# **Citations**: Policy Name: Lifetime Protection Plan, Page: 7
|
520 |
+
|
521 |
+
# **Example 2:**
|
522 |
+
# **Query**: "What is the death benefit for a final expense life insurance policy?"
|
523 |
+
# **Search Results**: Dataframe contains a document with the following excerpt: "The final expense policy provides a death benefit of up to $10,000, intended to cover funeral costs and other end-of-life expenses."
|
524 |
+
# **Response**: "The final expense life insurance policy provides a death benefit of up to $10,000, which is typically used to cover funeral costs and other end-of-life expenses."
|
525 |
+
# **Citations**: Policy Name: Final Expense Protection, Page: 3
|
526 |
+
|
527 |
+
# """},
|
528 |
+
# ]
|
529 |
+
|
530 |
+
# return prompt
|
531 |
+
|
532 |
+
# # Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model
|
533 |
+
|
534 |
+
# def create_prompt(query, top_docs):
|
535 |
+
|
536 |
+
# """
|
537 |
+
# Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
|
538 |
+
# """
|
539 |
+
# prompt = [
|
540 |
+
# {
|
541 |
+
# "role": "system",
|
542 |
+
# "content": "You are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely."
|
543 |
+
# },
|
544 |
+
# {
|
545 |
+
# "role": "user",
|
546 |
+
# "content": f"""
|
547 |
+
# You are given a user query and a set of relevant insurance policy document excerpts retrieved by a Retrieval-Augmented Generation (RAG) system.
|
548 |
+
|
549 |
+
# Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
|
550 |
+
|
551 |
+
# The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
|
552 |
+
|
553 |
+
# Guidelines:
|
554 |
+
# 1. Extract information that directly answers the user's query from the document excerpts.
|
555 |
+
# 2. Organize the response using clear headings, bullet points, or tables where applicable.
|
556 |
+
# 3. Cite the relevant policy name(s) and page number(s) using the metadata from the dataframe.
|
557 |
+
# 4. If the provided excerpts do not fully answer the query, provide all available information and suggest which sections of the policy document the user should review for further details.
|
558 |
+
# 5. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
|
559 |
+
|
560 |
+
# ### Example Query:
|
561 |
+
# **User Query**: "What are the premium rates for different types of insurance under this policy?"
|
562 |
+
|
563 |
+
# **Extracted Information**:
|
564 |
+
# **Article 2 - Premium Rates**:
|
565 |
+
# 1. **Member Life Insurance**: $0.210 for each $1,000 of insurance in force.
|
566 |
+
# 2. **Member Accidental Death and Dismemberment Insurance**: $0.025 for each $1,000 of Member Life Insurance in force.
|
567 |
+
# 3. **Dependent Life Insurance**: $1.46 for each Member insured for Dependent Life Insurance.
|
568 |
+
|
569 |
+
# **Multiple Policy Discount**: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
|
570 |
+
|
571 |
+
# **Citations**: Policy Name: Group Life Insurance Policy, Page Number: 12.
|
572 |
+
|
573 |
+
# ### Your Task:
|
574 |
+
# The user query is: '{query}'
|
575 |
+
# """
|
576 |
+
# }
|
577 |
+
# ]
|
578 |
+
# return prompt
|
579 |
+
|
580 |
+
# # function to create prompt having the top ranked docs and query.
|
581 |
+
|
582 |
+
# def create_prompt(query, top_docs):
|
583 |
+
|
584 |
+
# """
|
585 |
+
# Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
|
586 |
+
# """
|
587 |
+
# prompt = [
|
588 |
+
# {
|
589 |
+
# "role": "system",
|
590 |
+
# "content": "You are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely."
|
591 |
+
# },
|
592 |
+
# {
|
593 |
+
# "role": "user",
|
594 |
+
# "content": f"""
|
595 |
+
# You are given a user query and a set of relevant insurance policy document excerpts retrieved by a Retrieval-Augmented Generation (RAG) system.
|
596 |
+
|
597 |
+
# Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
|
598 |
+
|
599 |
+
# The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
|
600 |
+
|
601 |
+
# Guidelines:
|
602 |
+
# 1. Extract information that directly answers the user's query from the document excerpts.
|
603 |
+
# 2. Organize the response using clear headings, bullet points, or tables where applicable.
|
604 |
+
# 3. If the text includes tables with relevant information, reformat them into a clear, readable structure.
|
605 |
+
# 4. Cite the relevant policy name(s) and page number(s) using the metadata from the dataframe.
|
606 |
+
# 5. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
|
607 |
+
# 6. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
|
608 |
+
|
609 |
+
# ### Example Query:
|
610 |
+
# **User Query**: "What are the premium rates for different types of insurance under this policy?"
|
611 |
+
|
612 |
+
# **Premium Rates**:
|
613 |
+
# 1. **Member Life Insurance**: $0.210 for each $1,000 of insurance in force.
|
614 |
+
# 2. **Member Accidental Death and Dismemberment Insurance**: $0.025 for each $1,000 of Member Life Insurance in force.
|
615 |
+
# 3. **Dependent Life Insurance**: $1.46 for each Member insured for Dependent Life Insurance.
|
616 |
+
|
617 |
+
# **Multiple Policy Discount**: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
|
618 |
+
|
619 |
+
# **Citations**: Policy Name: Group Life Insurance Policy, Page Number: 12.
|
620 |
+
|
621 |
+
# ### Your Task:
|
622 |
+
# The user query is: '{query}'
|
623 |
+
# """
|
624 |
+
# }
|
625 |
+
# ]
|
626 |
+
# return prompt
|
627 |
+
|
628 |
+
# prompt = create_prompt(query, top_docs)
|
629 |
+
|
630 |
+
# # function to generate the response.
|
631 |
+
|
632 |
+
# def generate_response(query, top_docs):
|
633 |
+
# """
|
634 |
+
# Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
|
635 |
+
# """
|
636 |
+
# messages = [
|
637 |
+
# {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
|
638 |
+
# {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
|
639 |
+
# You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_docs}'. These search results are essentially one page of an insurance document that may be relevant to the user query.
|
640 |
+
|
641 |
+
# The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.
|
642 |
+
|
643 |
+
# Use the documents in '{top_docs}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.
|
644 |
+
|
645 |
+
# Follow the guidelines below when performing the task.
|
646 |
+
# 1. Try to provide relevant/accurate numbers if available.
|
647 |
+
# 2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
|
648 |
+
# 3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
|
649 |
+
# 3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
|
650 |
+
# 4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
|
651 |
+
# 5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.
|
652 |
+
|
653 |
+
# The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
|
654 |
+
# """},
|
655 |
+
# ]
|
656 |
+
|
657 |
+
# response = openai.chat.completions.create(
|
658 |
+
# model="gpt-3.5-turbo",
|
659 |
+
# messages=messages
|
660 |
+
# )
|
661 |
+
|
662 |
+
# return response.choices[0].message.content.split('\n')
|
663 |
+
|
664 |
+
# response = generate_response(query, top_docs)
|
665 |
+
# print(query + '\n')
|
666 |
+
# print("\n".join(response))
|
667 |
+
|
668 |
+
# function to generate the response.
|
669 |
+
|
670 |
+
def generate_response(query, top_docs):
|
671 |
+
"""
|
672 |
+
Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
|
673 |
+
"""
|
674 |
+
messages = f"""
|
675 |
+
Remember your system message and that you are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely.
|
676 |
+
Your task is to extract and present relevant information from the policy documents to answer the user’s query.
|
677 |
+
The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
|
678 |
+
The user input is: '{query}'
|
679 |
+
"""
|
680 |
+
|
681 |
+
# response = openai.chat.completions.create (
|
682 |
+
# model="gpt
|
683 |
+
### Your Task:-3.5-turbo",
|
684 |
+
# messages=messages
|
685 |
+
# )
|
686 |
+
conversation = [{"role": "user", "parts": messages}]
|
687 |
+
|
688 |
+
return conversation #response.choices[0].message.content.split('\n')
|
689 |
+
|
690 |
+
# response = generate_response(query, top_docs)
|
691 |
+
# print(query + '\n')
|
692 |
+
# print("\n".join(response))
|
693 |
+
|
694 |
+
"""## <font color = yellow> Query Search
|
695 |
+
|
696 |
+
### <font color = yellow> Query #1
|
697 |
+
"""
|
698 |
+
|
699 |
+
# query1 = "what happens if failed to Pay Premium?"
|
700 |
+
|
701 |
+
# results_df = retreive_results(query1, insurance_collection, cache_collection)
|
702 |
+
# top_docs = rerank_with_cross_encoder(results_df)
|
703 |
+
# top_docs
|
704 |
+
|
705 |
+
# #generate response
|
706 |
+
# response = generate_response(query1, top_docs)
|
707 |
+
|
708 |
+
# print("\n".join(response))
|
709 |
+
|
710 |
+
# """### <font color = yellow> Query #2"""
|
711 |
+
|
712 |
+
# query2 = "what are the eligibility requirements for different types of insurance under this policy?"
|
713 |
+
|
714 |
+
# results_df = retreive_results(query2, insurance_collection, cache_collection)
|
715 |
+
# top_docs = rerank_with_cross_encoder(results_df)
|
716 |
+
# top_docs
|
717 |
+
|
718 |
+
# #generate response
|
719 |
+
# response = generate_response(query2, top_docs)
|
720 |
+
# print("\n".join(response))
|
721 |
+
|
722 |
+
# """### <font color = yellow> Query #3"""
|
723 |
+
|
724 |
+
# query3 = "What are the Termination Rights of the Policyholder?"
|
725 |
+
|
726 |
+
# results_df = retreive_results(query3, insurance_collection, cache_collection)
|
727 |
+
# top_docs = rerank_with_cross_encoder(results_df)
|
728 |
+
# top_docs
|
729 |
+
|
730 |
+
# #generate response
|
731 |
+
# response = generate_response(query3, top_docs)
|
732 |
+
# print("\n".join(response))
|
733 |
+
|
734 |
+
# def run_pipeline(chunk_strategy,
|
735 |
+
# embedding_function,
|
736 |
+
# chroma_data_path,
|
737 |
+
# query,
|
738 |
+
# cross_encoder,
|
739 |
+
# top_k,
|
740 |
+
# rag_model,
|
741 |
+
# prompt_style="default"):
|
742 |
+
|
743 |
+
# # Embedding layer
|
744 |
+
# # Preprocess documents
|
745 |
+
|
746 |
+
# # Extract text
|
747 |
+
# # Split into chunks
|
748 |
+
# if chunk_strategy == "page":
|
749 |
+
# docs = extract_pages_from_pdf(pdf_path)
|
750 |
+
# elif chunk_strategy == "fixed_size":
|
751 |
+
# docs = fixed_size_chunking_of_pdf(pdf_path)
|
752 |
+
|
753 |
+
# docs_df = store_docs_to_df(docs)
|
754 |
+
|
755 |
+
# # Generate embeddings and store in chromadb collection and cache
|
756 |
+
# insurance_collection, cache_collection = generate_embeddings(docs_df, embedding_function)
|
757 |
+
|
758 |
+
# # Retrieve documents relevant to query from collections and store in cache
|
759 |
+
# results_df = retreive_results(query, insurance_collection, cache_collection)
|
760 |
+
|
761 |
+
# # Re-rank with Cross Encoder
|
762 |
+
# top_re_ranks, top_df = rerank_with_cross_encoder(results_df, top_k)
|
763 |
+
|
764 |
+
# # Create prompt
|
765 |
+
# prompt = create_prompt(query, top_re_ranks)
|
766 |
+
|
767 |
+
# # Generate response
|
768 |
+
# response = generate_response(prompt, rag_model)
|
769 |
+
|
770 |
+
# return top_df, response
|
771 |
+
|
772 |
+
# # select chunking strategy
|
773 |
+
|
774 |
+
# # chunk_strategy = "page"
|
775 |
+
# chunk_strategy = "fixed_size"
|
776 |
+
# # Load the tokenizer
|
777 |
+
# tokenizer = tiktoken.get_encoding("cl100k_base")
|
778 |
+
# # Define the token limit for each chunk
|
779 |
+
# TOKEN_SIZE = 500 # Adjust this based on your needs
|
780 |
+
|
781 |
+
# # Import the OpenAI Embedding Function into chroma
|
782 |
+
# from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
|
783 |
+
|
784 |
+
# # select the model and initialise the embedding function
|
785 |
+
# # model = "text-embedding-ada-002"
|
786 |
+
# # embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)
|
787 |
+
|
788 |
+
# from chromadb.utils import embedding_functions
|
789 |
+
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
|
790 |
+
# # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
|
791 |
+
# # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
792 |
+
|
793 |
+
# # Import the CrossEncoder library from sentence_transformers
|
794 |
+
# from sentence_transformers import CrossEncoder, util
|
795 |
+
# # Initialise the cross encoder model
|
796 |
+
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
|
797 |
+
# # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
|
798 |
+
# # cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
799 |
+
|
800 |
+
# # test query
|
801 |
+
# # query = "what are the eligibility requirements?"
|
802 |
+
# # query = "what are the eligibility requirements for different types of insurance under this policy?"
|
803 |
+
|
804 |
+
# # query = "what are the benefits payable?"
|
805 |
+
# # query = "what are the benefits payable for different types of insurance under this policy?"
|
806 |
+
# # query = "What are the benefits payable of Member Accidental Death and Dismemberment Insurance?"
|
807 |
+
# # query = "What are the benefits of Member Life Insurance?"
|
808 |
+
|
809 |
+
# # query = "How much is the premium amount?"
|
810 |
+
# # query = "How much is the premium amount for different types of insurance under this policy?"
|
811 |
+
|
812 |
+
# # query = "How much is the premium rate?"
|
813 |
+
# # query = "What are the premium rates for different types of insurance under this policy?"
|
814 |
+
# # query = "What are the premium rates?"
|
815 |
+
|
816 |
+
# # print(query)
|
817 |
+
|
818 |
+
# # how much top query results to consider for generating response
|
819 |
+
# top_k = 5
|
820 |
+
|
821 |
+
# # select RAG model
|
822 |
+
# rag_model = "gpt-3.5-turbo"
|
823 |
+
|
824 |
+
# top_df, response = run_pipeline(chunk_strategy,
|
825 |
+
# embedding_function,
|
826 |
+
# chroma_data_path,
|
827 |
+
# query,
|
828 |
+
# cross_encoder,
|
829 |
+
# top_k,
|
830 |
+
# rag_model)
|
831 |
+
# # results_df = run_pipeline(chunk_strategy,
|
832 |
+
# # embedding_function,
|
833 |
+
# # chroma_data_path,
|
834 |
+
# # query,
|
835 |
+
# # cross_encoder,
|
836 |
+
# # top_k,
|
837 |
+
# # rag_model)
|
838 |
+
|
839 |
+
# # top_re_ranks = run_pipeline(chunk_strategy,
|
840 |
+
# # embedding_function,
|
841 |
+
# # chroma_data_path,
|
842 |
+
# # query,
|
843 |
+
# # cross_encoder,
|
844 |
+
# # top_k,
|
845 |
+
# # rag_model)
|
846 |
+
|
847 |
+
# print("\n".join(response))
|
848 |
+
# # print(prompt)
|
849 |
+
# # top_re_ranks
|
850 |
+
# # docs_df.head(100)
|
851 |
+
# # top_semantic_search
|
852 |
+
# top_df
|
853 |
+
# # results_df
|
854 |
+
|
855 |
+
|
856 |
+
|
857 |
+
|
858 |
+
|
859 |
+
|
860 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
chromadb==0.5.7
|
2 |
+
google.generativeai
|
3 |
+
python-dotenv
|
4 |
+
pandas
|
5 |
+
numpy
|
6 |
+
fastapi
|
7 |
+
uvicorn
|
8 |
+
jinja2
|
9 |
+
python-multipart
|
10 |
+
pdfplumber
|
11 |
+
sentence_transformers
|
12 |
+
tiktoken
|
static/css/styles.css
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
button {
|
2 |
+
/* These styles apply to all buttons */
|
3 |
+
color: white; /* Color of the button text */
|
4 |
+
background-color: rgb(165, 152, 111); /* Background color of the button */
|
5 |
+
border: 2px solid black; /* Border around the button. It's 2px wide, solid, and black */
|
6 |
+
padding: 10px; /* Space between the button text and the edge of the button */
|
7 |
+
width: 200px; /* The button is 200px wide */
|
8 |
+
margin-top: 20px; /* Space between the top of the button and the element above it */
|
9 |
+
border-radius: 10px; /* Rounds the corners of the button */
|
10 |
+
cursor: pointer; /* The cursor turns into a hand when it's over the button */
|
11 |
+
font-family: 'Courier New', Courier, monospace; /* Font of the text. If 'Courier New' isn't available, the browser will try to use 'Courier'. If that's not available, it will use any monospace font. */
|
12 |
+
}
|
13 |
+
|
14 |
+
.outercontainer {
|
15 |
+
/* These styles apply to the div with class "outercontainer" */
|
16 |
+
max-width: 800px; /* Maximum width of the container. If the screen is narrower than 800px, the container will shrink to fit it */
|
17 |
+
margin: 0 auto; /* Centers the container. 'auto' makes the left and right margins equal */
|
18 |
+
padding: 20px; /* Space between the content of the container and its borders */
|
19 |
+
border: 5px solid #ccc; /* Border around the container. It's 5px wide, solid, and a light grey (#ccc) */
|
20 |
+
border-radius: 50px; /* Rounds the corners of the border */
|
21 |
+
}
|
22 |
+
|
23 |
+
.conversationcontainer{
|
24 |
+
/* These styles apply to the div with class "conversation" */
|
25 |
+
border: 5px solid #ccc; /* Border around the conversation. It's 5px wide, solid, and a light grey (#ccc) */
|
26 |
+
padding: 10px; /* Space between the content of the conversation and its borders */
|
27 |
+
height: 500px; /* The conversation box is always 500px tall */
|
28 |
+
background-color: #f3f3f3; /* Background color of the conversation */
|
29 |
+
overflow-y: scroll; /* If the conversation gets too tall for its container, it will become scrollable */
|
30 |
+
border-radius: 50px; /* Rounds the corners of the border */
|
31 |
+
}
|
32 |
+
|
33 |
+
#titlestyle{
|
34 |
+
color: black; /* Color of the heading text */
|
35 |
+
background-color: rgb(227, 224, 212); /* Background color of the heading */
|
36 |
+
border: 2px solid black; /* Border around the heading. It's 2px wide, solid, and black */
|
37 |
+
padding: 10px; /* Space between the content of the heading and its borders */
|
38 |
+
margin-top: 20px; /* Space between the top of the heading and the element above it */
|
39 |
+
border-radius: 10px; /* Rounds the corners of the border */
|
40 |
+
width:fit-content; /* The width of the heading is just enough to fit its content */
|
41 |
+
}
|
42 |
+
|
43 |
+
#inputtextbox{
|
44 |
+
/* These styles apply to text input fields */
|
45 |
+
width: 85%; /* Text inputs are 85% as wide as their container */
|
46 |
+
border: 5px solid #ccc; /* Border around the text inputs. It's 5px wide, solid, and a light grey (#ccc) */
|
47 |
+
padding: 10px; /* Space between the text in the input and the edge of the input field */
|
48 |
+
background-color: #f3f3f3f3; /* Background color of the text input field */
|
49 |
+
border-radius: 10px; /* Rounds the corners of the text input field */
|
50 |
+
}
|
51 |
+
|
52 |
+
#submitbutton {
|
53 |
+
/* These styles apply to the submit button */
|
54 |
+
background: url(/static/send-icon.png) no-repeat center center; /* Background image of the button. The image is centered and doesn't repeat */
|
55 |
+
background-size: cover; /* The image covers the entire background of the button */
|
56 |
+
border: none; /* The button has no border */
|
57 |
+
width: 40px; /* The button is 40px wide */
|
58 |
+
height: 40px; /* The button is 40px tall */
|
59 |
+
cursor: pointer; /* The cursor turns into a hand when it's over the button */
|
60 |
+
}
|
61 |
+
|
62 |
+
.user {
|
63 |
+
/* These styles apply to divs with class "user", i.e., user messages */
|
64 |
+
color: white; /* Color of the user's text */
|
65 |
+
background-color: rgb(0, 195, 255); /* Background color of the user's messages */
|
66 |
+
padding: 8px; /* Space between the user's text and the edge of its container */
|
67 |
+
border-radius: 10px; /* Rounds the corners of the user's messages */
|
68 |
+
float: right; /* Floats the user's messages to the right */
|
69 |
+
clear: both; /* Positions the user's messages below any floated elements that came before it */
|
70 |
+
max-width: 80%; /* Maximum width of the user's messages */
|
71 |
+
margin: 5px;
|
72 |
+
}
|
73 |
+
|
74 |
+
.bot {
|
75 |
+
/* Same as above, but for the bot's messages */
|
76 |
+
color: white;
|
77 |
+
background-color: rgb(0, 128, 90);
|
78 |
+
padding: 8px;
|
79 |
+
border-radius: 10px;
|
80 |
+
float: left;
|
81 |
+
clear: both;
|
82 |
+
max-width: 80%;
|
83 |
+
margin: 5px;
|
84 |
+
}
|
85 |
+
|
86 |
+
.center {
|
87 |
+
/* These styles apply to elements with class "center" */
|
88 |
+
display: flex; /* The center element is a flex container */
|
89 |
+
justify-content: center; /* The items inside the flex container are centered along the horizontal line */
|
90 |
+
}
|
91 |
+
|
92 |
+
form {
|
93 |
+
/* These styles apply to all forms */
|
94 |
+
margin: 10px; /* Space around the outside of each form */
|
95 |
+
}
|
96 |
+
|
97 |
+
body {
|
98 |
+
font-family: 'Courier New', Courier, monospace; /* Font of the text. If 'Courier New' isn't available, the browser will try to use 'Courier'. If that's not available, it will use any monospace font. */
|
99 |
+
margin: 50px; /* Space around the outside of the <body> */
|
100 |
+
}
|
static/send-icon.png
ADDED
templates/index_bye.html
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<html>
|
2 |
+
<head>
|
3 |
+
<title>
|
4 |
+
Invite App
|
5 |
+
</title>
|
6 |
+
</head>
|
7 |
+
<body>
|
8 |
+
<div>
|
9 |
+
<h1>
|
10 |
+
Invite App
|
11 |
+
</h1>
|
12 |
+
</div>
|
13 |
+
<div>
|
14 |
+
<h2>
|
15 |
+
Bye, you are not invited to the event, {{ name_xyz }}
|
16 |
+
|
17 |
+
</h2>
|
18 |
+
</div>
|
19 |
+
|
20 |
+
</body>
|
21 |
+
</html>
|
templates/index_hello.html
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<html>
|
2 |
+
<head>
|
3 |
+
<title>
|
4 |
+
Invite App
|
5 |
+
</title>
|
6 |
+
</head>
|
7 |
+
<body>
|
8 |
+
<div>
|
9 |
+
<h1>
|
10 |
+
Invite App
|
11 |
+
</h1>
|
12 |
+
</div>
|
13 |
+
<div>
|
14 |
+
<h2>
|
15 |
+
Hello, you are invited to the event, {{ name_xyz }}
|
16 |
+
|
17 |
+
</h2>
|
18 |
+
</div>
|
19 |
+
|
20 |
+
</body>
|
21 |
+
</html>
|
templates/index_invite.html
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<html>
|
2 |
+
<head>
|
3 |
+
<title>
|
4 |
+
Insurance Policy AI Assistant
|
5 |
+
</title>
|
6 |
+
<link rel="stylesheet" type="text/css" href="/static/css/styles.css">
|
7 |
+
</head>
|
8 |
+
<body>
|
9 |
+
<div class="outercontainer">
|
10 |
+
<div class="center">
|
11 |
+
<h1 id="titlestyle">
|
12 |
+
Insurance Policy AI Assistant
|
13 |
+
</h1>
|
14 |
+
</div>
|
15 |
+
<div class="conversationcontainer" id="chatcontainer">
|
16 |
+
{% for entry in name_xyz %}
|
17 |
+
<div class="{% if entry.bot %}bot{% else %}user{% endif %}">
|
18 |
+
{% if entry.bot %}
|
19 |
+
{{ entry.bot|safe }}
|
20 |
+
{% else %}
|
21 |
+
{{ entry.user }}
|
22 |
+
{% endif %}
|
23 |
+
</div>
|
24 |
+
{% endfor %}
|
25 |
+
</div>
|
26 |
+
|
27 |
+
<form action = "/invite" method="POST" class="center">
|
28 |
+
<input type ="text" name="user_input_message" id = "inputtextbox">
|
29 |
+
<input type="submit" value=" " id="submitbutton">
|
30 |
+
</form>
|
31 |
+
|
32 |
+
<form action="/end_conv" method="POST" class="center">
|
33 |
+
<button type="submit">END CONVERSATION</button>
|
34 |
+
</form>
|
35 |
+
</div>
|
36 |
+
|
37 |
+
<script>
|
38 |
+
// Scroll to the bottom of the chat container on page load and after a new message is sent
|
39 |
+
function scrollToBottom() {
|
40 |
+
var chatContainer = document.getElementById('chatcontainer');
|
41 |
+
chatContainer.scrollTop = chatContainer.scrollHeight;
|
42 |
+
}
|
43 |
+
|
44 |
+
window.onload = scrollToBottom;
|
45 |
+
|
46 |
+
</script>
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
</body>
|
51 |
+
</html>
|