danicafisher commited on
Commit
8983152
1 Parent(s): 96c1443

Cleans up files

Browse files
Files changed (2) hide show
  1. app.py +56 -92
  2. requirements.txt +1 -12
app.py CHANGED
@@ -1,60 +1,14 @@
1
- import os
2
  from typing import List
3
- from PyPDF2 import PdfReader
4
-
5
- from langchain.embeddings.openai import OpenAIEmbeddings
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain.vectorstores import Chroma
8
- from langchain.chat_models import ChatOpenAI
9
- from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
10
- from langchain.docstore.document import Document
11
- from langchain.schema import StrOutputParser
12
- from langchain.chains import (
13
- ConversationalRetrievalChain,
14
- LLMChain
15
  )
16
-
 
17
  import chainlit as cl
18
-
19
-
20
- class PDFFileLoader:
21
- def __init__(self, path: str):
22
- self.documents = []
23
- self.path = path
24
-
25
- def load(self):
26
- if os.path.isdir(self.path):
27
- self.load_directory()
28
- elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
29
- self.load_file()
30
- else:
31
- raise ValueError(
32
- "Provided path is neither a valid directory nor a .pdf file."
33
- )
34
-
35
- def load_file(self):
36
- with open(self.path, "rb") as file:
37
- pdf_reader = PdfReader(file)
38
- text = ""
39
- for page in pdf_reader.pages:
40
- text += page.extract_text()
41
- self.documents.append(text)
42
-
43
- def load_directory(self):
44
- for root, _, files in os.walk(self.path):
45
- for file in files:
46
- if file.endswith(".pdf"):
47
- file_path = os.path.join(root, file)
48
- with open(file_path, "rb") as f:
49
- pdf_reader = PdfReader(f)
50
- text = ""
51
- for page in pdf_reader.pages:
52
- text += page.extract_text()
53
- self.documents.append(text)
54
-
55
- def load_documents(self):
56
- self.load()
57
- return self.documents
58
 
59
 
60
  pdf_loader_NIST = PDFFileLoader("data/NIST.AI.600-1.pdf")
@@ -62,25 +16,19 @@ pdf_loader_Blueprint = PDFFileLoader("data/Blueprint-for-an-AI-Bill-of-Rights.pd
62
  documents_NIST = pdf_loader_NIST.load_documents()
63
  documents_Blueprint = pdf_loader_Blueprint.load_documents()
64
 
 
 
 
65
 
66
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
67
- split_documents_NIST = text_splitter.split_text(documents_NIST)
68
- split_documents_Blueprint = text_splitter.split_text(documents_Blueprint)
69
- documents = split_documents_NIST + split_documents_Blueprint
70
 
71
- embeddings = OpenAIEmbeddings()
72
- # Create a metadata for each chunk
73
- metadatas = [{"source": f"{i}-pl"} for i in range(len(documents))]
74
-
75
- # Set up prompts
76
  RAG_PROMPT_TEMPLATE = """ \
77
  Use the provided context to answer the user's query.
78
-
79
  You may not answer the user's query unless there is specific context in the following text.
80
-
81
  If you do not know the answer, or cannot answer, please respond with "I don't know".
82
  """
83
 
 
 
84
  USER_PROMPT_TEMPLATE = """ \
85
  Context:
86
  {context}
@@ -88,42 +36,58 @@ User Query:
88
  {user_query}
89
  """
90
 
91
- rag_prompt = SystemMessagePromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
92
- user_prompt = HumanMessagePromptTemplate.from_template(USER_PROMPT_TEMPLATE)
93
- chat_prompt = ChatPromptTemplate.from_messages([rag_prompt, user_prompt])
94
 
95
- @cl.on_chat_start
96
- async def start_chat():
97
- # settings = {
98
- # "model": "gpt-4o-mini",
99
- # "temperature": 0,
100
- # "max_tokens": 500,
101
- # "top_p": 1,
102
- # "frequency_penalty": 0,
103
- # "presence_penalty": 0,
104
- # }
105
 
106
- # cl.user_session.set("settings", settings)
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # Vector Database
110
- docsearch = await cl.make_async(Chroma.from_texts)(
111
- documents, embeddings, metadatas=metadatas
112
- )
113
 
114
- # Create a chain that uses the Chroma vector store
115
- chain = ConversationalRetrievalChain.from_llm(
116
- ChatOpenAI(model_name="gpt-4o-mini", streaming=True),
117
- prompt=chat_prompt,
118
- output_parser=StrOutputParser(),
119
- retriever=docsearch.as_retriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
 
122
- # chain = LLMChain(llm= ChatOpenAI(model_name="gpt-4o-mini", streaming=True), prompt=chat_prompt, output_parser=StrOutputParser())
123
 
124
- cl.user_session.set("chain", chain)
125
 
126
- @cl.on_message # marks a function that should be run each time the chatbot receives a message from a user
127
  async def main(message):
128
  chain = cl.user_session.get("chain")
129
 
 
 
1
  from typing import List
2
+ from aimakerspace.text_utils import CharacterTextSplitter, PDFFileLoader
3
+ from aimakerspace.openai_utils.prompts import (
4
+ UserRolePrompt,
5
+ SystemRolePrompt
 
 
 
 
 
 
 
 
6
  )
7
+ from aimakerspace.vectordatabase import VectorDatabase
8
+ from aimakerspace.openai_utils.chatmodel import ChatOpenAI
9
  import chainlit as cl
10
+ import nest_asyncio
11
+ nest_asyncio.apply()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  pdf_loader_NIST = PDFFileLoader("data/NIST.AI.600-1.pdf")
 
16
  documents_NIST = pdf_loader_NIST.load_documents()
17
  documents_Blueprint = pdf_loader_Blueprint.load_documents()
18
 
19
+ text_splitter = CharacterTextSplitter()
20
+ split_documents_NIST = text_splitter.split_texts(documents_NIST)
21
+ split_documents_Blueprint = text_splitter.split_texts(documents_Blueprint)
22
 
 
 
 
 
23
 
 
 
 
 
 
24
  RAG_PROMPT_TEMPLATE = """ \
25
  Use the provided context to answer the user's query.
 
26
  You may not answer the user's query unless there is specific context in the following text.
 
27
  If you do not know the answer, or cannot answer, please respond with "I don't know".
28
  """
29
 
30
+ rag_prompt = SystemRolePrompt(RAG_PROMPT_TEMPLATE)
31
+
32
  USER_PROMPT_TEMPLATE = """ \
33
  Context:
34
  {context}
 
36
  {user_query}
37
  """
38
 
39
+ user_prompt = UserRolePrompt(USER_PROMPT_TEMPLATE)
 
 
40
 
41
+ class RetrievalAugmentedQAPipeline:
42
+ def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
43
+ self.llm = llm
44
+ self.vector_db_retriever = vector_db_retriever
 
 
 
 
 
 
45
 
46
+ async def arun_pipeline(self, user_query: str):
47
+ context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
48
 
49
+ context_prompt = ""
50
+ for context in context_list:
51
+ context_prompt += context[0] + "\n"
52
+
53
+ formatted_system_prompt = rag_prompt.create_message()
54
+
55
+ formatted_user_prompt = user_prompt.create_message(user_query=user_query, context=context_prompt)
56
+
57
+ async def generate_response():
58
+ async for chunk in self.llm.astream([formatted_system_prompt, formatted_user_prompt]):
59
+ yield chunk
60
+
61
+ return {"response": generate_response(), "context": context_list}
62
 
 
 
 
 
63
 
64
+ # ------------------------------------------------------------
65
+
66
+
67
+ @cl.on_chat_start
68
+ async def start_chat():
69
+ settings = {
70
+ "model": "gpt-4o-mini"
71
+ }
72
+ cl.user_session.set("settings", settings)
73
+
74
+ # Create a vector store
75
+ vector_db = VectorDatabase()
76
+ vector_db = await vector_db.abuild_from_list(split_documents_NIST)
77
+ vector_db = await vector_db.abuild_from_list(split_documents_Blueprint)
78
+
79
+ chat_openai = ChatOpenAI()
80
+
81
+ # Create a chain
82
+ retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
83
+ vector_db_retriever=vector_db,
84
+ llm=chat_openai
85
  )
86
 
87
+ cl.user_session.set("chain", retrieval_augmented_qa_pipeline)
88
 
 
89
 
90
+ @cl.on_message
91
  async def main(message):
92
  chain = cl.user_session.get("chain")
93
 
requirements.txt CHANGED
@@ -1,15 +1,4 @@
1
  numpy
2
  chainlit==0.7.700
3
  openai
4
- PyPDF2
5
- pymupdf
6
- # langchain
7
- # langchain-core
8
- langchain-community
9
- langchain-text-splitters
10
- # langchain-openai
11
- # qdrant-client
12
- # langchain-qdrant
13
- langchain
14
- chromadb
15
- tiktoken
 
1
  numpy
2
  chainlit==0.7.700
3
  openai
4
+ PyPDF2