viboognesh commited on
Commit
a8a91b0
1 Parent(s): e12b281

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +118 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ import streamlit as st
3
+ import requests
4
+ import os
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain_openai import OpenAIEmbeddings
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.memory import ConversationBufferMemory
11
+ from langchain.chains import ConversationalRetrievalChain
12
+
13
+ def getpdfdoc():
14
+ with st.spinner("Loading PDF..."):
15
+ filename = '48lawsofpower.pdf'
16
+ if os.path.exists(filename):
17
+ with open(filename, 'rb') as f:
18
+ pdf_doc = f.read()
19
+ return pdf_doc
20
+ else:
21
+ url = 'https://pgcag.files.wordpress.com/2010/01/48lawsofpower.pdf'
22
+ response = requests.get(url)
23
+
24
+ with open(filename, 'wb') as f:
25
+ f.write(response.content)
26
+
27
+ return getpdfdoc()
28
+
29
+
30
+ def extract_text_from_pdf(pdf_file_obj):
31
+ with st.spinner("Extracting text from PDF..."):
32
+ pdf_reader = PdfReader(BytesIO(pdf_file_obj))
33
+ text = ""
34
+ for page_num in range(len(pdf_reader.pages)):
35
+ page_obj = pdf_reader.pages[page_num]
36
+ text += page_obj.extract_text()
37
+ return text
38
+
39
+ def get_text_chunks(text):
40
+ with st.spinner("Splitting text into chunks..."):
41
+ text_splitter = CharacterTextSplitter(
42
+ separator="\n",
43
+ chunk_size=1000,
44
+ chunk_overlap=200,
45
+ length_function=len
46
+ )
47
+ chunks = text_splitter.split_text(text)
48
+ return chunks
49
+
50
+
51
+ def get_vectorstore(text_chunks):
52
+ with st.spinner("Creating vectorstore..."):
53
+ metadatas = [{"source": f"{i}-pl"} for i in range(len(text_chunks))]
54
+ embeddings = OpenAIEmbeddings()
55
+ vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embeddings, persist_directory="./chroma_db", metadatas=metadatas)
56
+ return vectorstore
57
+
58
+ def get_conversation_chain(vectorstore):
59
+ with st.spinner("Loading LLM..."):
60
+ llm = ChatOpenAI()
61
+
62
+ memory = ConversationBufferMemory(
63
+ memory_key='chat_history', return_messages=True)
64
+ conversation_chain = ConversationalRetrievalChain.from_llm(
65
+ llm=llm,
66
+ retriever=vectorstore.as_retriever(),
67
+ memory=memory
68
+ )
69
+ return conversation_chain
70
+
71
+
72
+ def handle_userinput(user_question):
73
+ response = st.session_state.conversation({'question': user_question})
74
+ st.session_state.chat_history = response['chat_history']
75
+
76
+ for i, message in enumerate(st.session_state.chat_history):
77
+ if i % 2 == 0:
78
+ st.markdown(("User: "+message.content))
79
+ else:
80
+ st.markdown(("AI: "+message.content))
81
+
82
+
83
+ def main():
84
+ if "conversation" not in st.session_state:
85
+ st.session_state.conversation = None
86
+ if "chat_history" not in st.session_state:
87
+ st.session_state.chat_history = None
88
+
89
+ if st.session_state.conversation is None:
90
+ if os.path.isdir("./chroma_db"):
91
+ if os.listdir("./chroma_db"):
92
+ with st.spinner("Loading vector store..."):
93
+ vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=OpenAIEmbeddings())
94
+ st.session_state.conversation = get_conversation_chain(vectorstore)
95
+
96
+ if st.session_state.conversation is None:
97
+ pdf_doc = getpdfdoc()
98
+
99
+ # get pdf text
100
+ raw_text = extract_text_from_pdf(pdf_doc)
101
+
102
+ # get the text chunks
103
+ text_chunks = get_text_chunks(raw_text)
104
+
105
+ # create vector store
106
+ vectorstore = get_vectorstore(text_chunks)
107
+
108
+ # create conversation chain
109
+ st.session_state.conversation = get_conversation_chain(vectorstore)
110
+
111
+ if st.session_state.conversation is not None:
112
+ st.header("Ask questions from 48 Laws of Power:books:")
113
+ user_question = st.chat_input("Ask a question about your documents:")
114
+ if user_question:
115
+ handle_userinput(user_question)
116
+
117
+ if __name__ == '__main__':
118
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchain-openai
4
+ pypdf2
5
+ chromadb
6
+ streamlit