mikepastor11 commited on
Commit
7bb34c1
·
verified ·
1 Parent(s): 00e4319

Initial load

Browse files
Files changed (2) hide show
  1. PennwickFileAnalyzer.py +168 -0
  2. htmlTemplates.py +44 -0
PennwickFileAnalyzer.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##############################################################
2
+ # PDF Chat
3
+ #
4
+ # Mike Pastor February 2024
5
+
6
+
7
+ import streamlit as st
8
+ from dotenv import load_dotenv
9
+
10
+ from PyPDF2 import PdfReader
11
+ from langchain.text_splitter import CharacterTextSplitter
12
+
13
+ from InstructorEmbedding import INSTRUCTOR
14
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
15
+ from langchain.vectorstores import FAISS
16
+ from langchain.chat_models import ChatOpenAI
17
+ from langchain.memory import ConversationBufferMemory
18
+ from langchain.chains import ConversationalRetrievalChain
19
+ from htmlTemplates import css, bot_template, user_template
20
+ from langchain.llms import HuggingFaceHub
21
+
22
+ def get_pdf_text(pdf_docs):
23
+ text = ""
24
+ for pdf in pdf_docs:
25
+ pdf_reader = PdfReader(pdf)
26
+ for page in pdf_reader.pages:
27
+ text += page.extract_text()
28
+ return text
29
+
30
+ # Chunk size and overlap must not exceed the models capacity!
31
+ #
32
+ def get_text_chunks(text):
33
+ text_splitter = CharacterTextSplitter(
34
+ separator="\n",
35
+ chunk_size=800, # 1000
36
+ chunk_overlap=200,
37
+ length_function=len
38
+ )
39
+ chunks = text_splitter.split_text(text)
40
+ return chunks
41
+
42
+
43
+ def get_vectorstore(text_chunks):
44
+ # embeddings = OpenAIEmbeddings()
45
+
46
+ # pip install InstructorEmbedding
47
+ # pip install sentence-transformers==2.2.2
48
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
49
+
50
+ # from InstructorEmbedding import INSTRUCTOR
51
+ # model = INSTRUCTOR('hkunlp/instructor-xl')
52
+ # sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
53
+ # instruction = "Represent the Science title:"
54
+ # embeddings = model.encode([[instruction, sentence]])
55
+
56
+ # embeddings = model.encode(text_chunks)
57
+ print('have Embeddings: ')
58
+
59
+ # text_chunks="this is a test"
60
+ # FAISS, Chroma and other vector databases
61
+ #
62
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
63
+ print('FAISS succeeds: ')
64
+
65
+ return vectorstore
66
+
67
+ def get_conversation_chain(vectorstore):
68
+ # llm = ChatOpenAI()
69
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
70
+ # google/bigbird-roberta-base facebook/bart-large
71
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
72
+
73
+ memory = ConversationBufferMemory(
74
+ memory_key='chat_history', return_messages=True)
75
+ conversation_chain = ConversationalRetrievalChain.from_llm(
76
+ llm=llm,
77
+ retriever=vectorstore.as_retriever(),
78
+ memory=memory,
79
+ )
80
+ return conversation_chain
81
+
82
+ def handle_userinput(user_question):
83
+
84
+ response = st.session_state.conversation({'question': user_question})
85
+ # response = st.session_state.conversation({'summarization': user_question})
86
+ st.session_state.chat_history = response['chat_history']
87
+
88
+
89
+ # st.empty()
90
+
91
+ for i, message in enumerate(st.session_state.chat_history):
92
+ if i % 2 == 0:
93
+ st.write(user_template.replace(
94
+ "{{MSG}}", message.content), unsafe_allow_html=True)
95
+
96
+ else:
97
+ st.write(bot_template.replace(
98
+ "{{MSG}}", message.content), unsafe_allow_html=True)
99
+
100
+
101
+
102
+
103
+ def main():
104
+
105
+ load_dotenv()
106
+ st.set_page_config(page_title="MLP Chat with multiple PDFs",
107
+ page_icon=":books:")
108
+
109
+ st.write(css, unsafe_allow_html=True)
110
+
111
+ if "conversation" not in st.session_state:
112
+ st.session_state.conversation = None
113
+ if "chat_history" not in st.session_state:
114
+ st.session_state.chat_history = None
115
+
116
+ st.header("Mike's PDF Chat :books:")
117
+
118
+ user_question = st.text_input("Ask a question about your documents:")
119
+ if user_question:
120
+ handle_userinput(user_question)
121
+
122
+ # st.write( user_template, unsafe_allow_html=True)
123
+ # st.write(user_template.replace( "{{MSG}}", "Hello robot!"), unsafe_allow_html=True)
124
+ # st.write(bot_template.replace( "{{MSG}}", "Hello human!"), unsafe_allow_html=True)
125
+
126
+
127
+ with st.sidebar:
128
+
129
+ st.subheader("Your documents")
130
+ pdf_docs = st.file_uploader(
131
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
132
+
133
+ # Upon button press
134
+ if st.button("Process these files"):
135
+ with st.spinner("Processing..."):
136
+
137
+ #################################################################
138
+ # Track the overall time for file processing into Vectors
139
+ # #
140
+ from datetime import datetime
141
+ global_now = datetime.now()
142
+ global_current_time = global_now.strftime("%H:%M:%S")
143
+ st.write("Vectorizing Files - Current Time =", global_current_time)
144
+
145
+ # get pdf text
146
+ raw_text = get_pdf_text(pdf_docs)
147
+ # st.write(raw_text)
148
+
149
+ # # get the text chunks
150
+ text_chunks = get_text_chunks(raw_text)
151
+ # st.write(text_chunks)
152
+
153
+ # # create vector store
154
+ vectorstore = get_vectorstore(text_chunks)
155
+
156
+ # # create conversation chain
157
+ st.session_state.conversation = get_conversation_chain(vectorstore)
158
+
159
+ # Mission Complete!
160
+ global_later = datetime.now()
161
+ st.write("Files Vectorized - Total EXECUTION Time =",
162
+ (global_later - global_now), global_later)
163
+
164
+
165
+ if __name__ == '__main__':
166
+ main()
167
+
168
+
htmlTemplates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ bot_template = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://free-images.com/sm/9cb8/sunset_sundown_da_nang.jpg" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ user_template = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://free-images.com/sm/176d/squirrel_tail_bushy_tail.jpg" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;" >
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''