owl123 commited on
Commit
5621d9a
1 Parent(s): c8beef1
Files changed (5) hide show
  1. app.py +122 -0
  2. icon_assistant.png +0 -0
  3. icon_user.png +0 -0
  4. requirements.txt +8 -0
  5. run.sh +4 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import langchain
3
+ from langchain.document_loaders import OnlinePDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.vectorstores import Pinecone
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
+ import pinecone
8
+
9
+ st.sidebar.markdown(" # Welcome to Ztudy ")
10
+
11
+ # ------------------------ PDF ------------------------
12
+ # Hard-coded PDFs (TODO: make this dynamic from Google Drive)
13
+ pdf_dict = {}
14
+ pdf_dict["Field Guide to Data Science"] = "https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf"
15
+ pdf_dict["2023 GPT-4 Technical Report"] = "https://cdn.openai.com/papers/gpt-4.pdf"
16
+ pdf_dict["Administering Data Centers"] = "https://drive.google.com/file/d/1r3bqHq-ZszXnX6UJLOaeoEEa1plUYXZu"
17
+ pdf_dict["First Aid Reference Guide (Google)"] = "https://drive.google.com/file/d/1fzN2wa_uJ8INUYim88eCymSvJdyDT2fz/"
18
+ pdf_dict["First Aid Reference Guide (Public)"] = "https://www.sja.ca/sites/default/files/2021-05/First%20aid%20reference%20guide_V4.1_Public.pdf"
19
+ pdf_dict["Astronomy 2106"] = "https://drive.google.com/file/d/1XXmjMLENP90-eXEqOaTxQ8O56ZwExsVT"
20
+ pdf_dict["Astronomy 2106 (New)"] = "https://drive.google.com/file/d/1w1S-TY2PzeJ9mjPVb1yLwcYh5EI44oP7"
21
+ pdf_dict["Learning Deep Learning: Chapter 1"] = "https://drive.google.com/file/d/1o7feaKFzXd5-95GffZyynAwY_fzGafhr/view?usp=sharing"
22
+
23
+ # -------------------- Globals ------------------------
24
+ texts = None
25
+ pinecone_index = "group-1"
26
+
27
+ if 'exchanges' not in st.session_state:
28
+ st.session_state.exchanges = []
29
+
30
+ # -------------------- Functions -----------------------
31
+ def console_log(msg):
32
+ st.sidebar.write(msg)
33
+
34
+ def init_pinecone():
35
+ pinecone.init(
36
+ api_key=st.secrets["PINECONE_API_KEY"], # find at app.pinecone.io
37
+ environment=st.secrets["PINECONE_API_ENV"] # next to api key in console
38
+ )
39
+ return
40
+
41
+ def load_vector_database():
42
+ embeddings = OpenAIEmbeddings(openai_api_key=st.secrets["OPENAI_API_KEY"])
43
+ init_pinecone()
44
+ print(f"Number of vectors: {len(texts)} to be upserted to Index: {pinecone_index}")
45
+ Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=pinecone_index)
46
+
47
+ def load_pdf(url):
48
+ console_log(f"Loading {url}")
49
+ loader = OnlinePDFLoader(url)
50
+ data = loader.load()
51
+ console_log(f'You have {len(data)} document(s) in your data')
52
+ console_log(f'There are {len(data[0].page_content)} characters in your document')
53
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
54
+ global texts
55
+ texts = text_splitter.split_documents(data)
56
+ console_log(f'After splitting, you have {len(texts)} documents')
57
+ load_vector_database()
58
+
59
+ def chat(query):
60
+
61
+ from langchain.llms import OpenAI
62
+ from langchain.chains.question_answering import load_qa_chain
63
+
64
+ llm = OpenAI(temperature=0, openai_api_key=st.secrets["OPENAI_API_KEY"])
65
+ chain = load_qa_chain(llm, chain_type="stuff")
66
+
67
+ embeddings = OpenAIEmbeddings(openai_api_key=st.secrets["OPENAI_API_KEY"])
68
+ init_pinecone()
69
+ vector_store = Pinecone.from_existing_index(pinecone_index, embeddings)
70
+ docs = vector_store.similarity_search(query, include_metadata=True)
71
+
72
+ # Comment/Uncomment to hide/show trace of documents
73
+ with st.expander("See documents for embedding"):
74
+ for i in range(len(docs)):
75
+ st.write(docs[i])
76
+
77
+ return chain.run(input_documents=docs, question=query)
78
+
79
+ def format_exchanges(exchanges):
80
+ for i in range(len(exchanges)):
81
+ if exchanges[i]["role"] == "user":
82
+ icon, text, blank = st.columns([1,8,1])
83
+ elif exchanges[i]["role"] == "assistant":
84
+ blank, text, icon = st.columns([1,8,1])
85
+ else:
86
+ st.markdown("*" + exchanges[i]["role"] + ":* " + exchanges[i]["content"])
87
+ continue
88
+
89
+ with icon:
90
+ st.image("icon_" + exchanges[i]["role"] + ".png", width=50)
91
+ with text:
92
+ st.markdown(exchanges[i]["content"])
93
+ st.markdown("""---""")
94
+
95
+ def format_prompt(exchanges):
96
+ # Include the last 6 exchanges
97
+ prompt = ""
98
+ for i in range( max(len(exchanges)-7,0), len(exchanges)):
99
+ prompt += "[Q]" if (exchanges[i]["role"] == "user") else "[A]"
100
+ prompt += ": " + exchanges[i]["content"] + "\n"
101
+ with st.expander("See prompt sent to LLM"):
102
+ st.write(prompt)
103
+ return prompt
104
+
105
+ # ------------------------ Load PDF ------------------------
106
+ with st.sidebar:
107
+ option = st.selectbox("Select a PDF", list(pdf_dict.keys()), key="pdf", on_change=None)
108
+ st.markdown(f"*Selected*: {option}")
109
+ st.button('Click to start loading PDF', key="load_pdf", on_click=load_pdf, args=[pdf_dict[option]])
110
+
111
+ # ------------------------ Chatbot ------------------------
112
+ st.text_input("Prompt", placeholder="Ask me anything", key="prompt")
113
+
114
+ if st.session_state.prompt:
115
+ st.session_state.exchanges.append({"role": "user", "content": st.session_state.prompt})
116
+ try:
117
+ response = chat(format_prompt(st.session_state.exchanges))
118
+ except Exception as e:
119
+ st.error(e)
120
+ st.stop()
121
+ st.session_state.exchanges.append({"role": "assistant", "content": response})
122
+ format_exchanges(st.session_state.exchanges)
icon_assistant.png ADDED
icon_user.png ADDED
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ langchain
3
+ streamlit
4
+ unstructured
5
+ unstructured[local-inference]
6
+ pinecone-client
7
+ tiktoken
8
+ nltk
run.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # To run the app, run the following command from the project root directory:
2
+ # sh run.sh
3
+ # For Windonws, need to have a .bat file to run the app
4
+ $PWD/.venv/bin/streamlit run app.py