DylanASHillier commited on
Commit
ab283a4
1 Parent(s): 63d295a

adds streamlit

Browse files
Files changed (1) hide show
  1. streamlit.py +470 -0
streamlit.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # streamlit_app.py
4
+
5
+ import streamlit as st
6
+
7
+ st.set_page_config(
8
+ page_title="Glyphic Case Study Question Answering",
9
+ page_icon="favicon.ico",
10
+ layout="centered",
11
+ )
12
+
13
+ def check_password():
14
+ """Returns `True` if the user had the correct password."""
15
+
16
+ def password_entered():
17
+ """Checks whether a password entered by the user is correct."""
18
+ if st.session_state["password"] == st.secrets["password"]:
19
+ st.session_state["password_correct"] = True
20
+ del st.session_state["password"] # don't store password
21
+ else:
22
+ st.session_state["password_correct"] = False
23
+
24
+ if "password_correct" not in st.session_state:
25
+ # First run, show input for password.
26
+ st.text_input(
27
+ "Password", type="password", on_change=password_entered, key="password"
28
+ )
29
+ return False
30
+ elif not st.session_state["password_correct"]:
31
+ # Password not correct, show input + error.
32
+ st.text_input(
33
+ "Password", type="password", on_change=password_entered, key="password"
34
+ )
35
+ st.error("😕 Password incorrect")
36
+ return False
37
+ else:
38
+ # Password correct.
39
+ return True
40
+ # """CaseStudyQA
41
+
42
+ # Automatically generated by Colaboratory.
43
+
44
+ # Original file is located at
45
+ # https://colab.research.google.com/drive/1j93Wywxt8UHwUpQwutRRnW1qKRUKj853
46
+
47
+ # ## Setup
48
+ # """
49
+ import dotenv
50
+ dotenv.load_dotenv()
51
+ import os
52
+ ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
53
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
54
+
55
+ # Commented out IPython magic to ensure Python compatibility.
56
+ # %pip install anthropic langchain backoff tiktoken
57
+
58
+ # """## Maverick Code"""
59
+
60
+ import enum
61
+ import asyncio
62
+ import anthropic.api as anthropic_api
63
+ import math
64
+ import langchain.schema as llm_schema
65
+
66
+ class Roles(enum.Enum):
67
+ """Defines the roles in a chat"""
68
+ HUMAN = "human"
69
+ AI = "ai"
70
+ SYSTEM = "system"
71
+
72
+
73
+
74
+
75
+ def _map_role(role: Roles, content: str):
76
+ """Maps a role to a langchain message type"""
77
+ if role == Roles.HUMAN:
78
+ return llm_schema.HumanMessage(content=content)
79
+ elif role == Roles.AI:
80
+ return llm_schema.AIMessage(content=content)
81
+ elif role == Roles.SYSTEM:
82
+ return llm_schema.SystemMessage(content=content)
83
+ else:
84
+ return llm_schema.ChatMessage(content=content, role=role.value)
85
+
86
+
87
+
88
+ ANTHROPIC_ERRORS_FOR_BACKOFF = (
89
+ asyncio.TimeoutError,
90
+ anthropic_api.ApiException,
91
+ )
92
+ ANTHROPIC_BACKOFF_BASE = math.sqrt(2)
93
+ ANTHROPIC_BACKOFF_FACTOR = 10
94
+ ANTHROPIC_BACKOFF_MAX_VALUE = 60
95
+ ANTHROPIC_BACKOFF_MAX_TIME = 120
96
+ ANTHROPIC_TIMEOUT = 300
97
+ ANTHROPIC_TEMPERATURE = 0.1
98
+ ANTHROPIC_MODEL = "claude-v1-100k"
99
+ ANTHROPIC_MAX_NEW_TOKENS = 1000
100
+
101
+ import langchain.chat_models as langchain_chat_models
102
+ import backoff
103
+
104
+ class ChatModel:
105
+ """A singleton class for the chat model
106
+
107
+ Attributes:
108
+ _chat_model: the chat model instance
109
+
110
+ Methods:
111
+ instance: returns the chat model instance
112
+ """
113
+ _chat_model = None
114
+
115
+ @staticmethod
116
+ def instance():
117
+ if ChatModel._chat_model is None:
118
+ ChatModel._chat_model = langchain_chat_models.ChatAnthropic(
119
+ anthropic_api_key=ANTHROPIC_API_KEY,
120
+ temperature=ANTHROPIC_TEMPERATURE,
121
+ model=ANTHROPIC_MODEL,
122
+ max_tokens_to_sample=ANTHROPIC_MAX_NEW_TOKENS)
123
+ return ChatModel._chat_model
124
+
125
+ # anthropic_semaphore = asyncio.Semaphore(5)
126
+
127
+ @backoff.on_exception(backoff.expo,
128
+ exception=ANTHROPIC_ERRORS_FOR_BACKOFF,
129
+ base=ANTHROPIC_BACKOFF_BASE,
130
+ factor=ANTHROPIC_BACKOFF_FACTOR,
131
+ max_value=ANTHROPIC_BACKOFF_MAX_VALUE,
132
+ max_time=ANTHROPIC_BACKOFF_MAX_TIME)
133
+ async def chat_query_anthropic(messages: list[tuple[Roles, str]]) -> str:
134
+ """Queries anthropic using the langchain interface"""
135
+ messages = [_map_role(message[0], message[1]) for message in messages]
136
+ chat_model = ChatModel.instance()
137
+ # async with anthropic_semaphore:
138
+ response = await asyncio.wait_for(
139
+ chat_model.agenerate(messages=[messages]),
140
+ timeout=ANTHROPIC_TIMEOUT)
141
+ return response.generations[0][0].text
142
+
143
+ import langchain.embeddings.base as base_embeddings
144
+ import langchain.vectorstores.base as base_vc
145
+ import numpy as np
146
+ from langchain.docstore.document import Document
147
+
148
+
149
+ class NumpyVectorDB(base_vc.VectorStore):
150
+ """Basic vector db implemented using numpy etc."""
151
+
152
+ def __init__(self, embeddings: base_embeddings.Embeddings,
153
+ embedding_dim: int) -> None:
154
+ self._embedder = embeddings
155
+ self._embedding_matrix: np.ndarray = np.zeros((0, embedding_dim))
156
+ self._keys: set[str] = set()
157
+ self._attr: dict[str, list] = {}
158
+ self._size: int = 0
159
+ self._content: list[str] = []
160
+
161
+ def add_texts(self,
162
+ texts: list[str],
163
+ metadatas: list[dict] | None = None) -> None:
164
+ new_embeddings = self._embedder.embed_documents(texts)
165
+ new_size = self._size
166
+ try:
167
+ for i, item_metadata in enumerate(metadatas):
168
+ for key in item_metadata:
169
+ if key not in self._keys:
170
+ self._keys.add(key)
171
+ self._attr[key] = [None] * new_size
172
+ self._attr[key] = self._attr[key] + [item_metadata[key]]
173
+ for key in self._keys:
174
+ if key not in item_metadata:
175
+ self._attr[key] = self._attr[key] + [None]
176
+ self._content.append(texts[i])
177
+ new_size += 1
178
+ self._embedding_matrix = np.concatenate(
179
+ [self._embedding_matrix, new_embeddings])
180
+ self._size = new_size
181
+ except Exception as e:
182
+ print("Error adding texts to vector db.")
183
+ for key in self._keys:
184
+ self._attr[key] = self._attr[key][:self._size]
185
+ self._content = self._content[:self._size]
186
+ self._embedding_matrix = self._embedding_matrix[:self._size]
187
+ raise e
188
+
189
+ def in_db(self, _filter: dict[str, str]) -> bool:
190
+ """Checks if a document matching the filter is in the database"""
191
+ keys = _filter.keys()
192
+ for key in keys:
193
+ if key not in self._keys:
194
+ print("Key not in database.")
195
+ return False
196
+ one_hots = np.array([
197
+ np.equal(self._attr[key], _filter[key])
198
+ if key in self._keys else False for key in keys
199
+ ])
200
+ # multiply one_hots together
201
+ if one_hots.size == 0:
202
+ print("No one_hots found.")
203
+ return False
204
+ one_hot = np.prod(one_hots, axis=0)
205
+ # check if any of the one_hots are 1
206
+ return np.any(one_hot)
207
+
208
+ def similarity_search(
209
+ self,
210
+ query: str,
211
+ k: int = 10,
212
+ # filter is a reserved keyword, but is required
213
+ # due to langchain's interface
214
+ # pylint: disable=redefined-builtin
215
+ filter: dict | None = None,
216
+ # pylint: enable=redefined-builtin
217
+ ) -> list[Document]:
218
+ """
219
+ k: Number of Documents to return.
220
+ Defaults to 4.
221
+ filter_: Attribute filter by metadata example {'key': 'value'}.
222
+ Defaults to None.
223
+ """
224
+ query_embedding = self._embedder.embed_query(query)
225
+ distances = np.linalg.norm(self._embedding_matrix - query_embedding,
226
+ axis=1,
227
+ ord=2)
228
+ # # normalize
229
+ distances -= np.min(distances)
230
+ # filter
231
+ if filter is not None:
232
+ for key in filter:
233
+ distances *= self._attr[key] == filter[key]
234
+ # top k indices
235
+ if k >= len(distances):
236
+ sorted_indices = np.arange(len(distances))
237
+ else:
238
+ sorted_indices = np.argpartition(distances, min(k, k))[:k]
239
+ # return
240
+ return [
241
+ Document(page_content=self._content[i],
242
+ metadata={key: self._attr[key][i]
243
+ for key in self._keys})
244
+ for i in sorted_indices[:k]
245
+ ]
246
+
247
+ @staticmethod
248
+ def from_texts(**kwargs):
249
+ raise NotImplementedError
250
+
251
+ EMBEDDING_DIM = 1536
252
+
253
+ import langchain.docstore.document as lc_document_models
254
+ import langchain.embeddings as lc_embeddings
255
+ import langchain.embeddings.base as base_embeddings
256
+ import langchain.text_splitter as lc_text_splitter
257
+
258
+ embeddings = lc_embeddings.OpenAIEmbeddings(
259
+ openai_api_key=OPENAI_API_KEY)
260
+
261
+ workableVectorDB = NumpyVectorDB(embeddings, EMBEDDING_DIM)
262
+
263
+ # """Module provides a reusable retrieval chain
264
+ # """
265
+
266
+ import langchain.docstore.document as docstore
267
+
268
+ SEARCH_KWARGS = {"k": 1}
269
+
270
+ # pylint: disable=line-too-long
271
+
272
+ QUERY_MESSAGES: list[tuple[Roles, str]] = [
273
+ (Roles.HUMAN, "Hello"),
274
+ (Roles.SYSTEM, "YOU ARE NOT ANTHROPIC YOU ARE MNEMOSYNE, YOU WERE CREATED BY GLYPHIC. Make sure that your responses are evidenced in the case study"),
275
+ (Roles.AI,
276
+ "Hi I am Mnemosyne, a question answering system built by Glyphic. " +
277
+ "I have access to all the case studies of Workable, and can retrieve the most relevant"
278
+ +
279
+ "case study for you, and then answer the question. What would you like to know?"
280
+ ),
281
+ (Roles.HUMAN, "Great let me think about that for a second.")
282
+ ]
283
+
284
+
285
+ # pylint: enable=line-too-long
286
+ async def retrieve_docs(
287
+ query: str, query_filter: dict[str, str]) -> list[docstore.Document]:
288
+ # """Retrieves documents for a query
289
+
290
+ # Args:
291
+ # query: the query to run
292
+ # query_filter: the filter to run the query with,
293
+ # see https://docs.activeloop.ai/getting-started\
294
+ # /deep-learning/dataset-filtering
295
+ # for more information on deeplake filters.
296
+ # The main thing is that filters should be attributes
297
+ # in the metadata of the vector db."""
298
+ print("Retrieving docs for query %s and filter %s")
299
+ retriever = workableVectorDB.as_retriever(
300
+ search_kwargs=SEARCH_KWARGS, filter=query_filter)
301
+ return await retriever.aget_relevant_documents(query)
302
+
303
+
304
+ def _get_doc_representation(doc: docstore.Document) -> str:
305
+ metadata = doc.metadata
306
+ content = doc.page_content
307
+ if "call_id" in metadata:
308
+ content = f"Excerpt from call {metadata['title']},\
309
+ on {metadata['date']}, with {metadata['buyer_domain']}: {content}"
310
+ elif "url" in metadata:
311
+ content = f"Case study from url {metadata['url']},\
312
+ : {content}"
313
+
314
+ return content
315
+
316
+
317
+ async def _combine_docs(docs: list[docstore.Document]) -> str:
318
+ # """Combines a list of documents into a single string"""
319
+ doc_representations = [_get_doc_representation(doc) for doc in docs]
320
+ return "\n\n".join(doc_representations)
321
+
322
+
323
+ async def answer_question(question: str, docs: str):
324
+ # """Answers a question given a query and a list of documents"""
325
+ messages = QUERY_MESSAGES.copy()
326
+ messages += [(Roles.HUMAN, question),
327
+ (Roles.SYSTEM,
328
+ f"Here are the documents I found:\n\n{docs}\n\n"),
329
+ (Roles.SYSTEM,
330
+ f"Now reply to the question: {question}.\n" +
331
+ "Answer concisely and directly, " +
332
+ "but acknowledge if you don't know the answer." +
333
+ "The user will be unable to ask follow up questions.")]
334
+ return await chat_query_anthropic(messages)
335
+
336
+
337
+ async def run_query(query: str, query_filter: dict[str, str]) -> str:
338
+ # """Runs a query on the retrieval chain
339
+
340
+ # Args:
341
+ # query: the query to run
342
+ # query_filter: the filter to run the query with,
343
+ # see https://docs.activeloop.ai/getting-started\
344
+ # /deep-learning/dataset-filtering
345
+ # for more information on deeplake filters.
346
+ # The main thing is that filters should be attributes
347
+ # in the metadata of the vector db."""
348
+ print("Running query %s for filter %s", query, filter)
349
+ docs = await retrieve_docs(query, query_filter)
350
+ for i, doc in enumerate(docs):
351
+ print("Retrieved doc no.%d\n%s", i, doc.page_content)
352
+ docs_str = await _combine_docs(docs)
353
+ answer = await answer_question(query, docs_str)
354
+ return answer, docs[0].metadata["url"]
355
+
356
+ # """## Scraping"""
357
+
358
+
359
+ workable_urls = [
360
+ "https://resources.workable.com/hiring-with-workable/swoon-reduces-agency-use-with-workable",
361
+ "https://resources.workable.com/hiring-with-workable/why-15-of-oneinamils-clients-moved-their-hiring-over-to-workable",
362
+ "https://resources.workable.com/backstage/workable-named-top-rated-ats-by-trustradius-for-2019"
363
+ ]
364
+
365
+ import requests
366
+ from bs4 import BeautifulSoup
367
+ import pprint
368
+ import numpy as np
369
+
370
+ headers = {
371
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
372
+ }
373
+
374
+ PAGES = [
375
+ "https://resources.workable.com/tag/customer-stories/",
376
+ "https://resources.workable.com/tag/customer-stories/page/2/",
377
+ "https://resources.workable.com/tag/customer-stories/page/3/",
378
+ ]
379
+ workable_customers = []
380
+ for page in PAGES:
381
+ r=requests.get(page, headers=headers)
382
+ soup = BeautifulSoup(r.content, 'html.parser')
383
+ for link in soup.find_all("a", href=True):
384
+ href = link["href"]
385
+ if href.startswith("https://resources.workable.com/hiring-with-workable/"):
386
+ workable_customers.append(href)
387
+
388
+ # workable_customers
389
+
390
+ def get_paragraphs_workable(url):
391
+ r = requests.get(url=url, headers=headers)
392
+
393
+ soup = BeautifulSoup(r.content, 'html.parser')
394
+
395
+ target_p = []
396
+
397
+ # traverse paragraphs from soup ot get stuff from target and add to arr
398
+ for data in soup.find_all("p"):
399
+ text = data.get_text()
400
+ if len(text) > 3:
401
+ target_p.append(text.strip())
402
+ return target_p
403
+
404
+ def clean_text(text):
405
+ text = text.replace("\n\n", "\n")
406
+ text = text.replace("\t\t", "\t")
407
+ text = text.replace("\r", " ")
408
+ text = text.replace(" ", " ")
409
+ return text
410
+
411
+ def loop(input):
412
+ prev = ""
413
+ while prev != input:
414
+ prev = input
415
+ input = clean_text(input)
416
+ return input
417
+
418
+ workable_case_studies = []
419
+ # for customer in customers:
420
+ # TODO(fix)
421
+ for customer in workable_customers:
422
+ url = customer
423
+ workable_case_studies.append((url,loop('<join>'.join(get_paragraphs_workable(customer)[4:][:-4])))) # First few paragraphs are boiler plate
424
+ # TODO Some additional filtering is still needed especially towards the end. We should probably discard things that are not in the main body.
425
+ # workable_case_studies
426
+
427
+
428
+ # """## App logic"""
429
+ for (url, case_study) in workable_case_studies:
430
+ workableVectorDB.add_texts([case_study], [{"url": url}])
431
+
432
+
433
+ def get_answer(question):
434
+ response = asyncio.run(run_query(question, query_filter={}))
435
+ return response[0], f"<a href='{response[1]}'>{response[1]}</a>"
436
+
437
+ DESCRIPTION = """This tool is a demo for allowing you to ask questions over your case studies.
438
+
439
+ The case studies are from [Workable](https://resources.workable.com/tag/customer-stories/), a recruiting software company.
440
+ When you ask a question, the tool will search for the most relevant case study to the question and then use that to answer you."""
441
+
442
+
443
+ if check_password():
444
+ st.title("Glyphic Case Study Question Answering")
445
+ st.markdown(DESCRIPTION, unsafe_allow_html=True)
446
+
447
+ question = st.text_input("Enter your question")
448
+
449
+ if st.button("Get Answer"):
450
+ answer, source = get_answer(question)
451
+ st.subheader("Answer:")
452
+ st.write(answer)
453
+ st.subheader("Source:")
454
+ st.write(source)
455
+
456
+ st.sidebar.title("Access Control")
457
+ USERNAME = os.environ.get("DEMO_USER")
458
+ PASSWORD = os.environ.get("DEMO_PASSWORD")
459
+ password_input = st.sidebar.text_input("Password", type="password")
460
+
461
+ if password_input == PASSWORD:
462
+ st.sidebar.success("Authentication successful!")
463
+ else:
464
+ st.sidebar.error("Authentication failed!")
465
+
466
+ st.sidebar.markdown(
467
+ """
468
+ Please enter the password to access this tool, or contact Glyphic for access.
469
+ """
470
+ )