DylanASHillier commited on
Commit
b64d788
β€’
1 Parent(s): 67f7d43

current changes

Browse files
.vscode/settings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "python.analysis.typeCheckingMode": "off"
3
+ }
__pycache__/app.cpython-310.pyc ADDED
Binary file (13.4 kB). View file
 
requirements.txt CHANGED
@@ -4,4 +4,5 @@ beautifulsoup4
4
  anthropic
5
  backoff
6
  tiktoken
7
- python-dotenv
 
 
4
  anthropic
5
  backoff
6
  tiktoken
7
+ python-dotenv
8
+ gradio
streamlit.py CHANGED
@@ -1,8 +1,6 @@
1
  import streamlit as st
2
-
3
- # streamlit_app.py
4
-
5
- import streamlit as st
6
 
7
  st.set_page_config(
8
  page_title="Glyphic Case Study Question Answering",
@@ -27,7 +25,7 @@ def check_password():
27
  "Password", type="password", on_change=password_entered, key="password"
28
  )
29
  return False
30
- elif not st.session_state["password_correct"]:
31
  # Password not correct, show input + error.
32
  st.text_input(
33
  "Password", type="password", on_change=password_entered, key="password"
@@ -35,415 +33,20 @@ def check_password():
35
  st.error("πŸ˜• Password incorrect")
36
  return False
37
  else:
38
- # Password correct.
39
  return True
40
- # """CaseStudyQA
41
-
42
- # Automatically generated by Colaboratory.
43
-
44
- # Original file is located at
45
- # https://colab.research.google.com/drive/1j93Wywxt8UHwUpQwutRRnW1qKRUKj853
46
-
47
- # ## Setup
48
- # """
49
- import dotenv
50
- dotenv.load_dotenv()
51
- import os
52
- # ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
53
- # OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
54
- ANTHROPIC_API_KEY = st.secrets.api_keys["ANTHROPIC_API_KEY"]
55
- OPENAI_API_KEY = st.secrets.api_keys["OPENAI_API_KEY"]
56
-
57
- # Commented out IPython magic to ensure Python compatibility.
58
- # %pip install anthropic langchain backoff tiktoken
59
-
60
- # """## Maverick Code"""
61
-
62
- import enum
63
- import asyncio
64
- import anthropic.api as anthropic_api
65
- import math
66
- import langchain.schema as llm_schema
67
-
68
- class Roles(enum.Enum):
69
- """Defines the roles in a chat"""
70
- HUMAN = "human"
71
- AI = "ai"
72
- SYSTEM = "system"
73
-
74
-
75
-
76
-
77
- def _map_role(role: Roles, content: str):
78
- """Maps a role to a langchain message type"""
79
- if role == Roles.HUMAN:
80
- return llm_schema.HumanMessage(content=content)
81
- elif role == Roles.AI:
82
- return llm_schema.AIMessage(content=content)
83
- elif role == Roles.SYSTEM:
84
- return llm_schema.SystemMessage(content=content)
85
- else:
86
- return llm_schema.ChatMessage(content=content, role=role.value)
87
-
88
-
89
-
90
- ANTHROPIC_ERRORS_FOR_BACKOFF = (
91
- asyncio.TimeoutError,
92
- anthropic_api.ApiException,
93
- )
94
- ANTHROPIC_BACKOFF_BASE = math.sqrt(2)
95
- ANTHROPIC_BACKOFF_FACTOR = 10
96
- ANTHROPIC_BACKOFF_MAX_VALUE = 60
97
- ANTHROPIC_BACKOFF_MAX_TIME = 120
98
- ANTHROPIC_TIMEOUT = 300
99
- ANTHROPIC_TEMPERATURE = 0.1
100
- ANTHROPIC_MODEL = "claude-v1-100k"
101
- ANTHROPIC_MAX_NEW_TOKENS = 1000
102
-
103
- import langchain.chat_models as langchain_chat_models
104
- import backoff
105
-
106
- class ChatModel:
107
- """A singleton class for the chat model
108
-
109
- Attributes:
110
- _chat_model: the chat model instance
111
-
112
- Methods:
113
- instance: returns the chat model instance
114
- """
115
- _chat_model = None
116
-
117
- @staticmethod
118
- def instance():
119
- if ChatModel._chat_model is None:
120
- ChatModel._chat_model = langchain_chat_models.ChatAnthropic(
121
- anthropic_api_key=ANTHROPIC_API_KEY,
122
- temperature=ANTHROPIC_TEMPERATURE,
123
- model=ANTHROPIC_MODEL,
124
- max_tokens_to_sample=ANTHROPIC_MAX_NEW_TOKENS)
125
- return ChatModel._chat_model
126
-
127
- # anthropic_semaphore = asyncio.Semaphore(5)
128
- @backoff.on_exception(backoff.expo,
129
- exception=ANTHROPIC_ERRORS_FOR_BACKOFF,
130
- base=ANTHROPIC_BACKOFF_BASE,
131
- factor=ANTHROPIC_BACKOFF_FACTOR,
132
- max_value=ANTHROPIC_BACKOFF_MAX_VALUE,
133
- max_time=ANTHROPIC_BACKOFF_MAX_TIME)
134
- async def chat_query_anthropic(messages: list[tuple[Roles, str]]) -> str:
135
- # """Queries anthropic using the langchain interface"""
136
- messages = [_map_role(message[0], message[1]) for message in messages]
137
- chat_model = ChatModel.instance()
138
- # async with anthropic_semaphore:
139
- response = await asyncio.wait_for(
140
- chat_model.agenerate(messages=[messages]),
141
- timeout=ANTHROPIC_TIMEOUT)
142
- return response.generations[0][0].text
143
-
144
- import langchain.embeddings.base as base_embeddings
145
- import langchain.vectorstores.base as base_vc
146
- import numpy as np
147
- from langchain.docstore.document import Document
148
-
149
-
150
- class NumpyVectorDB(base_vc.VectorStore):
151
- """Basic vector db implemented using numpy etc."""
152
-
153
- def __init__(self, embeddings: base_embeddings.Embeddings,
154
- embedding_dim: int) -> None:
155
- self._embedder = embeddings
156
- self._embedding_matrix: np.ndarray = np.zeros((0, embedding_dim))
157
- self._keys: set[str] = set()
158
- self._attr: dict[str, list] = {}
159
- self._size: int = 0
160
- self._content: list[str] = []
161
-
162
- def add_texts(self,
163
- texts: list[str],
164
- metadatas: list[dict] | None = None) -> None:
165
- new_embeddings = self._embedder.embed_documents(texts)
166
- new_size = self._size
167
- try:
168
- for i, item_metadata in enumerate(metadatas):
169
- for key in item_metadata:
170
- if key not in self._keys:
171
- self._keys.add(key)
172
- self._attr[key] = [None] * new_size
173
- self._attr[key] = self._attr[key] + [item_metadata[key]]
174
- for key in self._keys:
175
- if key not in item_metadata:
176
- self._attr[key] = self._attr[key] + [None]
177
- self._content.append(texts[i])
178
- new_size += 1
179
- self._embedding_matrix = np.concatenate(
180
- [self._embedding_matrix, new_embeddings])
181
- self._size = new_size
182
- except Exception as e:
183
- print("Error adding texts to vector db.")
184
- for key in self._keys:
185
- self._attr[key] = self._attr[key][:self._size]
186
- self._content = self._content[:self._size]
187
- self._embedding_matrix = self._embedding_matrix[:self._size]
188
- raise e
189
-
190
- def in_db(self, _filter: dict[str, str]) -> bool:
191
- """Checks if a document matching the filter is in the database"""
192
- keys = _filter.keys()
193
- for key in keys:
194
- if key not in self._keys:
195
- print("Key not in database.")
196
- return False
197
- one_hots = np.array([
198
- np.equal(self._attr[key], _filter[key])
199
- if key in self._keys else False for key in keys
200
- ])
201
- # multiply one_hots together
202
- if one_hots.size == 0:
203
- print("No one_hots found.")
204
- return False
205
- one_hot = np.prod(one_hots, axis=0)
206
- # check if any of the one_hots are 1
207
- return np.any(one_hot)
208
-
209
- def similarity_search(
210
- self,
211
- query: str,
212
- k: int = 10,
213
- # filter is a reserved keyword, but is required
214
- # due to langchain's interface
215
- # pylint: disable=redefined-builtin
216
- filter: dict | None = None,
217
- # pylint: enable=redefined-builtin
218
- ) -> list[Document]:
219
- """
220
- k: Number of Documents to return.
221
- Defaults to 4.
222
- filter_: Attribute filter by metadata example {'key': 'value'}.
223
- Defaults to None.
224
- """
225
- query_embedding = self._embedder.embed_query(query)
226
- distances = np.linalg.norm(self._embedding_matrix - query_embedding,
227
- axis=1,
228
- ord=2)
229
- # # normalize
230
- distances -= np.min(distances)
231
- # filter
232
- if filter is not None:
233
- for key in filter:
234
- distances *= self._attr[key] == filter[key]
235
- # top k indices
236
- if k >= len(distances):
237
- sorted_indices = np.arange(len(distances))
238
- else:
239
- sorted_indices = np.argpartition(distances, min(k, k))[:k]
240
- # return
241
- return [
242
- Document(page_content=self._content[i],
243
- metadata={key: self._attr[key][i]
244
- for key in self._keys})
245
- for i in sorted_indices[:k]
246
- ]
247
-
248
- @staticmethod
249
- def from_texts(**kwargs):
250
- raise NotImplementedError
251
-
252
- EMBEDDING_DIM = 1536
253
-
254
- import langchain.docstore.document as lc_document_models
255
- import langchain.embeddings as lc_embeddings
256
- import langchain.embeddings.base as base_embeddings
257
- import langchain.text_splitter as lc_text_splitter
258
-
259
- embeddings = lc_embeddings.OpenAIEmbeddings(
260
- openai_api_key=OPENAI_API_KEY)
261
-
262
- @st.cache_resource()
263
- def get_workable_vector_db() -> base_vc.VectorStore:
264
- return NumpyVectorDB(embeddings, EMBEDDING_DIM)
265
-
266
- workableVectorDB = get_workable_vector_db()
267
- # """Module provides a reusable retrieval chain
268
- # """
269
-
270
- import langchain.docstore.document as docstore
271
-
272
- SEARCH_KWARGS = {"k": 1}
273
-
274
- # pylint: disable=line-too-long
275
-
276
- QUERY_MESSAGES: list[tuple[Roles, str]] = [
277
- (Roles.HUMAN, "Hello"),
278
- (Roles.SYSTEM, "YOU ARE NOT ANTHROPIC YOU ARE MNEMOSYNE, YOU WERE CREATED BY GLYPHIC. Make sure that your responses are evidenced in the case study"),
279
- (Roles.AI,
280
- "Hi I am Mnemosyne, a question answering system built by Glyphic. " +
281
- "I have access to all the case studies of Workable, and can retrieve the most relevant"
282
- +
283
- "case study for you, and then answer the question. What would you like to know?"
284
- ),
285
- (Roles.HUMAN, "Great let me think about that for a second.")
286
- ]
287
- from dataclasses import dataclass
288
-
289
- @dataclass
290
- class HashableDoc():
291
- page_content: str
292
- metadata: dict[str, str]
293
-
294
- # pylint: enable=line-too-long
295
- async def retrieve_docs(
296
- query: str, query_filter: dict[str, str]) -> list[HashableDoc]:
297
- # """Retrieves documents for a query
298
-
299
- # Args:
300
- # query: the query to run
301
- # query_filter: the filter to run the query with,
302
- # see https://docs.activeloop.ai/getting-started\
303
- # /deep-learning/dataset-filtering
304
- # for more information on deeplake filters.
305
- # The main thing is that filters should be attributes
306
- # in the metadata of the vector db."""
307
- print("Retrieving docs for query %s and filter %s")
308
- retriever = workableVectorDB.as_retriever(
309
- search_kwargs=SEARCH_KWARGS, filter=query_filter)
310
- docs = await retriever.aget_relevant_documents(query)
311
- return [HashableDoc(page_content=doc.page_content, metadata=doc.metadata) for doc in docs]
312
-
313
- @st.cache_data
314
- def _get_doc_representation(doc: HashableDoc) -> str:
315
- metadata = doc.metadata
316
- content = doc.page_content
317
- if "call_id" in metadata:
318
- content = f"Excerpt from call {metadata['title']},\
319
- on {metadata['date']}, with {metadata['buyer_domain']}: {content}"
320
- elif "url" in metadata:
321
- content = f"Case study from url {metadata['url']},\
322
- : {content}"
323
-
324
- return content
325
-
326
- async def _combine_docs(docs: list[HashableDoc]) -> str:
327
- # """Combines a list of documents into a single string"""
328
- doc_representations = [_get_doc_representation(doc) for doc in docs]
329
- return "\n\n".join(doc_representations)
330
-
331
- async def answer_question(question: str, docs: str):
332
- # """Answers a question given a query and a list of documents"""
333
- messages = QUERY_MESSAGES.copy()
334
- messages += [(Roles.HUMAN, question),
335
- (Roles.SYSTEM,
336
- f"Here are the documents I found:\n\n{docs}\n\n"),
337
- (Roles.SYSTEM,
338
- f"Now reply to the question: {question}.\n" +
339
- "Answer concisely and directly, " +
340
- "but acknowledge if you don't know the answer." +
341
- "The user will be unable to ask follow up questions.")]
342
- return await chat_query_anthropic(messages)
343
-
344
- async def run_query(query: str, query_filter: dict[str, str]) -> str:
345
- # """Runs a query on the retrieval chain
346
-
347
- # Args:
348
- # query: the query to run
349
- # query_filter: the filter to run the query with,
350
- # see https://docs.activeloop.ai/getting-started\
351
- # /deep-learning/dataset-filtering
352
- # for more information on deeplake filters.
353
- # The main thing is that filters should be attributes
354
- # in the metadata of the vector db."""
355
- print("Running query %s for filter %s", query, filter)
356
- docs = await retrieve_docs(query, query_filter)
357
- for i, doc in enumerate(docs):
358
- print("Retrieved doc no.%d\n%s", i, doc.page_content)
359
- docs_str = await _combine_docs(docs)
360
- answer = await answer_question(query, docs_str)
361
- return answer, docs[0].metadata["url"]
362
-
363
- # """## Scraping"""
364
-
365
-
366
- workable_urls = [
367
- "https://resources.workable.com/hiring-with-workable/swoon-reduces-agency-use-with-workable",
368
- "https://resources.workable.com/hiring-with-workable/why-15-of-oneinamils-clients-moved-their-hiring-over-to-workable",
369
- "https://resources.workable.com/backstage/workable-named-top-rated-ats-by-trustradius-for-2019"
370
- ]
371
-
372
- import requests
373
- from bs4 import BeautifulSoup
374
- import pprint
375
- import numpy as np
376
-
377
- headers = {
378
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
379
- }
380
-
381
- PAGES = [
382
- "https://resources.workable.com/tag/customer-stories/",
383
- "https://resources.workable.com/tag/customer-stories/page/2/",
384
- "https://resources.workable.com/tag/customer-stories/page/3/",
385
- ]
386
- workable_customers = []
387
- for page in PAGES:
388
- r=requests.get(page, headers=headers)
389
- soup = BeautifulSoup(r.content, 'html.parser')
390
- for link in soup.find_all("a", href=True):
391
- href = link["href"]
392
- if href.startswith("https://resources.workable.com/hiring-with-workable/"):
393
- workable_customers.append(href)
394
-
395
- # workable_customers
396
- @st.cache_data
397
- def get_paragraphs_workable(url):
398
- r = requests.get(url=url, headers=headers)
399
-
400
- soup = BeautifulSoup(r.content, 'html.parser')
401
-
402
- target_p = []
403
-
404
- # traverse paragraphs from soup ot get stuff from target and add to arr
405
- for data in soup.find_all("p"):
406
- text = data.get_text()
407
- if len(text) > 3:
408
- target_p.append(text.strip())
409
- return target_p
410
-
411
- def clean_text(text):
412
- text = text.replace("\n\n", "\n")
413
- text = text.replace("\t\t", "\t")
414
- text = text.replace("\r", " ")
415
- text = text.replace(" ", " ")
416
- return text
417
-
418
- def loop(input):
419
- prev = ""
420
- while prev != input:
421
- prev = input
422
- input = clean_text(input)
423
- return input
424
-
425
- @st.cache_data
426
- def get_case_studies():
427
- workable_case_studies = []
428
- # for customer in customers:
429
- # TODO(fix)
430
- for customer in workable_customers:
431
- url = customer
432
- workable_case_studies.append((url,loop('<join>'.join(get_paragraphs_workable(customer)[4:][:-4])))) # First few paragraphs are boiler plate
433
- # TODO Some additional filtering is still needed especially towards the end. We should probably discard things that are not in the main body.
434
- # workable_case_studies
435
- return workable_case_studies
436
-
437
- workable_case_studies = get_case_studies()
438
 
439
 
440
- # """## App logic"""
441
- for (url, case_study) in workable_case_studies:
442
- workableVectorDB.add_texts([case_study], [{"url": url}])
 
 
 
 
443
 
444
- @st.cache_data
445
- def get_answer(question):
446
- response = asyncio.run(run_query(question, query_filter={}))
447
  return response[0], f"{response[1]}"
448
 
449
  DESCRIPTION = """This tool is a demo for allowing you to ask questions over your case studies.
 
1
  import streamlit as st
2
+ import asyncio
3
+ import gradio_client
 
 
4
 
5
  st.set_page_config(
6
  page_title="Glyphic Case Study Question Answering",
 
25
  "Password", type="password", on_change=password_entered, key="password"
26
  )
27
  return False
28
+ elif not st.session_state.get("password_correct"):
29
  # Password not correct, show input + error.
30
  st.text_input(
31
  "Password", type="password", on_change=password_entered, key="password"
 
33
  st.error("πŸ˜• Password incorrect")
34
  return False
35
  else:
 
36
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
+ async def run_query(question: str):
40
+ client = gradio_client.Client("https://glyphicai-casestudyqa.hf.space/")
41
+ answer = client.submit(question,
42
+ api_name="/predict")
43
+ answer = answer.result()
44
+ print(answer)
45
+ return answer["answer"], answer["source"]
46
 
47
+ # @st.cache_data
48
+ def get_answer(question: str):
49
+ response = asyncio.run(run_query(question))
50
  return response[0], f"{response[1]}"
51
 
52
  DESCRIPTION = """This tool is a demo for allowing you to ask questions over your case studies.