DylanASHillier commited on
Commit
570a493
1 Parent(s): 7eaf320

updated with app

Browse files
Files changed (2) hide show
  1. app.py +430 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """CaseStudyQA
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1j93Wywxt8UHwUpQwutRRnW1qKRUKj853
8
+
9
+ ## Setup
10
+ """
11
+ import os
12
+ ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
13
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
+
15
+ # Commented out IPython magic to ensure Python compatibility.
16
+ # %pip install anthropic langchain backoff tiktoken
17
+
18
+ """## Maverick Code"""
19
+
20
+ import enum
21
+ import asyncio
22
+ import anthropic.api as anthropic_api
23
+ import math
24
+ import langchain.schema as llm_schema
25
+
26
+ class Roles(enum.Enum):
27
+ """Defines the roles in a chat"""
28
+ HUMAN = "human"
29
+ AI = "ai"
30
+ SYSTEM = "system"
31
+
32
+
33
+
34
+
35
+ def _map_role(role: Roles, content: str):
36
+ """Maps a role to a langchain message type"""
37
+ if role == Roles.HUMAN:
38
+ return llm_schema.HumanMessage(content=content)
39
+ elif role == Roles.AI:
40
+ return llm_schema.AIMessage(content=content)
41
+ elif role == Roles.SYSTEM:
42
+ return llm_schema.SystemMessage(content=content)
43
+ else:
44
+ return llm_schema.ChatMessage(content=content, role=role.value)
45
+
46
+
47
+
48
+ ANTHROPIC_ERRORS_FOR_BACKOFF = (
49
+ asyncio.TimeoutError,
50
+ anthropic_api.ApiException,
51
+ )
52
+ ANTHROPIC_BACKOFF_BASE = math.sqrt(2)
53
+ ANTHROPIC_BACKOFF_FACTOR = 10
54
+ ANTHROPIC_BACKOFF_MAX_VALUE = 60
55
+ ANTHROPIC_BACKOFF_MAX_TIME = 120
56
+ ANTHROPIC_TIMEOUT = 300
57
+ ANTHROPIC_TEMPERATURE = 0.1
58
+ ANTHROPIC_MODEL = "claude-v1-100k"
59
+ ANTHROPIC_MAX_NEW_TOKENS = 1000
60
+
61
+ import langchain.chat_models as langchain_chat_models
62
+ import backoff
63
+
64
+ class ChatModel:
65
+ """A singleton class for the chat model
66
+
67
+ Attributes:
68
+ _chat_model: the chat model instance
69
+
70
+ Methods:
71
+ instance: returns the chat model instance
72
+ """
73
+ _chat_model = None
74
+
75
+ @staticmethod
76
+ def instance():
77
+ if ChatModel._chat_model is None:
78
+ ChatModel._chat_model = langchain_chat_models.ChatAnthropic(
79
+ anthropic_api_key=ANTHROPIC_API_KEY,
80
+ temperature=ANTHROPIC_TEMPERATURE,
81
+ model=ANTHROPIC_MODEL,
82
+ max_tokens_to_sample=ANTHROPIC_MAX_NEW_TOKENS)
83
+ return ChatModel._chat_model
84
+
85
+ anthropic_semaphore = asyncio.Semaphore(5)
86
+
87
+ @backoff.on_exception(backoff.expo,
88
+ exception=ANTHROPIC_ERRORS_FOR_BACKOFF,
89
+ base=ANTHROPIC_BACKOFF_BASE,
90
+ factor=ANTHROPIC_BACKOFF_FACTOR,
91
+ max_value=ANTHROPIC_BACKOFF_MAX_VALUE,
92
+ max_time=ANTHROPIC_BACKOFF_MAX_TIME)
93
+ async def chat_query_anthropic(messages: list[tuple[Roles, str]]) -> str:
94
+ """Queries anthropic using the langchain interface"""
95
+ messages = [_map_role(message[0], message[1]) for message in messages]
96
+ chat_model = ChatModel.instance()
97
+ async with anthropic_semaphore:
98
+ response = await asyncio.wait_for(
99
+ chat_model.agenerate(messages=[messages]),
100
+ timeout=ANTHROPIC_TIMEOUT)
101
+ return response.generations[0][0].text
102
+
103
+ import langchain.embeddings.base as base_embeddings
104
+ import langchain.vectorstores.base as base_vc
105
+ import numpy as np
106
+ from langchain.docstore.document import Document
107
+
108
+
109
+ class NumpyVectorDB(base_vc.VectorStore):
110
+ """Basic vector db implemented using numpy etc."""
111
+
112
+ def __init__(self, embeddings: base_embeddings.Embeddings,
113
+ embedding_dim: int) -> None:
114
+ self._embedder = embeddings
115
+ self._embedding_matrix: np.ndarray = np.zeros((0, embedding_dim))
116
+ self._keys: set[str] = set()
117
+ self._attr: dict[str, list] = {}
118
+ self._size: int = 0
119
+ self._content: list[str] = []
120
+
121
+ def add_texts(self,
122
+ texts: list[str],
123
+ metadatas: list[dict] | None = None) -> None:
124
+ new_embeddings = self._embedder.embed_documents(texts)
125
+ new_size = self._size
126
+ try:
127
+ for i, item_metadata in enumerate(metadatas):
128
+ for key in item_metadata:
129
+ if key not in self._keys:
130
+ self._keys.add(key)
131
+ self._attr[key] = [None] * new_size
132
+ self._attr[key] = self._attr[key] + [item_metadata[key]]
133
+ for key in self._keys:
134
+ if key not in item_metadata:
135
+ self._attr[key] = self._attr[key] + [None]
136
+ self._content.append(texts[i])
137
+ new_size += 1
138
+ self._embedding_matrix = np.concatenate(
139
+ [self._embedding_matrix, new_embeddings])
140
+ self._size = new_size
141
+ except Exception as e:
142
+ print("Error adding texts to vector db.")
143
+ for key in self._keys:
144
+ self._attr[key] = self._attr[key][:self._size]
145
+ self._content = self._content[:self._size]
146
+ self._embedding_matrix = self._embedding_matrix[:self._size]
147
+ raise e
148
+
149
+ def in_db(self, _filter: dict[str, str]) -> bool:
150
+ """Checks if a document matching the filter is in the database"""
151
+ keys = _filter.keys()
152
+ for key in keys:
153
+ if key not in self._keys:
154
+ print("Key not in database.")
155
+ return False
156
+ one_hots = np.array([
157
+ np.equal(self._attr[key], _filter[key])
158
+ if key in self._keys else False for key in keys
159
+ ])
160
+ # multiply one_hots together
161
+ if one_hots.size == 0:
162
+ print("No one_hots found.")
163
+ return False
164
+ one_hot = np.prod(one_hots, axis=0)
165
+ # check if any of the one_hots are 1
166
+ return np.any(one_hot)
167
+
168
+ def similarity_search(
169
+ self,
170
+ query: str,
171
+ k: int = 10,
172
+ # filter is a reserved keyword, but is required
173
+ # due to langchain's interface
174
+ # pylint: disable=redefined-builtin
175
+ filter: dict | None = None,
176
+ # pylint: enable=redefined-builtin
177
+ ) -> list[Document]:
178
+ """
179
+ k: Number of Documents to return.
180
+ Defaults to 4.
181
+ filter_: Attribute filter by metadata example {'key': 'value'}.
182
+ Defaults to None.
183
+ """
184
+ query_embedding = self._embedder.embed_query(query)
185
+ distances = np.linalg.norm(self._embedding_matrix - query_embedding,
186
+ axis=1,
187
+ ord=2)
188
+ # # normalize
189
+ distances -= np.min(distances)
190
+ # filter
191
+ if filter is not None:
192
+ for key in filter:
193
+ distances *= self._attr[key] == filter[key]
194
+ # top k indices
195
+ if k >= len(distances):
196
+ sorted_indices = np.arange(len(distances))
197
+ else:
198
+ sorted_indices = np.argpartition(distances, min(k, k))[:k]
199
+ # return
200
+ return [
201
+ Document(page_content=self._content[i],
202
+ metadata={key: self._attr[key][i]
203
+ for key in self._keys})
204
+ for i in sorted_indices[:k]
205
+ ]
206
+
207
+ @staticmethod
208
+ def from_texts(**kwargs):
209
+ raise NotImplementedError
210
+
211
+ EMBEDDING_DIM = 1536
212
+
213
+ import langchain.docstore.document as lc_document_models
214
+ import langchain.embeddings as lc_embeddings
215
+ import langchain.embeddings.base as base_embeddings
216
+ import langchain.text_splitter as lc_text_splitter
217
+
218
+ embeddings = lc_embeddings.OpenAIEmbeddings(
219
+ openai_api_key=OPENAI_API_KEY)
220
+
221
+ workableVectorDB = NumpyVectorDB(embeddings, EMBEDDING_DIM)
222
+
223
+ """Module provides a reusable retrieval chain
224
+ """
225
+
226
+ import langchain.docstore.document as docstore
227
+
228
+ SEARCH_KWARGS = {"k": 1}
229
+
230
+ # pylint: disable=line-too-long
231
+
232
+ QUERY_MESSAGES: list[tuple[Roles, str]] = [
233
+ (Roles.HUMAN, "Hello"),
234
+ (Roles.SYSTEM, ""),
235
+ (Roles.AI,
236
+ "Hi I am Mnemosyne, a question answering system built by Glyphic. " +
237
+ "I have access to a series of sales calls from a deal and can retrieve the most relevant "
238
+ +
239
+ "parts of the call for you, and then answer the question. What would you like to know?"
240
+ ),
241
+ (Roles.HUMAN, "Great let me think about that for a second.")
242
+ ]
243
+
244
+
245
+ # pylint: enable=line-too-long
246
+ async def retrieve_docs(
247
+ query: str, query_filter: dict[str, str]) -> list[docstore.Document]:
248
+ """Retrieves documents for a query
249
+
250
+ Args:
251
+ query: the query to run
252
+ query_filter: the filter to run the query with,
253
+ see https://docs.activeloop.ai/getting-started\
254
+ /deep-learning/dataset-filtering
255
+ for more information on deeplake filters.
256
+ The main thing is that filters should be attributes
257
+ in the metadata of the vector db."""
258
+ print("Retrieving docs for query %s and filter %s")
259
+ retriever = workableVectorDB.as_retriever(
260
+ search_kwargs=SEARCH_KWARGS, filter=query_filter)
261
+ return await retriever.aget_relevant_documents(query)
262
+
263
+
264
+ def _get_doc_representation(doc: docstore.Document) -> str:
265
+ metadata = doc.metadata
266
+ content = doc.page_content
267
+ if "call_id" in metadata:
268
+ content = f"Excerpt from call {metadata['title']},\
269
+ on {metadata['date']}, with {metadata['buyer_domain']}: {content}"
270
+ elif "url" in metadata:
271
+ content = f"Case study from url {metadata['url']},\
272
+ : {content}"
273
+
274
+ return content
275
+
276
+
277
+ async def _combine_docs(docs: list[docstore.Document]) -> str:
278
+ """Combines a list of documents into a single string"""
279
+ doc_representations = [_get_doc_representation(doc) for doc in docs]
280
+ return "\n\n".join(doc_representations)
281
+
282
+
283
+ async def answer_question(question: str, docs: str):
284
+ """Answers a question given a query and a list of documents"""
285
+ messages = QUERY_MESSAGES.copy()
286
+ messages += [(Roles.HUMAN, question),
287
+ (Roles.SYSTEM,
288
+ f"Here are the documents I found:\n\n{docs}\n\n"),
289
+ (Roles.SYSTEM,
290
+ f"Now reply to the question: {question}.\n" +
291
+ "Answer concisely and directly, " +
292
+ "but acknowledge if you don't know the answer." +
293
+ "Some information may be from earlier parts of the deal " +
294
+ "and may be irrelevant to the question. " +
295
+ "The user will be unable to ask follow up questions.")]
296
+ return await chat_query_anthropic(messages)
297
+
298
+
299
+ async def run_query(query: str, query_filter: dict[str, str]) -> str:
300
+ """Runs a query on the retrieval chain
301
+
302
+ Args:
303
+ query: the query to run
304
+ query_filter: the filter to run the query with,
305
+ see https://docs.activeloop.ai/getting-started\
306
+ /deep-learning/dataset-filtering
307
+ for more information on deeplake filters.
308
+ The main thing is that filters should be attributes
309
+ in the metadata of the vector db."""
310
+ print("Running query %s for filter %s", query, filter)
311
+ docs = await retrieve_docs(query, query_filter)
312
+ for i, doc in enumerate(docs):
313
+ print("Retrieved doc no.%d\n%s", i, doc.page_content)
314
+ docs_str = await _combine_docs(docs)
315
+ answer = await answer_question(query, docs_str)
316
+ return answer, docs[0].metadata["url"]
317
+
318
+ """## Scraping"""
319
+
320
+
321
+ workable_urls = [
322
+ "https://resources.workable.com/hiring-with-workable/swoon-reduces-agency-use-with-workable",
323
+ "https://resources.workable.com/hiring-with-workable/why-15-of-oneinamils-clients-moved-their-hiring-over-to-workable",
324
+ "https://resources.workable.com/backstage/workable-named-top-rated-ats-by-trustradius-for-2019"
325
+ ]
326
+
327
+ import requests
328
+ from bs4 import BeautifulSoup
329
+ import pprint
330
+ import numpy as np
331
+
332
+ headers = {
333
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
334
+ }
335
+
336
+ PAGES = [
337
+ "https://resources.workable.com/tag/customer-stories/",
338
+ "https://resources.workable.com/tag/customer-stories/page/2/",
339
+ "https://resources.workable.com/tag/customer-stories/page/3/",
340
+ ]
341
+ workable_customers = []
342
+ for page in PAGES:
343
+ r=requests.get(page, headers=headers)
344
+ soup = BeautifulSoup(r.content, 'html.parser')
345
+ for link in soup.find_all("a", href=True):
346
+ href = link["href"]
347
+ if href.startswith("https://resources.workable.com/hiring-with-workable/"):
348
+ workable_customers.append(href)
349
+
350
+ workable_customers
351
+
352
+ def get_paragraphs_workable(url):
353
+ r = requests.get(url=url, headers=headers)
354
+
355
+ soup = BeautifulSoup(r.content, 'html.parser')
356
+
357
+ target_p = []
358
+
359
+ # traverse paragraphs from soup ot get stuff from target and add to arr
360
+ for data in soup.find_all("p"):
361
+ text = data.get_text()
362
+ if len(text) > 3:
363
+ target_p.append(text.strip())
364
+ return target_p
365
+
366
+ def clean_text(text):
367
+ text = text.replace("\n\n", "\n")
368
+ text = text.replace("\t\t", "\t")
369
+ text = text.replace("\r", " ")
370
+ text = text.replace(" ", " ")
371
+ return text
372
+
373
+ def loop(input):
374
+ prev = ""
375
+ while prev != input:
376
+ prev = input
377
+ input = clean_text(input)
378
+ return input
379
+
380
+ workable_case_studies = []
381
+ # for customer in customers:
382
+ # TODO(fix)
383
+ for customer in workable_customers:
384
+ url = customer
385
+ workable_case_studies.append((url,loop('<join>'.join(get_paragraphs_workable(customer)[4:][:-4])))) # First few paragraphs are boiler plate
386
+ # TODO Some additional filtering is still needed especially towards the end. We should probably discard things that are not in the main body.
387
+ workable_case_studies
388
+
389
+
390
+
391
+ """## App logic"""
392
+
393
+ for (url, case_study) in workable_case_studies:
394
+ workableVectorDB.add_texts([case_study], [{"url": url}])
395
+
396
+
397
+
398
+ import gradio as gr
399
+ import requests
400
+ import asyncio
401
+
402
+ API_KEY = os.environ.get("API_KEY")
403
+
404
+ def authenticate(api_key):
405
+ # Check if the provided API key matches the expected key
406
+ return api_key == API_KEY
407
+
408
+ def get_answer(question, api_key):
409
+ # Authenticate the API key
410
+ if not authenticate(api_key):
411
+ return "Invalid API key. Access denied."
412
+
413
+ # Send a POST request to the API endpoint
414
+ response = asyncio.run(run_query(question, query_filter={}))
415
+
416
+ return f"{response[0]} \n\nSource: {response[1]}"
417
+
418
+ # Create a Gradio interface
419
+ iface = gr.Interface(
420
+ fn=get_answer,
421
+ inputs=["text", gr.inputs.Textbox(label="API Key")],
422
+ outputs="text",
423
+ title="Question Answering App",
424
+ description="Enter a question and API key to get an answer.",
425
+ theme="default",
426
+ layout="vertical"
427
+ )
428
+
429
+ # Launch the Gradio interface
430
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openai
2
+ langchain
3
+ beautifulsoup4
4
+ anthropic
5
+ backoff
6
+ tiktoken