andrewgleave commited on
Commit
9643fb1
1 Parent(s): 95656f5
Files changed (6) hide show
  1. .gitignore +1 -0
  2. app.py +149 -0
  3. chain.py +140 -0
  4. requirements.txt +81 -0
  5. store/tok_doc_idx.json +0 -0
  6. store/tok_docs.json +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ env
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from langchain.docstore.document import Document
6
+ from langchain.docstore.in_memory import InMemoryDocstore
7
+ from langchain.embeddings import OpenAIEmbeddings
8
+ from langchain.vectorstores.faiss import FAISS
9
+
10
+ import gradio as gr
11
+
12
+ from chain import get_chain
13
+
14
+ STORE_DIR = "store"
15
+ YOUTUBE_EMBED_TEMPLATE = """
16
+ <iframe width="354" height="200" src="{source}" title="YouTube video player" frameborder="0"
17
+ allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen>
18
+ </iframe>"""
19
+
20
+
21
+ def load_store():
22
+ def keys_to_int(x):
23
+ return {int(k): v for k, v in x.items()}
24
+
25
+ def _read_index(path):
26
+ import faiss
27
+
28
+ return faiss.read_index(str(path))
29
+
30
+ index_path = list(Path(STORE_DIR).glob("*.faiss"))
31
+ if len(index_path) == 0:
32
+ raise ValueError("No index found in path")
33
+
34
+ index_path = index_path[0]
35
+ index_name = index_path.name.split(".")[0]
36
+
37
+ with open(os.path.join(STORE_DIR, f"{index_name}_doc_idx.json"), "r") as f:
38
+ index_to_id = json.load(f, object_hook=keys_to_int)
39
+
40
+ with open(os.path.join(STORE_DIR, f"{index_name}_docs.json"), "r") as f:
41
+ docs = json.load(f)
42
+
43
+ embeddings = OpenAIEmbeddings()
44
+ return FAISS(
45
+ embedding_function=embeddings.embed_query,
46
+ index=_read_index(index_path),
47
+ docstore=InMemoryDocstore(
48
+ {index_to_id[i]: Document(**doc) for i, doc in enumerate(docs.values())}
49
+ ),
50
+ index_to_docstore_id=index_to_id,
51
+ )
52
+
53
+
54
+ def set_openai_api_key(api_key, agent):
55
+ if api_key:
56
+ os.environ["OPENAI_API_KEY"] = api_key
57
+ vstore = load_store()
58
+ qa_chain = get_chain(vstore)
59
+ os.environ["OPENAI_API_KEY"] = ""
60
+ return qa_chain
61
+
62
+
63
+ def _to_embed(link):
64
+ return link.replace("watch?v=", "embed/").replace("&t=", "?start=")
65
+
66
+
67
+ def chat(inp, history, agent):
68
+ history = history or []
69
+ if agent is None:
70
+ history.append((inp, "Please paste your OpenAI key to use"))
71
+ return history, history
72
+ output = agent({"question": inp, "chat_history": history})
73
+ answer = output["answer"]
74
+ history.append((inp, answer))
75
+ source_iframes = []
76
+ for source in output["sources"]:
77
+ if "youtube.com" in source:
78
+ source_iframes.append(
79
+ YOUTUBE_EMBED_TEMPLATE.format(source=_to_embed(source))
80
+ )
81
+ source_html = f"""<div style='min-height:200px;display:flex;align-items:center;justify-content:space-around;'>
82
+ {''.join(source_iframes)}
83
+ </div>"""
84
+ return history, history, source_html
85
+
86
+
87
+ block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
88
+ with block:
89
+ gr.Markdown("<h3><center>ToKBot🤖 - Ask ToKCast Questions</center></h3>")
90
+ openai_api_key_textbox = gr.Textbox(
91
+ placeholder="Paste your OpenAI API key (sk-...)",
92
+ show_label=False,
93
+ lines=1,
94
+ type="password",
95
+ )
96
+
97
+ chatbot = gr.Chatbot()
98
+ gr.Markdown("<h3>Excerpts</h3>")
99
+ sources = gr.HTML(
100
+ """<div style="min-height:200px;display:flex;align-items:center;justify-content:center;">
101
+ <h3 style="text-align:center;color:#555;font-size:2rem;">No videos</h3>
102
+ </div>"""
103
+ )
104
+ with gr.Row():
105
+ message = gr.Textbox(
106
+ label="What's your question?",
107
+ placeholder="Type your question here...",
108
+ lines=1,
109
+ )
110
+ submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
111
+
112
+ gr.Examples(
113
+ examples=[
114
+ "What is a beginning of infinity?",
115
+ "How do memes differ from genes in how they replicate?",
116
+ "What is the nature of knowledge and how does it grow?",
117
+ ],
118
+ inputs=message,
119
+ )
120
+
121
+ gr.HTML(
122
+ """A GPT-3/LangChain bot that answers questions about the TokCast podcast provides relevant video excerpts"""
123
+ )
124
+
125
+ gr.HTML(
126
+ "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
127
+ )
128
+
129
+ state = gr.State()
130
+ agent_state = gr.State()
131
+
132
+ submit.click(
133
+ chat,
134
+ inputs=[message, state, agent_state],
135
+ outputs=[chatbot, state, sources],
136
+ )
137
+ message.submit(
138
+ chat,
139
+ inputs=[message, state, agent_state],
140
+ outputs=[chatbot, state, sources],
141
+ )
142
+
143
+ openai_api_key_textbox.change(
144
+ set_openai_api_key,
145
+ inputs=[openai_api_key_textbox, agent_state],
146
+ outputs=[agent_state],
147
+ )
148
+
149
+ block.launch(debug=True)
chain.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Tuple
2
+
3
+ from langchain import OpenAI, PromptTemplate
4
+ from langchain.chains import LLMChain
5
+ from langchain.chains.base import Chain
6
+ from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
7
+ from langchain.chains.question_answering import load_qa_chain
8
+ from langchain.prompts import FewShotPromptTemplate
9
+
10
+ # from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
11
+ from langchain.vectorstores import FAISS
12
+ from pydantic import BaseModel
13
+
14
+
15
+ class CustomChain(Chain, BaseModel):
16
+
17
+ vstore: FAISS
18
+ chain: BaseCombineDocumentsChain
19
+ key_word_extractor: Chain
20
+
21
+ @property
22
+ def input_keys(self) -> List[str]:
23
+ return ["question"]
24
+
25
+ @property
26
+ def output_keys(self) -> List[str]:
27
+ return ["answer", "sources"]
28
+
29
+ def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
30
+ question = inputs["question"]
31
+ chat_history_str = _get_chat_history(inputs["chat_history"])
32
+ if chat_history_str:
33
+ new_question = self.key_word_extractor.run(
34
+ question=question, chat_history=chat_history_str
35
+ )
36
+
37
+ else:
38
+ new_question = question
39
+ docs = self.vstore.similarity_search(new_question, k=3)
40
+ new_inputs = inputs.copy()
41
+ new_inputs["question"] = new_question
42
+ new_inputs["chat_history"] = chat_history_str
43
+ answer, _ = self.chain.combine_docs(docs, **new_inputs)
44
+ sources = ""
45
+ if "SOURCES:" in answer:
46
+ answer, sources = answer.split("SOURCES:")
47
+ sources = sources.split(", ")
48
+ answer = answer.strip()
49
+ return {"answer": answer, "sources": sources}
50
+
51
+
52
+ def get_chain(vectorstore: FAISS) -> Chain:
53
+ _eg_template = """## Example:
54
+
55
+ Chat History:
56
+ {chat_history}
57
+ Follow Up question: {question}
58
+ Standalone question: {answer}"""
59
+ _eg_prompt = PromptTemplate(
60
+ template=_eg_template,
61
+ input_variables=["chat_history", "question", "answer"],
62
+ )
63
+
64
+ _prefix = """Given the following Chat History and a Follow Up Question, rephrase the Follow Up Question to be a new Standalone Question that takes the Chat History and context in to consideration. You should assume that the question is related to the TokCast podcast."""
65
+ _suffix = """## Example:
66
+
67
+ Chat History:
68
+ {chat_history}
69
+ Follow Up Input: {question}
70
+ Standalone question:"""
71
+ # example_selector = SemanticSimilarityExampleSelector(
72
+ # vectorstore=vectorstore,
73
+ # k=4,
74
+ # )
75
+
76
+ examples = [
77
+ {
78
+ "question": "What is the TokCast podcast?",
79
+ "chat_history": [],
80
+ "answer": "TokCast is a podcast about the philosophy of David Deutsch.",
81
+ },
82
+ {
83
+ "question": "Who is that?",
84
+ "chat_history": "Human: What is the TokCast podcast?\nAssistant: TokCast is a podcast about the philosophy of David Deutsch.",
85
+ "answer": "Who is David Deutsch?",
86
+ },
87
+ {
88
+ "question": "What is the worldview presented here?",
89
+ "chat_history": "Human: What is the TokCast podcast?\nAssistant: TokCast is a podcast about the philosophy of David Deutsch.\nHuman: Who is that?\nAssistant: David Deutsch is a philosopher, physicist, and author. He is the author of The Beginning of Infinity, Fabric of Reality, and one of the pioneers of the field of quantum computing.",
90
+ "answer": "What is David Deutsch's worldview?",
91
+ },
92
+ ]
93
+ prompt = FewShotPromptTemplate(
94
+ prefix=_prefix,
95
+ suffix=_suffix,
96
+ # example_selector=example_selector,
97
+ examples=examples,
98
+ example_prompt=_eg_prompt,
99
+ input_variables=["question", "chat_history"],
100
+ )
101
+ llm = OpenAI(temperature=0, model_name="text-davinci-003")
102
+ key_word_extractor = LLMChain(llm=llm, prompt=prompt, verbose=True)
103
+
104
+ EXAMPLE_PROMPT = PromptTemplate(
105
+ template="CONTENT:\n{page_content}\n----------\nSOURCE:\n{source}\n",
106
+ input_variables=["page_content", "source"],
107
+ )
108
+ template = """You are an AI assistant for the TokCast Podcast. You're trained on all the transcripts of the podcast.
109
+ Given a QUESTION and a series one or more CONTENT and SOURCE sections from a long document provide a conversational answer as "ANSWER" and a "SOURCES" output which lists verbatim the SOURCEs used in generating the response.
110
+ You should only use SOURCEs that are explicitly listed as a SOURCE in the context.
111
+ ALWAYS include the "SOURCES" as part of the response. If you don't have any sources, just say "SOURCES:"
112
+ If you don't know the answer, just say "I'm not sure. Check out Brett's Channel" Don't try to make up an answer.
113
+ QUESTION: {question}
114
+ =========
115
+ {context}
116
+ =========
117
+ ANSWER:"""
118
+ PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
119
+ doc_chain = load_qa_chain(
120
+ OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=-1),
121
+ chain_type="stuff",
122
+ prompt=PROMPT,
123
+ document_prompt=EXAMPLE_PROMPT,
124
+ verbose=True,
125
+ )
126
+ return CustomChain(
127
+ chain=doc_chain,
128
+ vstore=vectorstore,
129
+ key_word_extractor=key_word_extractor,
130
+ verbose=True,
131
+ )
132
+
133
+
134
+ def _get_chat_history(chat_history: List[Tuple[str, str]]):
135
+ buffer = ""
136
+ for human_s, ai_s in chat_history:
137
+ human = "Human: " + human_s
138
+ ai = "Assistant: " + ai_s
139
+ buffer += "\n" + "\n".join([human, ai])
140
+ return buffer
requirements.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==22.1.0
2
+ aiohttp==3.8.3
3
+ aiosignal==1.3.1
4
+ altair==4.2.0
5
+ anyio==3.6.2
6
+ async-timeout==4.0.2
7
+ attrs==22.2.0
8
+ autopep8==2.0.1
9
+ black==22.12.0
10
+ blobfile==2.0.1
11
+ certifi==2022.12.7
12
+ charset-normalizer==2.1.1
13
+ click==8.1.3
14
+ contourpy==1.0.7
15
+ cycler==0.11.0
16
+ entrypoints==0.4
17
+ faiss-cpu==1.7.3
18
+ fastapi==0.89.1
19
+ ffmpy==0.3.0
20
+ filelock==3.9.0
21
+ flake8==6.0.0
22
+ fonttools==4.38.0
23
+ frozenlist==1.3.3
24
+ fsspec==2023.1.0
25
+ gradio==3.16.2
26
+ greenlet==2.0.1
27
+ h11==0.14.0
28
+ httpcore==0.16.3
29
+ httpx==0.23.3
30
+ idna==3.4
31
+ Jinja2==3.1.2
32
+ jsonschema==4.17.3
33
+ kiwisolver==1.4.4
34
+ langchain==0.0.65
35
+ linkify-it-py==1.0.3
36
+ lxml==4.9.2
37
+ markdown-it-py==2.1.0
38
+ MarkupSafe==2.1.2
39
+ matplotlib==3.6.3
40
+ mccabe==0.7.0
41
+ mdit-py-plugins==0.3.3
42
+ mdurl==0.1.2
43
+ multidict==6.0.4
44
+ mypy-extensions==0.4.3
45
+ numpy==1.24.1
46
+ openai==0.26.1
47
+ orjson==3.8.5
48
+ packaging==23.0
49
+ pandas==1.5.3
50
+ pathspec==0.10.3
51
+ Pillow==9.4.0
52
+ platformdirs==2.6.2
53
+ pycodestyle==2.10.0
54
+ pycryptodome==3.16.0
55
+ pycryptodomex==3.16.0
56
+ pydantic==1.10.4
57
+ pydub==0.25.1
58
+ pyflakes==3.0.1
59
+ pyparsing==3.0.9
60
+ pyrsistent==0.19.3
61
+ python-dateutil==2.8.2
62
+ python-multipart==0.0.5
63
+ pytz==2022.7.1
64
+ PyYAML==6.0
65
+ regex==2022.10.31
66
+ requests==2.28.2
67
+ rfc3986==1.5.0
68
+ six==1.16.0
69
+ sniffio==1.3.0
70
+ SQLAlchemy==1.4.46
71
+ starlette==0.22.0
72
+ tiktoken==0.1.2
73
+ tomli==2.0.1
74
+ toolz==0.12.0
75
+ tqdm==4.64.1
76
+ typing_extensions==4.4.0
77
+ uc-micro-py==1.0.1
78
+ urllib3==1.26.14
79
+ uvicorn==0.20.0
80
+ websockets==10.4
81
+ yarl==1.8.2
store/tok_doc_idx.json ADDED
The diff for this file is too large to render. See raw diff
 
store/tok_docs.json ADDED
The diff for this file is too large to render. See raw diff