Spaces:

andrewgleave
/

tokbot

Sleeping

App Files Files Community

andrewgleave commited on Jan 20, 2023

Commit

9643fb1

•

1 Parent(s): 95656f5

Initial

Browse files

Files changed (6) hide show

.gitignore +1 -0
app.py +149 -0
chain.py +140 -0
requirements.txt +81 -0
store/tok_doc_idx.json +0 -0
store/tok_docs.json +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ env

app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import json
+import os
+from pathlib import Path
+from langchain.docstore.document import Document
+from langchain.docstore.in_memory import InMemoryDocstore
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.faiss import FAISS
+import gradio as gr
+from chain import get_chain
+STORE_DIR = "store"
+YOUTUBE_EMBED_TEMPLATE = """
+<iframe width="354" height="200" src="{source}" title="YouTube video player" frameborder="0"
+    allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen>
+</iframe>"""
+def load_store():
+    def keys_to_int(x):
+        return {int(k): v for k, v in x.items()}
+    def _read_index(path):
+        import faiss
+        return faiss.read_index(str(path))
+    index_path = list(Path(STORE_DIR).glob("*.faiss"))
+    if len(index_path) == 0:
+        raise ValueError("No index found in path")
+    index_path = index_path[0]
+    index_name = index_path.name.split(".")[0]
+    with open(os.path.join(STORE_DIR, f"{index_name}_doc_idx.json"), "r") as f:
+        index_to_id = json.load(f, object_hook=keys_to_int)
+    with open(os.path.join(STORE_DIR, f"{index_name}_docs.json"), "r") as f:
+        docs = json.load(f)
+    embeddings = OpenAIEmbeddings()
+    return FAISS(
+        embedding_function=embeddings.embed_query,
+        index=_read_index(index_path),
+        docstore=InMemoryDocstore(
+            {index_to_id[i]: Document(**doc) for i, doc in enumerate(docs.values())}
+        ),
+        index_to_docstore_id=index_to_id,
+    )
+def set_openai_api_key(api_key, agent):
+    if api_key:
+        os.environ["OPENAI_API_KEY"] = api_key
+        vstore = load_store()
+        qa_chain = get_chain(vstore)
+        os.environ["OPENAI_API_KEY"] = ""
+        return qa_chain
+def _to_embed(link):
+    return link.replace("watch?v=", "embed/").replace("&t=", "?start=")
+def chat(inp, history, agent):
+    history = history or []
+    if agent is None:
+        history.append((inp, "Please paste your OpenAI key to use"))
+        return history, history
+    output = agent({"question": inp, "chat_history": history})
+    answer = output["answer"]
+    history.append((inp, answer))
+    source_iframes = []
+    for source in output["sources"]:
+        if "youtube.com" in source:
+            source_iframes.append(
+                YOUTUBE_EMBED_TEMPLATE.format(source=_to_embed(source))
+            )
+    source_html = f"""<div style='min-height:200px;display:flex;align-items:center;justify-content:space-around;'>
+        {''.join(source_iframes)}
+    </div>"""
+    return history, history, source_html
+block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
+with block:
+    gr.Markdown("<h3><center>ToKBot🤖 - Ask ToKCast Questions</center></h3>")
+    openai_api_key_textbox = gr.Textbox(
+        placeholder="Paste your OpenAI API key (sk-...)",
+        show_label=False,
+        lines=1,
+        type="password",
+    )
+    chatbot = gr.Chatbot()
+    gr.Markdown("<h3>Excerpts</h3>")
+    sources = gr.HTML(
+        """<div style="min-height:200px;display:flex;align-items:center;justify-content:center;">
+            <h3 style="text-align:center;color:#555;font-size:2rem;">No videos</h3>
+        </div>"""
+    )
+    with gr.Row():
+        message = gr.Textbox(
+            label="What's your question?",
+            placeholder="Type your question here...",
+            lines=1,
+        )
+        submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
+    gr.Examples(
+        examples=[
+            "What is a beginning of infinity?",
+            "How do memes differ from genes in how they replicate?",
+            "What is the nature of knowledge and how does it grow?",
+        ],
+        inputs=message,
+    )
+    gr.HTML(
+        """A GPT-3/LangChain bot that answers questions about the TokCast podcast provides relevant video excerpts"""
+    )
+    gr.HTML(
+        "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
+    )
+    state = gr.State()
+    agent_state = gr.State()
+    submit.click(
+        chat,
+        inputs=[message, state, agent_state],
+        outputs=[chatbot, state, sources],
+    )
+    message.submit(
+        chat,
+        inputs=[message, state, agent_state],
+        outputs=[chatbot, state, sources],
+    )
+    openai_api_key_textbox.change(
+        set_openai_api_key,
+        inputs=[openai_api_key_textbox, agent_state],
+        outputs=[agent_state],
+    )
+block.launch(debug=True)

chain.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import Dict, List, Tuple
+from langchain import OpenAI, PromptTemplate
+from langchain.chains import LLMChain
+from langchain.chains.base import Chain
+from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
+from langchain.chains.question_answering import load_qa_chain
+from langchain.prompts import FewShotPromptTemplate
+# from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
+from langchain.vectorstores import FAISS
+from pydantic import BaseModel
+class CustomChain(Chain, BaseModel):
+    vstore: FAISS
+    chain: BaseCombineDocumentsChain
+    key_word_extractor: Chain
+    @property
+    def input_keys(self) -> List[str]:
+        return ["question"]
+    @property
+    def output_keys(self) -> List[str]:
+        return ["answer", "sources"]
+    def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
+        question = inputs["question"]
+        chat_history_str = _get_chat_history(inputs["chat_history"])
+        if chat_history_str:
+            new_question = self.key_word_extractor.run(
+                question=question, chat_history=chat_history_str
+            )
+        else:
+            new_question = question
+        docs = self.vstore.similarity_search(new_question, k=3)
+        new_inputs = inputs.copy()
+        new_inputs["question"] = new_question
+        new_inputs["chat_history"] = chat_history_str
+        answer, _ = self.chain.combine_docs(docs, **new_inputs)
+        sources = ""
+        if "SOURCES:" in answer:
+            answer, sources = answer.split("SOURCES:")
+        sources = sources.split(", ")
+        answer = answer.strip()
+        return {"answer": answer, "sources": sources}
+def get_chain(vectorstore: FAISS) -> Chain:
+    _eg_template = """## Example:
+    Chat History:
+    {chat_history}
+    Follow Up question: {question}
+    Standalone question: {answer}"""
+    _eg_prompt = PromptTemplate(
+        template=_eg_template,
+        input_variables=["chat_history", "question", "answer"],
+    )
+    _prefix = """Given the following Chat History and a Follow Up Question, rephrase the Follow Up Question to be a new Standalone Question that takes the Chat History and context in to consideration. You should assume that the question is related to the TokCast podcast."""
+    _suffix = """## Example:
+    Chat History:
+    {chat_history}
+    Follow Up Input: {question}
+    Standalone question:"""
+    # example_selector = SemanticSimilarityExampleSelector(
+    #     vectorstore=vectorstore,
+    #     k=4,
+    # )
+    examples = [
+        {
+            "question": "What is the TokCast podcast?",
+            "chat_history": [],
+            "answer": "TokCast is a podcast about the philosophy of David Deutsch.",
+        },
+        {
+            "question": "Who is that?",
+            "chat_history": "Human: What is the TokCast podcast?\nAssistant: TokCast is a podcast about the philosophy of David Deutsch.",
+            "answer": "Who is David Deutsch?",
+        },
+        {
+            "question": "What is the worldview presented here?",
+            "chat_history": "Human: What is the TokCast podcast?\nAssistant: TokCast is a podcast about the philosophy of David Deutsch.\nHuman: Who is that?\nAssistant: David Deutsch is a philosopher, physicist, and author. He is the author of The Beginning of Infinity, Fabric of Reality, and one of the pioneers of the field of quantum computing.",
+            "answer": "What is David Deutsch's worldview?",
+        },
+    ]
+    prompt = FewShotPromptTemplate(
+        prefix=_prefix,
+        suffix=_suffix,
+        # example_selector=example_selector,
+        examples=examples,
+        example_prompt=_eg_prompt,
+        input_variables=["question", "chat_history"],
+    )
+    llm = OpenAI(temperature=0, model_name="text-davinci-003")
+    key_word_extractor = LLMChain(llm=llm, prompt=prompt, verbose=True)
+    EXAMPLE_PROMPT = PromptTemplate(
+        template="CONTENT:\n{page_content}\n----------\nSOURCE:\n{source}\n",
+        input_variables=["page_content", "source"],
+    )
+    template = """You are an AI assistant for the TokCast Podcast. You're trained on all the transcripts of the podcast.
+Given a QUESTION and a series one or more CONTENT and SOURCE sections from a long document provide a conversational answer as "ANSWER" and a "SOURCES" output which lists verbatim the SOURCEs used in generating the response.
+You should only use SOURCEs that are explicitly listed as a SOURCE in the context.
+ALWAYS include the "SOURCES" as part of the response. If you don't have any sources, just say "SOURCES:"
+If you don't know the answer, just say "I'm not sure. Check out Brett's Channel" Don't try to make up an answer.
+QUESTION: {question}
+=========
+{context}
+=========
+ANSWER:"""
+    PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
+    doc_chain = load_qa_chain(
+        OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=-1),
+        chain_type="stuff",
+        prompt=PROMPT,
+        document_prompt=EXAMPLE_PROMPT,
+        verbose=True,
+    )
+    return CustomChain(
+        chain=doc_chain,
+        vstore=vectorstore,
+        key_word_extractor=key_word_extractor,
+        verbose=True,
+    )
+def _get_chat_history(chat_history: List[Tuple[str, str]]):
+    buffer = ""
+    for human_s, ai_s in chat_history:
+        human = "Human: " + human_s
+        ai = "Assistant: " + ai_s
+        buffer += "\n" + "\n".join([human, ai])
+    return buffer

requirements.txt ADDED Viewed

	@@ -0,0 +1,81 @@

+aiofiles==22.1.0
+aiohttp==3.8.3
+aiosignal==1.3.1
+altair==4.2.0
+anyio==3.6.2
+async-timeout==4.0.2
+attrs==22.2.0
+autopep8==2.0.1
+black==22.12.0
+blobfile==2.0.1
+certifi==2022.12.7
+charset-normalizer==2.1.1
+click==8.1.3
+contourpy==1.0.7
+cycler==0.11.0
+entrypoints==0.4
+faiss-cpu==1.7.3
+fastapi==0.89.1
+ffmpy==0.3.0
+filelock==3.9.0
+flake8==6.0.0
+fonttools==4.38.0
+frozenlist==1.3.3
+fsspec==2023.1.0
+gradio==3.16.2
+greenlet==2.0.1
+h11==0.14.0
+httpcore==0.16.3
+httpx==0.23.3
+idna==3.4
+Jinja2==3.1.2
+jsonschema==4.17.3
+kiwisolver==1.4.4
+langchain==0.0.65
+linkify-it-py==1.0.3
+lxml==4.9.2
+markdown-it-py==2.1.0
+MarkupSafe==2.1.2
+matplotlib==3.6.3
+mccabe==0.7.0
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+multidict==6.0.4
+mypy-extensions==0.4.3
+numpy==1.24.1
+openai==0.26.1
+orjson==3.8.5
+packaging==23.0
+pandas==1.5.3
+pathspec==0.10.3
+Pillow==9.4.0
+platformdirs==2.6.2
+pycodestyle==2.10.0
+pycryptodome==3.16.0
+pycryptodomex==3.16.0
+pydantic==1.10.4
+pydub==0.25.1
+pyflakes==3.0.1
+pyparsing==3.0.9
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-multipart==0.0.5
+pytz==2022.7.1
+PyYAML==6.0
+regex==2022.10.31
+requests==2.28.2
+rfc3986==1.5.0
+six==1.16.0
+sniffio==1.3.0
+SQLAlchemy==1.4.46
+starlette==0.22.0
+tiktoken==0.1.2
+tomli==2.0.1
+toolz==0.12.0
+tqdm==4.64.1
+typing_extensions==4.4.0
+uc-micro-py==1.0.1
+urllib3==1.26.14
+uvicorn==0.20.0
+websockets==10.4
+yarl==1.8.2

store/tok_doc_idx.json ADDED Viewed

The diff for this file is too large to render. See raw diff

store/tok_docs.json ADDED Viewed

The diff for this file is too large to render. See raw diff