import os import re from ast import literal_eval import wandb import gradio as gr import pandas as pd from langchain.callbacks import get_openai_callback from langchain.chains import RetrievalQA from langchain.chat_models import ChatOpenAI from langchain.embeddings.openai import OpenAIEmbeddings from langchain.prompts import PromptTemplate from langchain.vectorstores import Chroma from src.config import config # download and read data api = wandb.Api() artifact_df = api.artifact(config.summarized_que_data_artifact) artifact_df.download(config.root_data_dir) artifact_embeddings = api.artifact(config.transcript_embeddings_artifact) chromadb_dir = artifact_embeddings.download(config.root_data_dir / "chromadb") df_path = config.root_data_dir / "summarized_que_podcasts.csv" df = pd.read_csv(df_path) def embed_video(title: str): video_url = df[df["title"] == title]["url"].values[0] match = re.search(r"v=([-\w]+)", video_url) video_id = match.group(1) # embed video # video_embed = f"" video_embed = f"" return video_embed def get_podcast_info(title: str): # get questions questions = df[df["title"] == title]["questions"].values[0] questions = literal_eval(questions) que_str = "" for que in questions: que_str += f"👉 {que}\n" # get summary summary = df[df["title"] == title]["summary"].values[0] return summary, que_str def get_answer(podcast: str, question: str): index = df[df["title"] == podcast].index[0] db_dir = os.path.join(chromadb_dir, str(index)) embeddings = OpenAIEmbeddings() db = Chroma(persist_directory=db_dir, embedding_function=embeddings) prompt_template = """Use the following pieces of context to answer the question. If you don't know the answer, just say that you don't know, don't try to make up an answer. Don't add your opinions or interpretations. Ensure that you complete the answer. If the question is not relevant to the context, just say that it is not relevant. CONTEXT: {context} QUESTION: {question} ANSWER:""" prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) retriever = db.as_retriever() retriever.search_kwargs["k"] = 2 qa = RetrievalQA.from_chain_type( llm=ChatOpenAI(temperature=0), chain_type="stuff", retriever=retriever, chain_type_kwargs={"prompt": prompt}, return_source_documents=True, ) with get_openai_callback() as cb: result = qa({"query": question}) print(cb) answer = result["result"] return answer with gr.Blocks() as demo: gr.Markdown("