import gradio as gr from datetime import date, timedelta from langchain.document_loaders import ArxivLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.llms import HuggingFaceHub # from langchain.llms import FakeListLLM from langchain.chains import LLMChain, StuffDocumentsChain from langchain.prompts import PromptTemplate from langchain.schema import Document LOAD_MAX_DOCS = 100 # CHUNK_SIZE = 1000 # text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE) embeddings = HuggingFaceEmbeddings() document_prompt = PromptTemplate( template="Title: {Title}\nContent: {page_content}", input_variables=["page_content", "Title"], ) prompt = PromptTemplate( template="""Write a personalised newsletter for a researcher on the most recent exciting developments in his field. The researcher describes his work as follows:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use Markdown format\n#ARTICLES\n\n"{text}"\n\nNEWSLETTER:\n# Your AI curated newsletter\n""", input_variables=["context", "text"]) # llm = FakeListLLM(responses=list(map(str, range(100)))) REPO_ID = "HuggingFaceH4/starchat-beta" llm = HuggingFaceHub( repo_id=REPO_ID, model_kwargs={ "max_new_tokens": 300, "do_sample": True, "temperature": 0.8, "top_p": 0.9 } ) llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True) stuff_chain = StuffDocumentsChain( llm_chain=llm_chain, document_variable_name="text", document_prompt=document_prompt, verbose=True, ) def process_document(doc: Document): metadata = doc.metadata metadata["Body"] = doc.page_content return Document(page_content=doc.metadata["Summary"], metadata=metadata) def get_data(lookback_days: float, user_query: str): print("User query:", user_query) max_date = date.today() min_date = (max_date - timedelta(days=3)).strftime('%Y%m%d') query = f"cat:hep-th AND submittedDate:[{min_date.strftime('%Y%m%d')} TO {max_date.strftime('%Y%m%d')}]" loader = ArxivLoader(query=query, load_max_docs=LOAD_MAX_DOCS) docs = loader.load() docs = [process_document(doc) for doc in docs] db = Chroma.from_documents(docs, embeddings) retriever = db.as_retriever() relevant_docs = retriever.get_relevant_documents(user_query) print(relevant_docs[0].metadata) articles = "" for doc in relevant_docs: articles += f"**Title: {doc.metadata['Title']}**\n\nAbstract: {doc.metadata['Summary']}\n\n" output = stuff_chain({"input_documents": relevant_docs, "context": user_query}) output_text = output["output_text"].split("<|end|>")[0] print("LLM output:", output_text) return f"# Your AI curated newsletter\n{output['output_text']}\n\n\n\n## Used articles:\n\n{articles}" # demo = gr.Interface( # fn=get_data, # inputs=[lookback_days, input_text] # outputs=gr.Markdown(), # title="Arxiv AI Curated Newsletter", # description="Describe your field of research in a few words to get a newsletter-style summary of today's Arxiv articles.", # ) with gr.Blocks() as demo: gr.Markdown( """ # Arxiv AI Curated Newsletter Get a newsletter-style summary of today's Arxiv articles personalised to your field of research. """ ) lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7) input_text = gr.Textbox(placeholder="Describe your field of research in a few words") gr.Examples( [["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]], input_text, ) output = gr.Markdown() input_text.change(fn=get_data, inputs=[lookback_days,input_text], outputs=output) demo.queue().launch()