import gradio as gr from datetime import datetime, timedelta import arxiv from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.llms import HuggingFaceHub from langchain.chains import LLMChain, StuffDocumentsChain from langchain.prompts import PromptTemplate from langchain.schema import Document MAX_RESULTS = 100 FORMAT = '%Y%m%d%H%M%S' embeddings = HuggingFaceEmbeddings() document_prompt = PromptTemplate( template="Title: {title}\nContent: {page_content}", input_variables=["page_content", "title"], ) prompt = PromptTemplate( template="""Write an engaging newsletter on the most recent exciting developments in the following field:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use emojis to catch attention and use the Markdown format.\n\n#ARTICLES\n"{text}"\n\nNEWSLETTER:\n# AI curated newsletter\n""", input_variables=["context", "text"]) REPO_ID = "HuggingFaceH4/starchat-beta" llm = HuggingFaceHub( repo_id=REPO_ID, model_kwargs={ "max_new_tokens": 1024, "do_sample": True, "temperature": 0.8, "top_p": 0.9 } ) llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True) stuff_chain = StuffDocumentsChain( llm_chain=llm_chain, document_variable_name="text", document_prompt=document_prompt, verbose=True, ) def get_date_range(lookback_days: float): max_date = datetime.today() # Get the current date and time in UTC now_utc = datetime.utcnow() # Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 2, 18, 0, 0) min_date = today_1800_utc - timedelta(days=lookback_days) return min_date, max_date def get_documents(category: str, min_date: datetime, max_date: datetime): # We use the arxiv package instead of Langchain's ArxivLoader, # because the latter automatically loads pdfs which results in poor performance. query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]" search = arxiv.Search( query=query, max_results=MAX_RESULTS, sort_by=arxiv.SortCriterion.SubmittedDate ) docs = [Document( page_content=doc.summary, metadata={ "authors": ", ".join(map(str, doc.authors)), "categories": ", ".join(map(str, doc.categories)), "id": doc.get_short_id(), "title": doc.title, } ) for doc in search.results()] return docs def get_data(category: str, lookback_days: float, user_query: str): print("User query:", user_query) min_date, max_date = get_date_range(lookback_days) docs = get_documents(category, min_date, max_date) if len(docs) == 0: return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'." db = Chroma.from_documents(docs, embeddings) retriever = db.as_retriever() relevant_docs = retriever.get_relevant_documents(user_query) articles = "" for doc in relevant_docs: articles += f"**Title: {doc.metadata['title']}**\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.metadata['id']}\n\n" output = stuff_chain({"input_documents": relevant_docs, "context": user_query}) output_text = output["output_text"].split("<|end|>")[0] print("LLM output:", output_text) return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}" with gr.Blocks() as demo: gr.Markdown( """ # Arxiv AI Curated Newsletter Get a newsletter-style summary of today's Arxiv articles personalised to your field of research. """ ) with gr.Row(): with gr.Column(): with gr.Accordion("Parameters", open=False): lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7) category = gr.Textbox(value="hep-th", label="Which category to search through. See https://arxiv.org/category_taxonomy for possible values.") with gr.Box(): gr.Markdown("Describe your field of research in a few words or sentences.") input_text = gr.Textbox(placeholder="The relationship between Euclidean solutions to supergravity and black hole microstates.", container=False, show_label=False) gr.Examples( [["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]], input_text, ) button = gr.Button(value="Submit") with gr.Column(): with gr.Box(): output = gr.Markdown("Press 'submit' to see your results.") button.click(fn=get_data, inputs=[category, lookback_days,input_text], outputs=output) demo.queue().launch()