Spaces:

vincentmin
/

ArxivNewsLetter

Sleeping

File size: 5,256 Bytes

8c26ae0
5927a64
d3e5a67
ee043ca
bb7257e
a326270
 
 
820771b
bb7257e
ee21478
7f82885
5927a64
42c6b22
ee21478
 
a326270
a5664be
 
a326270
 
6e36ec1
a326270
 
 
 
 
 
acc1121
a326270
 
 
 
 
 
 
 
 
 
 
 
 
 
5927a64
 
 
d3e5a67
 
74c23c5
d3e5a67
5927a64
 
d3e5a67
 
 
 
 
 
7f82885
d3e5a67
 
 
 
 
076a043
 
d3e5a67
 
 
 
b95ae99
d3e5a67
6ab47bd
0268ea7
5927a64
 
d3e5a67
4760881
6ab47bd
3baeead
bb7257e
3baeead
a326270
3baeead
d32e198
a326270
45e331f
0268ea7
d3e5a67
bb7257e
d3db9a3
 
 
 
 
 
 
 
 
6b1fe0f
d4f0e90
6b1fe0f
99471f1
1d3a54f
6ab47bd
1d3a54f
e210228
74c23c5
1f1a29f
e210228
74c23c5
e210228
 
1d3a54f
1f1a29f
93f881f
6b1fe0f
1f1a29f
 
6b1fe0f
1f1a29f
d3db9a3

import gradio as gr
from datetime import datetime, timedelta
import arxiv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document


MAX_RESULTS = 100
FORMAT = '%Y%m%d%H%M%S'

embeddings = HuggingFaceEmbeddings()    

document_prompt = PromptTemplate(
  template="Title: {title}\nContent: {page_content}",
  input_variables=["page_content", "title"],
)
prompt = PromptTemplate(
  template="""Write an engaging newsletter on the most recent exciting developments in the following field:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use emojis to catch attention and use the Markdown format.\n\n#ARTICLES\n"{text}"\n\nNEWSLETTER:\n# AI curated newsletter\n""",
  input_variables=["context", "text"])

REPO_ID = "HuggingFaceH4/starchat-beta"
llm = HuggingFaceHub(
    repo_id=REPO_ID,
    model_kwargs={
        "max_new_tokens": 1024,
        "do_sample": True,
        "temperature": 0.8,
        "top_p": 0.9
    }
)

llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True)
stuff_chain = StuffDocumentsChain(
  llm_chain=llm_chain,
  document_variable_name="text",
  document_prompt=document_prompt,
  verbose=True,
)

def get_date_range(lookback_days: float):
    max_date = datetime.today()
    # Get the current date and time in UTC
    now_utc = datetime.utcnow()
    # Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions
    today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 2, 18, 0, 0)
    min_date = today_1800_utc - timedelta(days=lookback_days)
    return min_date, max_date

def get_documents(category: str, min_date: datetime, max_date: datetime):
    # We use the arxiv package instead of Langchain's ArxivLoader,
    # because the latter automatically loads pdfs which results in poor performance.
    query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
    search = arxiv.Search(
        query=query,
        max_results=MAX_RESULTS,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    docs = [Document(
        page_content=doc.summary,
        metadata={
            "authors": ", ".join(map(str, doc.authors)),
            "categories": ", ".join(map(str, doc.categories)),
            "id": doc.get_short_id(),
            "title": doc.title,
        }
    ) for doc in search.results()]
    return docs

def get_data(category: str, lookback_days: float, user_query: str):
    print("User query:", user_query)
    
    min_date, max_date = get_date_range(lookback_days)
    docs = get_documents(category, min_date, max_date)
    if len(docs) == 0:
        return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
    db = Chroma.from_documents(docs, embeddings)
    retriever = db.as_retriever()    
    relevant_docs = retriever.get_relevant_documents(user_query)
    articles = ""
    for doc in relevant_docs:
        articles += f"**Title: {doc.metadata['title']}**\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.metadata['id']}\n\n"
    output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
    output_text = output["output_text"].split("<|end|>")[0]
    print("LLM output:", output_text)
    return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Arxiv AI Curated Newsletter

        
        Get a newsletter-style summary of today's Arxiv articles personalised to your field of research.
        """
    )
    with gr.Row():
        with gr.Column():
            with gr.Accordion("Parameters", open=False):
                lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7)
            
            category = gr.Textbox(value="hep-th", label="Which category to search through. See https://arxiv.org/category_taxonomy for possible values.")
    
            with gr.Box():
                gr.Markdown("Describe your field of research in a few words or sentences.")
                input_text = gr.Textbox(placeholder="The relationship between Euclidean solutions to supergravity and black hole microstates.", container=False, show_label=False)
                gr.Examples(
                    [["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]],
                    input_text,
                )
            
            button = gr.Button(value="Submit")

        with gr.Column():
            with gr.Box():
                output = gr.Markdown("Press 'submit' to see your results.")
    
    button.click(fn=get_data, inputs=[category, lookback_days,input_text], outputs=output)

    demo.queue().launch()