Spaces:
Sleeping
Sleeping
File size: 5,256 Bytes
8c26ae0 5927a64 d3e5a67 ee043ca bb7257e a326270 820771b bb7257e ee21478 7f82885 5927a64 42c6b22 ee21478 a326270 a5664be a326270 6e36ec1 a326270 acc1121 a326270 5927a64 d3e5a67 74c23c5 d3e5a67 5927a64 d3e5a67 7f82885 d3e5a67 076a043 d3e5a67 b95ae99 d3e5a67 6ab47bd 0268ea7 5927a64 d3e5a67 4760881 6ab47bd 3baeead bb7257e 3baeead a326270 3baeead d32e198 a326270 45e331f 0268ea7 d3e5a67 bb7257e d3db9a3 6b1fe0f d4f0e90 6b1fe0f 99471f1 1d3a54f 6ab47bd 1d3a54f e210228 74c23c5 1f1a29f e210228 74c23c5 e210228 1d3a54f 1f1a29f 93f881f 6b1fe0f 1f1a29f 6b1fe0f 1f1a29f d3db9a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
from datetime import datetime, timedelta
import arxiv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document
MAX_RESULTS = 100
FORMAT = '%Y%m%d%H%M%S'
embeddings = HuggingFaceEmbeddings()
document_prompt = PromptTemplate(
template="Title: {title}\nContent: {page_content}",
input_variables=["page_content", "title"],
)
prompt = PromptTemplate(
template="""Write an engaging newsletter on the most recent exciting developments in the following field:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use emojis to catch attention and use the Markdown format.\n\n#ARTICLES\n"{text}"\n\nNEWSLETTER:\n# AI curated newsletter\n""",
input_variables=["context", "text"])
REPO_ID = "HuggingFaceH4/starchat-beta"
llm = HuggingFaceHub(
repo_id=REPO_ID,
model_kwargs={
"max_new_tokens": 1024,
"do_sample": True,
"temperature": 0.8,
"top_p": 0.9
}
)
llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True)
stuff_chain = StuffDocumentsChain(
llm_chain=llm_chain,
document_variable_name="text",
document_prompt=document_prompt,
verbose=True,
)
def get_date_range(lookback_days: float):
max_date = datetime.today()
# Get the current date and time in UTC
now_utc = datetime.utcnow()
# Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions
today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 2, 18, 0, 0)
min_date = today_1800_utc - timedelta(days=lookback_days)
return min_date, max_date
def get_documents(category: str, min_date: datetime, max_date: datetime):
# We use the arxiv package instead of Langchain's ArxivLoader,
# because the latter automatically loads pdfs which results in poor performance.
query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
search = arxiv.Search(
query=query,
max_results=MAX_RESULTS,
sort_by=arxiv.SortCriterion.SubmittedDate
)
docs = [Document(
page_content=doc.summary,
metadata={
"authors": ", ".join(map(str, doc.authors)),
"categories": ", ".join(map(str, doc.categories)),
"id": doc.get_short_id(),
"title": doc.title,
}
) for doc in search.results()]
return docs
def get_data(category: str, lookback_days: float, user_query: str):
print("User query:", user_query)
min_date, max_date = get_date_range(lookback_days)
docs = get_documents(category, min_date, max_date)
if len(docs) == 0:
return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
db = Chroma.from_documents(docs, embeddings)
retriever = db.as_retriever()
relevant_docs = retriever.get_relevant_documents(user_query)
articles = ""
for doc in relevant_docs:
articles += f"**Title: {doc.metadata['title']}**\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.metadata['id']}\n\n"
output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
output_text = output["output_text"].split("<|end|>")[0]
print("LLM output:", output_text)
return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"
with gr.Blocks() as demo:
gr.Markdown(
"""
# Arxiv AI Curated Newsletter
Get a newsletter-style summary of today's Arxiv articles personalised to your field of research.
"""
)
with gr.Row():
with gr.Column():
with gr.Accordion("Parameters", open=False):
lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7)
category = gr.Textbox(value="hep-th", label="Which category to search through. See https://arxiv.org/category_taxonomy for possible values.")
with gr.Box():
gr.Markdown("Describe your field of research in a few words or sentences.")
input_text = gr.Textbox(placeholder="The relationship between Euclidean solutions to supergravity and black hole microstates.", container=False, show_label=False)
gr.Examples(
[["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]],
input_text,
)
button = gr.Button(value="Submit")
with gr.Column():
with gr.Box():
output = gr.Markdown("Press 'submit' to see your results.")
button.click(fn=get_data, inputs=[category, lookback_days,input_text], outputs=output)
demo.queue().launch()
|