Spaces:
Sleeping
Sleeping
import gradio as gr | |
from datetime import date, timedelta | |
from langchain.document_loaders import ArxivLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.llms import HuggingFaceHub | |
from langchain.chains import LLMChain, StuffDocumentsChain | |
from langchain.prompts import PromptTemplate | |
from langchain.schema import Document | |
LOAD_MAX_DOCS = 100 | |
embeddings = HuggingFaceEmbeddings() | |
document_prompt = PromptTemplate( | |
template="Title: {Title}\nContent: {page_content}", | |
input_variables=["page_content", "Title"], | |
) | |
prompt = PromptTemplate( | |
template="""Write a personalised newsletter for a researcher on the most recent exciting developments in his field. The researcher describes his work as follows:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use Markdown format\n#ARTICLES\n\n"{text}"\n\nNEWSLETTER:\n# Your AI curated newsletter\n""", | |
input_variables=["context", "text"]) | |
REPO_ID = "HuggingFaceH4/starchat-beta" | |
llm = HuggingFaceHub( | |
repo_id=REPO_ID, | |
model_kwargs={ | |
"max_new_tokens": 300, | |
"do_sample": True, | |
"temperature": 0.8, | |
"top_p": 0.9 | |
} | |
) | |
llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True) | |
stuff_chain = StuffDocumentsChain( | |
llm_chain=llm_chain, | |
document_variable_name="text", | |
document_prompt=document_prompt, | |
verbose=True, | |
) | |
def process_document(doc: Document): | |
metadata = doc.metadata | |
metadata["Body"] = doc.page_content | |
return Document(page_content=doc.metadata["Summary"], metadata=metadata) | |
def get_data(lookback_days: float, user_query: str): | |
print("User query:", user_query) | |
max_date = date.today() | |
min_date = (max_date - timedelta(days=lookback_days)) | |
query = f"cat:hep-th AND submittedDate:[{min_date.strftime('%Y%m%d')} TO {max_date.strftime('%Y%m%d')}]" | |
loader = ArxivLoader(query=query, load_max_docs=LOAD_MAX_DOCS) | |
docs = loader.load() | |
docs = [process_document(doc) for doc in docs] | |
db = Chroma.from_documents(docs, embeddings) | |
retriever = db.as_retriever() | |
relevant_docs = retriever.get_relevant_documents(user_query) | |
print(relevant_docs[0].metadata) | |
articles = "" | |
for doc in relevant_docs: | |
articles += f"**Title: {doc.metadata['Title']}**\n\nAbstract: {doc.metadata['Summary']}\n\n" | |
output = stuff_chain({"input_documents": relevant_docs, "context": user_query}) | |
output_text = output["output_text"].split("<|end|>")[0] | |
print("LLM output:", output_text) | |
return f"# Your AI curated newsletter\n{output['output_text']}\n\n\n\n## Used articles:\n\n{articles}" | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# Arxiv AI Curated Newsletter | |
Get a newsletter-style summary of today's Arxiv articles personalised to your field of research. | |
""" | |
) | |
with gr.Accordion("Parameters", open=False): | |
lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7) | |
input_text = gr.Textbox(placeholder="Describe your field of research in a few words") | |
gr.Examples( | |
[["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]], | |
input_text, | |
) | |
output = gr.Markdown() | |
btn = gr.Button(value="Submit") | |
btn.click(fn=get_data, inputs=[lookback_days,input_text], outputs=output) | |
demo.queue().launch() | |