ArxivNewsLetter / app.py
vincentmin's picture
Update app.py
a326270
raw
history blame
2.77 kB
import gradio as gr
from datetime import date, timedelta
from langchain.document_loaders import ArxivLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
# from langchain.llms import FakeListLLM
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document
LOAD_MAX_DOCS = 100
min_date = (date.today() - timedelta(days=2)).strftime('%Y%m%d')
max_date = date.today().strftime('%Y%m%d')
query = f"cat:hep-th AND submittedDate:[{min_date} TO {max_date}]"
loader = ArxivLoader(query=query, load_max_docs=LOAD_MAX_DOCS)
# CHUNK_SIZE = 1000
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE)
embeddings = HuggingFaceEmbeddings()
document_prompt = PromptTemplate(
template="Title: {Title}\nContent: {page_content}",
input_variables=["page_content", "Title"],
)
prompt = PromptTemplate(
template=
"""Write a personalised newsletter for a researcher. The researcher describes his work as follows:"{context}". Base the newsletter on the following articles:\n\n"{text}"\n\nNEWSLETTER:""",
input_variables=["context", "text"])
# llm = FakeListLLM(responses=list(map(str, range(100))))
REPO_ID = "HuggingFaceH4/starchat-beta"
llm = HuggingFaceHub(
repo_id=REPO_ID,
model_kwargs={
"max_new_tokens": 1024,
"do_sample": True,
"temperature": 0.8,
"top_p": 0.9
}
)
llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True)
stuff_chain = StuffDocumentsChain(
llm_chain=llm_chain,
document_variable_name="text",
document_prompt=document_prompt,
verbose=True,
)
def process_document(doc: Document):
metadata = doc.metadata
metadata["Body"] = doc.page_content
return Document(page_content=doc.metadata["Summary"], metadata=metadata)
def get_data(user_query: str):
docs = loader.load()
docs = [process_document(doc) for doc in docs]
db = Chroma.from_documents(docs, embeddings)
retriever = db.as_retriever()
relevant_docs = retriever.get_relevant_documents(user_query)
print(relevant_docs[0].metadata)
articles = ""
for doc in relevant_docs:
articles += f"**Title: {doc.metadata['Title']}**\n\nAbstract: {doc.metadata['Summary']}\n\n"
output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
return f"{output["output_text"]}\n\n\n\nUsed articles:\n\n{output}"
demo = gr.Interface(
fn=get_data,
inputs="text",
outputs=gr.Markdown(),
title="Document Filter",
description="Enter a query to filter the list of documents."
)
demo.queue().launch()