ArxivNewsLetter / app.py
vincentmin's picture
Create app.py
bb7257e
raw
history blame
1.25 kB
from datetime import date, timedelta
from langchain.document_loaders import ArxivLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
def get_data(user_query: str, load_max_docs: int = 5, chunk_size: int=1000):
min_date = (date.today() - timedelta(days=2)).strftime('%Y%m%d')
max_date = date.today().strftime('%Y%m%d')
query = f"cat:hep-th AND submittedDate:[{min_date.strftime('%Y%m%d')} TO {max_date.strftime('%Y%m%d')}]"
loader = ArxivLoader(query=query, load_max_docs=load_max_docs)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
texts = text_splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings()
db = FAISS.from_documents(texts, embeddings)
retriever = db.as_retriever()
docs = retriever.get_relevant_documents(user_query)
print(docs[0].metadata)
return "\n\n".join([d.page_content for d in docs])
demo = gr.Interface(
fn=get_data,
inputs="text",
outputs="text",
title="Document Filter",
description="Enter a query to filter the list of documents."
)
demo.queue().launch()