from datetime import date, timedelta from langchain.document_loaders import ArxivLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings def get_data(user_query: str, load_max_docs: int = 5, chunk_size: int=1000): min_date = (date.today() - timedelta(days=2)).strftime('%Y%m%d') max_date = date.today().strftime('%Y%m%d') query = f"cat:hep-th AND submittedDate:[{min_date.strftime('%Y%m%d')} TO {max_date.strftime('%Y%m%d')}]" loader = ArxivLoader(query=query, load_max_docs=load_max_docs) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) texts = text_splitter.split_documents(documents) embeddings = HuggingFaceEmbeddings() db = FAISS.from_documents(texts, embeddings) retriever = db.as_retriever() docs = retriever.get_relevant_documents(user_query) print(docs[0].metadata) return "\n\n".join([d.page_content for d in docs]) demo = gr.Interface( fn=get_data, inputs="text", outputs="text", title="Document Filter", description="Enter a query to filter the list of documents." ) demo.queue().launch()