Spaces:
Sleeping
Sleeping
from datetime import date, timedelta | |
from langchain.document_loaders import ArxivLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain.embeddings import HuggingFaceEmbeddings | |
def get_data(user_query: str, load_max_docs: int = 5, chunk_size: int=1000): | |
min_date = (date.today() - timedelta(days=2)).strftime('%Y%m%d') | |
max_date = date.today().strftime('%Y%m%d') | |
query = f"cat:hep-th AND submittedDate:[{min_date.strftime('%Y%m%d')} TO {max_date.strftime('%Y%m%d')}]" | |
loader = ArxivLoader(query=query, load_max_docs=load_max_docs) | |
documents = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) | |
texts = text_splitter.split_documents(documents) | |
embeddings = HuggingFaceEmbeddings() | |
db = FAISS.from_documents(texts, embeddings) | |
retriever = db.as_retriever() | |
docs = retriever.get_relevant_documents(user_query) | |
print(docs[0].metadata) | |
return "\n\n".join([d.page_content for d in docs]) | |
demo = gr.Interface( | |
fn=get_data, | |
inputs="text", | |
outputs="text", | |
title="Document Filter", | |
description="Enter a query to filter the list of documents." | |
) | |
demo.queue().launch() | |