Spaces:

vincentmin
/

ArxivNewsLetter

Sleeping

App Files Files Community

vincentmin commited on Jun 25, 2023

Commit

d3e5a67

1 Parent(s): 5927a64

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -16

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from datetime import datetime, timedelta
-from langchain.document_loaders import ArxivLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
@@ -41,29 +41,40 @@ stuff_chain = StuffDocumentsChain(
   verbose=True,
 )
-def process_document(doc: Document):
-    metadata = doc.metadata
-    metadata["Body"] = doc.page_content
-    return Document(page_content=doc.metadata["Summary"], metadata=metadata)
 def get_date_range(lookback_days: float):
     max_date = datetime.today()
     # Get the current date and time in UTC
-    now_utc = datetime.datetime.utcnow()
-    # Create a new datetime object for today at 18:00 UTC
-    today_1800_utc = datetime.datetime(now_utc.year, now_utc.month, now_utc.day, 18, 0, 0)
-    min_date = (today_1800_utc - timedelta(days=lookback_days+1))
-    # min_date = (max_date - timedelta(days=lookback_days))
     return min_date, max_date
 def get_data(category: str, lookback_days: float, user_query: str):
     print("User query:", user_query)
     min_date, max_date = get_date_range(lookback_days)
     print(min_date, max_date)
-    query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
-    loader = ArxivLoader(query=query, load_max_docs=LOAD_MAX_DOCS)
-    docs = [process_document(doc) for doc in loader.load()]
     if len(docs) == 0:
         return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
     db = Chroma.from_documents(docs, embeddings)
@@ -72,11 +83,11 @@ def get_data(category: str, lookback_days: float, user_query: str):
     print(relevant_docs[0].metadata)
     articles = ""
     for doc in relevant_docs:
-        articles += f"**Title: {doc.metadata['Title']}**\n\nAuthors: {doc.metadata['Authors']}\n\nAbstract: {doc.metadata['Summary']}\n\n"
     output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
     output_text = output["output_text"].split("<|end|>")[0]
     print("LLM output:", output_text)
-    return f"# Your AI curated newsletter\n{output_text}\n\n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n  \n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"
 with gr.Blocks() as demo:
     gr.Markdown(

 import gradio as gr
 from datetime import datetime, timedelta
+import arxiv
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
   verbose=True,
 )
 def get_date_range(lookback_days: float):
     max_date = datetime.today()
     # Get the current date and time in UTC
+    now_utc = datetime.utcnow()
+    # Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions
+    today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 1, 18, 0, 0)
+    min_date = today_1800_utc - timedelta(days=lookback_days)
     return min_date, max_date
+def get_documents(category: str, min_date: datetime, max_date: datetime):
+    # We use the arxiv package instead of Langchain's ArxivLoader,
+    # because the latter automatically loads pdfs which results in poor performance.
+    query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
+    search = arxiv.Search(
+        query=query,
+        max_results=100,
+        sort_by=arxiv.SortCriterion.SubmittedDate
+    )
+    docs = [Document(
+        page_content=doc.summary,
+        metadata={
+            "authors": doc.authors,
+            "categories": doc.categories,
+            "id": doc.get_short_id(),
+            "title": doc.title,
+        }
+    ) for doc in search.results()]
 def get_data(category: str, lookback_days: float, user_query: str):
     print("User query:", user_query)
     min_date, max_date = get_date_range(lookback_days)
     print(min_date, max_date)
+    docs = get_documents(category, min_date, max_date)
     if len(docs) == 0:
         return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
     db = Chroma.from_documents(docs, embeddings)
     print(relevant_docs[0].metadata)
     articles = ""
     for doc in relevant_docs:
+        articles += f"**Title: {doc.metadata['title']}**\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.id}\n\n"
     output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
     output_text = output["output_text"].split("<|end|>")[0]
     print("LLM output:", output_text)
+    return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"
 with gr.Blocks() as demo:
     gr.Markdown(