vincentmin commited on
Commit
d3e5a67
·
1 Parent(s): 5927a64

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -16
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  from datetime import datetime, timedelta
3
- from langchain.document_loaders import ArxivLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.vectorstores import Chroma
6
  from langchain.embeddings import HuggingFaceEmbeddings
@@ -41,29 +41,40 @@ stuff_chain = StuffDocumentsChain(
41
  verbose=True,
42
  )
43
 
44
- def process_document(doc: Document):
45
- metadata = doc.metadata
46
- metadata["Body"] = doc.page_content
47
- return Document(page_content=doc.metadata["Summary"], metadata=metadata)
48
-
49
  def get_date_range(lookback_days: float):
50
  max_date = datetime.today()
51
  # Get the current date and time in UTC
52
- now_utc = datetime.datetime.utcnow()
53
- # Create a new datetime object for today at 18:00 UTC
54
- today_1800_utc = datetime.datetime(now_utc.year, now_utc.month, now_utc.day, 18, 0, 0)
55
- min_date = (today_1800_utc - timedelta(days=lookback_days+1))
56
- # min_date = (max_date - timedelta(days=lookback_days))
57
  return min_date, max_date
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def get_data(category: str, lookback_days: float, user_query: str):
60
  print("User query:", user_query)
61
 
62
  min_date, max_date = get_date_range(lookback_days)
63
  print(min_date, max_date)
64
- query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
65
- loader = ArxivLoader(query=query, load_max_docs=LOAD_MAX_DOCS)
66
- docs = [process_document(doc) for doc in loader.load()]
67
  if len(docs) == 0:
68
  return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
69
  db = Chroma.from_documents(docs, embeddings)
@@ -72,11 +83,11 @@ def get_data(category: str, lookback_days: float, user_query: str):
72
  print(relevant_docs[0].metadata)
73
  articles = ""
74
  for doc in relevant_docs:
75
- articles += f"**Title: {doc.metadata['Title']}**\n\nAuthors: {doc.metadata['Authors']}\n\nAbstract: {doc.metadata['Summary']}\n\n"
76
  output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
77
  output_text = output["output_text"].split("<|end|>")[0]
78
  print("LLM output:", output_text)
79
- return f"# Your AI curated newsletter\n{output_text}\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"
80
 
81
  with gr.Blocks() as demo:
82
  gr.Markdown(
 
1
  import gradio as gr
2
  from datetime import datetime, timedelta
3
+ import arxiv
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.vectorstores import Chroma
6
  from langchain.embeddings import HuggingFaceEmbeddings
 
41
  verbose=True,
42
  )
43
 
 
 
 
 
 
44
  def get_date_range(lookback_days: float):
45
  max_date = datetime.today()
46
  # Get the current date and time in UTC
47
+ now_utc = datetime.utcnow()
48
+ # Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions
49
+ today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 1, 18, 0, 0)
50
+ min_date = today_1800_utc - timedelta(days=lookback_days)
 
51
  return min_date, max_date
52
 
53
+ def get_documents(category: str, min_date: datetime, max_date: datetime):
54
+ # We use the arxiv package instead of Langchain's ArxivLoader,
55
+ # because the latter automatically loads pdfs which results in poor performance.
56
+ query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
57
+ search = arxiv.Search(
58
+ query=query,
59
+ max_results=100,
60
+ sort_by=arxiv.SortCriterion.SubmittedDate
61
+ )
62
+ docs = [Document(
63
+ page_content=doc.summary,
64
+ metadata={
65
+ "authors": doc.authors,
66
+ "categories": doc.categories,
67
+ "id": doc.get_short_id(),
68
+ "title": doc.title,
69
+ }
70
+ ) for doc in search.results()]
71
+
72
  def get_data(category: str, lookback_days: float, user_query: str):
73
  print("User query:", user_query)
74
 
75
  min_date, max_date = get_date_range(lookback_days)
76
  print(min_date, max_date)
77
+ docs = get_documents(category, min_date, max_date)
 
 
78
  if len(docs) == 0:
79
  return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
80
  db = Chroma.from_documents(docs, embeddings)
 
83
  print(relevant_docs[0].metadata)
84
  articles = ""
85
  for doc in relevant_docs:
86
+ articles += f"**Title: {doc.metadata['title']}**\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.id}\n\n"
87
  output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
88
  output_text = output["output_text"].split("<|end|>")[0]
89
  print("LLM output:", output_text)
90
+ return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"
91
 
92
  with gr.Blocks() as demo:
93
  gr.Markdown(