Spaces:
Sleeping
Sleeping
vincentmin
commited on
Commit
·
d3e5a67
1
Parent(s):
5927a64
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
from datetime import datetime, timedelta
|
3 |
-
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
from langchain.vectorstores import Chroma
|
6 |
from langchain.embeddings import HuggingFaceEmbeddings
|
@@ -41,29 +41,40 @@ stuff_chain = StuffDocumentsChain(
|
|
41 |
verbose=True,
|
42 |
)
|
43 |
|
44 |
-
def process_document(doc: Document):
|
45 |
-
metadata = doc.metadata
|
46 |
-
metadata["Body"] = doc.page_content
|
47 |
-
return Document(page_content=doc.metadata["Summary"], metadata=metadata)
|
48 |
-
|
49 |
def get_date_range(lookback_days: float):
|
50 |
max_date = datetime.today()
|
51 |
# Get the current date and time in UTC
|
52 |
-
now_utc = datetime.
|
53 |
-
# Create a new datetime object for today at 18:00 UTC
|
54 |
-
today_1800_utc = datetime
|
55 |
-
min_date =
|
56 |
-
# min_date = (max_date - timedelta(days=lookback_days))
|
57 |
return min_date, max_date
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
def get_data(category: str, lookback_days: float, user_query: str):
|
60 |
print("User query:", user_query)
|
61 |
|
62 |
min_date, max_date = get_date_range(lookback_days)
|
63 |
print(min_date, max_date)
|
64 |
-
|
65 |
-
loader = ArxivLoader(query=query, load_max_docs=LOAD_MAX_DOCS)
|
66 |
-
docs = [process_document(doc) for doc in loader.load()]
|
67 |
if len(docs) == 0:
|
68 |
return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
|
69 |
db = Chroma.from_documents(docs, embeddings)
|
@@ -72,11 +83,11 @@ def get_data(category: str, lookback_days: float, user_query: str):
|
|
72 |
print(relevant_docs[0].metadata)
|
73 |
articles = ""
|
74 |
for doc in relevant_docs:
|
75 |
-
articles += f"**Title: {doc.metadata['
|
76 |
output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
|
77 |
output_text = output["output_text"].split("<|end|>")[0]
|
78 |
print("LLM output:", output_text)
|
79 |
-
return f"# Your AI curated newsletter\n{output_text}\n\n
|
80 |
|
81 |
with gr.Blocks() as demo:
|
82 |
gr.Markdown(
|
|
|
1 |
import gradio as gr
|
2 |
from datetime import datetime, timedelta
|
3 |
+
import arxiv
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
from langchain.vectorstores import Chroma
|
6 |
from langchain.embeddings import HuggingFaceEmbeddings
|
|
|
41 |
verbose=True,
|
42 |
)
|
43 |
|
|
|
|
|
|
|
|
|
|
|
44 |
def get_date_range(lookback_days: float):
|
45 |
max_date = datetime.today()
|
46 |
# Get the current date and time in UTC
|
47 |
+
now_utc = datetime.utcnow()
|
48 |
+
# Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions
|
49 |
+
today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 1, 18, 0, 0)
|
50 |
+
min_date = today_1800_utc - timedelta(days=lookback_days)
|
|
|
51 |
return min_date, max_date
|
52 |
|
53 |
+
def get_documents(category: str, min_date: datetime, max_date: datetime):
|
54 |
+
# We use the arxiv package instead of Langchain's ArxivLoader,
|
55 |
+
# because the latter automatically loads pdfs which results in poor performance.
|
56 |
+
query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
|
57 |
+
search = arxiv.Search(
|
58 |
+
query=query,
|
59 |
+
max_results=100,
|
60 |
+
sort_by=arxiv.SortCriterion.SubmittedDate
|
61 |
+
)
|
62 |
+
docs = [Document(
|
63 |
+
page_content=doc.summary,
|
64 |
+
metadata={
|
65 |
+
"authors": doc.authors,
|
66 |
+
"categories": doc.categories,
|
67 |
+
"id": doc.get_short_id(),
|
68 |
+
"title": doc.title,
|
69 |
+
}
|
70 |
+
) for doc in search.results()]
|
71 |
+
|
72 |
def get_data(category: str, lookback_days: float, user_query: str):
|
73 |
print("User query:", user_query)
|
74 |
|
75 |
min_date, max_date = get_date_range(lookback_days)
|
76 |
print(min_date, max_date)
|
77 |
+
docs = get_documents(category, min_date, max_date)
|
|
|
|
|
78 |
if len(docs) == 0:
|
79 |
return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
|
80 |
db = Chroma.from_documents(docs, embeddings)
|
|
|
83 |
print(relevant_docs[0].metadata)
|
84 |
articles = ""
|
85 |
for doc in relevant_docs:
|
86 |
+
articles += f"**Title: {doc.metadata['title']}**\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.id}\n\n"
|
87 |
output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
|
88 |
output_text = output["output_text"].split("<|end|>")[0]
|
89 |
print("LLM output:", output_text)
|
90 |
+
return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"
|
91 |
|
92 |
with gr.Blocks() as demo:
|
93 |
gr.Markdown(
|