File size: 3,986 Bytes
8c26ae0
bb7257e
 
 
ee043ca
bb7257e
a326270
 
 
 
820771b
bb7257e
ee21478
a326270
 
 
 
ee21478
 
a326270
 
 
 
 
d3db9a3
a326270
 
 
 
 
 
 
45e331f
a326270
 
 
 
 
 
 
 
 
 
 
 
 
 
3baeead
 
 
 
 
d3db9a3
0268ea7
d3db9a3
 
 
 
3baeead
 
 
bb7257e
3baeead
 
a326270
3baeead
a326270
 
45e331f
0268ea7
 
bb7257e
d3db9a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
from datetime import date, timedelta
from langchain.document_loaders import ArxivLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
# from langchain.llms import FakeListLLM
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document


LOAD_MAX_DOCS = 100
# CHUNK_SIZE = 1000
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE)

embeddings = HuggingFaceEmbeddings()    

document_prompt = PromptTemplate(
  template="Title: {Title}\nContent: {page_content}",
  input_variables=["page_content", "Title"],
)
prompt = PromptTemplate(
  template="""Write a personalised newsletter for a researcher on the most recent exciting developments in his field. The researcher describes his work as follows:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use Markdown format\n#ARTICLES\n\n"{text}"\n\nNEWSLETTER:\n# Your AI curated newsletter\n""",
  input_variables=["context", "text"])

# llm = FakeListLLM(responses=list(map(str, range(100))))
REPO_ID = "HuggingFaceH4/starchat-beta"
llm = HuggingFaceHub(
    repo_id=REPO_ID,
    model_kwargs={
        "max_new_tokens": 300,
        "do_sample": True,
        "temperature": 0.8,
        "top_p": 0.9
    }
)

llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True)
stuff_chain = StuffDocumentsChain(
  llm_chain=llm_chain,
  document_variable_name="text",
  document_prompt=document_prompt,
  verbose=True,
)

def process_document(doc: Document):
    metadata = doc.metadata
    metadata["Body"] = doc.page_content
    return Document(page_content=doc.metadata["Summary"], metadata=metadata)

def get_data(lookback_days: float, user_query: str):
    print("User query:", user_query)
    max_date = date.today()
    min_date = (max_date - timedelta(days=3)).strftime('%Y%m%d')
    query = f"cat:hep-th AND submittedDate:[{min_date.strftime('%Y%m%d')} TO {max_date.strftime('%Y%m%d')}]"
    loader = ArxivLoader(query=query, load_max_docs=LOAD_MAX_DOCS)
    docs = loader.load()
    docs = [process_document(doc) for doc in docs]
    db = Chroma.from_documents(docs, embeddings)
    retriever = db.as_retriever()    
    relevant_docs = retriever.get_relevant_documents(user_query)
    print(relevant_docs[0].metadata)
    articles = ""
    for doc in relevant_docs:
        articles += f"**Title: {doc.metadata['Title']}**\n\nAbstract: {doc.metadata['Summary']}\n\n"
    output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
    output_text = output["output_text"].split("<|end|>")[0]
    print("LLM output:", output_text)
    return f"# Your AI curated newsletter\n{output['output_text']}\n\n\n\n## Used articles:\n\n{articles}"

# demo = gr.Interface(
#     fn=get_data,
#     inputs=[lookback_days, input_text]
#     outputs=gr.Markdown(),
#     title="Arxiv AI Curated Newsletter",
#     description="Describe your field of research in a few words to get a newsletter-style summary of today's Arxiv articles.",
# )

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Arxiv AI Curated Newsletter

        
        Get a newsletter-style summary of today's Arxiv articles personalised to your field of research.
        """
    )
    lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7)
    input_text = gr.Textbox(placeholder="Describe your field of research in a few words")
    gr.Examples(
        [["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]],
        input_text,
    )
    output = gr.Markdown()

    input_text.change(fn=get_data, inputs=[lookback_days,input_text], outputs=output)

    demo.queue().launch()