Spaces:
Running
Running
switching to gradio for better memory handling.
Browse filesstreamlit doesn't handle heavy traffic as well, so keeping gradio's queue for now. access the original streamlit version at https://huggingface.co./spaces/kiyer/pathfinder_v3
- README.md +3 -3
- app_gradio.py +550 -0
- prompts.py +63 -0
README.md
CHANGED
@@ -3,9 +3,9 @@ title: Pathfinder
|
|
3 |
emoji: 🔎
|
4 |
colorFrom: yellow
|
5 |
colorTo: blue
|
6 |
-
sdk:
|
7 |
-
sdk_version:
|
8 |
-
app_file:
|
9 |
pinned: true
|
10 |
license: mit
|
11 |
---
|
|
|
3 |
emoji: 🔎
|
4 |
colorFrom: yellow
|
5 |
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.40.0
|
8 |
+
app_file: app_gradio.py
|
9 |
pinned: true
|
10 |
license: mit
|
11 |
---
|
app_gradio.py
ADDED
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
from abc import ABC, abstractmethod
|
4 |
+
from typing import List, Dict, Any, Tuple
|
5 |
+
from collections import defaultdict
|
6 |
+
import pandas as pd
|
7 |
+
from datetime import datetime, date
|
8 |
+
from datasets import load_dataset, load_from_disk
|
9 |
+
from collections import Counter
|
10 |
+
|
11 |
+
import yaml, json, requests, sys, os, time
|
12 |
+
import urllib.parse
|
13 |
+
import concurrent.futures
|
14 |
+
|
15 |
+
from langchain import hub
|
16 |
+
from langchain_openai import ChatOpenAI as openai_llm
|
17 |
+
from langchain_openai import OpenAIEmbeddings
|
18 |
+
from langchain_core.runnables import RunnableConfig, RunnablePassthrough, RunnableParallel
|
19 |
+
from langchain_core.prompts import PromptTemplate
|
20 |
+
from langchain_community.callbacks import StreamlitCallbackHandler
|
21 |
+
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
22 |
+
from langchain_community.vectorstores import Chroma
|
23 |
+
from langchain_community.document_loaders import TextLoader
|
24 |
+
from langchain.agents import create_react_agent, Tool, AgentExecutor
|
25 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
26 |
+
from langchain_core.output_parsers import StrOutputParser
|
27 |
+
from langchain.callbacks import FileCallbackHandler
|
28 |
+
from langchain.callbacks.manager import CallbackManager
|
29 |
+
from langchain.schema import Document
|
30 |
+
|
31 |
+
import instructor
|
32 |
+
from pydantic import BaseModel, Field
|
33 |
+
from typing import List, Literal
|
34 |
+
|
35 |
+
from nltk.corpus import stopwords
|
36 |
+
import nltk
|
37 |
+
from openai import OpenAI
|
38 |
+
# import anthropic
|
39 |
+
import cohere
|
40 |
+
import faiss
|
41 |
+
import matplotlib.pyplot as plt
|
42 |
+
import spacy
|
43 |
+
from string import punctuation
|
44 |
+
import pytextrank
|
45 |
+
from prompts import *
|
46 |
+
|
47 |
+
openai_key = os.environ['openai_key']
|
48 |
+
cohere_key = os.environ['cohere_key']
|
49 |
+
|
50 |
+
def load_nlp():
|
51 |
+
nlp = spacy.load("en_core_web_sm")
|
52 |
+
nlp.add_pipe("textrank")
|
53 |
+
try:
|
54 |
+
stopwords.words('english')
|
55 |
+
except:
|
56 |
+
nltk.download('stopwords')
|
57 |
+
stopwords.words('english')
|
58 |
+
return nlp
|
59 |
+
|
60 |
+
gen_llm = openai_llm(temperature=0, model_name='gpt-4o-mini', openai_api_key = openai_key)
|
61 |
+
consensus_client = instructor.patch(OpenAI(api_key=openai_key))
|
62 |
+
embed_client = OpenAI(api_key = openai_key)
|
63 |
+
embed_model = "text-embedding-3-small"
|
64 |
+
embeddings = OpenAIEmbeddings(model = embed_model, api_key = openai_key)
|
65 |
+
nlp = load_nlp()
|
66 |
+
|
67 |
+
|
68 |
+
def get_keywords(text, nlp=nlp):
|
69 |
+
result = []
|
70 |
+
pos_tag = ['PROPN', 'ADJ', 'NOUN']
|
71 |
+
doc = nlp(text.lower())
|
72 |
+
for token in doc:
|
73 |
+
if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
|
74 |
+
continue
|
75 |
+
if(token.pos_ in pos_tag):
|
76 |
+
result.append(token.text)
|
77 |
+
return result
|
78 |
+
|
79 |
+
def load_arxiv_corpus():
|
80 |
+
arxiv_corpus = load_from_disk('data/')
|
81 |
+
arxiv_corpus.load_faiss_index('embed', 'data/astrophindex.faiss')
|
82 |
+
print('loading arxiv corpus from disk')
|
83 |
+
return arxiv_corpus
|
84 |
+
|
85 |
+
class RetrievalSystem():
|
86 |
+
|
87 |
+
def __init__(self):
|
88 |
+
|
89 |
+
self.dataset = arxiv_corpus
|
90 |
+
self.client = OpenAI(api_key = openai_key)
|
91 |
+
self.embed_model = "text-embedding-3-small"
|
92 |
+
self.generation_client = openai_llm(temperature=0,model_name='gpt-4o-mini', openai_api_key = openai_key)
|
93 |
+
self.hyde_client = openai_llm(temperature=0.5,model_name='gpt-4o-mini', openai_api_key = openai_key)
|
94 |
+
self.cohere_client = cohere.Client(cohere_key)
|
95 |
+
|
96 |
+
def make_embedding(self, text):
|
97 |
+
str_embed = self.client.embeddings.create(input = [text], model = self.embed_model).data[0].embedding
|
98 |
+
return str_embed
|
99 |
+
|
100 |
+
def embed_batch(self, texts: List[str]) -> List[np.ndarray]:
|
101 |
+
embeddings = self.client.embeddings.create(input=texts, model=self.embed_model).data
|
102 |
+
return [np.array(embedding.embedding, dtype=np.float32) for embedding in embeddings]
|
103 |
+
|
104 |
+
def get_query_embedding(self, query):
|
105 |
+
return self.make_embedding(query)
|
106 |
+
|
107 |
+
def calc_faiss(self, query_embedding, top_k = 100):
|
108 |
+
# xq = query_embedding.reshape(-1,1).T.astype('float32')
|
109 |
+
# D, I = self.index.search(xq, top_k)
|
110 |
+
# return I[0], D[0]
|
111 |
+
tmp = self.dataset.search('embed', query_embedding, k=top_k)
|
112 |
+
return [tmp.indices, tmp.scores, self.dataset[tmp.indices]]
|
113 |
+
|
114 |
+
def rank_and_filter(self, query, query_embedding, top_k = 10, top_k_internal = 1000, return_scores=False):
|
115 |
+
|
116 |
+
if 'Keywords' in self.toggles:
|
117 |
+
self.weight_keywords = True
|
118 |
+
else:
|
119 |
+
self.weight_keywords = False
|
120 |
+
|
121 |
+
if 'Time' in self.toggles:
|
122 |
+
self.weight_date = True
|
123 |
+
else:
|
124 |
+
self.weight_date = False
|
125 |
+
|
126 |
+
if 'Citations' in self.toggles:
|
127 |
+
self.weight_citation = True
|
128 |
+
else:
|
129 |
+
self.weight_citation = False
|
130 |
+
|
131 |
+
topk_indices, similarities, small_corpus = self.calc_faiss(np.array(query_embedding), top_k = top_k_internal)
|
132 |
+
similarities = 1/similarities # converting from a distance (less is better) to a similarity (more is better)
|
133 |
+
|
134 |
+
if self.weight_keywords == True:
|
135 |
+
|
136 |
+
query_kws = get_keywords(query)
|
137 |
+
input_kws = self.query_input_keywords
|
138 |
+
query_kws = query_kws + input_kws
|
139 |
+
self.query_kws = query_kws
|
140 |
+
sub_kws = [small_corpus['keywords'][i] for i in range(top_k_internal)]
|
141 |
+
kw_weight = np.zeros((len(topk_indices),)) + 0.1
|
142 |
+
|
143 |
+
for k in query_kws:
|
144 |
+
for i in (range(len(topk_indices))):
|
145 |
+
for j in range(len(sub_kws[i])):
|
146 |
+
if k.lower() in sub_kws[i][j].lower():
|
147 |
+
kw_weight[i] = kw_weight[i] + 0.1
|
148 |
+
# print(i, k, sub_kws[i][j])
|
149 |
+
|
150 |
+
# kw_weight = kw_weight**0.36 / np.amax(kw_weight**0.36)
|
151 |
+
kw_weight = kw_weight / np.amax(kw_weight)
|
152 |
+
else:
|
153 |
+
kw_weight = np.ones((len(topk_indices),))
|
154 |
+
|
155 |
+
if self.weight_date == True:
|
156 |
+
sub_dates = [small_corpus['date'][i] for i in range(top_k_internal)]
|
157 |
+
date = datetime.now().date()
|
158 |
+
date_diff = np.array([((date - i).days / 365.) for i in sub_dates])
|
159 |
+
# age_weight = (1 + np.exp(date_diff/2.1))**(-1) + 0.5
|
160 |
+
age_weight = (1 + np.exp(date_diff/0.7))**(-1)
|
161 |
+
age_weight = age_weight / np.amax(age_weight)
|
162 |
+
else:
|
163 |
+
age_weight = np.ones((len(topk_indices),))
|
164 |
+
|
165 |
+
if self.weight_citation == True:
|
166 |
+
# st.write('weighting by citations')
|
167 |
+
sub_cites = np.array([small_corpus['cites'][i] for i in range(top_k_internal)])
|
168 |
+
temp = sub_cites.copy()
|
169 |
+
temp[sub_cites > 300] = 300.
|
170 |
+
cite_weight = (1 + np.exp((300-temp)/42.0))**(-1.)
|
171 |
+
cite_weight = cite_weight / np.amax(cite_weight)
|
172 |
+
else:
|
173 |
+
cite_weight = np.ones((len(topk_indices),))
|
174 |
+
|
175 |
+
similarities = similarities * (kw_weight) * (age_weight) * (cite_weight)
|
176 |
+
|
177 |
+
filtered_results = [[topk_indices[i], similarities[i]] for i in range(len(similarities))]
|
178 |
+
top_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)[:top_k]
|
179 |
+
|
180 |
+
top_scores = [doc[1] for doc in top_results]
|
181 |
+
top_indices = [doc[0] for doc in top_results]
|
182 |
+
small_df = self.dataset[top_indices]
|
183 |
+
|
184 |
+
if return_scores:
|
185 |
+
return {doc[0]: doc[1] for doc in top_results}, small_df
|
186 |
+
|
187 |
+
# Only keep the document IDs
|
188 |
+
top_results = [doc[0] for doc in top_results]
|
189 |
+
return top_results, small_df
|
190 |
+
|
191 |
+
def generate_doc(self, query: str):
|
192 |
+
prompt = """You are an expert astronomer. Given a scientific query, generate the abstract of an expert-level research paper
|
193 |
+
that answers the question. Stick to a maximum length of {} tokens and return just the text of the abstract and conclusion.
|
194 |
+
Do not include labels for any section. Use research-specific jargon.""".format(self.max_doclen)
|
195 |
+
|
196 |
+
messages = [("system",prompt,),("human", query),]
|
197 |
+
return self.hyde_client.invoke(messages).content
|
198 |
+
|
199 |
+
def generate_docs(self, query: str):
|
200 |
+
docs = []
|
201 |
+
for i in range(self.generate_n):
|
202 |
+
docs.append(self.generate_doc(query))
|
203 |
+
return docs
|
204 |
+
|
205 |
+
def embed_docs(self, docs: List[str]):
|
206 |
+
return self.embed_batch(docs)
|
207 |
+
|
208 |
+
def retrieve(self, query, top_k, return_scores = False,
|
209 |
+
embed_query=True, max_doclen=250,
|
210 |
+
generate_n=1, temperature=0.5,
|
211 |
+
rerank_top_k = 250):
|
212 |
+
|
213 |
+
if max_doclen * generate_n > 8191:
|
214 |
+
raise ValueError("Too many tokens. Please reduce max_doclen or generate_n.")
|
215 |
+
|
216 |
+
query_embedding = self.get_query_embedding(query)
|
217 |
+
|
218 |
+
if self.hyde == True:
|
219 |
+
self.max_doclen = max_doclen
|
220 |
+
self.generate_n = generate_n
|
221 |
+
self.hyde_client.temperature = temperature
|
222 |
+
self.embed_query = embed_query
|
223 |
+
docs = self.generate_docs(query)
|
224 |
+
# st.expander('Abstract generated with hyde', expanded=False).write(docs)
|
225 |
+
doc_embeddings = self.embed_docs(docs)
|
226 |
+
if self.embed_query:
|
227 |
+
query_emb = self.embed_docs([query])[0]
|
228 |
+
doc_embeddings.append(query_emb)
|
229 |
+
query_embedding = np.mean(np.array(doc_embeddings), axis = 0)
|
230 |
+
|
231 |
+
if self.rerank == True:
|
232 |
+
top_results, small_df = self.rank_and_filter(query,
|
233 |
+
query_embedding,
|
234 |
+
rerank_top_k,
|
235 |
+
return_scores = False)
|
236 |
+
# try:
|
237 |
+
docs_for_rerank = [small_df['abstract'][i] for i in range(rerank_top_k)]
|
238 |
+
if len(docs_for_rerank) == 0:
|
239 |
+
return []
|
240 |
+
reranked_results = self.cohere_client.rerank(
|
241 |
+
query=query,
|
242 |
+
documents=docs_for_rerank,
|
243 |
+
model='rerank-english-v3.0',
|
244 |
+
top_n=top_k
|
245 |
+
)
|
246 |
+
final_results = []
|
247 |
+
for result in reranked_results.results:
|
248 |
+
doc_id = top_results[result.index]
|
249 |
+
doc_text = docs_for_rerank[result.index]
|
250 |
+
score = float(result.relevance_score)
|
251 |
+
final_results.append([doc_id, "", score])
|
252 |
+
final_indices = [doc[0] for doc in final_results]
|
253 |
+
if return_scores:
|
254 |
+
return {result[0]: result[2] for result in final_results}, self.dataset[final_indices]
|
255 |
+
return [doc[0] for doc in final_results], self.dataset[final_indices]
|
256 |
+
# except:
|
257 |
+
# print('heavy load, please wait 10s and try again.')
|
258 |
+
else:
|
259 |
+
top_results, small_df = self.rank_and_filter(query,
|
260 |
+
query_embedding,
|
261 |
+
top_k,
|
262 |
+
return_scores = return_scores)
|
263 |
+
|
264 |
+
return top_results, small_df
|
265 |
+
|
266 |
+
def return_formatted_df(self, top_results, small_df):
|
267 |
+
|
268 |
+
df = pd.DataFrame(small_df)
|
269 |
+
df = df.drop(columns=['umap_x','umap_y','cite_bibcodes','ref_bibcodes'])
|
270 |
+
links = ['['+i+'](https://ui.adsabs.harvard.edu/abs/'+i+'/abstract)' for i in small_df['bibcode']]
|
271 |
+
|
272 |
+
# st.write(top_results[0:10])
|
273 |
+
scores = [top_results[i] for i in top_results]
|
274 |
+
indices = [i for i in top_results]
|
275 |
+
df.insert(1,'ADS Link',links,True)
|
276 |
+
df.insert(2,'Relevance',scores,True)
|
277 |
+
df.insert(3,'indices',indices,True)
|
278 |
+
df = df[['ADS Link','Relevance','date','cites','title','authors','abstract','keywords','ads_id','indices','embed']]
|
279 |
+
df.index += 1
|
280 |
+
return df
|
281 |
+
|
282 |
+
arxiv_corpus = load_arxiv_corpus()
|
283 |
+
ec = RetrievalSystem()
|
284 |
+
print('loaded retrieval system')
|
285 |
+
|
286 |
+
def Library(papers_df):
|
287 |
+
op_docs = ''
|
288 |
+
for i in range(len(papers_df)):
|
289 |
+
op_docs = op_docs + 'Paper %.0f:' %(i+1) + papers_df['title'][i+1] + '\n' + papers_df['abstract'][i+1] + '\n\n'
|
290 |
+
|
291 |
+
return op_docs
|
292 |
+
|
293 |
+
def run_rag_qa(query, papers_df, question_type):
|
294 |
+
|
295 |
+
loaders = []
|
296 |
+
|
297 |
+
documents = []
|
298 |
+
|
299 |
+
for i, row in papers_df.iterrows():
|
300 |
+
content = f"Paper {i+1}: {row['title']}\n{row['abstract']}\n\n"
|
301 |
+
metadata = {"source": row['ads_id']}
|
302 |
+
doc = Document(page_content=content, metadata=metadata)
|
303 |
+
documents.append(doc)
|
304 |
+
|
305 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50, add_start_index=True)
|
306 |
+
splits = text_splitter.split_documents(documents)
|
307 |
+
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, collection_name='retdoc4')
|
308 |
+
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
|
309 |
+
|
310 |
+
if question_type == 'Bibliometric':
|
311 |
+
template = bibliometric_prompt
|
312 |
+
elif question_type == 'Single-paper':
|
313 |
+
template = single_paper_prompt
|
314 |
+
elif question_type == 'Broad but nuanced':
|
315 |
+
template = deep_knowledge_prompt
|
316 |
+
else:
|
317 |
+
template = regular_prompt
|
318 |
+
prompt = PromptTemplate.from_template(template)
|
319 |
+
|
320 |
+
def format_docs(docs):
|
321 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
322 |
+
|
323 |
+
rag_chain_from_docs = (
|
324 |
+
RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
|
325 |
+
| prompt
|
326 |
+
| gen_llm
|
327 |
+
| StrOutputParser()
|
328 |
+
)
|
329 |
+
|
330 |
+
rag_chain_with_source = RunnableParallel(
|
331 |
+
{"context": retriever, "question": RunnablePassthrough()}
|
332 |
+
).assign(answer=rag_chain_from_docs)
|
333 |
+
rag_answer = rag_chain_with_source.invoke(query, )
|
334 |
+
vectorstore.delete_collection()
|
335 |
+
|
336 |
+
# except:
|
337 |
+
# st.subheader('heavy load! please wait 10 seconds and try again.')
|
338 |
+
|
339 |
+
return rag_answer
|
340 |
+
|
341 |
+
def guess_question_type(query: str):
|
342 |
+
|
343 |
+
gen_client = openai_llm(temperature=0,model_name='gpt-4o-mini', openai_api_key = openai_key)
|
344 |
+
messages = [("system",question_categorization_prompt,),("human", query),]
|
345 |
+
return gen_client.invoke(messages).content
|
346 |
+
|
347 |
+
class OverallConsensusEvaluation(BaseModel):
|
348 |
+
rewritten_statement: str = Field(
|
349 |
+
...,
|
350 |
+
description="The query rewritten as a statement if it was initially a question"
|
351 |
+
)
|
352 |
+
consensus: Literal[
|
353 |
+
"Strong Agreement Between Abstracts and Query",
|
354 |
+
"Moderate Agreement Between Abstracts and Query",
|
355 |
+
"Weak Agreement Between Abstracts and Query",
|
356 |
+
"No Clear Agreement/Disagreement Between Abstracts and Query",
|
357 |
+
"Weak Disagreement Between Abstracts and Query",
|
358 |
+
"Moderate Disagreement Between Abstracts and Query",
|
359 |
+
"Strong Disagreement Between Abstracts and Query"
|
360 |
+
] = Field(
|
361 |
+
...,
|
362 |
+
description="The overall level of consensus between the rewritten statement and the abstracts"
|
363 |
+
)
|
364 |
+
explanation: str = Field(
|
365 |
+
...,
|
366 |
+
description="A detailed explanation of the consensus evaluation (maximum six sentences)"
|
367 |
+
)
|
368 |
+
relevance_score: float = Field(
|
369 |
+
...,
|
370 |
+
description="A score from 0 to 1 indicating how relevant the abstracts are to the query overall",
|
371 |
+
ge=0,
|
372 |
+
le=1
|
373 |
+
)
|
374 |
+
|
375 |
+
def evaluate_overall_consensus(query: str, abstracts: List[str]) -> OverallConsensusEvaluation:
|
376 |
+
prompt = f"""
|
377 |
+
Query: {query}
|
378 |
+
You will be provided with {len(abstracts)} scientific abstracts. Your task is to do the following:
|
379 |
+
1. If the provided query is a question, rewrite it as a statement. This statement does not have to be true. Output this as 'Rewritten Statement:'.
|
380 |
+
2. Evaluate the overall consensus between the rewritten statement and the abstracts using one of the following levels:
|
381 |
+
- Strong Agreement Between Abstracts and Query
|
382 |
+
- Moderate Agreement Between Abstracts and Query
|
383 |
+
- Weak Agreement Between Abstracts and Query
|
384 |
+
- No Clear Agreement/Disagreement Between Abstracts and Query
|
385 |
+
- Weak Disagreement Between Abstracts and Query
|
386 |
+
- Moderate Disagreement Between Abstracts and Query
|
387 |
+
- Strong Disagreement Between Abstracts and Query
|
388 |
+
Output this as 'Consensus:'
|
389 |
+
3. Provide a detailed explanation of your consensus evaluation in maximum six sentences. Output this as 'Explanation:'
|
390 |
+
4. Assign a relevance score as a float between 0 to 1, where:
|
391 |
+
- 1.0: Perfect match in content and quality
|
392 |
+
- 0.8-0.9: Excellent, with minor differences
|
393 |
+
- 0.6-0.7: Good, captures main points but misses some details
|
394 |
+
- 0.4-0.5: Fair, partially relevant but significant gaps
|
395 |
+
- 0.2-0.3: Poor, major inaccuracies or omissions
|
396 |
+
- 0.0-0.1: Completely irrelevant or incorrect
|
397 |
+
Output this as 'Relevance Score:'
|
398 |
+
Here are the abstracts:
|
399 |
+
{' '.join([f"Abstract {i+1}: {abstract}" for i, abstract in enumerate(abstracts)])}
|
400 |
+
Provide your evaluation in the structured format described above.
|
401 |
+
"""
|
402 |
+
|
403 |
+
response = consensus_client.chat.completions.create(
|
404 |
+
model="gpt-4o-mini", # used to be "gpt-4",
|
405 |
+
response_model=OverallConsensusEvaluation,
|
406 |
+
messages=[
|
407 |
+
{"role": "system", "content": """You are an assistant with expertise in astrophysics for question-answering tasks.
|
408 |
+
Evaluate the overall consensus of the retrieved scientific abstracts in relation to a given query.
|
409 |
+
If you don't know the answer, just say that you don't know.
|
410 |
+
Use six sentences maximum and keep the answer concise."""},
|
411 |
+
{"role": "user", "content": prompt}
|
412 |
+
],
|
413 |
+
temperature=0
|
414 |
+
)
|
415 |
+
|
416 |
+
return response
|
417 |
+
|
418 |
+
def calc_outlier_flag(papers_df, top_k, cutoff_adjust = 0.1):
|
419 |
+
|
420 |
+
cut_dist = np.load('pfdr_arxiv_cutoff_distances.npy') - cutoff_adjust
|
421 |
+
pts = np.array(papers_df['embed'].tolist())
|
422 |
+
centroid = np.mean(pts,0)
|
423 |
+
dists = np.sqrt(np.sum((pts-centroid)**2,1))
|
424 |
+
outlier_flag = (dists > cut_dist[top_k-1])
|
425 |
+
|
426 |
+
return outlier_flag
|
427 |
+
|
428 |
+
def make_embedding_plot(papers_df, top_k, consensus_answer, arxiv_corpus=arxiv_corpus):
|
429 |
+
|
430 |
+
plt_indices = np.array(papers_df['indices'].tolist())
|
431 |
+
|
432 |
+
xax = np.array(arxiv_corpus['umap_x'])
|
433 |
+
yax = np.array(arxiv_corpus['umap_y'])
|
434 |
+
|
435 |
+
outlier_flag = calc_outlier_flag(papers_df, top_k, cutoff_adjust=0.25)
|
436 |
+
alphas = np.ones((len(plt_indices),)) * 0.9
|
437 |
+
alphas[outlier_flag] = 0.5
|
438 |
+
|
439 |
+
fig = plt.figure(figsize=(9*1.8,12*1.8))
|
440 |
+
plt.scatter(xax,yax, s=1, alpha=0.01, c='k')
|
441 |
+
|
442 |
+
clkws = np.load('kw_tags.npz')
|
443 |
+
all_x, all_y, all_topics, repeat_flag = clkws['all_x'], clkws['all_y'], clkws['all_topics'], clkws['repeat_flag']
|
444 |
+
for i in range(len(all_topics)):
|
445 |
+
if repeat_flag[i] == False:
|
446 |
+
plt.text(all_x[i], all_y[i], all_topics[i],fontsize=9,ha="center", va="center",
|
447 |
+
bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3',alpha=0.81))
|
448 |
+
plt.scatter(xax[plt_indices], yax[plt_indices], s=300*alphas**2, alpha=alphas, c='w',zorder=1000)
|
449 |
+
plt.scatter(xax[plt_indices], yax[plt_indices], s=100*alphas**2, alpha=alphas, c='dodgerblue',zorder=1001)
|
450 |
+
# plt.scatter(xax[plt_indices][outlier_flag], yax[plt_indices][outlier_flag], s=100, alpha=1., c='firebrick')
|
451 |
+
plt.axis([0,20,-4.2,18])
|
452 |
+
plt.axis('off')
|
453 |
+
return fig
|
454 |
+
|
455 |
+
def run_pathfinder(query, top_k, extra_keywords, toggles, prompt_type, rag_type, ec=ec, progress=gr.Progress()):
|
456 |
+
|
457 |
+
yield None, None, None, None, None
|
458 |
+
|
459 |
+
search_text_list = ['rooting around in the paper pile...','looking for clarity...','scanning the event horizon...','peering into the abyss...','potatoes power this ongoing search...']
|
460 |
+
gen_text_list = ['making the LLM talk to the papers...','invoking arcane rituals...','gone to library, please wait...','is there really an answer to this...']
|
461 |
+
|
462 |
+
input_keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
|
463 |
+
query_keywords = get_keywords(query)
|
464 |
+
ec.query_input_keywords = input_keywords+query_keywords
|
465 |
+
ec.toggles = toggles
|
466 |
+
if rag_type == "Semantic Search":
|
467 |
+
ec.hyde = False
|
468 |
+
ec.rerank = False
|
469 |
+
elif rag_type == "Semantic + HyDE":
|
470 |
+
ec.hyde = True
|
471 |
+
ec.rerank = False
|
472 |
+
elif rag_type == "Semantic + HyDE + CoHERE":
|
473 |
+
ec.hyde = True
|
474 |
+
ec.rerank = True
|
475 |
+
|
476 |
+
progress(0.2, desc=search_text_list[np.random.choice(len(search_text_list))])
|
477 |
+
rs, small_df = ec.retrieve(query, top_k = top_k, return_scores=True)
|
478 |
+
formatted_df = ec.return_formatted_df(rs, small_df)
|
479 |
+
yield formatted_df, None, None, None, None
|
480 |
+
|
481 |
+
progress(0.4, desc=gen_text_list[np.random.choice(len(gen_text_list))])
|
482 |
+
rag_answer = run_rag_qa(query, formatted_df, prompt_type)
|
483 |
+
yield formatted_df, rag_answer['answer'], None, None, None
|
484 |
+
|
485 |
+
progress(0.6, desc="Generating consensus")
|
486 |
+
consensus_answer = evaluate_overall_consensus(query, [formatted_df['abstract'][i+1] for i in range(len(formatted_df))])
|
487 |
+
consensus = '## Consensus \n'+consensus_answer.consensus + '\n\n'+consensus_answer.explanation + '\n\n > Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score
|
488 |
+
yield formatted_df, rag_answer['answer'], consensus, None, None
|
489 |
+
|
490 |
+
progress(0.8, desc="Analyzing question type")
|
491 |
+
question_type_gen = guess_question_type(query)
|
492 |
+
if '<categorization>' in question_type_gen:
|
493 |
+
question_type_gen = question_type_gen.split('<categorization>')[1]
|
494 |
+
if '</categorization>' in question_type_gen:
|
495 |
+
question_type_gen = question_type_gen.split('</categorization>')[0]
|
496 |
+
question_type_gen = question_type_gen.replace('\n',' \n')
|
497 |
+
qn_type = question_type_gen
|
498 |
+
yield formatted_df, rag_answer['answer'], consensus, qn_type, None
|
499 |
+
|
500 |
+
progress(1.0, desc="Visualizing embeddings")
|
501 |
+
fig = make_embedding_plot(formatted_df, top_k, consensus_answer)
|
502 |
+
|
503 |
+
yield formatted_df, rag_answer['answer'], consensus, qn_type, fig
|
504 |
+
|
505 |
+
def create_interface():
|
506 |
+
custom_css = """
|
507 |
+
#custom-slider-* {
|
508 |
+
background-color: #ffffff;
|
509 |
+
}
|
510 |
+
"""
|
511 |
+
|
512 |
+
with gr.Blocks(css=custom_css) as demo:
|
513 |
+
|
514 |
+
with gr.Tabs():
|
515 |
+
# with gr.Tab("What is Pathfinder?"):
|
516 |
+
# gr.Markdown(pathfinder_text)
|
517 |
+
with gr.Tab("pathfinder"):
|
518 |
+
with gr.Accordion("What is Pathfinder? / How do I use it?", open=False):
|
519 |
+
gr.Markdown(pathfinder_text)
|
520 |
+
|
521 |
+
with gr.Row():
|
522 |
+
query = gr.Textbox(label="Ask me anything")
|
523 |
+
with gr.Row():
|
524 |
+
with gr.Column(scale=1, min_width=300):
|
525 |
+
top_k = gr.Slider(1, 30, step=1, value=10, label="top-k", info="Number of papers to retrieve")
|
526 |
+
keywords = gr.Textbox(label="Optional Keywords (comma-separated)",value="")
|
527 |
+
toggles = gr.CheckboxGroup(["Keywords", "Time", "Citations"], label="Weight by", info="weighting retrieved papers",value=['Keywords'])
|
528 |
+
prompt_type = gr.Radio(choices=["Single-paper", "Multi-paper", "Bibliometric", "Broad but nuanced"], label="Prompt Specialization", value='Multi-paper')
|
529 |
+
rag_type = gr.Radio(choices=["Semantic Search", "Semantic + HyDE", "Semantic + HyDE + CoHERE"], label="RAG Method",value='Semantic + HyDE + CoHERE')
|
530 |
+
with gr.Column(scale=2, min_width=300):
|
531 |
+
img1 = gr.Image("local_files/pathfinder_logo.png")
|
532 |
+
btn = gr.Button("Run pfdr!")
|
533 |
+
# search_results_state = gr.State([])
|
534 |
+
ret_papers = gr.Dataframe(label='top-k retrieved papers', datatype='markdown')
|
535 |
+
search_results_state = gr.Markdown(label='Generated Answer')
|
536 |
+
qntype = gr.Markdown(label='Question type suggestion')
|
537 |
+
conc = gr.Markdown(label='Consensus')
|
538 |
+
plot = gr.Plot(label='top-k in embedding space')
|
539 |
+
|
540 |
+
inputs = [query, top_k, keywords, toggles, prompt_type, rag_type]
|
541 |
+
outputs = [ret_papers, search_results_state, qntype, conc, plot]
|
542 |
+
btn.click(fn=run_pathfinder, inputs=inputs, outputs=outputs)
|
543 |
+
|
544 |
+
return demo
|
545 |
+
|
546 |
+
|
547 |
+
if __name__ == "__main__":
|
548 |
+
|
549 |
+
pathfinder = create_interface()
|
550 |
+
pathfinder.launch()
|
prompts.py
CHANGED
@@ -142,3 +142,66 @@ Present your final answer in the following format:
|
|
142 |
Category: [Selected category]
|
143 |
Explanation: [Your explanation for the categorization]
|
144 |
</categorization>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
Category: [Selected category]
|
143 |
Explanation: [Your explanation for the categorization]
|
144 |
</categorization>"""
|
145 |
+
|
146 |
+
|
147 |
+
pathfinder_text = """# Welcome to Pathfinder
|
148 |
+
|
149 |
+
## Discover the Universe Through AI-Powered Astronomy ReSearch
|
150 |
+
|
151 |
+
### What is Pathfinder?
|
152 |
+
|
153 |
+
Pathfinder (https://pfdr.app) harnesses the power of modern large language models (LLMs) in combination with papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) to navigate the vast expanse of astronomy literature.
|
154 |
+
Our tool empowers researchers, students, and astronomy enthusiasts to get started on their journeys to find answers to complex research questions quickly and efficiently.
|
155 |
+
|
156 |
+
To use the old streamlit pathfinder (with the ReAct agent), you can use the [pfdr streamlit mirror](https://huggingface.co/spaces/kiyer/pathfinder_v3/).
|
157 |
+
|
158 |
+
This is not meant to be a replacement to existing tools like the [ADS](https://ui.adsabs.harvard.edu/), [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers that otherwise might be missed during a literature survey. It is trained on astro-ph papers up to July 2024.
|
159 |
+
|
160 |
+
### How to Use Pathfinder
|
161 |
+
|
162 |
+
You can use pathfinder to find papers of interest with natural-language questions, and generate basic answers to questions using the retrieved papers. Try asking it questions like
|
163 |
+
|
164 |
+
- What is the value of the Hubble Constant?
|
165 |
+
- Are there open source radiative transfer codes for planetary atmospheres?
|
166 |
+
- Can I predict a galaxy spectrum from an image cutout? Please reply in Hindi.
|
167 |
+
- How would galaxy evolution differ in a universe with no dark matter?
|
168 |
+
|
169 |
+
**👈 Use the sidebar to tweak the search parameters to get better results**. Changing the number of retrieved papers (**top-k**), weighting by keywords, time, or citations, or changing the prompt type might help better refine the paper search and synthesized answers for your specific question.
|
170 |
+
|
171 |
+
1. **Enter Your Query**: Type your astronomy question in the search bar & hit `run pathfinder`.
|
172 |
+
2. **Review Results**: Pathfinder will analyze relevant literature and present you with a concise answer.
|
173 |
+
3. **Explore Further**: Click on provided links to delve deeper into the source material on ADS.
|
174 |
+
4. **Refine Your Search**: Use our advanced filters to narrow down results by date, author, or topic.
|
175 |
+
5. **Download results:** You can download the results of your query as a json file.
|
176 |
+
|
177 |
+
### Why Use Pathfinder?
|
178 |
+
|
179 |
+
- **Time-Saving**: Get started finding answers that would take hours of manual research.
|
180 |
+
- **Comprehensive**: Access information from papers across a large database of astronomy literature.
|
181 |
+
- **User-Friendly**: Intuitive interface designed for researchers at all levels.
|
182 |
+
- **Constantly Updated**: Our database is regularly refreshed with the latest publications.
|
183 |
+
|
184 |
+
### Learn More
|
185 |
+
|
186 |
+
- Read our paper on [arXiv](https://arxiv.org/abs/2408.01556) to understand the technology behind Pathfinder.
|
187 |
+
- Discover how Pathfinder was developed in collaboration with [UniverseTBD](https://www.universetbd.org) on its mission is to democratise science for everyone, and [JSALT](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/).
|
188 |
+
|
189 |
+
---
|
190 |
+
|
191 |
+
### Copyright and Terms of Use
|
192 |
+
|
193 |
+
© 2024 Pathfinder. All rights reserved.
|
194 |
+
|
195 |
+
Pathfinder is provided "as is" without warranty of any kind. By using this service, you agree to our [Terms of Service] and [Privacy Policy].
|
196 |
+
|
197 |
+
### Contact Us
|
198 |
+
|
199 |
+
Have questions or feedback? We'd love to hear from you!
|
200 |
+
- Email: [email protected]
|
201 |
+
- Twitter: [@universe_tbd](https://twitter.com/universe_tbd)
|
202 |
+
- Huggingface: [https://huggingface.co/spaces/kiyer/pathfinder/](https://huggingface.co/spaces/kiyer/pathfinder/)
|
203 |
+
|
204 |
+
---
|
205 |
+
|
206 |
+
*Empowering astronomical discoveries, one query at a time.*
|
207 |
+
"""
|