Spaces:
Running
Running
Added output format option
Browse files- messages.py +91 -0
- search_agent.py +17 -93
messages.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from langchain.schema import SystemMessage, HumanMessage
|
3 |
+
|
4 |
+
def get_optimized_search_messages(query):
|
5 |
+
messages = [
|
6 |
+
SystemMessage(
|
7 |
+
content="""
|
8 |
+
You are a serach query optimizer specialist.
|
9 |
+
Rewrite the user's question using only the most important keywords. Remove extra words.
|
10 |
+
Tips:
|
11 |
+
Identify the key concepts in the question
|
12 |
+
Remove filler words like "how to", "what is", "I want to"
|
13 |
+
Removed style such as "in the style of", "engaging", "short", "long"
|
14 |
+
Remove lenght instruction (example: essay, article, letter, blog, post, blogpost, etc)
|
15 |
+
Keep it short, around 3-7 words total
|
16 |
+
Put the most important keywords first
|
17 |
+
Remove formatting instructions
|
18 |
+
Remove style instructions (exmaple: in the style of, engaging, short, long)
|
19 |
+
Remove lenght instruction (example: essay, article, letter, etc)
|
20 |
+
Example:
|
21 |
+
Question: How do I bake chocolate chip cookies from scratch?
|
22 |
+
Search query: chocolate chip cookies recipe from scratch
|
23 |
+
Example:
|
24 |
+
Question: I would like you to show me a time line of Marie Curie life. Show results as a markdown table
|
25 |
+
Search query: Marie Curie timeline
|
26 |
+
Example:
|
27 |
+
Question: I would like you to write a long article on nato vs russia. Use know geopolical frameworks.
|
28 |
+
Search query: geopolitics nato russia
|
29 |
+
Example:
|
30 |
+
Question: Write a engaging linkedin post about Andrew Ng
|
31 |
+
Search query: Andrew Ng
|
32 |
+
Example:
|
33 |
+
Question: Write a short artible about the solar system in the style of Carl Sagan
|
34 |
+
Search query: solar system
|
35 |
+
Example:
|
36 |
+
Question: Should I use Kubernetes? Answer in the style of Gilfoyde from the TV show Silicon Valley
|
37 |
+
Search query: Kubernetes decision
|
38 |
+
Example:
|
39 |
+
Question: biography of napoleon. include a table with the major events.
|
40 |
+
Search query: napoleon biography events
|
41 |
+
"""
|
42 |
+
),
|
43 |
+
HumanMessage(
|
44 |
+
content=f"""
|
45 |
+
Questions: {query}
|
46 |
+
Search query:
|
47 |
+
"""
|
48 |
+
),
|
49 |
+
]
|
50 |
+
return messages
|
51 |
+
|
52 |
+
def get_query_with_sources_messages(query, relevant_docs):
|
53 |
+
messages = [
|
54 |
+
SystemMessage(
|
55 |
+
content="""
|
56 |
+
You are an expert research assistant.
|
57 |
+
You are provided with a Context in JSON format and a Question.
|
58 |
+
|
59 |
+
Use RAG to answer the Question, providing references and links to the Context material you retrieve and use in your answer:
|
60 |
+
When generating your answer, follow these steps:
|
61 |
+
- Retrieve the most relevant context material from your knowledge base to help answer the question
|
62 |
+
- Cite the references you use by including the title, author, publication, and a link to each source
|
63 |
+
- Synthesize the retrieved information into a clear, informative answer to the question
|
64 |
+
- Format your answer in Markdown, using heading levels 2-3 as needed
|
65 |
+
- Include a "References" section at the end with the full citations and link for each source you used
|
66 |
+
|
67 |
+
|
68 |
+
Example of Context JSON entry:
|
69 |
+
{
|
70 |
+
"page_content": "This provides access to material related to ...",
|
71 |
+
"metadata": {
|
72 |
+
"title": "Introduction - Marie Curie: Topics in Chronicling America",
|
73 |
+
"link": "https://guides.loc.gov/chronicling-america-marie-curie"
|
74 |
+
}
|
75 |
+
}
|
76 |
+
|
77 |
+
"""
|
78 |
+
),
|
79 |
+
HumanMessage(
|
80 |
+
content= f"""
|
81 |
+
Context information is below.
|
82 |
+
Context:
|
83 |
+
---------------------
|
84 |
+
{json.dumps(relevant_docs, indent=2, ensure_ascii=False)}
|
85 |
+
---------------------
|
86 |
+
Question: {query}
|
87 |
+
Answer:
|
88 |
+
"""
|
89 |
+
),
|
90 |
+
]
|
91 |
+
return messages
|
search_agent.py
CHANGED
@@ -6,6 +6,7 @@ Usage:
|
|
6 |
[--provider=provider]
|
7 |
[--temperature=temp]
|
8 |
[--max_pages=num]
|
|
|
9 |
SEARCH_QUERY
|
10 |
search_agent.py --version
|
11 |
|
@@ -16,6 +17,7 @@ Options:
|
|
16 |
-t temp --temperature=temp Set the temperature of the LLM [default: 0.0]
|
17 |
-p provider --provider=provider Use a specific LLM (choices: bedrock,openai,groq) [default: openai]
|
18 |
-m num --max_pages=num Max number of pages to retrieve [default: 10]
|
|
|
19 |
|
20 |
"""
|
21 |
|
@@ -63,52 +65,8 @@ def get_chat_llm(provider, temperature=0.0):
|
|
63 |
return chat_llm
|
64 |
|
65 |
def optimize_search_query(query):
|
66 |
-
messages
|
67 |
-
|
68 |
-
content="""
|
69 |
-
You are a serach query optimizer specialist.
|
70 |
-
Rewrite the user's question using only the most important keywords. Remove extra words.
|
71 |
-
Tips:
|
72 |
-
Identify the key concepts in the question
|
73 |
-
Remove filler words like "how to", "what is", "I want to"
|
74 |
-
Removed style such as "in the style of", "engaging", "short", "long"
|
75 |
-
Remove lenght instruction (example: essay, article, letter, blog, post, blogpost, etc)
|
76 |
-
Keep it short, around 3-7 words total
|
77 |
-
Put the most important keywords first
|
78 |
-
Remove formatting instructions
|
79 |
-
Remove style instructions (exmaple: in the style of, engaging, short, long)
|
80 |
-
Remove lenght instruction (example: essay, article, letter, etc)
|
81 |
-
Example:
|
82 |
-
Question: How do I bake chocolate chip cookies from scratch?
|
83 |
-
Search query: chocolate chip cookies recipe from scratch
|
84 |
-
Example:
|
85 |
-
Question: I would like you to show me a time line of Marie Curie life. Show results as a markdown table
|
86 |
-
Search query: Marie Curie timeline
|
87 |
-
Example:
|
88 |
-
Question: I would like you to write a long article on nato vs russia. Use know geopolical frameworks.
|
89 |
-
Search query: geopolitics nato russia
|
90 |
-
Example:
|
91 |
-
Question: Write a engaging linkedin post about Andrew Ng
|
92 |
-
Search query: Andrew Ng
|
93 |
-
Example:
|
94 |
-
Question: Write a short artible about the solar system in the style of Carl Sagan
|
95 |
-
Search query: solar system
|
96 |
-
Example:
|
97 |
-
Question: Should I use Kubernetes? Answer in the style of Gilfoyde from the TV show Silicon Valley
|
98 |
-
Search query: Kubernetes decision
|
99 |
-
Example:
|
100 |
-
Question: biography of napoleon. include a table with the major events.
|
101 |
-
Search query: napoleon biography events
|
102 |
-
"""
|
103 |
-
),
|
104 |
-
HumanMessage(
|
105 |
-
content=f"""
|
106 |
-
Questions: {query}
|
107 |
-
Search query:
|
108 |
-
"""
|
109 |
-
),
|
110 |
-
]
|
111 |
-
|
112 |
response = chat.invoke(messages, config={"callbacks": callbacks})
|
113 |
return response.content
|
114 |
|
@@ -238,45 +196,8 @@ def process_and_vectorize_content(
|
|
238 |
|
239 |
|
240 |
def answer_query_with_sources(query, relevant_docs):
|
241 |
-
messages
|
242 |
-
|
243 |
-
content="""
|
244 |
-
You are an expert research assistant.
|
245 |
-
You are provided with a Context in JSON format and a Question.
|
246 |
-
|
247 |
-
Use RAG to answer the Question, providing references and links to the Context material you retrieve and use in your answer:
|
248 |
-
When generating your answer, follow these steps:
|
249 |
-
- Retrieve the most relevant context material from your knowledge base to help answer the question
|
250 |
-
- Cite the references you use by including the title, author, publication, and a link to each source
|
251 |
-
- Synthesize the retrieved information into a clear, informative answer to the question
|
252 |
-
- Format your answer in Markdown, using heading levels 2-3 as needed
|
253 |
-
- Include a "References" section at the end with the full citations and link for each source you used
|
254 |
-
|
255 |
-
|
256 |
-
Example of Context JSON entry:
|
257 |
-
{
|
258 |
-
"page_content": "This provides access to material related to ...",
|
259 |
-
"metadata": {
|
260 |
-
"title": "Introduction - Marie Curie: Topics in Chronicling America",
|
261 |
-
"link": "https://guides.loc.gov/chronicling-america-marie-curie"
|
262 |
-
}
|
263 |
-
}
|
264 |
-
|
265 |
-
"""
|
266 |
-
),
|
267 |
-
HumanMessage(
|
268 |
-
content= f"""
|
269 |
-
Context information is below.
|
270 |
-
Context:
|
271 |
-
---------------------
|
272 |
-
{json.dumps(relevant_docs, indent=2, ensure_ascii=False)}
|
273 |
-
---------------------
|
274 |
-
Question: {query}
|
275 |
-
Answer:
|
276 |
-
"""
|
277 |
-
),
|
278 |
-
]
|
279 |
-
|
280 |
response = chat.invoke(messages, config={"callbacks": callbacks})
|
281 |
return response
|
282 |
|
@@ -296,20 +217,20 @@ if(os.getenv("LANGCHAIN_API_KEY")):
|
|
296 |
|
297 |
if __name__ == '__main__':
|
298 |
arguments = docopt(__doc__, version='Search Agent 0.1')
|
299 |
-
#print(arguments)
|
300 |
-
|
301 |
|
302 |
provider = arguments["--provider"]
|
303 |
temperature = float(arguments["--temperature"])
|
304 |
-
|
|
|
|
|
305 |
query = arguments["SEARCH_QUERY"]
|
306 |
-
|
|
|
|
|
307 |
with console.status(f"[bold green]Optimizing query for search: {query}"):
|
308 |
optimize_search_query = optimize_search_query(query)
|
309 |
-
console.log(f"Optimized search query: [bold blue]{optimize_search_query}")
|
310 |
|
311 |
-
domain=arguments["--domain"]
|
312 |
-
max_pages=arguments["--max_pages"]
|
313 |
with console.status(f"[bold green]Searching sources using the optimized query: {optimize_search_query}"):
|
314 |
sources = get_sources(optimize_search_query, max_pages=max_pages, domain=domain)
|
315 |
console.log(f"Found {len(sources)} sources {'on ' + domain if domain else ''}")
|
@@ -329,5 +250,8 @@ if __name__ == '__main__':
|
|
329 |
respomse = answer_query_with_sources(query, relevant_docs)
|
330 |
|
331 |
console.rule(f"[bold green]Response from {provider}")
|
332 |
-
|
|
|
|
|
|
|
333 |
console.rule("[bold green]")
|
|
|
6 |
[--provider=provider]
|
7 |
[--temperature=temp]
|
8 |
[--max_pages=num]
|
9 |
+
[--output=text]
|
10 |
SEARCH_QUERY
|
11 |
search_agent.py --version
|
12 |
|
|
|
17 |
-t temp --temperature=temp Set the temperature of the LLM [default: 0.0]
|
18 |
-p provider --provider=provider Use a specific LLM (choices: bedrock,openai,groq) [default: openai]
|
19 |
-m num --max_pages=num Max number of pages to retrieve [default: 10]
|
20 |
+
-o text --output=text Output format (choices: text, markdown) [default: markdown]
|
21 |
|
22 |
"""
|
23 |
|
|
|
65 |
return chat_llm
|
66 |
|
67 |
def optimize_search_query(query):
|
68 |
+
from messages import get_optimized_search_messages
|
69 |
+
messages = get_optimized_search_messages(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
response = chat.invoke(messages, config={"callbacks": callbacks})
|
71 |
return response.content
|
72 |
|
|
|
196 |
|
197 |
|
198 |
def answer_query_with_sources(query, relevant_docs):
|
199 |
+
from messages import get_query_with_sources_messages
|
200 |
+
messages = get_query_with_sources_messages(query, relevant_docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
response = chat.invoke(messages, config={"callbacks": callbacks})
|
202 |
return response
|
203 |
|
|
|
217 |
|
218 |
if __name__ == '__main__':
|
219 |
arguments = docopt(__doc__, version='Search Agent 0.1')
|
|
|
|
|
220 |
|
221 |
provider = arguments["--provider"]
|
222 |
temperature = float(arguments["--temperature"])
|
223 |
+
domain=arguments["--domain"]
|
224 |
+
max_pages=arguments["--max_pages"]
|
225 |
+
output=arguments["--output"]
|
226 |
query = arguments["SEARCH_QUERY"]
|
227 |
+
|
228 |
+
chat = get_chat_llm(provider, temperature)
|
229 |
+
|
230 |
with console.status(f"[bold green]Optimizing query for search: {query}"):
|
231 |
optimize_search_query = optimize_search_query(query)
|
232 |
+
console.log(f"Optimized search query: [bold blue]{optimize_search_query}")
|
233 |
|
|
|
|
|
234 |
with console.status(f"[bold green]Searching sources using the optimized query: {optimize_search_query}"):
|
235 |
sources = get_sources(optimize_search_query, max_pages=max_pages, domain=domain)
|
236 |
console.log(f"Found {len(sources)} sources {'on ' + domain if domain else ''}")
|
|
|
250 |
respomse = answer_query_with_sources(query, relevant_docs)
|
251 |
|
252 |
console.rule(f"[bold green]Response from {provider}")
|
253 |
+
if output == "text":
|
254 |
+
console.print(respomse.content)
|
255 |
+
else:
|
256 |
+
console.print(Markdown(respomse.content))
|
257 |
console.rule("[bold green]")
|