File size: 3,265 Bytes
7765fa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fcf163
7765fa9
 
 
 
 
 
 
5fcf163
7765fa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fcf163
 
 
 
 
 
 
 
 
7765fa9
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
from langchain_groq import ChatGroq
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceHubEmbeddings
from langchain.tools.retriever import create_retriever_tool
from langchain_community.tools import ArxivQueryRun
from langchain_community.utilities import ArxivAPIWrapper
from langchain.agents import tool
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, PromptTemplate, MessagesPlaceholder
from langchain import hub
from googlesearch import search
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

def create_retriever_tool_agent(pdf_dir="./Pdfs"):
    loader = PyPDFDirectoryLoader(pdf_dir)
    documents = loader.load()
    doc_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    split_docs = doc_splitter.split_documents(documents)
    embedding = HuggingFaceHubEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")
    db = FAISS.from_documents(split_docs, embedding)
    return create_retriever_tool(retriever=db.as_retriever(), name="Pdf_search", description="Use to search information from the PDFs.")

def create_arxiv_tool_agent():
    arxiv_api = ArxivAPIWrapper(top_k_results=1, doc_content_chars_max=200)
    return ArxivQueryRun(api_wrapper=arxiv_api)

@tool
def google_search(input: str, num_results: int = 5):
    """Use to do google search. It is used when the AI's internal database does not have specific information requested by the user."""
    search_results = [url for url in tqdm(search(input, num_results=num_results))]
    return fetch_content(search_results)

def fetch_content(urls: list):
    text = []
    for link in tqdm(urls):
        try:
            response = requests.get(link)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            paragraphs = ' '.join([p.text for p in soup.find_all('p')])
            paragraphs = paragraphs[:1000]
            if paragraphs:
                text.append(paragraphs)
        except Exception:
            continue
    return " ".join(text)

def get_prompt():
    prompt = hub.pull("hwchase17/openai-functions-agent")
    prompt.messages = [
        SystemMessagePromptTemplate(prompt=PromptTemplate(
            input_variables=[],
            template="""You are a helpful AI that uses sequence of tools to answer human questions. 
The sequence you follow:
first, use your internal database
second, use pdf search tool,
third, use arxiv tool,
fourth, use google search tool

You can use one tool at a time to search for information. Once a tool is used, do not switch to another tool.
If you know which tool is suitable to answer human question, than use that tool and don't follow the sequence"""
        )),
        MessagesPlaceholder(variable_name='chat_history', optional=True),
        HumanMessagePromptTemplate(prompt=PromptTemplate(
            input_variables=['input'], 
            template='{input}'
        )),
        MessagesPlaceholder(variable_name='agent_scratchpad')
    ]
    return prompt