abhijeetps's picture
feat: add support for pdf files to vector db
b603861
import requests
from bs4 import BeautifulSoup
import pinecone
from langchain_community.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from consts import PINECONE_API_KEY, PINECONE_CLOUD, PINECONE_INDEX_NAME, PINECONE_DIMENSION, PINECONE_METRICS, CHUNK_OVERLAP, CHUNK_SIZE
def get_content_from_webpage(url):
try:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
content = soup.select('div#content').pop().get_text(separator='\n', strip=True)
return content
except Exception as e:
print(f"Exception occured while trying to fetch compliance policy: #{e}")
def get_webpages_content():
webpages = [
'https://www.hackerearth.com/recruit/tech-recruiters/',
'https://www.hackerearth.com/recruit/hiring-managers/',
'https://www.hackerearth.com/recruit/university-hiring/',
'https://www.hackerearth.com/recruit/remote-hiring/',
'https://www.hackerearth.com/recruit/learning-and-development/'
'https://www.hackerearth.com/recruit/pricing/'
]
documents = ""
for webpage in webpages:
documents += get_content_from_webpage(webpage)
return documents
def read_doc(directory="doc/"):
file_loader = PyPDFDirectoryLoader(directory)
documents = file_loader.load_and_split()
return documents
def get_vector_search_index(chunks):
embeddings = OpenAIEmbeddings()
pinecone.init(
api_key=PINECONE_API_KEY,
environment=PINECONE_CLOUD
)
if PINECONE_INDEX_NAME in pinecone.list_indexes():
vector_search_index = Pinecone.from_existing_index(PINECONE_INDEX_NAME, embeddings)
else:
pinecone.create_index(
PINECONE_INDEX_NAME,
dimension=PINECONE_DIMENSION,
metric=PINECONE_METRICS
)
vector_search_index = Pinecone.from_documents(
chunks,
embeddings,
index_name = PINECONE_INDEX_NAME
)
return vector_search_index
def retrieve_query(documents, query, k=4):
index = get_vector_search_index(documents=documents)
matching_results = index.similarity_search(query=query, k=k)
return matching_results
def process_pdf(directory='doc/', chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
loader = PyPDFDirectoryLoader(directory)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
documents = text_splitter.split_documents(data)
return documents
def chunk_data(documents, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function = len,
)
doc = text_splitter.create_documents([documents])
return doc
def init_vdb():
content = get_webpages_content()
chunked_content = chunk_data(content)
chunked_document = process_pdf()
chunked = chunked_content + chunked_document
index = get_vector_search_index(chunked)
return index
init_vdb()