import requests from bs4 import BeautifulSoup import pinecone from langchain_community.vectorstores import Pinecone from langchain_openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFDirectoryLoader from langchain_community.document_loaders.merge import MergedDataLoader from consts import PINECONE_API_KEY, PINECONE_CLOUD, PINECONE_INDEX_NAME, PINECONE_DIMENSION, PINECONE_METRICS, CHUNK_OVERLAP, CHUNK_SIZE def get_content_from_webpage(url): try: page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') content = soup.select('div#content').pop().get_text(separator='\n', strip=True) return content except Exception as e: print(f"Exception occured while trying to fetch compliance policy: #{e}") def get_webpages_content(): webpages = [ 'https://www.hackerearth.com/recruit/tech-recruiters/', 'https://www.hackerearth.com/recruit/hiring-managers/', 'https://www.hackerearth.com/recruit/university-hiring/', 'https://www.hackerearth.com/recruit/remote-hiring/', 'https://www.hackerearth.com/recruit/learning-and-development/' 'https://www.hackerearth.com/recruit/pricing/' ] documents = "" for webpage in webpages: documents += get_content_from_webpage(webpage) return documents def read_doc(directory="doc/"): file_loader = PyPDFDirectoryLoader(directory) documents = file_loader.load_and_split() return documents def get_vector_search_index(chunks): embeddings = OpenAIEmbeddings() pinecone.init( api_key=PINECONE_API_KEY, environment=PINECONE_CLOUD ) if PINECONE_INDEX_NAME in pinecone.list_indexes(): vector_search_index = Pinecone.from_existing_index(PINECONE_INDEX_NAME, embeddings) else: pinecone.create_index( PINECONE_INDEX_NAME, dimension=PINECONE_DIMENSION, metric=PINECONE_METRICS ) vector_search_index = Pinecone.from_documents( chunks, embeddings, index_name = PINECONE_INDEX_NAME ) return vector_search_index def retrieve_query(documents, query, k=4): index = get_vector_search_index(documents=documents) matching_results = index.similarity_search(query=query, k=k) return matching_results def process_pdf(directory='doc/', chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP): loader = PyPDFDirectoryLoader(directory) data = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) documents = text_splitter.split_documents(data) return documents def chunk_data(documents, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function = len, ) doc = text_splitter.create_documents([documents]) return doc def init_vdb(): content = get_webpages_content() chunked_content = chunk_data(content) chunked_document = process_pdf() chunked = chunked_content + chunked_document index = get_vector_search_index(chunked) return index init_vdb()