|
import os |
|
|
|
|
|
from langchain.embeddings import SentenceTransformerEmbeddings |
|
from langchain.vectorstores import Chroma |
|
import configparser |
|
from tqdm import tqdm |
|
from langchain.vectorstores import Pinecone |
|
from langchain.schema import Document |
|
import pinecone |
|
from dotenv import load_dotenv |
|
from llm import LLMManager |
|
|
|
class EmbeddingsManager: |
|
|
|
def __init__(self,settings, emb="hkunlp/instructor-large"): |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
self.config=configparser.ConfigParser() |
|
self.config.read("config.ini") |
|
|
|
|
|
self.set=settings |
|
|
|
|
|
self.search_method=self.set.search_method |
|
self.n_doc_return=self.set.n_doc_return |
|
self.ai_assisted_search=self.config.getboolean('RAG','default_ai_assisted_search') |
|
self.available_search_methods=self.set.available_search_methods |
|
self.text_split_size=self.config.getint('RAG','default_text_split_size') |
|
self.text_overlap=self.config.getint('RAG','default_text_overlap') |
|
|
|
|
|
self.vector_stores=self.get_vector_list() |
|
self.vector_stores_map=self.get_vector_map_list() |
|
|
|
|
|
self.embedding_model_name=emb |
|
|
|
|
|
current_dir = os.path.dirname(__file__) |
|
data_dir = os.path.join(current_dir, "data") |
|
os.environ['TRANSFORMERS_CACHE'] = data_dir |
|
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY') |
|
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV') |
|
self.embeddings_model = SentenceTransformerEmbeddings(model_name=self.embedding_model_name, cache_folder=data_dir) |
|
pinecone.init(api_key=PINECONE_API_KEY,environment=PINECONE_API_ENV) |
|
|
|
|
|
def get_emb_list(self): |
|
"""Returns a list of the available Embedding models""" |
|
emb_map_section = 'EMB' |
|
if emb_map_section in self.config: |
|
return [self.config.get(emb_map_section, emb) for emb in self.config[emb_map_section]] |
|
else: |
|
return [] |
|
|
|
|
|
def get_vector_list(self): |
|
"""Returns a list of the available Vector Stores""" |
|
section = 'Vector_Stores' |
|
if section in self.config: |
|
return [self.config.get(section, vector) for vector in self.config[section]] |
|
else: |
|
return [] |
|
|
|
|
|
def get_vector_map_list(self): |
|
"""Returns a list of the available Vector Stores""" |
|
section = 'Vector_Stores_Map' |
|
if section in self.config: |
|
return [self.config.get(section, vector) for vector in self.config[section]] |
|
else: |
|
return [] |
|
|
|
|
|
def get_context(self,index, query, history): |
|
"""Returns the relevant context for the LLM""" |
|
|
|
docsearch = Pinecone.from_existing_index(index, self.embeddings_model) |
|
|
|
if self.set.ai_assisted_search: |
|
prompt=self.set.default_ai_search_prompt |
|
prompt=prompt.format(question=query,history=history) |
|
print(prompt) |
|
llm=LLMManager(self.set) |
|
queryterms=llm.get_query_terms(prompt) |
|
query=queryterms+"\n"+query |
|
|
|
|
|
|
|
|
|
if self.set.search_method=="MMR": |
|
return docsearch.max_marginal_relevance_search(query, k=self.set.n_doc_return,fetch_metadata=True) |
|
|
|
elif self.set.search_method=="Similarity": |
|
return docsearch.similarity_search(query, k=self.set.n_doc_return,fetch_metadata=True) |
|
|
|
else: |
|
return docsearch.max_marginal_relevance_search(query, k=self.set.n_doc_return,fetch_metadata=True) |
|
|
|
|
|
|
|
def get_context_search(self,index, query): |
|
"""Returns the relevant context for the LLM""" |
|
|
|
docsearch = Pinecone.from_existing_index(index, self.embeddings_model) |
|
|
|
if self.set.search_method=="MMR": |
|
return docsearch.max_marginal_relevance_search(query, k=2,fetch_metadata=True) |
|
|
|
elif self.set.search_method=="Similarity": |
|
return docsearch.similarity_search(query, k=2,fetch_metadata=True) |
|
|
|
else: |
|
return docsearch.max_marginal_relevance_search(query, k=self.n_doc_return,fetch_metadata=True) |
|
|
|
|
|
def get_formatted_context(self,index, query,history): |
|
"""Returns the relevant context for the LLM formatted""" |
|
|
|
formatted="" |
|
docs=self.get_context(index, query,history) |
|
for doc in docs: |
|
formatted+="DOCUMENT NAME={doc_name}\nDOCUMENT CONTENT={doc_content}\n\n".format(doc_name=doc.metadata["source"],doc_content=doc.page_content) |
|
return formatted |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
"""This is an example of how to add document to the vectorstore on Pinecone""" |
|
from settings import SettingManager |
|
set= SettingManager() |
|
emb_manager = EmbeddingsManager(set,emb="hkunlp/instructor-large") |
|
print(emb_manager.generate_vector_store("prohelper")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|