ZySec / modules /app_to_vectorstore.py
vSiddi
fix files
8e29341
raw
history blame
4 kB
#app_to_vectorstore.py
import os
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from modules import common_utils,file_utils
from modules import app_logger
# Assuming all necessary loader classes are imported
from modules import app_constants
app_logger = app_logger.app_logger
TEMP_DIR = app_constants.WORKSPACE_DIRECTORY + "tmp"
DB_DIR = app_constants.WORKSPACE_DIRECTORY + "db"
processed_files_record = os.path.join(app_constants.WORKSPACE_DIRECTORY, app_constants.PROCESSED_DOCS)
def load_documents_from_jsonl(file_path, loader_class):
try:
loader = loader_class(file_path, json_lines=True, text_content=False, jq_schema='.')
return loader.load()
except Exception as e:
app_logger.error(f"Error loading documents from JSONL file {file_path}: {e}")
return None
def update_processed_files_record(file_md5,module, file_path):
try:
with open(processed_files_record, 'a') as file: # 'a' mode will create the file if it doesn't exist
file.write(f"{file_md5},{module},{file_path}\n")
except Exception as e:
app_logger.error(f"Error updating processed files record: {e}")
def is_file_processed(file_md5):
if os.path.exists(processed_files_record):
with open(processed_files_record, 'r') as file:
for line in file:
md5, _ = line.strip().split(',', 1)
if md5 == file_md5:
return True
return False
def get_chroma_index(file_path, current_page="nav_playbooks", is_persistent=True):
app_logger.info(f"Starting get_chroma_index for {file_path}")
file_md5 = file_utils.compute_md5(file_path)
if is_file_processed(file_md5):
app_logger.info(f"File {file_path} has already been processed. Skipping.")
db = None
return False
_, file_extension = os.path.splitext(file_path)
loader_class = app_constants.DOCUMENT_MAP.get(file_extension.lower(), None)
if not loader_class:
app_logger.error(f"No suitable loader found for file type {file_extension}")
return None, False
embedding_model = app_constants.EMBEDDING_MODEL_NAME
chunk_size = app_constants.CHUNK_SIZE
chunk_overlap = app_constants.CHUNK_OVERLAP
storage_dir = DB_DIR if is_persistent else TEMP_DIR
base_filename = f"{current_page}_chroma_db" if is_persistent else f"{os.path.splitext(os.path.basename(file_path))[0]}_chroma_db"
sanitized_base_filename = file_utils.sanitize_filename(base_filename)
chroma_persist_directory = os.path.join(storage_dir, sanitized_base_filename)
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
try:
if file_extension.lower() == '.jsonl':
documents = load_documents_from_jsonl(file_path, loader_class)
else:
loader = loader_class(file_path)
documents = loader.load()
if not documents:
app_logger.error(f"No documents loaded from {file_path}.")
db = None
return False
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
if not docs:
app_logger.error(f"No documents to process after splitting from {file_path}.")
db = None
return False
db = Chroma.from_documents(docs, embeddings, persist_directory=chroma_persist_directory, client_settings=app_constants.CHROMA_SETTINGS)
update_processed_files_record(file_md5,current_page, file_path)
app_logger.info("Created index and saved to disk")
db.persist()
except Exception as e:
app_logger.error(f"Error in get_chroma_index for {file_path}: {e}")
db = None
return False
app_logger.info("Completed get_chroma_index operation")
db = None
return True