Spaces:

ZySec-AI
/

ZySec

Running

ZySec / modules /app_to_vectorstore.py

vSiddi

fix files

8e29341 12 months ago

4 kB


	#app_to_vectorstore.py

	import os
	from langchain_community.vectorstores import Chroma
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.text_splitter import CharacterTextSplitter
	from modules import common_utils,file_utils
	from modules import app_logger
	# Assuming all necessary loader classes are imported

	from modules import app_constants

	app_logger = app_logger.app_logger

	TEMP_DIR = app_constants.WORKSPACE_DIRECTORY + "tmp"
	DB_DIR = app_constants.WORKSPACE_DIRECTORY + "db"

	processed_files_record = os.path.join(app_constants.WORKSPACE_DIRECTORY, app_constants.PROCESSED_DOCS)

	def load_documents_from_jsonl(file_path, loader_class):
	try:
	loader = loader_class(file_path, json_lines=True, text_content=False, jq_schema='.')
	return loader.load()
	except Exception as e:
	app_logger.error(f"Error loading documents from JSONL file {file_path}: {e}")
	return None

	def update_processed_files_record(file_md5,module, file_path):
	try:
	with open(processed_files_record, 'a') as file: # 'a' mode will create the file if it doesn't exist
	file.write(f"{file_md5},{module},{file_path}\n")
	except Exception as e:
	app_logger.error(f"Error updating processed files record: {e}")

	def is_file_processed(file_md5):
	if os.path.exists(processed_files_record):
	with open(processed_files_record, 'r') as file:
	for line in file:
	md5, _ = line.strip().split(',', 1)
	if md5 == file_md5:
	return True
	return False

	def get_chroma_index(file_path, current_page="nav_playbooks", is_persistent=True):
	app_logger.info(f"Starting get_chroma_index for {file_path}")
	file_md5 = file_utils.compute_md5(file_path)
	if is_file_processed(file_md5):
	app_logger.info(f"File {file_path} has already been processed. Skipping.")
	db = None
	return False

	_, file_extension = os.path.splitext(file_path)
	loader_class = app_constants.DOCUMENT_MAP.get(file_extension.lower(), None)

	if not loader_class:
	app_logger.error(f"No suitable loader found for file type {file_extension}")
	return None, False

	embedding_model = app_constants.EMBEDDING_MODEL_NAME
	chunk_size = app_constants.CHUNK_SIZE
	chunk_overlap = app_constants.CHUNK_OVERLAP

	storage_dir = DB_DIR if is_persistent else TEMP_DIR

	base_filename = f"{current_page}_chroma_db" if is_persistent else f"{os.path.splitext(os.path.basename(file_path))[0]}_chroma_db"
	sanitized_base_filename = file_utils.sanitize_filename(base_filename)
	chroma_persist_directory = os.path.join(storage_dir, sanitized_base_filename)


	embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
	try:
	if file_extension.lower() == '.jsonl':
	documents = load_documents_from_jsonl(file_path, loader_class)
	else:
	loader = loader_class(file_path)
	documents = loader.load()

	if not documents:
	app_logger.error(f"No documents loaded from {file_path}.")
	db = None
	return False

	text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	docs = text_splitter.split_documents(documents)

	if not docs:
	app_logger.error(f"No documents to process after splitting from {file_path}.")
	db = None
	return False

	db = Chroma.from_documents(docs, embeddings, persist_directory=chroma_persist_directory, client_settings=app_constants.CHROMA_SETTINGS)
	update_processed_files_record(file_md5,current_page, file_path)
	app_logger.info("Created index and saved to disk")
	db.persist()
	except Exception as e:
	app_logger.error(f"Error in get_chroma_index for {file_path}: {e}")
	db = None
	return False
	app_logger.info("Completed get_chroma_index operation")
	db = None
	return True