Spaces:
Sleeping
Sleeping
File size: 6,015 Bytes
ea83a52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import os
from typing import Optional, List
import shutil
from zipfile import ZipFile
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.embeddings import Embeddings
from document_processor import DocumentProcessor
class VectorStoreManager:
def __init__(self, path: str, name: str, embeddings: Embeddings):
"""
Descripci贸n: Clase para gestionar el vectorstore, incluyendo la creaci贸n, eliminaci贸n y b煤squeda de
documentos similares.
Par谩metros:
- path: str - ruta del directorio que contiene los documentos (usualmente es "database" que es el directorio
donde se almacenan las bases de datos).
- name: str - nombre del vectorstore. (usualmente, es el nombre de la base de datos que contiene los documentos)
- embeddings: Embeddings - modelo de embeddings para el vectorstore.
"""
self.path = path
self.name = name
self.embeddings = embeddings
self.vectorstore = None
def create_vectorstore(self) -> bool:
documents = DocumentProcessor(self.path).files_to_texts()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=200, length_function=len
)
texts = text_splitter.split_documents(documents)
self.vectorstore = FAISS.from_documents(
documents=texts, embedding=self.embeddings
)
base_de_datos_dir = os.path.join("database", self.name)
self.vectorstore.save_local(folder_path=base_de_datos_dir)
return True
def delete_vectorstore(self) -> bool:
try:
shutil.rmtree(f"database/{self.name}")
except FileNotFoundError:
return False
return True
def search_similarity(self, query: str, fuente: Optional[str] = None) -> str:
"""
Modo de uso:
debe ingresar la query y la fuente (opcional) para buscar documentos similares en el vectorstore.
Nota: debe estar definido el vectorstore para poder realizar la b煤squeda.
Par谩metros:
query: str - texto de la query.
fuente: str - fuente de los documentos a buscar.
Retorna:
str - documentos similares.
"""
if not self.vectorstore:
self.vectorstore = self.load_vectorstore()
if fuente:
filtro = {"source": fuente}
retriever = self.vectorstore.similarity_search(
query=query, k=5, filter=filtro
)
else:
retriever = self.vectorstore.similarity_search(query=query, k=5)
busqueda = [
{
"content": doc.page_content,
"title": doc.metadata.get("title", None),
"source": doc.metadata.get("source", None),
}
for doc in retriever
]
return str(busqueda)
def list_sources(self) -> List[str]:
if not self.vectorstore:
self.vectorstore = self.load_vectorstore()
docstore_dict = self.vectorstore.docstore._dict
source_metadata = {}
for doc_id, document in docstore_dict.items():
source = document.metadata.get("source", None)
source_metadata[doc_id] = source
return list(set(source_metadata.values()))
def extract_texts_by_source(self, source: str) -> List[str]:
if not self.vectorstore:
self.vectorstore = self.load_vectorstore()
docstore_dict = self.vectorstore.docstore._dict
texts = []
for document in docstore_dict.values():
source_doc = document.metadata.get("source", None)
if source_doc == source:
texts.append(document.page_content)
return texts
def save_text_to_file_temp(self, source: str) -> bool:
texts = self.extract_texts_by_source(source)
carpeta = "temp"
target_source_safe = source.replace("\\", "_").replace("/", "_")
file_path = os.path.join(carpeta, target_source_safe + ".txt")
try:
if os.path.exists(carpeta):
shutil.rmtree(carpeta)
os.makedirs(carpeta)
with open(file_path, "w", encoding="utf-8") as file:
for text in texts:
file.write(text)
file.write("\n")
return True
except Exception:
return False
def load_vectorstore(self) -> FAISS:
return FAISS.load_local(
folder_path=os.path.join("database", self.name),
embeddings=self.embeddings,
allow_dangerous_deserialization=True,
)
def add_files_vectorstore(self) -> Optional[FAISS]:
temp_folder = "docs"
if not os.path.exists(temp_folder):
os.makedirs(temp_folder)
return None
documents = DocumentProcessor(temp_folder).files_to_texts()
if not documents:
return None
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=200, length_function=len
)
texts = text_splitter.split_documents(documents)
self.vectorstore = self.load_vectorstore()
self.vectorstore.add_documents(documents=texts)
self.vectorstore.save_local(folder_path=os.path.join("database", self.name))
return self.vectorstore
def download_vectorstore(self):
# generar un zip de la carpeta del vectorstore, crearlo en la carpeta temp y devolver la ruta
with ZipFile("temp/vectorstore.zip", "w") as zip:
for root, dirs, files in os.walk(f"database/{self.name}"):
for file in files:
zip.write(os.path.join(root, file))
return "temp/vectorstore.zip"
|