File size: 6,015 Bytes
ea83a52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
from typing import Optional, List
import shutil
from zipfile import ZipFile
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.embeddings import Embeddings

from document_processor import DocumentProcessor


class VectorStoreManager:
    def __init__(self, path: str, name: str, embeddings: Embeddings):
        """

        Descripci贸n: Clase para gestionar el vectorstore, incluyendo la creaci贸n, eliminaci贸n y b煤squeda de

        documentos similares.



        Par谩metros:

        - path: str - ruta del directorio que contiene los documentos (usualmente es "database" que es el directorio

        donde se almacenan las bases de datos).

        - name: str - nombre del vectorstore. (usualmente, es el nombre de la base de datos que contiene los documentos)

        - embeddings: Embeddings - modelo de embeddings para el vectorstore.



        """
        self.path = path
        self.name = name
        self.embeddings = embeddings
        self.vectorstore = None

    def create_vectorstore(self) -> bool:
        documents = DocumentProcessor(self.path).files_to_texts()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200, length_function=len
        )
        texts = text_splitter.split_documents(documents)
        self.vectorstore = FAISS.from_documents(
            documents=texts, embedding=self.embeddings
        )
        base_de_datos_dir = os.path.join("database", self.name)
        self.vectorstore.save_local(folder_path=base_de_datos_dir)
        return True

    def delete_vectorstore(self) -> bool:
        try:
            shutil.rmtree(f"database/{self.name}")
        except FileNotFoundError:
            return False
        return True

    def search_similarity(self, query: str, fuente: Optional[str] = None) -> str:
        """

        Modo de uso:

        debe ingresar la query y la fuente (opcional) para buscar documentos similares en el vectorstore.



        Nota: debe estar definido el vectorstore para poder realizar la b煤squeda.



        Par谩metros:

        query: str - texto de la query.

        fuente: str - fuente de los documentos a buscar.



        Retorna:

        str - documentos similares.

        """
        if not self.vectorstore:
            self.vectorstore = self.load_vectorstore()

        if fuente:
            filtro = {"source": fuente}
            retriever = self.vectorstore.similarity_search(
                query=query, k=5, filter=filtro
            )
        else:
            retriever = self.vectorstore.similarity_search(query=query, k=5)
        busqueda = [
            {
                "content": doc.page_content,
                "title": doc.metadata.get("title", None),
                "source": doc.metadata.get("source", None),
            }
            for doc in retriever
        ]

        return str(busqueda)

    def list_sources(self) -> List[str]:
        if not self.vectorstore:
            self.vectorstore = self.load_vectorstore()

        docstore_dict = self.vectorstore.docstore._dict
        source_metadata = {}
        for doc_id, document in docstore_dict.items():
            source = document.metadata.get("source", None)
            source_metadata[doc_id] = source

        return list(set(source_metadata.values()))

    def extract_texts_by_source(self, source: str) -> List[str]:
        if not self.vectorstore:
            self.vectorstore = self.load_vectorstore()

        docstore_dict = self.vectorstore.docstore._dict
        texts = []
        for document in docstore_dict.values():
            source_doc = document.metadata.get("source", None)
            if source_doc == source:
                texts.append(document.page_content)
        return texts

    def save_text_to_file_temp(self, source: str) -> bool:
        texts = self.extract_texts_by_source(source)
        carpeta = "temp"
        target_source_safe = source.replace("\\", "_").replace("/", "_")
        file_path = os.path.join(carpeta, target_source_safe + ".txt")

        try:
            if os.path.exists(carpeta):
                shutil.rmtree(carpeta)
            os.makedirs(carpeta)

            with open(file_path, "w", encoding="utf-8") as file:
                for text in texts:
                    file.write(text)
                    file.write("\n")
            return True
        except Exception:
            return False

    def load_vectorstore(self) -> FAISS:
        return FAISS.load_local(
            folder_path=os.path.join("database", self.name),
            embeddings=self.embeddings,
            allow_dangerous_deserialization=True,
        )

    def add_files_vectorstore(self) -> Optional[FAISS]:
        temp_folder = "docs"
        if not os.path.exists(temp_folder):
            os.makedirs(temp_folder)
            return None

        documents = DocumentProcessor(temp_folder).files_to_texts()
        if not documents:
            return None

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200, length_function=len
        )
        texts = text_splitter.split_documents(documents)
        self.vectorstore = self.load_vectorstore()
        self.vectorstore.add_documents(documents=texts)
        self.vectorstore.save_local(folder_path=os.path.join("database", self.name))
        return self.vectorstore

    def download_vectorstore(self):
        # generar un zip de la carpeta del vectorstore, crearlo en la carpeta temp y devolver la ruta
        with ZipFile("temp/vectorstore.zip", "w") as zip:
            for root, dirs, files in os.walk(f"database/{self.name}"):
                for file in files:
                    zip.write(os.path.join(root, file))
        return "temp/vectorstore.zip"