JairoDanielMT commited on
Commit
ea83a52
verified
1 Parent(s): 21c8aea

Upload 7 files

Browse files
Files changed (7) hide show
  1. Dockerfile +15 -0
  2. app.py +178 -0
  3. document_processor.py +36 -0
  4. embeddings.py +30 -0
  5. model.py +33 -0
  6. requirements.txt +18 -0
  7. vector_db.py +160 -0
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Usa una imagen base de Python
2
+ FROM python:3.11
3
+ # Establece el directorio de trabajo
4
+ WORKDIR /code
5
+
6
+ # Copia los archivos necesarios al contenedor
7
+ COPY ./requirements.txt /code/requirements.txt
8
+ RUN pip install --no-cache-dir -r /code/requirements.txt
9
+
10
+ COPY . .
11
+
12
+ RUN chmod -R 777 /code
13
+
14
+ # Comando para ejecutar la aplicaci贸n
15
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Depends, File, UploadFile
2
+ from typing import List
3
+ from fastapi.responses import FileResponse
4
+ import urllib
5
+ from embeddings import EmbeddingManager
6
+ from model import (
7
+ AddFilesRequest,
8
+ CreateVectorStoreRequest,
9
+ DeleteVectorStoreRequest,
10
+ DownloadVectorStoreRequest,
11
+ ListSourcesRequest,
12
+ SaveTempRequest,
13
+ SearchSimilarityRequest,
14
+ )
15
+ from vector_db import VectorStoreManager
16
+ import os
17
+ import shutil
18
+ from starlette.responses import RedirectResponse
19
+
20
+ app = FastAPI()
21
+
22
+
23
+ @app.get("/", include_in_schema=False)
24
+ async def redirect_to_docs():
25
+ return RedirectResponse(url="/docs")
26
+
27
+
28
+ # Crear una sola instancia de EmbeddingManager
29
+ embedding_manager = EmbeddingManager()
30
+ embeddings = embedding_manager.get_embeddings
31
+ path_docs = "docs" # Directorio temporal para almacenar los archivos subidos
32
+ path_db = "database" # Directorio para almacenar el vectorstore
33
+
34
+
35
+ @app.post("/vectorstore/create", tags=["VectorStore"])
36
+ async def create_vectorstore(
37
+ create_request: CreateVectorStoreRequest = Depends(), # Usar el modelo como dependencia
38
+ files: List[UploadFile] = File(...),
39
+ ):
40
+ """Create a vectorstore from the uploaded documents."""
41
+ try:
42
+ if os.path.exists(path_docs):
43
+ shutil.rmtree(path_docs)
44
+ os.makedirs(path_docs)
45
+ for file in files:
46
+ file_path = os.path.join(path_docs, file.filename)
47
+ with open(file_path, "wb") as f:
48
+ f.write(await file.read())
49
+ manager = VectorStoreManager(
50
+ path=path_docs, name=create_request.name, embeddings=embeddings
51
+ )
52
+ if manager.create_vectorstore():
53
+ shutil.rmtree(path_docs)
54
+ return {"message": "Vectorstore created successfully."}
55
+ shutil.rmtree(path_docs)
56
+ return {"message": "Failed to create vectorstore."}
57
+ except Exception as e:
58
+ raise HTTPException(status_code=500, detail=str(e))
59
+
60
+
61
+ @app.get("/vectorstore/search", tags=["Similarity Search"])
62
+ async def search_similarity(search_request: SearchSimilarityRequest = Depends()):
63
+ """Search for similar documents in the vectorstore."""
64
+ try:
65
+ manager = VectorStoreManager(
66
+ path=path_db,
67
+ name=search_request.name_database,
68
+ embeddings=embeddings,
69
+ )
70
+ search_request.query = str(urllib.parse.unquote(search_request.query))
71
+ result = manager.search_similarity(
72
+ query=search_request.query, fuente=search_request.fuente
73
+ )
74
+ return {"results": result}
75
+ except Exception as e:
76
+ raise HTTPException(status_code=500, detail=str(e))
77
+
78
+
79
+ @app.get("/vectorstore/sources", tags=["Sources"])
80
+ async def list_sources(list_request: ListSourcesRequest = Depends()):
81
+ try:
82
+ manager = VectorStoreManager(
83
+ path=path_db, name=list_request.nombre_db_vectorial, embeddings=embeddings
84
+ )
85
+ sources = manager.list_sources()
86
+ return {"sources": sources}
87
+ except Exception as e:
88
+ raise HTTPException(status_code=500, detail=str(e))
89
+
90
+
91
+ @app.post("/vectorstore/save_temp", tags=["Save Temp"])
92
+ async def save_text_to_file_temp(save_temp: SaveTempRequest = Depends()):
93
+ """Descripci贸n: Guarda en un archivo temporal el texto de una fuente espec铆fica."""
94
+ try:
95
+ manager = VectorStoreManager(
96
+ path=path_db, name=save_temp.nombre_db_vectorial, embeddings=embeddings
97
+ )
98
+ saved = manager.save_text_to_file_temp(source=save_temp.fuente)
99
+ if saved:
100
+ return {"message": "Text saved to file successfully."}
101
+ else:
102
+ return {"message": "No text found to save."}
103
+ except Exception as e:
104
+ raise HTTPException(status_code=500, detail=str(e))
105
+
106
+
107
+ @app.post("/vectorstore/add_files", tags=["Add Files"])
108
+ async def add_files_vectorstore(
109
+ add_files_request: AddFilesRequest = Depends(), files: List[UploadFile] = File(...)
110
+ ):
111
+ try:
112
+ if os.path.exists(path_docs):
113
+ shutil.rmtree(path_docs)
114
+ os.makedirs(path_docs)
115
+
116
+ for file in files:
117
+ file_path = os.path.join(path_docs, file.filename)
118
+ with open(file_path, "wb") as f:
119
+ f.write(await file.read())
120
+ manager = VectorStoreManager(
121
+ path=path_docs,
122
+ name=add_files_request.nombre_db_vectorial,
123
+ embeddings=embeddings,
124
+ )
125
+ if manager.add_files_vectorstore():
126
+ shutil.rmtree(path_docs)
127
+ return {"message": "Files added to vectorstore successfully."}
128
+ shutil.rmtree(path_docs)
129
+ return {"message": "Failed to add files to vectorstore."}
130
+ except Exception as e:
131
+ raise HTTPException(status_code=500, detail=str(e))
132
+
133
+
134
+ @app.delete("/vectorstore/delete", tags=["Delete VectorStore"])
135
+ async def delete_vectorstore(delete_request: DeleteVectorStoreRequest = Depends()):
136
+ """Delete the vectorstore and its data."""
137
+ try:
138
+ manager = VectorStoreManager(
139
+ path=path_db, name=delete_request.nombre_db_vectorial, embeddings=embeddings
140
+ )
141
+ if manager.delete_vectorstore():
142
+ return {"message": "Vectorstore deleted successfully."}
143
+ return {"message": "Failed to delete vectorstore."}
144
+ except Exception as e:
145
+ raise HTTPException(status_code=500, detail=str(e))
146
+
147
+
148
+ @app.post("/vectorstore/download", tags=["Download VectorStore"])
149
+ async def download_vectorstore(
150
+ download_request: DownloadVectorStoreRequest = Depends(),
151
+ ):
152
+ try:
153
+ manager = VectorStoreManager(
154
+ path=path_db,
155
+ name=download_request.nombre_db_vectorial,
156
+ embeddings=embeddings,
157
+ )
158
+ zip_path = manager.download_vectorstore()
159
+ return FileResponse(zip_path, filename="vectorstore.zip")
160
+ except Exception as e:
161
+ raise HTTPException(status_code=500, detail=str(e))
162
+
163
+
164
+ if __name__ == "__main__":
165
+ import os
166
+
167
+ try:
168
+ # crear todas las carpetas necesarias si no existen
169
+ carpetas = [path_docs, path_db, "temp"]
170
+ for carpeta in carpetas:
171
+ if not os.path.exists(carpeta):
172
+ os.makedirs(carpeta)
173
+ os.system("uvicorn app:app --port 7860 --host 0.0.0.0")
174
+ except KeyboardInterrupt:
175
+ print("Server stopped.")
176
+ except Exception as e:
177
+ print(e)
178
+ print("Failed to start server.")
document_processor.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_community.document_loaders import (
3
+ PyMuPDFLoader,
4
+ TextLoader,
5
+ Docx2txtLoader,
6
+ DirectoryLoader,
7
+ )
8
+
9
+ class DocumentProcessor:
10
+ def __init__(self, path: str):
11
+ self.path = path
12
+
13
+ def files_to_texts(self) -> list:
14
+ loaders_config = {
15
+ "*.pdf": PyMuPDFLoader,
16
+ "*.txt": (TextLoader, {"encoding": "utf-8"}),
17
+ "*.docx": Docx2txtLoader,
18
+ "*.doc": Docx2txtLoader,
19
+ }
20
+
21
+ loaders = [
22
+ DirectoryLoader(
23
+ path=self.path,
24
+ glob=glob,
25
+ loader_cls=loader if isinstance(loader, type) else loader[0],
26
+ loader_kwargs=loader[1] if isinstance(loader, tuple) else None,
27
+ )
28
+ for glob, loader in loaders_config.items()
29
+ if any(fname.endswith(glob[1:]) for fname in os.listdir(self.path))
30
+ ]
31
+
32
+ documents = []
33
+ for loader in loaders:
34
+ documents.extend(loader.load())
35
+
36
+ return documents
embeddings.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
+ import torch
3
+
4
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
5
+
6
+
7
+ class EmbeddingManager:
8
+ _instance = None
9
+
10
+ def __new__(cls, *args, **kwargs):
11
+ if cls._instance is None:
12
+ cls._instance = super(EmbeddingManager, cls).__new__(cls, *args, **kwargs)
13
+ cls._instance.__initialized = False
14
+ return cls._instance
15
+
16
+ def __init__(self):
17
+ if self.__initialized:
18
+ return
19
+ self.__initialized = True
20
+ self.__embeddings = HuggingFaceEmbeddings(
21
+ model_name="jinaai/jina-embeddings-v2-base-es",
22
+ encode_kwargs={"normalize_embeddings": True},
23
+ model_kwargs={
24
+ "device": device,
25
+ },
26
+ )
27
+
28
+ @property
29
+ def get_embeddings(self):
30
+ return self.__embeddings
model.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class CreateVectorStoreRequest(BaseModel):
6
+ name: str
7
+
8
+
9
+ class SearchSimilarityRequest(BaseModel):
10
+ name_database: str
11
+ query: str
12
+ fuente: Optional[str] = None
13
+
14
+
15
+ class ListSourcesRequest(BaseModel):
16
+ nombre_db_vectorial: str
17
+
18
+
19
+ class SaveTempRequest(BaseModel):
20
+ nombre_db_vectorial: str
21
+ fuente: str
22
+
23
+
24
+ class AddFilesRequest(BaseModel):
25
+ nombre_db_vectorial: str
26
+
27
+
28
+ class DeleteVectorStoreRequest(BaseModel):
29
+ nombre_db_vectorial: str
30
+
31
+
32
+ class DownloadVectorStoreRequest(BaseModel):
33
+ nombre_db_vectorial: str
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ fastapi
3
+ langchain
4
+ langchain-core
5
+ langchain-openai
6
+ langchain-community
7
+ langchain-huggingface
8
+ faiss-cpu
9
+ duckduckgo-search
10
+ uvicorn
11
+ einops
12
+ python-multipart
13
+ docx2txt
14
+ aiofiles
15
+ pdfplumber
16
+ python-docx
17
+ sentence-transformers
18
+ pymupdf
vector_db.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, List
3
+ import shutil
4
+ from zipfile import ZipFile
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_core.embeddings import Embeddings
8
+
9
+ from document_processor import DocumentProcessor
10
+
11
+
12
+ class VectorStoreManager:
13
+ def __init__(self, path: str, name: str, embeddings: Embeddings):
14
+ """
15
+ Descripci贸n: Clase para gestionar el vectorstore, incluyendo la creaci贸n, eliminaci贸n y b煤squeda de
16
+ documentos similares.
17
+
18
+ Par谩metros:
19
+ - path: str - ruta del directorio que contiene los documentos (usualmente es "database" que es el directorio
20
+ donde se almacenan las bases de datos).
21
+ - name: str - nombre del vectorstore. (usualmente, es el nombre de la base de datos que contiene los documentos)
22
+ - embeddings: Embeddings - modelo de embeddings para el vectorstore.
23
+
24
+ """
25
+ self.path = path
26
+ self.name = name
27
+ self.embeddings = embeddings
28
+ self.vectorstore = None
29
+
30
+ def create_vectorstore(self) -> bool:
31
+ documents = DocumentProcessor(self.path).files_to_texts()
32
+ text_splitter = RecursiveCharacterTextSplitter(
33
+ chunk_size=1000, chunk_overlap=200, length_function=len
34
+ )
35
+ texts = text_splitter.split_documents(documents)
36
+ self.vectorstore = FAISS.from_documents(
37
+ documents=texts, embedding=self.embeddings
38
+ )
39
+ base_de_datos_dir = os.path.join("database", self.name)
40
+ self.vectorstore.save_local(folder_path=base_de_datos_dir)
41
+ return True
42
+
43
+ def delete_vectorstore(self) -> bool:
44
+ try:
45
+ shutil.rmtree(f"database/{self.name}")
46
+ except FileNotFoundError:
47
+ return False
48
+ return True
49
+
50
+ def search_similarity(self, query: str, fuente: Optional[str] = None) -> str:
51
+ """
52
+ Modo de uso:
53
+ debe ingresar la query y la fuente (opcional) para buscar documentos similares en el vectorstore.
54
+
55
+ Nota: debe estar definido el vectorstore para poder realizar la b煤squeda.
56
+
57
+ Par谩metros:
58
+ query: str - texto de la query.
59
+ fuente: str - fuente de los documentos a buscar.
60
+
61
+ Retorna:
62
+ str - documentos similares.
63
+ """
64
+ if not self.vectorstore:
65
+ self.vectorstore = self.load_vectorstore()
66
+
67
+ if fuente:
68
+ filtro = {"source": fuente}
69
+ retriever = self.vectorstore.similarity_search(
70
+ query=query, k=5, filter=filtro
71
+ )
72
+ else:
73
+ retriever = self.vectorstore.similarity_search(query=query, k=5)
74
+ busqueda = [
75
+ {
76
+ "content": doc.page_content,
77
+ "title": doc.metadata.get("title", None),
78
+ "source": doc.metadata.get("source", None),
79
+ }
80
+ for doc in retriever
81
+ ]
82
+
83
+ return str(busqueda)
84
+
85
+ def list_sources(self) -> List[str]:
86
+ if not self.vectorstore:
87
+ self.vectorstore = self.load_vectorstore()
88
+
89
+ docstore_dict = self.vectorstore.docstore._dict
90
+ source_metadata = {}
91
+ for doc_id, document in docstore_dict.items():
92
+ source = document.metadata.get("source", None)
93
+ source_metadata[doc_id] = source
94
+
95
+ return list(set(source_metadata.values()))
96
+
97
+ def extract_texts_by_source(self, source: str) -> List[str]:
98
+ if not self.vectorstore:
99
+ self.vectorstore = self.load_vectorstore()
100
+
101
+ docstore_dict = self.vectorstore.docstore._dict
102
+ texts = []
103
+ for document in docstore_dict.values():
104
+ source_doc = document.metadata.get("source", None)
105
+ if source_doc == source:
106
+ texts.append(document.page_content)
107
+ return texts
108
+
109
+ def save_text_to_file_temp(self, source: str) -> bool:
110
+ texts = self.extract_texts_by_source(source)
111
+ carpeta = "temp"
112
+ target_source_safe = source.replace("\\", "_").replace("/", "_")
113
+ file_path = os.path.join(carpeta, target_source_safe + ".txt")
114
+
115
+ try:
116
+ if os.path.exists(carpeta):
117
+ shutil.rmtree(carpeta)
118
+ os.makedirs(carpeta)
119
+
120
+ with open(file_path, "w", encoding="utf-8") as file:
121
+ for text in texts:
122
+ file.write(text)
123
+ file.write("\n")
124
+ return True
125
+ except Exception:
126
+ return False
127
+
128
+ def load_vectorstore(self) -> FAISS:
129
+ return FAISS.load_local(
130
+ folder_path=os.path.join("database", self.name),
131
+ embeddings=self.embeddings,
132
+ allow_dangerous_deserialization=True,
133
+ )
134
+
135
+ def add_files_vectorstore(self) -> Optional[FAISS]:
136
+ temp_folder = "docs"
137
+ if not os.path.exists(temp_folder):
138
+ os.makedirs(temp_folder)
139
+ return None
140
+
141
+ documents = DocumentProcessor(temp_folder).files_to_texts()
142
+ if not documents:
143
+ return None
144
+
145
+ text_splitter = RecursiveCharacterTextSplitter(
146
+ chunk_size=1000, chunk_overlap=200, length_function=len
147
+ )
148
+ texts = text_splitter.split_documents(documents)
149
+ self.vectorstore = self.load_vectorstore()
150
+ self.vectorstore.add_documents(documents=texts)
151
+ self.vectorstore.save_local(folder_path=os.path.join("database", self.name))
152
+ return self.vectorstore
153
+
154
+ def download_vectorstore(self):
155
+ # generar un zip de la carpeta del vectorstore, crearlo en la carpeta temp y devolver la ruta
156
+ with ZipFile("temp/vectorstore.zip", "w") as zip:
157
+ for root, dirs, files in os.walk(f"database/{self.name}"):
158
+ for file in files:
159
+ zip.write(os.path.join(root, file))
160
+ return "temp/vectorstore.zip"