Spaces:
Sleeping
Sleeping
thanhtung09t2
commited on
Commit
•
bcc5b22
1
Parent(s):
0859f1c
Upload vector_index
Browse files
api/vector_index/base.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Class trung gian xử lý tác vụ tạo VectorIndex từ database
|
2 |
+
# Nhận vào các tham số: loại db, tên embedding_model
|
3 |
+
from api.vector_index import chroma, milvus
|
4 |
+
|
5 |
+
def get_vector_index(db_name, force_new = False):
|
6 |
+
if db_name == "chroma":
|
7 |
+
return chroma.ChromaVectorIndex(force_new)
|
8 |
+
if db_name == "milvus":
|
9 |
+
return milvus.MilvusVectorIndex(force_new)
|
10 |
+
else:
|
11 |
+
raise NotImplementedError("This type of index is not yet supported")
|
api/vector_index/chroma.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex
|
6 |
+
from llama_index.vector_stores.chroma import ChromaVectorStore
|
7 |
+
from llama_index.core.node_parser import SemanticSplitterNodeParser
|
8 |
+
from llama_index.core.node_parser import SentenceSplitter
|
9 |
+
root_path = os.environ['ROOT_PATH']
|
10 |
+
|
11 |
+
DOCUMENTS_DIRECTORY = os.path.join(root_path,"doc","loaded")
|
12 |
+
INPUT_DIRECTORY = os.path.join(root_path,"doc","input")
|
13 |
+
DB_DIRECTORY = os.path.join(root_path,"chromadb")
|
14 |
+
COLLECTION_NAME = "SmartAgri"
|
15 |
+
DB_METADATA = {"hnsw:space": "cosine"}
|
16 |
+
|
17 |
+
def move_files(src, dst):
|
18 |
+
for root, dirs, files in os.walk(src):
|
19 |
+
# Tạo cấu trúc thư mục con tương tự trong folder đích
|
20 |
+
for dir in dirs:
|
21 |
+
os.makedirs(os.path.join(dst, os.path.relpath(os.path.join(root, dir), src)), exist_ok=True)
|
22 |
+
|
23 |
+
# Di chuyển các file
|
24 |
+
for file in files:
|
25 |
+
src_file = os.path.join(root, file)
|
26 |
+
dst_file = os.path.join(dst, os.path.relpath(src_file, src))
|
27 |
+
|
28 |
+
# Nếu file đích đã tồn tại, giữ lại file có sẵn trong folder 'dest'
|
29 |
+
if not os.path.exists(dst_file):
|
30 |
+
shutil.move(src_file, dst_file)
|
31 |
+
|
32 |
+
def ChromaVectorIndex(force_new = False):
|
33 |
+
chroma_client = chromadb.PersistentClient(path = DB_DIRECTORY)
|
34 |
+
if force_new:
|
35 |
+
# chuyển hết sang thư mục input để thực hiện embedding tất cả từ đầu
|
36 |
+
move_files(DOCUMENTS_DIRECTORY, INPUT_DIRECTORY)
|
37 |
+
chroma_client.delete_collection(COLLECTION_NAME) # xóa db đang có
|
38 |
+
try:
|
39 |
+
reader = SimpleDirectoryReader(input_dir=INPUT_DIRECTORY, recursive=True)
|
40 |
+
documents = reader.load_data()
|
41 |
+
except ValueError:
|
42 |
+
documents = None
|
43 |
+
chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME,
|
44 |
+
metadata=DB_METADATA)
|
45 |
+
# chuyển hết sang thư mục loaded sau khi đã embedding xong
|
46 |
+
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
47 |
+
move_files(INPUT_DIRECTORY, DOCUMENTS_DIRECTORY)
|
48 |
+
|
49 |
+
if documents:
|
50 |
+
Settings.text_splitter = SentenceSplitter(chunk_size=1500, chunk_overlap=500) # SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, embed_model=Settings.embed_model, breakpoint_percentile_threshold=95)
|
51 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
52 |
+
index = VectorStoreIndex.from_documents(documents,
|
53 |
+
transformations=[Settings.text_splitter],
|
54 |
+
storage_context=storage_context,
|
55 |
+
embed_model=Settings.embed_model)
|
56 |
+
else:
|
57 |
+
index = VectorStoreIndex.from_vector_store(vector_store=vector_store,
|
58 |
+
embed_model=Settings.embed_model)
|
59 |
+
return index
|
api/vector_index/vector_index_config.py
ADDED
File without changes
|