Spaces:
Running
Running
import os | |
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection | |
milvus_token = os.getenv("MILVUS_TOKEN") | |
COLLECTION_NAME = "final_ragbench_document_embeddings" | |
MILVUS_CLOUD_URI = "https://in03-7b4da1b7b588a88.serverless.gcp-us-west1.cloud.zilliz.com" | |
#Function to create milvus db schema to insert the data | |
def CreateMilvusDbSchema(): | |
connections.connect("default", uri=MILVUS_CLOUD_URI, token=milvus_token) | |
print(connections.get_connection_addr("default")) | |
# Define the fields for the collection | |
fields = [ | |
FieldSchema(name="chunk_doc_id", dtype=DataType.VARCHAR, max_length=350, is_primary=True, auto_id=False), # Primary Key | |
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=300), # Document ID | |
FieldSchema(name="chunk_embedding", dtype=DataType.FLOAT_VECTOR, dim=384), # Vector Field (embedding) | |
FieldSchema(name="context_relevance", dtype=DataType.FLOAT), # Context Relevance Score | |
FieldSchema(name="context_utilization", dtype=DataType.FLOAT), # Context Utilization Score | |
FieldSchema(name="adherence", dtype=DataType.FLOAT), # Adherence Score | |
FieldSchema(name="dataset_name", dtype=DataType.VARCHAR, max_length=300), # Dataset Name | |
FieldSchema(name="relevance_score", dtype=DataType.FLOAT), # Relevance Score | |
FieldSchema(name="utilization_score", dtype=DataType.FLOAT), # Utilization Score | |
FieldSchema(name="completeness_score", dtype=DataType.FLOAT) # Completeness Score | |
] | |
# Define the collection schema | |
schema = CollectionSchema(fields, description="RAG Bench document vector collection") | |
# Create the collection in Milvus | |
collection = Collection(name=COLLECTION_NAME, schema=schema) | |
# Create an optimized index for fast vector search | |
collection.create_index( | |
"chunk_embedding", | |
{ | |
"index_type": "HNSW", # Hierarchical Navigable Small World (HNSW) index | |
"metric_type": "COSINE", # Cosine similarity for vector search | |
"params": {"M": 16, "efConstruction": 200} # HNSW parameters | |
} | |
) | |
print(f"Collection '{COLLECTION_NAME}' created successfully.") | |
return collection |