Docs_QA_ColBERT_DSPy / colbert_utils.py
Didier Guillevic
Initial commit
1c18375
raw
history blame
1.29 kB
""" colbert_utils.py
Utilities for building (and using) a ColBERT (retrieval) model.
:author: Didier Guillevic
:email: [email protected]
:creation: 2024-12-21
"""
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
from ragatouille import RAGPretrainedModel
def build_colbert_model(
documents: list[str],
metadatas: list[dict[str, str]],
pretrained_model: str='antoinelouis/colbert-xm',
index_name: str='colbert_index'
) -> RAGPretrainedModel:
"""Build a ColBERT model for retrieval.
Args:
documents: list of documents to index
metadatas: list of metadata for each document
index_name: name of the index built with given documents
pretrined_model: name of the pretrained model to use
Returns:
the ColBERT retrieval model built witt the given documents.
"""
model = RAGPretrainedModel.from_pretrained(pretrained_model)
model.index(
collection=documents,
#document_ids=document_ids, # no unique IDs at the moment
document_metadatas=metadatas,
index_name=index_name,
max_document_length=180,
split_documents=True,
use_faiss=False # cannot get it to work...
)
return model