""" colbert_utils.py Utilities for building (and using) a ColBERT (retrieval) model. :author: Didier Guillevic :email: didier@guillevic.net :creation: 2024-12-21 """ import logging logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) from ragatouille import RAGPretrainedModel def build_colbert_model( documents: list[str], metadatas: list[dict[str, str]], pretrained_model: str='antoinelouis/colbert-xm', index_name: str='colbert_index' ) -> RAGPretrainedModel: """Build a ColBERT model for retrieval. Args: documents: list of documents to index metadatas: list of metadata for each document index_name: name of the index built with given documents pretrined_model: name of the pretrained model to use Returns: the ColBERT retrieval model built witt the given documents. """ model = RAGPretrainedModel.from_pretrained(pretrained_model) model.index( collection=documents, #document_ids=document_ids, # no unique IDs at the moment document_metadatas=metadatas, index_name=index_name, max_document_length=180, split_documents=True, use_faiss=True # set to True if faiss working properly in current env ) return model