Spaces:
Running
Running
""" colbert_utils.py | |
Utilities for building (and using) a ColBERT (retrieval) model. | |
:author: Didier Guillevic | |
:email: [email protected] | |
:creation: 2024-12-21 | |
""" | |
import logging | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
from ragatouille import RAGPretrainedModel | |
def build_colbert_model( | |
documents: list[str], | |
metadatas: list[dict[str, str]], | |
pretrained_model: str='antoinelouis/colbert-xm', | |
index_name: str='colbert_index' | |
) -> RAGPretrainedModel: | |
"""Build a ColBERT model for retrieval. | |
Args: | |
documents: list of documents to index | |
metadatas: list of metadata for each document | |
index_name: name of the index built with given documents | |
pretrined_model: name of the pretrained model to use | |
Returns: | |
the ColBERT retrieval model built witt the given documents. | |
""" | |
model = RAGPretrainedModel.from_pretrained(pretrained_model) | |
model.index( | |
collection=documents, | |
#document_ids=document_ids, # no unique IDs at the moment | |
document_metadatas=metadatas, | |
index_name=index_name, | |
max_document_length=180, | |
split_documents=True, | |
use_faiss=False # cannot get it to work... | |
) | |
return model | |