import os import random import math from colbert.utils.runs import Run from colbert.utils.parser import Arguments from colbert.indexing.faiss import index_faiss from colbert.indexing.loaders import load_doclens def main(): random.seed(12345) parser = Arguments(description='Faiss indexing for end-to-end retrieval with ColBERT.') parser.add_index_use_input() parser.add_argument('--sample', dest='sample', default=None, type=float) parser.add_argument('--slices', dest='slices', default=1, type=int) args = parser.parse() assert args.slices >= 1 assert args.sample is None or (0.0 < args.sample < 1.0), args.sample with Run.context(): args.index_path = os.path.join(args.index_root, args.index_name) assert os.path.exists(args.index_path), args.index_path num_embeddings = sum(load_doclens(args.index_path)) print("#> num_embeddings =", num_embeddings) if args.partitions is None: args.partitions = 1 << math.ceil(math.log2(8 * math.sqrt(num_embeddings))) print('\n\n') Run.warn("You did not specify --partitions!") Run.warn("Default computation chooses", args.partitions, "partitions (for {} embeddings)".format(num_embeddings)) print('\n\n') index_faiss(args) if __name__ == "__main__": main()