from FlagEmbedding import FlagModel model = FlagModel("openbmb/MiniCPM-Embedding-Light", query_instruction_for_retrieval="Query: ", pooling_method="mean", trust_remote_code=True, normalize_embeddings=True, use_fp16=True) # You can hack the __init__() method of the FlagEmbedding BaseEmbedder class to use flash_attention_2 for faster inference # self.model = AutoModel.from_pretrained( # model_name_or_path, # trust_remote_code=trust_remote_code, # cache_dir=cache_dir, # # torch_dtype=torch.float16, # we need to add this line to use fp16 # # attn_implementation="flash_attention_2", # we need to add this line to use flash_attention_2 # ) queries = ["中国的首都是哪里?"] # "What is the capital of China?" passages = ["beijing", "shanghai"] # "北京", "上海" embeddings_query = model.encode_queries(queries) embeddings_doc = model.encode_corpus(passages) scores = (embeddings_query @ embeddings_doc.T) print(scores.tolist()) # [[0.40356746315956116, 0.36183440685272217]]