""" https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py https://github.com/awinml/llama-cpp-python-bindings python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/ ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128 ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128 ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv ## reference - https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py - https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py - https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py """ import json import copy import os from models.base_model import Simulator import llama_cpp # import llama_cpp.llama_tokenizer from transformers import AutoTokenizer from utils.logging_util import logger import config class Qwen2Simulator(Simulator): def __init__(self): self.hf_tokenizer = AutoTokenizer.from_pretrained( "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/") local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf" if os.path.exists(local_path): self.llm = llama_cpp.Llama( # n_ctx, n_threads model_path=local_path, n_ctx=config.MAX_SEQUENCE_LENGTH, # # n_threads=None, # 默认会根据cpu数来设置 n_threads use_mlock=True, verbose=True, ) else: self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") self.llm = llama_cpp.Llama.from_pretrained( repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", filename="*fp16.gguf", n_ctx=config.MAX_SEQUENCE_LENGTH, use_mlock=True, verbose=False, ) logger.info(f"llm has been initialized: {self.llm}, " f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, " f"env[CACHE]={os.environ.get('CACHE', None)}") self.generation_kwargs = dict( temperature=config.DEFAULT_TEMPERATURE, top_p=config.DEFAULT_TOP_P, top_k=config.DEFAULT_TOP_K, max_tokens=config.DEFAULT_MAX_TOKENS, repeat_penalty=1.1, # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|> stop=[ "<|im_end|>", "<|im_start|>", "<|endoftext|>", ], ) def tokenize(self, text): return self.llm.tokenize(text.encode("utf-8")) def generate_query(self, message, history_tokens, stream=True): """ """ # {% for message in messages %} # {% if loop.first and messages[0]['role'] != 'system' %} # {{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} # {% endif %} # {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}} # {% endfor %} # {% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", input_ids = history_tokens + self.tokenize( f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>user\n" ) if stream: return self._stream_generate(input_ids) else: return self._generate(input_ids) def generate_response(self, message, history_tokens, stream=True): input_ids = history_tokens + self.tokenize( f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>assistant\n" ) if stream: return self._stream_generate(input_ids) else: return self._generate(input_ids) def _stream_generate(self, input_ids): logger.info(f"generation_kwargs {self.generation_kwargs}") # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx output = self.llm.create_completion( input_ids, stream=True, **self.generation_kwargs ) generated_text = "" # TODO: 检测finish reason,如果是length,则shift,并继续生成。 # TODO: 返回 token_id, for out in output: stream = copy.deepcopy(out) if stream["choices"][0]["finish_reason"] is None: generated_text += stream["choices"][0]["text"] if "completion_text" in stream["choices"][0]: yield stream["choices"][0]["completion_text"], stream["choices"][0]["all_tokens"] else: logger.info("completion_text not found") yield generated_text, None bot = Qwen2Simulator() if __name__ == "__main__": # messages = [ # {"role": "system", "content": "you are a helpful assistant"}, # {"role": "user", "content": "What is the capital of France?"} # ] # output = bot.generate_response(messages) # print(output) messages = [ {"role": "system", "content": "you are a helpful assistant"}, {"role": "user", "content": "hi, what your name"}, {"role": "assistant", "content": "My name is Jordan"} ] print(list(bot.generate_query(messages, stream=True))) print(bot.generate_query(messages, stream=False))