"""
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
https://github.com/awinml/llama-cpp-python-bindings

python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat

python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/


./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128

./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128

./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv


## reference

- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py

"""

import json
import copy
import os

from models.base_model import Simulator
import llama_cpp
# import llama_cpp.llama_tokenizer
from transformers import AutoTokenizer
from utils.logging_util import logger
import config


class Qwen2Simulator(Simulator):

    def __init__(self):
        self.hf_tokenizer = AutoTokenizer.from_pretrained(
            "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")

        local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
        if os.path.exists(local_path):
            self.llm = llama_cpp.Llama(  # n_ctx, n_threads
                model_path=local_path,
                n_ctx=config.MAX_SEQUENCE_LENGTH,  #
                # n_threads=None, # 默认会根据cpu数来设置 n_threads
                use_mlock=True,
                verbose=True,
            )
        else:
            self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
            self.llm = llama_cpp.Llama.from_pretrained(
                repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
                filename="*fp16.gguf",
                n_ctx=config.MAX_SEQUENCE_LENGTH,
                use_mlock=True,
                verbose=False,
            )
        logger.info(f"llm has been initialized: {self.llm}, "
                    f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, "
                    f"env[CACHE]={os.environ.get('CACHE', None)}")

        self.generation_kwargs = dict(
            temperature=config.DEFAULT_TEMPERATURE,
            top_p=config.DEFAULT_TOP_P,
            top_k=config.DEFAULT_TOP_K,
            max_tokens=config.DEFAULT_MAX_TOKENS,
            repeat_penalty=1.1,
            # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>，直接跟 <|im_start|>
            stop=[
                "<|im_end|>",
                "<|im_start|>",
                "<|endoftext|>",
            ],
        )

    def tokenize(self, text):
        return self.llm.tokenize(text.encode("utf-8"))

    def generate_query(self, message, history_tokens, stream=True):
        """
        """
        # {% for message in messages %}
        #   {% if loop.first and messages[0]['role'] != 'system' %}
        #     {{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
        #   {% endif %}
        #   {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
        # {% endfor %}
        # {% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",

        input_ids = history_tokens + self.tokenize(
            f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>user\n"
        )
        if stream:
            return self._stream_generate(input_ids)
        else:
            return self._generate(input_ids)

    def generate_response(self, message, history_tokens, stream=True):
        input_ids = history_tokens + self.tokenize(
            f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>assistant\n"
        )
        if stream:
            return self._stream_generate(input_ids)
        else:
            return self._generate(input_ids)

    def _stream_generate(self, input_ids):
        logger.info(f"generation_kwargs {self.generation_kwargs}")

        # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx
        output = self.llm.create_completion(
            input_ids,
            stream=True,
            **self.generation_kwargs
        )
        generated_text = ""
        # TODO: 检测finish reason，如果是length，则shift，并继续生成。
        # TODO: 返回 token_id,
        for out in output:
            stream = copy.deepcopy(out)
            if stream["choices"][0]["finish_reason"] is None:
                generated_text += stream["choices"][0]["text"]
                if "completion_text" in stream["choices"][0]:
                    yield stream["choices"][0]["completion_text"], stream["choices"][0]["all_tokens"]
                else:
                    logger.info("completion_text not found")
                    yield generated_text, None


bot = Qwen2Simulator()

if __name__ == "__main__":
    # messages = [
    #     {"role": "system", "content": "you are a helpful assistant"},
    #     {"role": "user", "content": "What is the capital of France?"}
    # ]
    # output = bot.generate_response(messages)
    # print(output)

    messages = [
        {"role": "system", "content": "you are a helpful assistant"},
        {"role": "user", "content": "hi, what your name"},
        {"role": "assistant", "content": "My name is Jordan"}
    ]
    print(list(bot.generate_query(messages, stream=True)))
    print(bot.generate_query(messages, stream=False))