File size: 4,478 Bytes
b37c16f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import torch
import gradio as gr
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

# Environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Global variables for model and tokenizer
model = None
tokenizer = None

def get_gpu_memory():
    return torch.cuda.memory_allocated() / 1024 / 1024  # Convert to MiB

class TorchTracemalloc:
    def __init__(self):
        self.begin = 0
        self.peak = 0

    def __enter__(self):
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
        self.begin = get_gpu_memory()
        return self

    def __exit__(self, *exc):
        torch.cuda.synchronize()
        self.peak = (
            torch.cuda.max_memory_allocated() / 1024 / 1024
        )  # Convert to MiB

    def consumed(self):
        return self.peak - self.begin

def load_model_and_tokenizer(model_name, dtype, kv_bits):
    global model, tokenizer
    if model is None or tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        special_tokens = {"pad_token": "<PAD>"}
        tokenizer.add_special_tokens(special_tokens)

        config = AutoConfig.from_pretrained(model_name)
        if kv_bits != "unquantized":
            quantizer_path = f"codebooks/{model_name.split('/')[-1]}_{kv_bits}bit.xmad"
            setattr(config, "quantizer_path", quantizer_path)

        dtype = torch.__dict__.get(dtype, torch.float32)
        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")

        if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
            model.resize_token_embeddings(len(tokenizer))

        tokenizer.padding_side = "left"
        model.config.pad_token_id = tokenizer.pad_token_id

    return model, tokenizer

# Initialize model and tokenizer
model, tokenizer = load_model_and_tokenizer("NousResearch/Hermes-2-Theta-Llama-3-8B", "fp16", "1")

def process_dialog(dialog, model, tokenizer, max_tokens, temperature):
    prompt = tokenizer.apply_chat_template(
        dialog, tokenize=False, add_generation_prompt=True
    )
    tokenized_input_prompt_ids = tokenizer(
        prompt, return_tensors="pt"
    ).input_ids.to(model.device)

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    with TorchTracemalloc() as tt:
        start_time = time.time()
        with torch.no_grad():
            token_ids_for_each_answer = model.generate(
                tokenized_input_prompt_ids,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )
        torch.cuda.synchronize()
        end_time = time.time()

    response = token_ids_for_each_answer[0][
        tokenized_input_prompt_ids.shape[-1] :
    ]
    cleaned_response = tokenizer.decode(
        response,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    return cleaned_response

def respond(message, history, system_message, max_tokens, temperature):
    dialog = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            dialog.append({"role": "user", "content": val[0]})
        if val[1]:
            dialog.append({"role": "assistant", "content": val[1]})
    dialog.append({"role": "user", "content": message})

    response = process_dialog(dialog, model, tokenizer, max_tokens, temperature)
    history.append((message, response))
    return history, history

# Initialize Gradio ChatInterface
demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
    ],
    theme="default",
    title="1bit llama3 by xMAD.ai",
    description="The first industrial level 1 bit quantization Llama3, we can achieve 800 tokens per second on NVIDIA V100 and 1200 on NVIDIA A100, 90% cost down of your cloud hosting cost",
    css=".scrollable { height: 400px; overflow-y: auto; padding: 10px; border: 1px solid #ccc; }"
)

if __name__ == "__main__":
    demo.launch()