File size: 4,589 Bytes
325d5a7
9e05f4d
325d5a7
 
 
 
b9d98e8
 
 
 
c3874dd
325d5a7
c3874dd
 
325d5a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933a11c
85d5bc9
325d5a7
 
 
 
 
 
 
 
 
 
 
 
a344b34
325d5a7
 
 
 
 
 
 
 
 
 
 
 
 
 
a344b34
325d5a7
a344b34
 
 
 
 
 
325d5a7
a344b34
 
 
 
 
 
 
 
 
 
 
92b1747
a344b34
 
 
 
 
df4d3cc
 
a344b34
 
 
325d5a7
 
0cbbcf0
 
 
0e790fa
0cbbcf0
7c77a73
 
0cbbcf0
 
325d5a7
 
 
 
 
 
 
 
 
 
 
85d5bc9
325d5a7
 
8673e66
325d5a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Thank you code from https://huggingface.co./spaces/gokaygokay/Gemma-2-llamacpp
import spaces
import os
import json
import subprocess
from llama_cpp import Llama
# from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
# from llama_cpp_agent.providers import LlamaCppPythonProvider
# from llama_cpp_agent.chat_history import BasicChatHistory
# from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download


# huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

hf_hub_download(
    repo_id="wannaphong/KhanomTanLLM-1B-Instruct-Q2_K-GGUF",
    filename="khanomtanllm-1b-instruct-q2_k.gguf",
    local_dir="./models"
)

hf_hub_download(
    repo_id="wannaphong/KhanomTanLLM-3B-Instruct-Q2_K-GGUF",
    filename="khanomtanllm-3b-instruct-q2_k.gguf",
    local_dir="./models"
)

# hf_hub_download(
#     repo_id="google/gemma-2-2b-it-GGUF",
#     filename="2b_it_v2.gguf",
#     local_dir="./models",
#     token=huggingface_token
# )



llm = None
llm_model = None
#
@spaces.GPU #duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    model,
    system_message,
    max_tokens,
    temperature,
    min_p,
    top_p,
    top_k,
    repeat_penalty,
):
    # chat_template = MessagesFormatterType.MISTRAL

    global llm
    global llm_model
    
    if llm is None or llm_model != model:
        llm = Llama(
            model_path=f"models/{model}",
            flash_attn=True,
            #n_gpu_layers=81,
            n_batch=1024,
            n_ctx=2048,
        )
        llm_model = model

    # provider = LlamaCppPythonProvider(llm)

    # agent = LlamaCppAgent(
    #     provider,
    #     system_prompt=f"{system_message}",
    #     predefined_messages_formatter_type=chat_template,
    #     debug_output=True
    # )
    
    # settings = provider.get_provider_default_settings()
    # settings.temperature = temperature
    # settings.top_k = top_k
    # settings.top_p = top_p
    # settings.min_p = min_p
    # settings.max_tokens = max_tokens
    # settings.repeat_penalty = repeat_penalty
    # settings.stream = True

    # messages = BasicChatHistory()
    messages=[{"role":"system","content":system_message}]
    chat=[{"role":"user","content":message}]
    chat_b=[]

    i=1
    if history!=[]:
        for msn in history:
            messages.append({"role":"user","content":msn[0]})
            messages.append({"role":"assistant","content":msn[1]})
    messages+=chat
    print(messages)
    stream = llm.create_chat_completion(messages=messages,temperature = temperature,top_k = top_k,top_p = top_p,min_p = min_p,max_tokens = max_tokens,repeat_penalty = repeat_penalty,stream = True)
    
    outputs = ""
    for chunk in stream:
        delta = chunk['choices'][0]['delta']
        if 'content' in delta:
            tokens = delta['content']#.split()
            for token in tokens:
                outputs+=token
                yield outputs
            
        #yield outputs.replace("<|assistant|>","").replace("<|user|>","")

description = """
"""

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Dropdown([
                'khanomtanllm-1b-instruct-q2_k.gguf',
                'khanomtanllm-3b-instruct-q2_k.gguf',
            ],
            value="khanomtanllm-1b-instruct-q2_k.gguf",
            label="Model"
        ),
        gr.Textbox(value="You are a helpful assistant.", label="System message", lines=6),
        gr.Slider(minimum=1, maximum=2048, value=2048, step=1, label="Max tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=2.0, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.7,
            step=0.05,
            label="min-p",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
        ),
        gr.Slider(
            minimum=0,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
        ),
        gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition penalty",
        ),
    ],
    retry_btn="Retry",
    undo_btn="Undo",
    clear_btn="Clear",
    submit_btn="Send",
    title="Chat with KhanomTanLLM using llama.cpp", 
    description=description,
    chatbot=gr.Chatbot(
        scale=1, 
        likeable=False,
        show_copy_button=True
    )
)

if __name__ == "__main__":
    demo.launch()