File size: 4,001 Bytes
9aa0a43
 
 
 
 
 
 
 
 
 
1199506
9aa0a43
 
 
 
 
1199506
9aa0a43
 
 
 
 
 
 
 
 
 
 
 
 
1199506
9aa0a43
 
 
 
 
 
 
 
1199506
9aa0a43
 
 
 
 
 
1199506
9aa0a43
 
 
 
 
 
 
1199506
9aa0a43
1199506
9aa0a43
 
 
 
 
 
 
 
 
 
 
1199506
9aa0a43
 
 
 
 
 
 
1199506
9aa0a43
 
 
 
1199506
9aa0a43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1199506
 
9aa0a43
 
1199506
9aa0a43
1199506
9aa0a43
 
1199506
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import spaces
import json
import subprocess
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download

hf_hub_download(
    repo_id="fluently-lm/FluentlyLM-Prinum-Q4_K_M-GGUF",
    filename="fluentlylm-prinum-q4_k_m.gguf",
    local_dir="./models"
)

@spaces.GPU(duration=90)
def respond(
    message,
    history: list[tuple[str, str]],
    model,
    system_message,
    max_tokens,
    temperature,
    top_p,
    top_k,
    repeat_penalty,
):
    chat_template = MessagesFormatterType.GEMMA_2

    llm = Llama(
        model_path=f"models/{model}",
        flash_attn=True,
        n_gpu_layers=81,
        n_batch=1024,
        n_ctx=8192,
    )
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt=f"{system_message}",
        predefined_messages_formatter_type=chat_template,
        debug_output=True
    )

    settings = provider.get_provider_default_settings()
    settings.temperature = temperature
    settings.top_k = top_k
    settings.top_p = top_p
    settings.max_tokens = max_tokens
    settings.repeat_penalty = repeat_penalty
    settings.stream = True

    messages = BasicChatHistory()

    for msn in history:
        user = {
            'role': Roles.user,
            'content': msn[0]
        }
        assistant = {
            'role': Roles.assistant,
            'content': msn[1]
        }
        messages.add_message(user)
        messages.add_message(assistant)

    stream = agent.get_chat_response(
        message,
        llm_sampling_settings=settings,
        chat_history=messages,
        returns_streaming_generator=True,
        print_output=False
    )

    outputs = ""
    for output in stream:
        outputs += output
        yield outputs

def create_interface(model_name, description):
    return gr.ChatInterface(
        respond,
        additional_inputs=[
            gr.Textbox(value=model_name, label="Model", interactive=False),
            gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness", label="System message"),
            gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
            gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p",
            ),
            gr.Slider(
                minimum=0,
                maximum=100,
                value=40,
                step=1,
                label="Top-k",
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=1.1,
                step=0.1,
                label="Repetition penalty",
            ),
        ],
        retry_btn="Retry",
        undo_btn="Undo",
        clear_btn="Clear",
        submit_btn="Send",
        title=f"{model_name}",
        description=description,
        examples=[
        ["Hi! How are you?",
         "Write a short story about a scary island.",
         "Prove that the force of gravity applies to all bodies in the Universe.",
         "Give examples of how a quantum computer works."
        ],
        ],
        chatbot=gr.Chatbot(
            label=None,
            scale=1,
            likeable=True,
            show_copy_button=True
        )
    )

description = """<h2 align="center"<bold>FluentlyLM Prinum</bold> Demo</h2>"""
interface = create_interface('fluentlylm-prinum-q4_k_m.gguf', description)

demo = gr.Blocks()

with demo:
    interface.render()

if __name__ == "__main__":
    demo.launch()