File size: 3,557 Bytes
bd0305e
 
015885c
bd0305e
 
 
d14c800
015885c
 
1019a35
d14c800
b373db2
6f60281
8a546d4
1019a35
 
26dd269
8a546d4
c58f313
 
14e5da3
c58f313
 
 
e4f8042
8a546d4
 
 
 
9b522e7
e4f8042
a33c796
14e5da3
8a546d4
c58f313
 
015885c
 
 
 
 
 
 
8a546d4
015885c
 
 
 
 
c58f313
14e5da3
 
015885c
c58f313
14e5da3
 
 
c58f313
 
1019a35
 
e4f8042
 
 
 
 
 
 
 
 
 
 
 
 
 
14e5da3
e4f8042
 
 
 
 
 
 
 
c58f313
b373db2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
import time
import numpy as np
from torch.nn import functional as F
import os
from threading import Thread

print(f"Starting to load the model to memory")
m = AutoModelForCausalLM.from_pretrained(
    "stabilityai/stablelm-2-zephyr-1_6b", torch_dtype=torch.float32, trust_remote_code=True)
tok = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-zephyr-1_6b", trust_remote_code=True)
generator = pipeline('text-generation', model=m, tokenizer=tok)
print(f"Sucessfully loaded the model to the memory")


start_message = ""

def user(message, history):
    # Append the user's message to the conversation history
    return "", history + [[message, ""]]


def chat(message, history):
    chat = []
    for item in history:
        chat.append({"role": "user", "content": item[0]})
        if item[1] is not None:
            chat.append({"role": "assistant", "content": item[1]})
    chat.append({"role": "user", "content": message})
    messages = tok.apply_chat_template(chat, tokenize=False)
    # Tokenize the messages string
    model_inputs = tok([messages], return_tensors="pt")
    streamer = TextIteratorStreamer(
        tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.95,
        top_k=1000,
        temperature=0.75,
        num_beams=1,
    )
    t = Thread(target=m.generate, kwargs=generate_kwargs)
    t.start()

    # print(history)
    # Initialize an empty string to store the generated text
    partial_text = ""
    for new_text in streamer:
        # print(new_text)
        partial_text += new_text
        history[-1][1] = partial_text
        # Yield an empty string to cleanup the message textbox and the updated conversation history
        yield history
    return partial_text


# with gr.Blocks() as demo:
#     # history = gr.State([])
#     gr.Markdown("## Stable LM 2 Zephyr 1.6b")
#     gr.HTML('''<center><a href="https://huggingface.co./spaces/stabilityai/stablelm-2-1_6b-zephyr?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate the Space to skip the queue and run in a private space</center>''')
#     chatbot = gr.Chatbot().style(height=500)
#     with gr.Row():
#         with gr.Column():
#             msg = gr.Textbox(label="Chat Message Box", placeholder="Chat Message Box",
#                              show_label=False).style(container=False)
#         with gr.Column():
#             with gr.Row():
#                 submit = gr.Button("Submit")
#                 stop = gr.Button("Stop")
#                 clear = gr.Button("Clear")

#     submit_event = msg.submit(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False).then(
#         fn=chat, inputs=[chatbot], outputs=[chatbot], queue=True)
#     submit_click_event = submit.click(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False).then(
#         fn=chat, inputs=[chatbot], outputs=[chatbot], queue=True)
#     stop.click(fn=None, inputs=None, outputs=None, cancels=[
#                submit_event, submit_click_event], queue=False)
#     clear.click(lambda: None, None, [chatbot], queue=False)
demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="Stable LM 2 Zephyr 1.6b")
demo.queue(max_size=32, concurrency_count=2)
demo.launch()