Spaces:
Runtime error
Runtime error
File size: 7,789 Bytes
c67caa3 e7eaa32 c67caa3 83aa170 23e4a4d 83aa170 c67caa3 0f2c752 c67caa3 8f37834 83aa170 22a7fc4 83aa170 d84665e c67caa3 615a7e2 c67caa3 83aa170 c67caa3 8f37834 23e4a4d 50da33e 23e4a4d 83aa170 d84665e 8f37834 d84665e 3928243 c67caa3 83aa170 e7eaa32 38bd567 c67caa3 e7eaa32 c67caa3 ccca515 e562990 c67caa3 6c44a5d c67caa3 380e40f c67caa3 e562990 83aa170 e562990 83aa170 c67caa3 6c44a5d c67caa3 83aa170 c67caa3 09f5472 23e4a4d c67caa3 83aa170 09f5472 83aa170 c67caa3 83aa170 c67caa3 e562990 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import os
import re
import logging
import gradio as gr
import openai
from itertools import zip_longest
print(os.environ)
openai.api_base1 = os.environ.get("OPENAI_API_BASE1")
openai.api_base2 = os.environ.get("OPENAI_API_BASE2")
openai.api_key1 = os.environ.get("OPENAI_API_KEY")
openai.api_key2 = os.environ.get("OPENAI_API_KEY")
openai.api_model1 = os.environ.get("OPENAI_API_MODEL1")
openai.api_model2 = os.environ.get("OPENAI_API_MODEL2")
BASE_SYSTEM_MESSAGE = """"""
def make_prediction(prompt, api_model, api_key, api_base, max_tokens=None, temperature=None, top_p=None, top_k=None, repetition_penalty=None):
completion = openai.Completion.create(
model=api_model,
api_key=api_key,
api_base=api_base,
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
stream=True,
stop=["</s>", "<|im_end|>"])
for chunk in completion:
yield chunk["choices"][0]["text"]
def clear_chat(chat_history_state, chat_message):
chat_history_state = []
chat_message = ''
return chat_history_state, chat_message
def user(message, history):
history = history or []
# Append the user's message to the conversation history
history.append([message, ""])
return message, history
def user_double(message, history1, history2):
history1 = history1 or []
history2 = history2 or []
history1.append([message, ""])
history2.append([message, ""])
return "", history1, history2
def chat(api_model, history, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty, api_key, api_base):
history = history or []
messages = BASE_SYSTEM_MESSAGE + system_message.strip() + "\n" + \
"\n".join(["\n".join(["### Instruction:\n"+item[0]+"\n\n", "### Response:\n"+item[1]+"\n\n"])
for item in history])
# strip the last `<|end_of_turn|>` from the messages
#messages = messages.rstrip("<|end_of_turn|>")
# remove last space from assistant, some models output a ZWSP if you leave a space
messages = messages.rstrip()
prediction = make_prediction(
messages,
api_model,
api_key,
api_base,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
)
for tokens in prediction:
tokens = re.findall(r'(.*?)(\s|$)', tokens)
for subtoken in tokens:
subtoken = "".join(subtoken)
# Remove "Response\n" if it's at the beginning of the assistant's output
if subtoken.startswith("Response"):
subtoken = subtoken[len("Response"):]
answer = subtoken
history[-1][1] += answer
# stream the response
yield history, history, ""
def chat_double(history1, history2, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty):
gen1 = chat(openai.api_model1, history1, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty, openai.api_key1, openai.api_base1)
gen2 = chat(openai.api_model2, history2, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty, openai.api_key2, openai.api_base2)
# Define a default value that will be used when one of the generators is exhausted
latest_chatbot1_out, latest_chat_history_state1_out, _ = ([["", ""]], [["", ""]], "")
latest_chatbot2_out, latest_chat_history_state2_out, _ = ([["", ""]], [["", ""]], "")
for out1, out2 in zip_longest(gen1, gen2, fillvalue=None):
if out1 is not None: # None means gen1 is exhausted
chatbot1_out, chat_history_state1_out, _ = out1
latest_chatbot1_out, latest_chat_history_state1_out = chatbot1_out, chat_history_state1_out
if out2 is not None: # None means gen2 is exhausted
chatbot2_out, chat_history_state2_out, _ = out2
latest_chatbot2_out, latest_chat_history_state2_out = chatbot2_out, chat_history_state2_out
yield latest_chatbot1_out, latest_chatbot2_out, latest_chat_history_state1_out, latest_chat_history_state2_out, ""
start_message = ""
CSS ="""
.contain { display: flex; flex-direction: column; }
.gradio-container { height: 100vh !important; }
#component-0 { height: 100%; }
#chatbot { flex-grow: 1; overflow: auto; resize: vertical; }
#chatbot1 { flex-grow: 1; overflow: auto; resize: vertical; }
#chatbot2 { flex-grow: 1; overflow: auto; resize: vertical; }
"""
with gr.Blocks(css=CSS) as demo:
with gr.Row():
with gr.Column():
gr.Markdown(f"""
## This demo is a quantized GPU chatbot of [WizardCoder-Python-34B-V1.0-GGUF](https://huggingface.co./TheBloke/WizardCoder-Python-34B-V1.0-GGUF)
It runs two different quantization levels in parallel for comparison. Best run at temperature 0.
""")
with gr.Row():
gr.Markdown("# 🔍 WizardCoder-Python-34B-V1.0-GGUF Playground Space! 🔎")
with gr.Row():
with gr.Column():
#chatbot = gr.Chatbot().style(height=500)
chatbot1 = gr.Chatbot(label="Chat1: "+openai.api_model1, elem_id="chatbot1")
with gr.Column():
chatbot2 = gr.Chatbot(label="Chat2: "+openai.api_model2, elem_id="chatbot2")
with gr.Row():
message = gr.Textbox(
label="What do you want to chat about?",
placeholder="Ask me anything.",
lines=3,
)
with gr.Row():
submit = gr.Button(value="Send message", variant="secondary").style(full_width=True)
clear = gr.Button(value="New topic", variant="secondary").style(full_width=False)
stop = gr.Button(value="Stop", variant="secondary").style(full_width=False)
with gr.Accordion("Show Model Parameters", open=False):
with gr.Row():
with gr.Column():
max_tokens = gr.Slider(20, 4000, label="Max Tokens", step=20, value=2000)
temperature = gr.Slider(0.0, 2.0, label="Temperature", step=0.1, value=0.0)
top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.02, value=1.0)
top_k = gr.Slider(-1, 100, label="Top K", step=1, value=0)
repetition_penalty = gr.Slider(0.0, 2.0, label="Repetition Penalty", step=0.05, value=1.1)
system_msg = gr.Textbox(
start_message, label="System Message", interactive=True, visible=True, placeholder="System prompt. Provide instructions which you want the model to remember.", lines=3)
chat_history_state1 = gr.State()
chat_history_state2 = gr.State()
clear.click(clear_chat, inputs=[chat_history_state1, message], outputs=[chat_history_state1, message], queue=False)
clear.click(clear_chat, inputs=[chat_history_state2, message], outputs=[chat_history_state2, message], queue=False)
clear.click(lambda: None, None, chatbot1, queue=False)
clear.click(lambda: None, None, chatbot2, queue=False)
submit_click_event = submit.click(
fn=user_double, inputs=[message, chat_history_state1, chat_history_state2], outputs=[message, chat_history_state1, chat_history_state2], queue=True
).then(
fn=chat_double, inputs=[chat_history_state1, chat_history_state2, system_msg, max_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[chatbot1, chatbot2, chat_history_state1, chat_history_state2, message], queue=True
)
stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_click_event], queue=False)
demo.queue(max_size=48, concurrency_count=8).launch(debug=True, server_name="0.0.0.0", server_port=7860)
|