WizardCoder-Python-34b-v1.0

Runtime error

File size: 7,789 Bytes

c67caa3
 
 
 
 
e7eaa32
c67caa3
 
83aa170
23e4a4d
83aa170
 
 
 
c67caa3
0f2c752
c67caa3
8f37834
83aa170
22a7fc4
83aa170
 
 
 
 
 
 
 
 
 
d84665e
c67caa3
 
 
 
 
 
 
 
 
 
 
 
 
615a7e2
c67caa3
 
83aa170
 
 
 
 
 
c67caa3
8f37834
23e4a4d
 
 
50da33e
23e4a4d
 
 
 
 
 
83aa170
d84665e
8f37834
 
 
d84665e
 
 
 
 
 
3928243
c67caa3
 
 
 
 
 
 
 
 
 
 
83aa170
 
 
e7eaa32
38bd567
 
 
 
 
 
 
 
 
 
 
 
 
 
c67caa3
e7eaa32
c67caa3
 
 
 
 
 
ccca515
e562990
 
c67caa3
 
 
 
 
 
6c44a5d
 
c67caa3
 
380e40f
c67caa3
e562990
 
83aa170
e562990
83aa170
c67caa3
 
 
 
 
 
 
 
 
 
 
 
 
 
6c44a5d
 
 
c67caa3
 
 
83aa170
c67caa3
09f5472
 
 
 
23e4a4d
 
c67caa3
83aa170
 
09f5472
83aa170
c67caa3
83aa170
 
c67caa3
 
e562990

import os
import re
import logging
import gradio as gr
import openai
from itertools import zip_longest

print(os.environ)
openai.api_base1 = os.environ.get("OPENAI_API_BASE1")
openai.api_base2 = os.environ.get("OPENAI_API_BASE2")
openai.api_key1 = os.environ.get("OPENAI_API_KEY")
openai.api_key2 = os.environ.get("OPENAI_API_KEY")
openai.api_model1 = os.environ.get("OPENAI_API_MODEL1")
openai.api_model2 = os.environ.get("OPENAI_API_MODEL2")

BASE_SYSTEM_MESSAGE = """"""

def make_prediction(prompt, api_model, api_key, api_base, max_tokens=None, temperature=None, top_p=None, top_k=None, repetition_penalty=None):
    completion = openai.Completion.create(
        model=api_model,
        api_key=api_key,
        api_base=api_base,
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        stream=True,
        stop=["</s>", "<|im_end|>"])
    for chunk in completion:
        yield chunk["choices"][0]["text"]


def clear_chat(chat_history_state, chat_message):
    chat_history_state = []
    chat_message = ''
    return chat_history_state, chat_message


def user(message, history):
    history = history or []
    # Append the user's message to the conversation history
    history.append([message, ""])
    return message, history


def user_double(message, history1, history2):
    history1 = history1 or []
    history2 = history2 or []
    history1.append([message, ""])
    history2.append([message, ""])
    return "", history1, history2

def chat(api_model, history, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty, api_key, api_base):
    history = history or []

    messages = BASE_SYSTEM_MESSAGE + system_message.strip() + "\n" + \
               "\n".join(["\n".join(["### Instruction:\n"+item[0]+"\n\n", "### Response:\n"+item[1]+"\n\n"])
                          for item in history])
    # strip the last `<|end_of_turn|>` from the messages
    #messages = messages.rstrip("<|end_of_turn|>")
    # remove last space from assistant, some models output a ZWSP if you leave a space
    messages = messages.rstrip()

    prediction = make_prediction(
        messages,
        api_model,
        api_key,
        api_base,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
    )
    for tokens in prediction:
        tokens = re.findall(r'(.*?)(\s|$)', tokens)
        for subtoken in tokens:
            subtoken = "".join(subtoken)
            # Remove "Response\n" if it's at the beginning of the assistant's output
            if subtoken.startswith("Response"):
                subtoken = subtoken[len("Response"):]
            answer = subtoken
            history[-1][1] += answer
            # stream the response
            yield history, history, ""

def chat_double(history1, history2, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty):
    gen1 = chat(openai.api_model1, history1, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty, openai.api_key1, openai.api_base1)
    gen2 = chat(openai.api_model2, history2, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty, openai.api_key2, openai.api_base2)

    # Define a default value that will be used when one of the generators is exhausted
    latest_chatbot1_out, latest_chat_history_state1_out, _ = ([["", ""]], [["", ""]], "")
    latest_chatbot2_out, latest_chat_history_state2_out, _ = ([["", ""]], [["", ""]], "")

    for out1, out2 in zip_longest(gen1, gen2, fillvalue=None):
        if out1 is not None:  # None means gen1 is exhausted
            chatbot1_out, chat_history_state1_out, _ = out1
            latest_chatbot1_out, latest_chat_history_state1_out = chatbot1_out, chat_history_state1_out
            
        if out2 is not None:  # None means gen2 is exhausted
            chatbot2_out, chat_history_state2_out, _ = out2
            latest_chatbot2_out, latest_chat_history_state2_out = chatbot2_out, chat_history_state2_out
            
        yield latest_chatbot1_out, latest_chatbot2_out, latest_chat_history_state1_out, latest_chat_history_state2_out, ""


start_message = ""

CSS ="""
.contain { display: flex; flex-direction: column; }
.gradio-container { height: 100vh !important; }
#component-0 { height: 100%; }
#chatbot { flex-grow: 1; overflow: auto; resize: vertical; }
#chatbot1 { flex-grow: 1; overflow: auto; resize: vertical; }
#chatbot2 { flex-grow: 1; overflow: auto; resize: vertical; }
"""

with gr.Blocks(css=CSS) as demo:
    with gr.Row():
        with gr.Column():
            gr.Markdown(f"""
                    ## This demo is a quantized GPU chatbot of [WizardCoder-Python-34B-V1.0-GGUF](https://huggingface.co./TheBloke/WizardCoder-Python-34B-V1.0-GGUF)
                    It runs two different quantization levels in parallel for comparison. Best run at temperature 0.
                    """)
    with gr.Row():
        gr.Markdown("# 🔍 WizardCoder-Python-34B-V1.0-GGUF Playground Space! 🔎")
    with gr.Row():
        with gr.Column():
            #chatbot = gr.Chatbot().style(height=500)
            chatbot1 = gr.Chatbot(label="Chat1: "+openai.api_model1, elem_id="chatbot1")
        with gr.Column():
            chatbot2 = gr.Chatbot(label="Chat2: "+openai.api_model2, elem_id="chatbot2")
    with gr.Row():
        message = gr.Textbox(
            label="What do you want to chat about?",
            placeholder="Ask me anything.",
            lines=3,
        )
    with gr.Row():
        submit = gr.Button(value="Send message", variant="secondary").style(full_width=True)
        clear = gr.Button(value="New topic", variant="secondary").style(full_width=False)
        stop = gr.Button(value="Stop", variant="secondary").style(full_width=False)
    with gr.Accordion("Show Model Parameters", open=False):
        with gr.Row():
            with gr.Column():
                max_tokens = gr.Slider(20, 4000, label="Max Tokens", step=20, value=2000)
                temperature = gr.Slider(0.0, 2.0, label="Temperature", step=0.1, value=0.0)
                top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.02, value=1.0)
                top_k = gr.Slider(-1, 100, label="Top K", step=1, value=0)
                repetition_penalty = gr.Slider(0.0, 2.0, label="Repetition Penalty", step=0.05, value=1.1)

        system_msg = gr.Textbox(
            start_message, label="System Message", interactive=True, visible=True, placeholder="System prompt. Provide instructions which you want the model to remember.", lines=3)

    chat_history_state1 = gr.State()
    chat_history_state2 = gr.State()
    clear.click(clear_chat, inputs=[chat_history_state1, message], outputs=[chat_history_state1, message], queue=False)
    clear.click(clear_chat, inputs=[chat_history_state2, message], outputs=[chat_history_state2, message], queue=False)
    clear.click(lambda: None, None, chatbot1, queue=False)
    clear.click(lambda: None, None, chatbot2, queue=False)

    submit_click_event = submit.click(
        fn=user_double, inputs=[message, chat_history_state1, chat_history_state2], outputs=[message, chat_history_state1, chat_history_state2], queue=True
    ).then(
        fn=chat_double, inputs=[chat_history_state1, chat_history_state2, system_msg, max_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[chatbot1, chatbot2, chat_history_state1, chat_history_state2, message], queue=True
    )

    stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_click_event], queue=False)

demo.queue(max_size=48, concurrency_count=8).launch(debug=True, server_name="0.0.0.0", server_port=7860)