Spaces:

wangrongsheng
/

CareLlama

Runtime error

App Files Files Community

wangrongsheng commited on Aug 25, 2023

Commit

b62eec7

1 Parent(s): 2ff72d7

Upload 4 files

Browse files

Files changed (4) hide show

app.py +235 -293
model.py +74 -0
requirements.txt +8 -9
style.css +16 -0

app.py CHANGED Viewed

@@ -1,329 +1,271 @@
-"""Credit to https://github.com/THUDM/ChatGLM2-6B/blob/main/web_demo.py while mistakes are mine."""
-# pylint: disable=broad-exception-caught, redefined-outer-name, missing-function-docstring, missing-module-docstring, too-many-arguments, line-too-long, invalid-name, redefined-builtin, redefined-argument-from-local
-# import gradio as gr
-# model_name = "models/THUDM/chatglm2-6b-int4"
-# gr.load(model_name).lauch()
-# %%writefile demo-4bit.py
-import os
-import time
-from textwrap import dedent
-import gradio as gr
-import mdtex2html
-import torch
-from loguru import logger
-from transformers import AutoModel, AutoTokenizer
-# fix timezone in Linux
-os.environ["TZ"] = "Asia/Shanghai"
-try:
-    time.tzset()  # type: ignore # pylint: disable=no-member
-except Exception:
-    # Windows
-    logger.warning("Windows, cant run time.tzset()")
-model_name = "wangrongsheng/IvyGPT-35"
-#model_name = "OpenMEDLab/PULSE-7bv5"
-RETRY_FLAG = False
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-#model = AutoModel.from_pretrained(model_name, trust_remote_code=True).quantize(4).half().cuda()
-model = AutoModel.from_pretrained(model_name, trust_remote_code=True).half().cuda()
-model = model.eval()
-_ = """Override Chatbot.postprocess"""
-def postprocess(self, y):
-    if y is None:
-        return []
-    for i, (message, response) in enumerate(y):
-        y[i] = (
-            None if message is None else mdtex2html.convert((message)),
-            None if response is None else mdtex2html.convert(response),
-        )
-    return y
-gr.Chatbot.postprocess = postprocess
-def parse_text(text):
-    lines = text.split("\n")
-    lines = [line for line in lines if line != ""]
-    count = 0
-    for i, line in enumerate(lines):
-        if "```" in line:
-            count += 1
-            items = line.split("`")
-            if count % 2 == 1:
-                lines[i] = f'<pre><code class="language-{items[-1]}">'
-            else:
-                lines[i] = "<br></code></pre>"
-        else:
-            if i > 0:
-                if count % 2 == 1:
-                    line = line.replace("`", r"\`")
-                    line = line.replace("<", "&lt;")
-                    line = line.replace(">", "&gt;")
-                    line = line.replace(" ", "&nbsp;")
-                    line = line.replace("*", "&ast;")
-                    line = line.replace("_", "&lowbar;")
-                    line = line.replace("-", "&#45;")
-                    line = line.replace(".", "&#46;")
-                    line = line.replace("!", "&#33;")
-                    line = line.replace("(", "&#40;")
-                    line = line.replace(")", "&#41;")
-                    line = line.replace("$", "&#36;")
-                lines[i] = "<br>" + line
-    text = "".join(lines)
-    return text
-def predict(
-    RETRY_FLAG, input, chatbot, max_length, top_p, temperature, history, past_key_values
-):
-    try:
-        chatbot.append((parse_text(input), ""))
-    except Exception as exc:
-        logger.error(exc)
-        logger.debug(f"{chatbot=}")
-        _ = """
-        if chatbot:
-            chatbot[-1] = (parse_text(input), str(exc))
-            yield chatbot, history, past_key_values
-        # """
-        yield chatbot, history, past_key_values
-    """
-    for response, history, past_key_values in model.stream_chat(
-        tokenizer,
-        input,
-        history,
-        past_key_values=past_key_values,
-        return_past_key_values=True,
-        max_length=max_length,
-        top_p=top_p,
-        temperature=temperature,
-    ):
-    """
-    for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
-                                               temperature=temperature):
-        chatbot[-1] = (parse_text(input), parse_text(response))
-        yield chatbot, history, past_key_values
-def trans_api(input, max_length=40960, top_p=0.7, temperature=0.95):
-    if max_length < 10:
-        max_length = 40960
-    if top_p < 0.1 or top_p > 1:
-        top_p = 0.7
-    if temperature <= 0 or temperature > 1:
-        temperature = 0.01
-    try:
-        res, _ = model.chat(
-            tokenizer,
-            input,
-            history=[],
-            past_key_values=None,
-            max_length=max_length,
-            top_p=top_p,
-            temperature=temperature,
-        )
-        # logger.debug(f"{res=} \n{_=}")
-    except Exception as exc:
-        logger.error(f"{exc=}")
-        res = str(exc)
-    return res
-def reset_user_input():
-    return gr.update(value="")
-def reset_state():
-    return [], [], None
-# Delete last turn
-def delete_last_turn(chat, history):
-    if chat and history:
-        chat.pop(-1)
-        history.pop(-1)
-    return chat, history
-# Regenerate response
-def retry_last_answer(
-    user_input, chatbot, max_length, top_p, temperature, history, past_key_values
-):
-    if chatbot and history:
-        # Removing the previous conversation from chat
-        chatbot.pop(-1)
-        # Setting up a flag to capture a retry
-        RETRY_FLAG = True
-        # Getting last message from user
-        user_input = history[-1][0]
-        # Removing bot response from the history
-        history.pop(-1)
-    yield from predict(
-        RETRY_FLAG,  # type: ignore
-        user_input,
-        chatbot,
-        max_length,
-        top_p,
-        temperature,
-        history,
-        past_key_values,
-    )
-with gr.Blocks(title="IvyGPT", theme=gr.themes.Soft(text_size="sm")) as demo:
-    # gr.HTML("""<h1 align="center">ChatGLM2-6B-int4</h1>""")
-    gr.HTML(
-        """<h1 align="center">IvyGPT医疗对话大模型</h1>"""
-    )
-    with gr.Accordion("🎈 Info", open=False):
-        _ = f"""
-            ## 欢迎体验IvyGPT
-            近期在通用领域中出现的大语言模型（LLMs），例如ChatGPT，在遵循指令和产生类人响应方面表现出了显著的成功。然而，这样的大型语言模型并没有被广泛应用于医学领域，导致响应的准确性较差，无法提供关于医学诊断、药物等合理的建议。IvyGPT是一个医疗大语言模型，它在高质量的医学问答数据上进行了监督微调，并使用人类反馈的强化学习进行了训练。
-            [模型下载地址](https://huggingface.co/wangrongsheng/)
-            """
-        gr.Markdown(dedent(_))
-    chatbot = gr.Chatbot()
-    with gr.Row():
-        with gr.Column(scale=4):
-            with gr.Column(scale=12):
-                user_input = gr.Textbox(
-                    show_label=False,
-                    placeholder="Input...",
-                ).style(container=False)
-                RETRY_FLAG = gr.Checkbox(value=False, visible=False)
-            with gr.Column(min_width=32, scale=1):
-                with gr.Row():
-                    submitBtn = gr.Button("Submit", variant="primary")
-                    deleteBtn = gr.Button("删除最后一条对话", variant="secondary")
-                    retryBtn = gr.Button("重新生成Regenerate", variant="secondary")
-        with gr.Column(scale=1):
-            emptyBtn = gr.Button("Clear History")
-            max_length = gr.Slider(
-                0,
-                32768,
-                value=8192,
-                step=1.0,
-                label="Maximum length",
-                interactive=True,
-            )
-            top_p = gr.Slider(
-                0, 1, value=0.85, step=0.01, label="Top P", interactive=True
-            )
-            temperature = gr.Slider(
-                0.01, 1, value=0.95, step=0.01, label="Temperature", interactive=True
             )
-    history = gr.State([])
-    past_key_values = gr.State(None)
-    user_input.submit(
-        predict,
-        [
-            RETRY_FLAG,
-            user_input,
             chatbot,
-            max_length,
-            top_p,
             temperature,
-            history,
-            past_key_values,
         ],
-        [chatbot, history, past_key_values],
-        show_progress="full",
     )
-    submitBtn.click(
-        predict,
-        [
-            RETRY_FLAG,
-            user_input,
             chatbot,
-            max_length,
-            top_p,
             temperature,
-            history,
-            past_key_values,
         ],
-        [chatbot, history, past_key_values],
-        show_progress="full",
-        api_name="predict",
-    )
-    submitBtn.click(reset_user_input, [], [user_input])
-    emptyBtn.click(
-        reset_state, outputs=[chatbot, history, past_key_values], show_progress="full"
     )
-    retryBtn.click(
-        retry_last_answer,
         inputs=[
-            user_input,
             chatbot,
-            max_length,
-            top_p,
             temperature,
-            history,
-            past_key_values,
         ],
-        # outputs = [chatbot, history, last_user_message, user_message]
-        outputs=[chatbot, history, past_key_values],
     )
-    deleteBtn.click(delete_last_turn, [chatbot, history], [chatbot, history])
-    with gr.Accordion("Example inputs", open=True):
-        examples = gr.Examples(
-            examples=[
-                ["熬夜对身体有什么危害? "],
-                ["新冠肺炎怎么预防"],
-                ["系统性红斑狼疮的危害和治疗方法是什么？"],
-            ],
-            inputs=[user_input],
-            examples_per_page=50,
-        )
-    with gr.Accordion("For Chat/Translation API", open=False, visible=False):
-        input_text = gr.Text()
-        tr_btn = gr.Button("Go", variant="primary")
-        out_text = gr.Text()
-    tr_btn.click(
-        trans_api,
-        [input_text, max_length, top_p, temperature],
-        out_text,
-        # show_progress="full",
-        api_name="tr",
-    )
-    _ = """
-    input_text.submit(
-        trans_api,
-        [input_text, max_length, top_p, temperature],
-        out_text,
-        show_progress="full",
-        api_name="tr1",
     )
-    # """
-# demo.queue().launch(share=False, inbrowser=True)
-# demo.queue().launch(share=True, inbrowser=True, debug=True)
-# concurrency_count > 1 requires more memory, max_size: queue size
-# T4 medium: 30GB, model size: ~4G concurrency_count = 6
-# leave one for api access
-# reduce to 5 if OOM occurs to often
-demo.queue(concurrency_count=3, max_size=30).launch(debug=True)

+from typing import Iterator
+import gradio as gr
+import torch
+from model import get_input_token_length, run
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
+"""
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DESCRIPTION = """
+# CareLlama-关怀羊驼
+- CareLlama (关怀羊驼)是一个医疗大语言模型，同时它集合了数十个公开可用的医疗微调数据集和开放可用的医疗大语言模型以促进医疗LLM快速发展。
+- Medical LLM, Open Source Driven for a Healthy Future.
+"""
+LICENSE = """
+<p/>
+---
+本项目相关资源仅供学术研究之用，严禁用于商业用途。使用涉及第三方代码的部分时，请严格遵循相应的开源协议。模型生成的内容受模型计算、随机性和量化精度损失等因素影响，本项目无法对其准确性作出保证。即使本项目模型输出符合医学事实，也不能被用作实际医学诊断的依据。对于模型输出的任何内容，本项目不承担任何法律责任，亦不对因使用相关资源和输出结果而可能产生的任何损失承担责任。
+"""
+if not torch.cuda.is_available():
+    DESCRIPTION += '\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>'
+def clear_and_save_textbox(message: str) -> tuple[str, str]:
+    return '', message
+def display_input(message: str,
+                  history: list[tuple[str, str]]) -> list[tuple[str, str]]:
+    history.append((message, ''))
+    return history
+def delete_prev_fn(
+        history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
+    try:
+        message, _ = history.pop()
+    except IndexError:
+        message = ''
+    return history, message or ''
+def generate(
+    message: str,
+    history_with_input: list[tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+) -> Iterator[list[tuple[str, str]]]:
+    if max_new_tokens > MAX_MAX_NEW_TOKENS:
+        raise ValueError
+    history = history_with_input[:-1]
+    generator = run(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k)
+    try:
+        first_response = next(generator)
+        yield history + [(message, first_response)]
+    except StopIteration:
+        yield history + [(message, '')]
+    for response in generator:
+        yield history + [(message, response)]
+def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
+    generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
+    for x in generator:
+        pass
+    return '', x
+def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
+    input_token_length = get_input_token_length(message, chat_history, system_prompt)
+    if input_token_length > MAX_INPUT_TOKEN_LENGTH:
+        raise gr.Error(f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.')
+with gr.Blocks(css='style.css') as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(value='Duplicate Space for private use',
+                       elem_id='duplicate-button')
+    with gr.Group():
+        chatbot = gr.Chatbot(label='CareLlama')
+        with gr.Row():
+            textbox = gr.Textbox(
+                container=False,
+                show_label=False,
+                placeholder='请输入内容...',
+                scale=10,
             )
+            submit_button = gr.Button('Submit',
+                                      variant='primary',
+                                      scale=1,
+                                      min_width=0)
+    with gr.Row():
+        retry_button = gr.Button('🔄  重试', variant='secondary')
+        undo_button = gr.Button('↩️ 撤销', variant='secondary')
+        clear_button = gr.Button('🗑️  清除', variant='secondary')
+    saved_input = gr.State()
+    with gr.Accordion(label='Advanced options', open=False):
+        system_prompt = gr.Textbox(label='System prompt',
+                                   value=DEFAULT_SYSTEM_PROMPT,
+                                   lines=6)
+        max_new_tokens = gr.Slider(
+            label='Max new tokens',
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        )
+        temperature = gr.Slider(
+            label='Temperature',
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=1.0,
+        )
+        top_p = gr.Slider(
+            label='Top-p (nucleus sampling)',
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.95,
+        )
+        top_k = gr.Slider(
+            label='Top-k',
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        )
+    gr.Examples(
+        examples=[
+            '你好'
+        ],
+        inputs=textbox,
+        outputs=[textbox, chatbot],
+        fn=process_example,
+        cache_examples=True,
+    )
+    gr.Markdown(LICENSE)
+    textbox.submit(
+        fn=clear_and_save_textbox,
+        inputs=textbox,
+        outputs=[textbox, saved_input],
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=display_input,
+        inputs=[saved_input, chatbot],
+        outputs=chatbot,
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=check_input_token_length,
+        inputs=[saved_input, chatbot, system_prompt],
+        api_name=False,
+        queue=False,
+    ).success(
+        fn=generate,
+        inputs=[
+            saved_input,
             chatbot,
+            system_prompt,
+            max_new_tokens,
             temperature,
+            top_p,
+            top_k,
         ],
+        outputs=chatbot,
+        api_name=False,
     )
+    button_event_preprocess = submit_button.click(
+        fn=clear_and_save_textbox,
+        inputs=textbox,
+        outputs=[textbox, saved_input],
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=display_input,
+        inputs=[saved_input, chatbot],
+        outputs=chatbot,
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=check_input_token_length,
+        inputs=[saved_input, chatbot, system_prompt],
+        api_name=False,
+        queue=False,
+    ).success(
+        fn=generate,
+        inputs=[
+            saved_input,
             chatbot,
+            system_prompt,
+            max_new_tokens,
             temperature,
+            top_p,
+            top_k,
         ],
+        outputs=chatbot,
+        api_name=False,
     )
+    retry_button.click(
+        fn=delete_prev_fn,
+        inputs=chatbot,
+        outputs=[chatbot, saved_input],
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=display_input,
+        inputs=[saved_input, chatbot],
+        outputs=chatbot,
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=generate,
         inputs=[
+            saved_input,
             chatbot,
+            system_prompt,
+            max_new_tokens,
             temperature,
+            top_p,
+            top_k,
         ],
+        outputs=chatbot,
+        api_name=False,
     )
+    undo_button.click(
+        fn=delete_prev_fn,
+        inputs=chatbot,
+        outputs=[chatbot, saved_input],
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=lambda x: x,
+        inputs=[saved_input],
+        outputs=textbox,
+        api_name=False,
+        queue=False,
     )
+    clear_button.click(
+        fn=lambda: ([], ''),
+        outputs=[chatbot, saved_input],
+        queue=False,
+        api_name=False,
+    )
+demo.queue(max_size=20).launch()

model.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from threading import Thread
+from typing import Iterator
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+model_id = '../merge'
+if torch.cuda.is_available():
+    config = AutoConfig.from_pretrained(model_id)
+    config.pretraining_tp = 1
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        config=config,
+        torch_dtype=torch.float16,
+        load_in_4bit=True,
+        device_map='auto'
+    )
+else:
+    model = None
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+def get_prompt(message: str, chat_history: list[tuple[str, str]],
+               system_prompt: str) -> str:
+    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
+    # The first user input is _not_ stripped
+    do_strip = False
+    for user_input, response in chat_history:
+        user_input = user_input.strip() if do_strip else user_input
+        do_strip = True
+        texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
+    message = message.strip() if do_strip else message
+    texts.append(f'{message} [/INST]')
+    return ''.join(texts)
+def get_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> int:
+    prompt = get_prompt(message, chat_history, system_prompt)
+    input_ids = tokenizer([prompt], return_tensors='np', add_special_tokens=False)['input_ids']
+    return input_ids.shape[-1]
+def run(message: str,
+        chat_history: list[tuple[str, str]],
+        system_prompt: str,
+        max_new_tokens: int = 1024,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        top_k: int = 50) -> Iterator[str]:
+    prompt = get_prompt(message, chat_history, system_prompt)
+    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
+    streamer = TextIteratorStreamer(tokenizer,
+                                    timeout=10.,
+                                    skip_prompt=True,
+                                    skip_special_tokens=True)
+    generate_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield ''.join(outputs)

requirements.txt CHANGED Viewed

@@ -1,9 +1,8 @@
-protobuf
-transformers==4.30.2
-cpm_kernels
-torch>=2.0
-gradio
-mdtex2html
-sentencepiece
-accelerate
-loguru

+accelerate==0.21.0
+bitsandbytes==0.40.2
+gradio==3.37.0
+protobuf==3.20.3
+scipy==1.11.1
+sentencepiece==0.1.99
+torch==2.0.1
+transformers==4.31.0

style.css ADDED Viewed

	@@ -0,0 +1,16 @@

+h1 {
+  text-align: center;
+}
+#duplicate-button {
+  margin: auto;
+  color: white;
+  background: #1565c0;
+  border-radius: 100vh;
+}
+#component-0 {
+  max-width: 900px;
+  margin: auto;
+  padding-top: 1.5rem;
+}