multimodal-longdoc-qwen2-vl

Running on Zero

App Files Files Community

chiayewken commited on Nov 11, 2024

Commit

596d336

1 Parent(s): 9442fde

Add qwen2-vl streaming inference

Browse files

Files changed (2) hide show

app.py +129 -38
run_demo.py +0 -97

app.py CHANGED Viewed

@@ -1,13 +1,25 @@
 import os
-from threading import Thread
-from typing import Iterator
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from run_demo import ZeroShotChatTemplate
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
@@ -29,15 +41,110 @@ As a derivate work of [Llama-3-8b-chat](https://huggingface.co/meta-llama/Meta-L
 this demo is governed by the original [license](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/LICENSE) and [acceptable use policy](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/USE_POLICY.md).
 """
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 if torch.cuda.is_available():
-    model_id = "chiayewken/llama3-8b-gsm8k-rpo"
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    tokenizer.use_default_system_prompt = False
 @spaces.GPU
@@ -51,32 +158,8 @@ def generate(
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
-    demo = ZeroShotChatTemplate()
-    prompt = demo.make_prompt(message)
-    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        {"input_ids": input_ids},
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        top_p=top_p,
-        top_k=top_k,
-        temperature=temperature,
-        num_beams=1,
-        repetition_penalty=repetition_penalty,
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
     outputs = []
-    for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
@@ -123,9 +206,15 @@ chat_interface = gr.ChatInterface(
     ],
     stop_btn=None,
     examples=[
-        ["Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?"],
-        ["Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"],
-        ["Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"],
     ],
     cache_examples=False,
     type="messages",
@@ -133,7 +222,9 @@ chat_interface = gr.ChatInterface(
 with gr.Blocks(css_paths="style.css", fill_height=True) as demo:
     gr.Markdown(DESCRIPTION)
-    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
     chat_interface.render()
     gr.Markdown(LICENSE)

+import hashlib
 import os
+from pathlib import Path
+from typing import Iterator, Optional, List, Union
 import gradio as gr
 import spaces
 import torch
+from PIL import Image
+from pydantic import BaseModel
+from swift.llm import (
+    ModelType,
+    get_model_tokenizer,
+    get_default_template_type,
+    get_template,
+    inference,
+    inference_stream,
+)
+from transformers import (
+    Qwen2VLForConditionalGeneration,
+    PreTrainedTokenizer,
+)
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 this demo is governed by the original [license](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/LICENSE) and [acceptable use policy](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/USE_POLICY.md).
 """
+def save_image(image: Image.Image, folder: str) -> str:
+    image_hash = hashlib.md5(image.tobytes()).hexdigest()
+    path = Path(folder, f"{image_hash}.png")
+    path.parent.mkdir(exist_ok=True, parents=True)
+    if not path.exists():
+        image.save(path)
+    return str(path)
+def resize_image(image: Image.Image, max_size: int) -> Image.Image:
+    # Same as modeling.py resize_image
+    width, height = image.size
+    if width <= max_size and height <= max_size:
+        return image
+    if width > height:
+        new_width = max_size
+        new_height = round(height * max_size / width)
+    else:
+        new_height = max_size
+        new_width = round(width * max_size / height)
+    return image.resize((new_width, new_height), Image.LANCZOS)
+class EvalModel(BaseModel, arbitrary_types_allowed=True):
+    engine: str
+    timeout: int = 60
+    temperature: float = 0.0
+    max_output_tokens: int = 512
+    def run(self, inputs: List[Union[str, Image.Image]]) -> str:
+        raise NotImplementedError
+    def run_many(self, inputs: List[Union[str, Image.Image]], num: int) -> List[str]:
+        raise NotImplementedError
+class SwiftQwenModel(EvalModel):
+    # https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Multi-Modal/qwen2-vl-best-practice.md
+    path: str = ""
+    model: Optional[Qwen2VLForConditionalGeneration] = None
+    tokenizer: Optional[PreTrainedTokenizer] = None
+    engine: str = ModelType.qwen2_vl_7b_instruct
+    image_size: int = 768
+    image_dir: str = "data/qwen_images"
+    def load(self):
+        if self.model is None or self.tokenizer is None:
+            self.model, self.tokenizer = get_model_tokenizer(
+                self.engine,
+                torch.bfloat16,
+                model_kwargs={"device_map": "auto"},
+                model_id_or_path=self.path or None,
+            )
+    def run(self, inputs: List[Union[str, Image.Image]]) -> str:
+        self.load()
+        template_type = get_default_template_type(self.engine)
+        self.model.generation_config.max_new_tokens = self.max_output_tokens
+        template = get_template(template_type, self.tokenizer)
+        text = "\n\n".join([x for x in inputs if isinstance(x, str)])
+        content = []
+        for x in inputs:
+            if isinstance(x, Image.Image):
+                path = save_image(resize_image(x, self.image_size), self.image_dir)
+                content.append(f"<img>{path}</img>")
+        content.append(text)
+        query = "".join(content)
+        response, history = inference(self.model, template, query)
+        return response
+    def run_stream(self, inputs: List[Union[str, Image.Image]]) -> Iterator[str]:
+        self.load()
+        template_type = get_default_template_type(self.engine)
+        self.model.generation_config.max_new_tokens = self.max_output_tokens
+        template = get_template(template_type, self.tokenizer)
+        text = "\n\n".join([x for x in inputs if isinstance(x, str)])
+        content = []
+        for x in inputs:
+            if isinstance(x, Image.Image):
+                path = save_image(resize_image(x, self.image_size), self.image_dir)
+                content.append(f"<img>{path}</img>")
+        content.append(text)
+        query = "".join(content)
+        generator = inference_stream(self.model, template, query)
+        print_idx = 0
+        print(f"query: {query}\nresponse: ", end="")
+        for response, history in generator:
+            delta = response[print_idx:]
+            print(delta, end="", flush=True)
+            print_idx = len(response)
+            yield delta
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 if torch.cuda.is_available():
+    model = SwiftQwenModel()
 @spaces.GPU
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     outputs = []
+    for text in model.run_stream(inputs=[message]):
         outputs.append(text)
         yield "".join(outputs)
     ],
     stop_btn=None,
     examples=[
+        [
+            "Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?"
+        ],
+        [
+            "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"
+        ],
+        [
+            "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"
+        ],
     ],
     cache_examples=False,
     type="messages",
 with gr.Blocks(css_paths="style.css", fill_height=True) as demo:
     gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use", elem_id="duplicate-button"
+    )
     chat_interface.render()
     gr.Markdown(LICENSE)

run_demo.py DELETED Viewed

@@ -1,97 +0,0 @@
-import re
-from typing import Optional, List
-import vllm
-from fire import Fire
-from pydantic import BaseModel
-from transformers import PreTrainedTokenizer, AutoTokenizer, AutoModelForCausalLM
-class ZeroShotChatTemplate:
-    # This is the default template used in llama-factory for training
-    texts: List[str] = []
-    @staticmethod
-    def make_prompt(prompt: str) -> str:
-        return f"Human: {prompt}\nAssistant: "
-    @staticmethod
-    def get_stopping_words() -> List[str]:
-        return ["Human:"]
-    @staticmethod
-    def extract_answer(text: str) -> str:
-        filtered = "".join([char for char in text if char.isdigit() or char == " "])
-        if not filtered.strip():
-            return text
-        return re.findall(pattern=r"\d+", string=filtered)[-1]
-class VLLMModel(BaseModel, arbitrary_types_allowed=True):
-    path_model: str
-    model: vllm.LLM = None
-    tokenizer: Optional[PreTrainedTokenizer] = None
-    max_input_length: int = 512
-    max_output_length: int = 512
-    stopping_words: Optional[List[str]] = None
-    def load(self):
-        if self.model is None:
-            self.model = vllm.LLM(model=self.path_model, trust_remote_code=True)
-        if self.tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.path_model)
-    def format_prompt(self, prompt: str) -> str:
-        self.load()
-        prompt = prompt.rstrip(" ")  # Llama is sensitive (eg "Answer:" vs "Answer: ")
-        return prompt
-    def make_kwargs(self, do_sample: bool, **kwargs) -> dict:
-        if self.stopping_words:
-            kwargs.update(stop=self.stopping_words)
-        params = vllm.SamplingParams(
-            temperature=0.5 if do_sample else 0.0,
-            max_tokens=self.max_output_length,
-            **kwargs,
-        )
-        outputs = dict(sampling_params=params, use_tqdm=False)
-        return outputs
-    def run(self, prompt: str) -> str:
-        prompt = self.format_prompt(prompt)
-        outputs = self.model.generate([prompt], **self.make_kwargs(do_sample=False))
-        pred = outputs[0].outputs[0].text
-        pred = pred.split("<|endoftext|>")[0]
-        return pred
-def upload_to_hub(path: str, repo_id: str):
-    tokenizer = AutoTokenizer.from_pretrained(path)
-    model = AutoModelForCausalLM.from_pretrained(path)
-    model.push_to_hub(repo_id)
-    tokenizer.push_to_hub(repo_id)
-def main(
-    question: str = "Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?",
-    **kwargs,
-):
-    model = VLLMModel(**kwargs)
-    demo = ZeroShotChatTemplate()
-    model.stopping_words = demo.get_stopping_words()
-    prompt = demo.make_prompt(question)
-    raw_outputs = model.run(prompt)
-    pred = demo.extract_answer(raw_outputs)
-    print(dict(question=question, prompt=prompt, raw_outputs=raw_outputs, pred=pred))
-"""
-p run_demo.py upload_to_hub outputs_paths/gsm8k_paths_llama3_8b_beta_03_rank_128/final chiayewken/llama3-8b-gsm8k-rpo
-p run_demo.py main --path_model chiayewken/llama3-8b-gsm8k-rpo
-"""
-if __name__ == "__main__":
-    Fire()