Spaces:

stzhao
/

Qwen2.5-VL-7B-Instruct

Runtime error

App Files Files Community

prithivMLmods commited on Jan 9

Commit

9e64b5c

verified ·

1 Parent(s): 3c9b81c

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -122

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 from collections.abc import Iterator
 from threading import Thread
 import gradio as gr
 import spaces
 import torch
@@ -11,6 +10,7 @@ from PIL import Image
 import uuid
 import io
 DESCRIPTION = """
 # GWQ PREV
 """
@@ -21,25 +21,24 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load Gemma model for text-only inputs
-gemma_model_id = "prithivMLmods/GWQ2b"
-gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_id)
-gemma_model = AutoModelForCausalLM.from_pretrained(
-    gemma_model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
-gemma_model.config.sliding_window = 4096
-gemma_model.eval()
-# Load Qwen model for multimodal inputs
-qwen_model_id = "Qwen/Qwen2-VL-2B-Instruct"
-qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
-    qwen_model_id,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to("cuda").eval()
-qwen_processor = AutoProcessor.from_pretrained(qwen_model_id, trust_remote_code=True)
 image_extensions = Image.registered_extensions()
 video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
@@ -72,19 +71,6 @@ def identify_and_save_blob(blob_path):
     except Exception as e:
         raise ValueError(f"An error occurred while processing the file: {e}")
-def process_vision_info(messages):
-    """Processes vision information (images or videos) from messages."""
-    image_inputs = []
-    video_inputs = []
-    for message in messages:
-        for content in message["content"]:
-            if content["type"] == "image":
-                image = Image.open(content["image"])
-                image_inputs.append(image)
-            elif content["type"] == "video":
-                video_inputs.append(content["video"])
-    return image_inputs, video_inputs
 @spaces.GPU()
 def generate(
     message: str,
@@ -94,26 +80,21 @@ def generate(
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
-    media_input: str = None,
 ) -> Iterator[str]:
-    if media_input:
-        # Use Qwen model for multimodal inputs
-        if isinstance(media_input, str):  # If it's a filepath
-            media_path = media_input
-            if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
-                media_type = "image"
-            elif media_path.endswith(video_extensions):
-                media_type = "video"
-            else:
-                try:
-                    media_path, media_type = identify_and_save_blob(media_input)
-                    print(media_path, media_type)
-                except Exception as e:
-                    print(e)
-                    raise ValueError(
-                        "Unsupported media type. Please upload an image or video."
-                    )
         messages = [
             {
                 "role": "user",
@@ -128,11 +109,11 @@ def generate(
             }
         ]
-        text = qwen_processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
         image_inputs, video_inputs = process_vision_info(messages)
-        inputs = qwen_processor(
             text=[text],
             images=image_inputs,
             videos=video_inputs,
@@ -141,11 +122,11 @@ def generate(
         ).to("cuda")
         streamer = TextIteratorStreamer(
-            qwen_processor, skip_prompt=True, **{"skip_special_tokens": True}
         )
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
-        thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
@@ -153,17 +134,17 @@ def generate(
             buffer += new_text
             yield buffer
     else:
-        # Use Gemma model for text-only inputs
         conversation = chat_history.copy()
         conversation.append({"role": "user", "content": message})
-        input_ids = gemma_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
             gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(gemma_model.device)
-        streamer = TextIteratorStreamer(gemma_tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generate_kwargs = dict(
             {"input_ids": input_ids},
             streamer=streamer,
@@ -175,7 +156,7 @@ def generate(
             num_beams=1,
             repetition_penalty=repetition_penalty,
         )
-        t = Thread(target=gemma_model.generate, kwargs=generate_kwargs)
         t.start()
         outputs = []
@@ -183,72 +164,61 @@ def generate(
             outputs.append(text)
             yield "".join(outputs)
-css = """
-  #output {
-    height: 500px;
-    overflow: auto;
-    border: 1px solid #ccc;
-  }
-"""
-with gr.Blocks(css=css) as demo:
-    gr.Markdown(DESCRIPTION)
-    with gr.Tab(label="Chat Interface"):
-        chat_interface = gr.ChatInterface(
-            fn=generate,
-            additional_inputs=[
-                gr.Slider(
-                    label="Max new tokens",
-                    minimum=1,
-                    maximum=MAX_MAX_NEW_TOKENS,
-                    step=1,
-                    value=DEFAULT_MAX_NEW_TOKENS,
-                ),
-                gr.Slider(
-                    label="Temperature",
-                    minimum=0.1,
-                    maximum=4.0,
-                    step=0.1,
-                    value=0.6,
-                ),
-                gr.Slider(
-                    label="Top-p (nucleus sampling)",
-                    minimum=0.05,
-                    maximum=1.0,
-                    step=0.05,
-                    value=0.9,
-                ),
-                gr.Slider(
-                    label="Top-k",
-                    minimum=1,
-                    maximum=1000,
-                    step=1,
-                    value=50,
-                ),
-                gr.Slider(
-                    label="Repetition penalty",
-                    minimum=1.0,
-                    maximum=2.0,
-                    step=0.05,
-                    value=1.2,
-                ),
-            ],
-            stop_btn=None,
-            examples=[
-                ["Hello there! How are you doing?"],
-                ["Can you explain briefly to me what is the Python programming language?"],
-                ["Explain the plot of Cinderella in a sentence."],
-                ["How many hours does it take a man to eat a Helicopter?"],
-                ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
-            ],
-            cache_examples=False,
-            type="messages",
-            description=DESCRIPTION,
-            css_paths="style.css",
-            fill_height=True,
-            textbox=gr.MultimodalTextbox(),
-            multimodal=True,
-        )
-demo.launch(debug=True)

 import os
 from collections.abc import Iterator
 from threading import Thread
 import gradio as gr
 import spaces
 import torch
 import uuid
 import io
+# Text-only model setup
 DESCRIPTION = """
 # GWQ PREV
 """
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+model_id = "prithivMLmods/GWQ2b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
+model.config.sliding_window = 4096
+model.eval()
+# Multimodal model setup
+MULTIMODAL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
+multimodal_model = Qwen2VLForConditionalGeneration.from_pretrained(
+    MULTIMODAL_MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+multimodal_processor = AutoProcessor.from_pretrained(MULTIMODAL_MODEL_ID, trust_remote_code=True)
 image_extensions = Image.registered_extensions()
 video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
     except Exception as e:
         raise ValueError(f"An error occurred while processing the file: {e}")
 @spaces.GPU()
 def generate(
     message: str,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
+    files: list = None,
 ) -> Iterator[str]:
+    if files and len(files) > 0:
+        # Multimodal input
+        media_path = files[0]
+        if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
+            media_type = "image"
+        elif media_path.endswith(video_extensions):
+            media_type = "video"
+        else:
+            try:
+                media_path, media_type = identify_and_save_blob(media_path)
+            except Exception as e:
+                raise ValueError("Unsupported media type. Please upload an image or video.")
         messages = [
             {
                 "role": "user",
             }
         ]
+        text = multimodal_processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
         image_inputs, video_inputs = process_vision_info(messages)
+        inputs = multimodal_processor(
             text=[text],
             images=image_inputs,
             videos=video_inputs,
         ).to("cuda")
         streamer = TextIteratorStreamer(
+            multimodal_processor, skip_prompt=True, **{"skip_special_tokens": True}
         )
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
+        thread = Thread(target=multimodal_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
             buffer += new_text
             yield buffer
     else:
+        # Text-only input
         conversation = chat_history.copy()
         conversation.append({"role": "user", "content": message})
+        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
             gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(model.device)
+        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generate_kwargs = dict(
             {"input_ids": input_ids},
             streamer=streamer,
             num_beams=1,
             repetition_penalty=repetition_penalty,
         )
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
         t.start()
         outputs = []
             outputs.append(text)
             yield "".join(outputs)
+demo = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Hello there! How are you doing?"],
+        ["Can you explain briefly to me what is the Python programming language?"],
+        ["Explain the plot of Cinderella in a sentence."],
+        ["How many hours does it take a man to eat a Helicopter?"],
+        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
+    ],
+    cache_examples=False,
+    type="messages",
+    description=DESCRIPTION,
+    css_paths="style.css",
+    fill_height=True,
+    multimodal=True,
+    textbox=gr.MultimodalTextbox(),
+)
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()