Spaces:

microsoft
/

Magma-Gaming

Running on L4

App Files Files Community

jw2yang commited on about 22 hours ago

Commit

54f748c

1 Parent(s): 34f3c9b

update

Browse files

Files changed (5) hide show

app_1p.py +0 -213
vlms/llavanext.py +0 -43
vlms/llavaov.py +0 -44
vlms/magma.py +0 -40
vlms/qwen2vl.py +0 -59

app_1p.py DELETED Viewed

@@ -1,213 +0,0 @@
-import os
-# add a command for installing flash-attn
-os.system('pip install flash-attn --no-build-isolation')
-os.system("pip install gradio==4.44.1")
-import pygame
-import numpy as np
-import gradio as gr
-import time
-import torch
-from PIL import Image
-from transformers import AutoModelForCausalLM, AutoProcessor
-import re
-import random
-pygame.mixer.quit()  # Disable sound
-# Constants
-WIDTH, HEIGHT = 800, 800
-GRID_SIZE = 80
-WHITE = (255, 255, 255)
-GREEN = (34, 139, 34)  # Forest green - more like an apple
-RED = (200, 50, 50)
-BLACK = (0, 0, 0)
-GRAY = (128, 128, 128)
-YELLOW = (218, 165, 32)  # Golden yellow color
-# Directions
-UP = (0, -1)
-DOWN = (0, 1)
-LEFT = (-1, 0)
-RIGHT = (1, 0)
-STATIC = (0, 0)
-ACTIONS = ["up", "down", "left", "right", "static"]
-# Load AI Model
-dtype = torch.bfloat16
-magma_model_id = "microsoft/Magma-8B"
-magam_model = AutoModelForCausalLM.from_pretrained(magma_model_id, trust_remote_code=True, torch_dtype=dtype)
-magma_processor = AutoProcessor.from_pretrained(magma_model_id, trust_remote_code=True)
-magam_model.to("cuda")
-magma_img = pygame.image.load("./assets/images/magma_game_thin.png")
-magma_img = pygame.transform.scale(magma_img, (GRID_SIZE, GRID_SIZE))
-class MagmaFindGPU:
-    def __init__(self):
-        self.reset()
-        self.step_count = 0
-    def reset(self):
-        self.snake = [(5, 5)]
-        self.direction = RIGHT
-        self.score = 0
-        self.game_over = False
-        self.step_count = 0
-        self.place_target()
-    def place_target(self):
-        while True:
-            target_x = np.random.randint(1, WIDTH // GRID_SIZE - 1)
-            target_y = np.random.randint(1, HEIGHT // GRID_SIZE - 1)
-            if (target_x, target_y) not in self.snake:
-                self.target = (target_x, target_y)
-                break
-    def step(self, action):
-        if action == "up":
-            self.direction = UP
-        elif action == "down":
-            self.direction = DOWN
-        elif action == "left":
-            self.direction = LEFT
-        elif action == "right":
-            self.direction = RIGHT
-        elif action == "static":
-            self.direction = STATIC
-        if self.game_over:
-            self.reset()
-            return self.render(), self.score
-        new_head = (self.snake[0][0] + self.direction[0], self.snake[0][1] + self.direction[1])
-        if new_head[0] < 0 or new_head[1] < 0 or new_head[0] >= WIDTH // GRID_SIZE or new_head[1] >= HEIGHT // GRID_SIZE:
-            self.game_over = True
-            return self.render(), self.score
-        self.snake = [new_head]  # Keep only the head (single block snake)
-        self.step_count += 1
-        # Check if the target is covered by four surrounding squares
-        head_x, head_y = self.snake[0]
-        neighbors = set([(head_x, head_y - 1), (head_x, head_y + 1), (head_x - 1, head_y), (head_x + 1, head_y)])
-        if neighbors.issuperset(set([self.target])):
-            self.score += 1
-            self.place_target()
-        return self.render(), self.score
-    def render(self):
-        pygame.init()
-        surface = pygame.Surface((WIDTH, HEIGHT))
-        surface.fill(BLACK)
-        head_x, head_y = self.snake[0]
-        surface.blit(magma_img, (head_x * GRID_SIZE, head_y * GRID_SIZE))
-        # pygame.draw.rect(surface, RED, (self.snake[0][0] * GRID_SIZE, self.snake[0][1] * GRID_SIZE, GRID_SIZE, GRID_SIZE))
-        pygame.draw.rect(surface, GREEN, (self.target[0] * GRID_SIZE, self.target[1] * GRID_SIZE, GRID_SIZE, GRID_SIZE))
-        # Draw four surrounding squares with labels
-        head_x, head_y = self.snake[0]
-        neighbors = [(head_x, head_y - 1), (head_x, head_y + 1), (head_x - 1, head_y), (head_x + 1, head_y)]
-        labels = ["1", "2", "3", "4"]
-        font = pygame.font.Font(None, 48)
-        # clone surface
-        surface_nomark = surface.copy()
-        for i, (nx, ny) in enumerate(neighbors):
-            if 0 <= nx < WIDTH // GRID_SIZE and 0 <= ny < HEIGHT // GRID_SIZE:
-                pygame.draw.rect(surface, RED, (nx * GRID_SIZE, ny * GRID_SIZE, GRID_SIZE, GRID_SIZE), GRID_SIZE)
-                # pygame.draw.rect(surface_nomark, RED, (nx * GRID_SIZE, ny * GRID_SIZE, GRID_SIZE, GRID_SIZE), GRID_SIZE)
-                text = font.render(labels[i], True, WHITE)
-                text_rect = text.get_rect(center=(nx * GRID_SIZE + GRID_SIZE // 2, ny * GRID_SIZE + GRID_SIZE // 2))
-                surface.blit(text, text_rect)
-        return np.array(pygame.surfarray.array3d(surface_nomark)).swapaxes(0, 1), np.array(pygame.surfarray.array3d(surface)).swapaxes(0, 1)
-    def get_state(self):
-        return self.render()
-game = MagmaFindGPU()
-def play_game():
-    state, state_som = game.get_state()
-    pil_img = Image.fromarray(state_som)
-    convs = [
-        {"role": "system", "content": "You are an agent that can see, talk, and act. Avoid hitting the wall."},
-        {"role": "user", "content": "<image_start><image><image_end>\nWhich mark is closer to green block? Answer with a single number."},
-    ]
-    prompt = magma_processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
-    inputs = magma_processor(images=[pil_img], texts=prompt, return_tensors="pt")
-    inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
-    inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
-    inputs = inputs.to("cuda").to(dtype)
-    generation_args = {
-        "max_new_tokens": 10,
-        "temperature": 0.3,
-        "do_sample": True,
-        "use_cache": True,
-        "num_beams": 1,
-    }
-    with torch.inference_mode():
-        generate_ids = magam_model.generate(**inputs, **generation_args)
-    generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
-    action = magma_processor.decode(generate_ids[0], skip_special_tokens=True).strip()
-    # extract mark id fro action use re
-    match = re.search(r'\d+', action)
-    if match:
-        action = match.group(0)
-        if action.isdigit() and 1 <= int(action) <= 4:
-            action = ACTIONS[int(action) - 1]
-        else:
-            # random choose one from the pool
-            action = random.choice(ACTIONS[:-1])
-    else:
-        action = random.choice(ACTIONS[:-1])
-    img, score = game.step(action)
-    img = img[0]
-    return img, f"Score: {score}"
-def reset_game():
-    game.reset()
-    return game.render()[0], "Score: 0"
-MARKDOWN = """
-<div align="center">
-<img src="./assets/images/logo.png" alt="Magma Logo" style="margin-right: 5px; height: 80px;margin-top: -10px;">
-<h2>Magma: A Foundation Model for Multimodal AI Agents</h2>
-\[[arXiv Paper](https://www.arxiv.org/pdf/2502.13130)\] &nbsp; \[[Project Page](https://microsoft.github.io/Magma/)\] &nbsp; \[[Github Repo](https://github.com/microsoft/Magma)\] &nbsp; \[[Hugging Face Model](https://huggingface.co/microsoft/Magma-8B)\] &nbsp;
-This demo is powered by [Gradio](https://gradio.app/).
-<b>Goal: Collects the green blocks by automatically moving up, down, left and right.</b>
-</div>
-"""
-with gr.Blocks() as interface:
-    gr.Markdown(MARKDOWN)
-    with gr.Row():
-        image_output = gr.Image(label="Game Screen")
-        with gr.Column():
-            score_output = gr.Text(label="Score", elem_classes="large-text")
-            gr.HTML("""
-                <style>
-                .large-text textarea {
-                    font-size: 24px !important;
-                }
-                </style>
-            """)
-            start_btn = gr.Button("Start/Reset Game")
-    interface.load(fn=play_game, every=1, inputs=[], outputs=[image_output, score_output])
-    start_btn.click(fn=reset_game, inputs=[], outputs=[image_output, score_output])
-interface.launch()

vlms/llavanext.py DELETED Viewed

@@ -1,43 +0,0 @@
-from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
-import torch
-import torch.nn as nn
-from PIL import Image
-import requests
-class LLaVANextAgent(nn.Module):
-    def __init__(self, device="cuda", dtype=torch.float16):
-        super().__init__()
-        self.processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-        self.model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=dtype, low_cpu_mem_usage=True)
-        self.dtype = dtype
-        self.device = device
-        self.model.to(device)
-        self.generation_args = {
-            "max_new_tokens": 10,
-            "temperature": 0.3,
-            "do_sample": True,
-            "use_cache": True,
-            "num_beams": 1,
-        }
-    def generate_response(self, image, question):
-        conversation = [
-            {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": question},
-                {"type": "image"},
-                ],
-            },
-        ]
-        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
-        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)
-        # autoregressively complete prompt
-        self.model.generation_config.pad_token_id = self.processor.tokenizer.pad_token_id
-        with torch.inference_mode():
-            output = self.model.generate(**inputs, **self.generation_args)
-        output = output[:, inputs["input_ids"].shape[-1] :]
-        return self.processor.decode(output[0], skip_special_tokens=True).strip()

vlms/llavaov.py DELETED Viewed

@@ -1,44 +0,0 @@
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-import torch
-import torch.nn as nn
-from PIL import Image
-import requests
-class LLaVAOVAgent(nn.Module):
-    model_id = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
-    def __init__(self, device="cuda", dtype=torch.float16):
-        super().__init__()
-        self.processor = AutoProcessor.from_pretrained(self.model_id)
-        self.model = LlavaOnevisionForConditionalGeneration.from_pretrained(self.model_id, torch_dtype=dtype, low_cpu_mem_usage=True)
-        self.dtype = dtype
-        self.device = device
-        self.model.to(device)
-        self.generation_args = {
-            "max_new_tokens": 10,
-            "temperature": 0.3,
-            "do_sample": True,
-            "use_cache": True,
-            "num_beams": 1,
-        }
-    def generate_response(self, image, question):
-        conversation = [
-            {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": question},
-                {"type": "image"},
-                ],
-            },
-        ]
-        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
-        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)
-        # autoregressively complete prompt
-        self.model.generation_config.pad_token_id = self.processor.tokenizer.pad_token_id
-        with torch.inference_mode():
-            output = self.model.generate(**inputs, **self.generation_args)
-        output = output[:, inputs["input_ids"].shape[-1] :]
-        return self.processor.decode(output[0], skip_special_tokens=True).strip()

vlms/magma.py DELETED Viewed

@@ -1,40 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoProcessor
-import torch
-import torch.nn as nn
-from PIL import Image
-import requests
-model_id = "microsoft/Magma-8B"
-class MagmaAgent(nn.Module):
-    def __init__(self, device="cuda", dtype=torch.float16):
-        super().__init__()
-        self.model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=dtype, low_cpu_mem_usage=True)
-        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-        self.dtype = dtype
-        self.device = device
-        self.model.to(device)
-        self.generation_args = {
-            "max_new_tokens": 10,
-            "temperature": 0.3,
-            "do_sample": True,
-            "use_cache": True,
-            "num_beams": 1,
-        }
-    def generate_response(self, image, question):
-        convs = [
-            {"role": "system", "content": "You are an agent that can see, talk, and act."},
-            {"role": "user", "content": "<image_start><image><image_end>\n{}".format(question)},
-        ]
-        prompt = self.processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
-        inputs = self.processor(images=[image], texts=prompt, return_tensors="pt").to(self.dtype).to(self.device)
-        inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
-        inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
-        with torch.inference_mode():
-            generate_ids = self.model.generate(**inputs, **self.generation_args)
-        generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
-        action = self.processor.decode(generate_ids[0], skip_special_tokens=True).strip()
-        return action

vlms/qwen2vl.py DELETED Viewed

@@ -1,59 +0,0 @@
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-from qwen_vl_utils import process_vision_info
-import torch
-import torch.nn as nn
-from PIL import Image
-import requests
-class Qwen2VLAgent(nn.Module):
-    model_id = "Qwen/Qwen2-VL-7B-Instruct"
-    def __init__(self, device="cuda", dtype=torch.float16):
-        super().__init__()
-        self.processor = AutoProcessor.from_pretrained(self.model_id)
-        self.model = Qwen2VLForConditionalGeneration.from_pretrained(self.model_id, torch_dtype=dtype, low_cpu_mem_usage=True)
-        self.dtype = dtype
-        self.device = device
-        self.model.to(device)
-        self.generation_args = {
-            "max_new_tokens": 10,
-            "temperature": 0.3,
-            "do_sample": True,
-            "use_cache": True,
-            "num_beams": 1,
-        }
-    def generate_response(self, image, question):
-        image.save('qwen25vl.png')
-        conversation = [
-            {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": question},
-                {"type": "image", "image": "qwen25vl.png"},
-                ],
-            },
-        ]
-        # Preparation for inference
-        text = self.processor.apply_chat_template(
-            conversation, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(conversation)
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        ).to(self.device)
-        # autoregressively complete prompt
-        self.model.generation_config.pad_token_id = self.processor.tokenizer.pad_token_id
-        with torch.inference_mode():
-            output = self.model.generate(**inputs, **self.generation_args)
-        output = output[:, inputs["input_ids"].shape[-1] :]
-        return self.processor.decode(output[0], skip_special_tokens=True).strip()