# Adapted from https://huggingface.co./nvidia/NVLM-D-72B#inference import math from typing import Any, Dict, List import torch import torchvision.transforms as T from torchvision.transforms.functional import InterpolationMode import requests from io import BytesIO from PIL import Image from transformers import AutoTokenizer, AutoModel from huggingface_inference_toolkit.logging import logger def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): best_ratio_diff = float("inf") best_ratio = (1, 1) area = width * height for ratio in target_ratios: target_aspect_ratio = ratio[0] / ratio[1] ratio_diff = abs(aspect_ratio - target_aspect_ratio) if ratio_diff < best_ratio_diff: best_ratio_diff = ratio_diff best_ratio = ratio elif ratio_diff == best_ratio_diff: if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: best_ratio = ratio return best_ratio def dynamic_preprocess( image, min_num=1, max_num=12, image_size=448, use_thumbnail=False ): orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num ) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) # find the closest aspect ratio to the target target_aspect_ratio = find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size, ) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] for i in range(blocks): box = ( (i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size, ) # split the image split_img = resized_img.crop(box) processed_images.append(split_img) assert len(processed_images) == blocks if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) return processed_images def load_image(image_url, input_size=448, max_num=12): response = requests.get(image_url) image = Image.open(BytesIO(response.content)).convert("RGB") transform = build_transform(input_size=input_size) images = dynamic_preprocess( image, image_size=input_size, use_thumbnail=True, max_num=max_num ) pixel_values = [transform(image) for image in images] pixel_values = torch.stack(pixel_values) return pixel_values def split_model(): device_map = {} world_size = torch.cuda.device_count() num_layers = 80 # Since the first GPU will be used for ViT, treat it as half a GPU. num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5)) num_layers_per_gpu = [num_layers_per_gpu] * world_size num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5) layer_cnt = 0 for i, num_layer in enumerate(num_layers_per_gpu): for j in range(num_layer): device_map[f"language_model.model.layers.{layer_cnt}"] = i layer_cnt += 1 device_map["vision_model"] = 0 device_map["mlp1"] = 0 device_map["language_model.model.tok_embeddings"] = 0 device_map["language_model.model.embed_tokens"] = 0 device_map["language_model.output"] = 0 device_map["language_model.model.norm"] = 0 device_map["language_model.lm_head"] = 0 device_map[f"language_model.model.layers.{num_layers - 1}"] = 0 return device_map IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) def build_transform(input_size): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD transform = T.Compose( [ T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD), ] ) return transform class EndpointHandler: def __init__(self, model_dir: str, **kwargs: Any) -> None: self.model = AutoModel.from_pretrained( model_dir, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_flash_attn=False, trust_remote_code=True, device_map=split_model(), ).eval() self.tokenizer = AutoTokenizer.from_pretrained( model_dir, trust_remote_code=True, use_fast=False ) def __call__(self, data: Dict[str, Any]) -> Dict[str, List[Any]]: logger.info(f"Received incoming request with {data=}") if "instances" in data: logger.warning("Using `instances` instead of `inputs` is deprecated.") data["inputs"] = data.pop("instances") if "inputs" not in data: raise ValueError( "The request body must contain a key 'inputs' with a list of inputs." ) if not isinstance(data["inputs"], list): raise ValueError( "The request inputs must be a list of dictionaries with either the key" " 'prompt' or 'prompt' + 'image_url', and optionally including the key" " 'generation_config'." ) if not all(isinstance(input, dict) and "prompt" in input.keys() for input in data["inputs"]): raise ValueError( "The request inputs must be a list of dictionaries with either the key" " 'prompt' or 'prompt' + 'image_url', and optionally including the key" " 'generation_config'." ) predictions = [] for input in data["inputs"]: if "prompt" not in input: raise ValueError( "The request input body must contain at least the key 'prompt' with the prompt to use." ) generation_config = input.get("generation_config", dict(max_new_tokens=1024, do_sample=False)) if "image_url" not in input: # pure-text conversation response, history = self.model.chat( self.tokenizer, None, input["prompt"], generation_config, history=None, return_history=True, ) else: # single-image single-round conversation pixel_values = load_image(input["image_url"], max_num=6).to( torch.bfloat16 ) response = self.model.chat( self.tokenizer, pixel_values, f"\n{input['prompt']}", generation_config, ) predictions.append(response) return {"predictions": predictions}