FYI: Python code that runs locally on Apple Silicon Macs (tested successfully on M1 Max with 64 GB RAM)

#2
by MaxJob - opened

The below code (that includes minor adjustments from the code suggested on the Model card) runs successfully on Apple Silicon with a sufficient amount of RAM (64 GB appear necessary to run SDXL 1.0, TBC)

NOTE: Additionally, you will need to set the environment variable PYTORCH_ENABLE_MPS_FALLBACK=1 to use the CPU as a fallback.

Enjoy!

Make sure to first install the libraries: pip install accelerate transformers safetensors diffusers


import torch
import numpy as np
from PIL import Image

from transformers import DPTFeatureExtractor, DPTForDepthEstimation
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL
from diffusers.utils import load_image

depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("mps")
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
controlnet = ControlNetModel.from_pretrained(
"diffusers/controlnet-depth-sdxl-1.0",
use_safetensors=True
).to("mps")

vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix").to("mps")
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
controlnet=controlnet,
vae=vae,
use_safetensors=True,
).to("mps")

def get_depth_map(image):
image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("mps")
with torch.no_grad(), torch.autocast("cpu"):
depth_map = depth_estimator(image).predicted_depth

depth_map = torch.nn.functional.interpolate(
    depth_map.unsqueeze(1),
    size=(1024, 1024),
    mode="bicubic",
    align_corners=False,
)
depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
image = torch.cat([depth_map] * 3, dim=1)

image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
return image

prompt = "stormtrooper lecture, photorealistic"
image = load_image("https://huggingface.co./lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
controlnet_conditioning_scale = 0.5 # recommended for good generalization

depth_image = get_depth_map(image)

images = pipe(
prompt,
image=depth_image,
num_inference_steps=30,
controlnet_conditioning_scale=controlnet_conditioning_scale,
).images
images[0]

images[0].save(f"stormtrooper.png")


MaxJob changed discussion title from FYI: Python code that runs locally on Apple Silicon (tested successfully on M1 Max with 64 GB RAM) to FYI: Python code that runs locally on Apple Silicon Macs (tested successfully on M1 Max with 64 GB RAM)
🧨Diffusers org

Thanks for this!

Would you like to open a PR to the repository to include your suggestions?

Although this doesn't include a depth map pre-processer so uses a predefined depth map,
It will run on a 8Gb M1 (but very slowly, 50ish seconds per iter and using a fair bit of swap)
16 Gb would be better

from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL, StableDiffusionControlNetPipeline
from PIL import Image

from torch import mps
import torch

import fp16fixes

fp16fixes.fp16_fixes()
torch.mps.set_per_process_memory_fraction(0.0)

prompt = "Photograph, scary looking willowy person at night under a full moon"
negative_prompt = 'low quality, bad quality, sketches'

image = Image.open("inferno_from_midas.png")

controlnet_conditioning_scale = 0.5  # recommended for good generalization

controlnet = ControlNetModel.from_pretrained(
    "diffusers/controlnet-depth-sdxl-1.0",
    torch_dtype=torch.float16,
    variant='fp16',
).to('mps')


vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to('mps')
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    controlnet=controlnet,
    vae=vae,
    variant='fp16',
    torch_dtype=torch.float16,
).to('mps')

pipe.enable_attention_slicing()
pipe.enable_vae_slicing()
pipe.enable_vae_tiling()

images = pipe(
    prompt, image=image, controlnet_conditioning_scale=controlnet_conditioning_scale, num_inference_steps=30
    ).images

images[0].save(f"hug_depth.png")

fp16fixes.fp16_fixes() depends with the version of torch but for general use its

import torch

def fp16_fixes():
  if torch.backends.mps.is_available():
      torch.empty = torch.zeros

  _torch_layer_norm = torch.nn.functional.layer_norm
  def new_layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-05):
      if input.device.type == "mps" and input.dtype == torch.float16:
          input = input.float()
          if weight is not None:
              weight = weight.float()
          if bias is not None:
              bias = bias.float()
          return _torch_layer_norm(input, normalized_shape, weight, bias, eps).half()
      else:
          return _torch_layer_norm(input, normalized_shape, weight, bias, eps)

  torch.nn.functional.layer_norm = new_layer_norm


  def new_torch_tensor_permute(input, *dims):
      result = torch.permute(input, tuple(dims))
      if input.device == "mps" and input.dtype == torch.float16:
          result = result.contiguous()
      return result

  torch.Tensor.permute = new_torch_tensor_permute

The last time I tried using a torch nightly only the torch.empty fix was required for a straight SDXL text2image, but I've also heard
the fp16 in torch is broken in Sonoma :-(

Sign up or log in to comment