FYI: Python code that runs locally on Apple Silicon Macs (tested successfully on M1 Max with 64 GB RAM)
The below code (that includes minor adjustments from the code suggested on the Model card) runs successfully on Apple Silicon with a sufficient amount of RAM (64 GB appear necessary to run SDXL 1.0, TBC)
NOTE: Additionally, you will need to set the environment variable PYTORCH_ENABLE_MPS_FALLBACK=1
to use the CPU as a fallback.
Enjoy!
Make sure to first install the libraries: pip install accelerate transformers safetensors diffusers
import torch
import numpy as np
from PIL import Image
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL
from diffusers.utils import load_image
depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("mps")
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
controlnet = ControlNetModel.from_pretrained(
"diffusers/controlnet-depth-sdxl-1.0",
use_safetensors=True
).to("mps")
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix").to("mps")
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
controlnet=controlnet,
vae=vae,
use_safetensors=True,
).to("mps")
def get_depth_map(image):
image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("mps")
with torch.no_grad(), torch.autocast("cpu"):
depth_map = depth_estimator(image).predicted_depth
depth_map = torch.nn.functional.interpolate(
depth_map.unsqueeze(1),
size=(1024, 1024),
mode="bicubic",
align_corners=False,
)
depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
image = torch.cat([depth_map] * 3, dim=1)
image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
return image
prompt = "stormtrooper lecture, photorealistic"
image = load_image("https://huggingface.co./lllyasviel/sd-controlnet-depth/resolve/main/images/stormtrooper.png")
controlnet_conditioning_scale = 0.5 # recommended for good generalization
depth_image = get_depth_map(image)
images = pipe(
prompt,
image=depth_image,
num_inference_steps=30,
controlnet_conditioning_scale=controlnet_conditioning_scale,
).images
images[0]
images[0].save(f"stormtrooper.png")
Thanks for this!
Would you like to open a PR to the repository to include your suggestions?
Although this doesn't include a depth map pre-processer so uses a predefined depth map,
It will run on a 8Gb M1 (but very slowly, 50ish seconds per iter and using a fair bit of swap)
16 Gb would be better
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL, StableDiffusionControlNetPipeline
from PIL import Image
from torch import mps
import torch
import fp16fixes
fp16fixes.fp16_fixes()
torch.mps.set_per_process_memory_fraction(0.0)
prompt = "Photograph, scary looking willowy person at night under a full moon"
negative_prompt = 'low quality, bad quality, sketches'
image = Image.open("inferno_from_midas.png")
controlnet_conditioning_scale = 0.5 # recommended for good generalization
controlnet = ControlNetModel.from_pretrained(
"diffusers/controlnet-depth-sdxl-1.0",
torch_dtype=torch.float16,
variant='fp16',
).to('mps')
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to('mps')
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
controlnet=controlnet,
vae=vae,
variant='fp16',
torch_dtype=torch.float16,
).to('mps')
pipe.enable_attention_slicing()
pipe.enable_vae_slicing()
pipe.enable_vae_tiling()
images = pipe(
prompt, image=image, controlnet_conditioning_scale=controlnet_conditioning_scale, num_inference_steps=30
).images
images[0].save(f"hug_depth.png")
fp16fixes.fp16_fixes() depends with the version of torch but for general use its
import torch
def fp16_fixes():
if torch.backends.mps.is_available():
torch.empty = torch.zeros
_torch_layer_norm = torch.nn.functional.layer_norm
def new_layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-05):
if input.device.type == "mps" and input.dtype == torch.float16:
input = input.float()
if weight is not None:
weight = weight.float()
if bias is not None:
bias = bias.float()
return _torch_layer_norm(input, normalized_shape, weight, bias, eps).half()
else:
return _torch_layer_norm(input, normalized_shape, weight, bias, eps)
torch.nn.functional.layer_norm = new_layer_norm
def new_torch_tensor_permute(input, *dims):
result = torch.permute(input, tuple(dims))
if input.device == "mps" and input.dtype == torch.float16:
result = result.contiguous()
return result
torch.Tensor.permute = new_torch_tensor_permute
The last time I tried using a torch nightly only the torch.empty fix was required for a straight SDXL text2image, but I've also heard
the fp16 in torch is broken in Sonoma :-(