import os os.system("pip install --upgrade torch==1.9.1+cu111 torchvision==0.10.1+cu111 -f") os.system("git clone") os.system("pip install -e ./CLIP") os.system("pip install einops ninja scipy numpy Pillow tqdm imageio-ffmpeg imageio") import sys sys.path.append('./CLIP') import io import os, time import pickle import shutil import numpy as np from PIL import Image import torch import torch.nn.functional as F import requests import torchvision.transforms as transforms import torchvision.transforms.functional as TF import clip from tqdm.notebook import tqdm from torchvision.transforms import Compose, Resize, ToTensor, Normalize from einops import rearrange import gradio as gr import imageio print(torch.cuda.get_device_name(0)) device = torch.device('cuda:0') def fetch(url_or_path): if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'): r = requests.get(url_or_path) r.raise_for_status() fd = io.BytesIO() fd.write(r.content) return fd return open(url_or_path, 'rb') def fetch_model(url_or_path): basename = os.path.basename(url_or_path) if os.path.exists(basename): return basename else: os.system("wget -c '{url_or_path}'") return basename def norm1(prompt): "Normalize to the unit sphere." return prompt / prompt.square().sum(dim=-1,keepdim=True).sqrt() def spherical_dist_loss(x, y): x = F.normalize(x, dim=-1) y = F.normalize(y, dim=-1) return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2) class MakeCutouts(torch.nn.Module): def __init__(self, cut_size, cutn, cut_pow=1.): super().__init__() self.cut_size = cut_size self.cutn = cutn self.cut_pow = cut_pow def forward(self, input): sideY, sideX = input.shape[2:4] max_size = min(sideX, sideY) min_size = min(sideX, sideY, self.cut_size) cutouts = [] for _ in range(self.cutn): size = int(torch.rand([])**self.cut_pow * (max_size - min_size) + min_size) offsetx = torch.randint(0, sideX - size + 1, ()) offsety = torch.randint(0, sideY - size + 1, ()) cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size] cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size)) return make_cutouts = MakeCutouts(224, 32, 0.5) def embed_image(image): n = image.shape[0] cutouts = make_cutouts(image) embeds = clip_model.embed_cutout(cutouts) embeds = rearrange(embeds, '(cc n) c -> cc n c', n=n) return embeds def embed_url(url): image ='RGB') return embed_image(TF.to_tensor(image).to(device).unsqueeze(0)).mean(0).squeeze(0) class CLIP(object): def __init__(self): clip_model = "ViT-B/32" self.model, _ = clip.load(clip_model) self.model = self.model.requires_grad_(False) self.normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) @torch.no_grad() def embed_text(self, prompt): "Normalized clip text embedding." return norm1(self.model.encode_text(clip.tokenize(prompt).to(device)).float()) def embed_cutout(self, image): "Normalized clip image embedding." return norm1(self.model.encode_image(self.normalize(image))) clip_model = CLIP() # Load stylegan model base_url = "" model_name = "stylegan3-t-ffhqu-1024x1024.pkl" #model_name = "stylegan3-r-metfacesu-1024x1024.pkl" #model_name = "stylegan3-t-afhqv2-512x512.pkl" network_url = base_url + model_name os.system("wget -c") with open('stylegan3-t-ffhqu-1024x1024.pkl', 'rb') as fp: G = pickle.load(fp)['G_ema'].to(device) zs = torch.randn([10000, G.mapping.z_dim], device=device) w_stds = G.mapping(zs, None).std(0) def inference(text,steps,image): all_frames = [] target = clip_model.embed_text(text) if image: target = embed_image( else: target = clip_model.embed_text(text) steps = steps seed = 2 tf = Compose([ Resize(224), lambda x: torch.clamp((x+1)/2,min=0,max=1), ]) torch.manual_seed(seed) timestring = time.strftime('%Y%m%d%H%M%S') with torch.no_grad(): qs = [] losses = [] for _ in range(8): q = (G.mapping(torch.randn([4,G.mapping.z_dim], device=device), None, truncation_psi=0.7) - G.mapping.w_avg) / w_stds images = G.synthesis(q * w_stds + G.mapping.w_avg) embeds = embed_image(images.add(1).div(2)) loss = spherical_dist_loss(embeds, target).mean(0) i = torch.argmin(loss) qs.append(q[i]) losses.append(loss[i]) qs = torch.stack(qs) losses = torch.stack(losses) print(losses) print(losses.shape, qs.shape) i = torch.argmin(losses) q = qs[i].unsqueeze(0) q.requires_grad_() q_ema = q opt = torch.optim.AdamW([q], lr=0.03, betas=(0.0,0.999)) loop = tqdm(range(steps)) for i in loop: opt.zero_grad() w = q * w_stds image = G.synthesis(w + G.mapping.w_avg, noise_mode='const') embed = embed_image(image.add(1).div(2)) loss = spherical_dist_loss(embed, target).mean() loss.backward() opt.step() loop.set_postfix(loss=loss.item(), q_magnitude=q.std().item()) q_ema = q_ema * 0.9 + q * 0.1 image = G.synthesis(q_ema * w_stds + G.mapping.w_avg, noise_mode='const') pil_image = TF.to_pil_image(image[0].add(1).div(2).clamp(0,1)) all_frames.append(pil_image) #os.makedirs(f'samples/{timestring}', exist_ok=True)'samples/{timestring}/{i:04}.jpg') writer = imageio.get_writer('test.mp4', fps=15) for im in all_frames: writer.append_data(np.array(im)) writer.close() return pil_image, "test.mp4" title = "StyleGAN3+CLIP" description = "Gradio demo for StyleGAN3+CLIP: Generates images (mostly faces) using StyleGAN3 with CLIP guidance. To use it, simply add your text, or click one of the examples to load them. Read more at the links below." article = "

ColabWritten by nshepperd (, Thanks to Katherine Crowson (, for coming up with many improved sampling tricks, as well as some of the code

" examples = [['mario',150,None]] gr.Interface( inference, ["text",gr.inputs.Slider(minimum=50, maximum=200, step=1, default=150, label="steps"),gr.inputs.Image(type="pil", label="Image (Optional)", optional=True)], [gr.outputs.Image(type="pil", label="Output"),"playable_video"], title=title, description=description, article=article, enable_queue=True, examples=examples ).launch(debug=True)