Upload 15 files

Browse files

Files changed (15) hide show

config.json +42 -0
control_v2p_sd15_mediapipe_face.full.ckpt +3 -0
control_v2p_sd15_mediapipe_face.pth +3 -0
control_v2p_sd15_mediapipe_face.safetensors +3 -0
control_v2p_sd15_mediapipe_face.yaml +79 -0
diffusion_pytorch_model.bin +3 -0
diffusion_pytorch_model.fp16.bin +3 -0
diffusion_pytorch_model.fp16.safetensors +3 -0
gradio_face2image.py +105 -0
laion_face_common.py +180 -0
laion_face_dataset.py +55 -0
tool_download_face_targets.py +86 -0
tool_generate_face_poses.py +180 -0
train_laion_face.py +46 -0
train_laion_face_sd15.py +42 -0

config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.15.0.dev0",
+  "_name_or_path": "/home/josephcatrambone/ControlNet/models",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "upcast_attention": null,
+  "use_linear_projection": false
+}

control_v2p_sd15_mediapipe_face.full.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2a71953d7372d5585899b44693a7532ebbf80c091108ae2b8987ca93cc2dac2
+size 8601300183

control_v2p_sd15_mediapipe_face.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f2ccead3a8c0b9fbf9cad7b8eaa29834983ced916c766a92fb84db34ff29e43
+size 1445239863

control_v2p_sd15_mediapipe_face.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5be501156709895f0b14a7ec76faae7cf0a105f76895252a2c69db541629628f
+size 1445154814

control_v2p_sd15_mediapipe_face.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  target: cldm.cldm.ControlLDM
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    control_key: "hint"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_stage_config:
+      target: cldm.cldm.ControlNet
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        hint_channels: 3
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    unet_config:
+      target: cldm.cldm.ControlledUnetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f63de389f776b75bb11f10487a187573aea84f9a51debd08f314bd084e7fb362
+size 1445254969

diffusion_pytorch_model.fp16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c37b3dd41e956160909129b50f84fd938116550727b491192cbdbe6f896cd7b
+size 722696633

diffusion_pytorch_model.fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb50465b4fd7e15f0dc7df8031767e57309cfda2917082485bcf6c11bedb540
+size 722598642

gradio_face2image.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+from typing import Mapping
+import gradio as gr
+import numpy
+import torch
+import random
+from PIL import Image
+from cldm.model import create_model, load_state_dict
+from cldm.ddim_hacked import DDIMSampler
+from laion_face_common import generate_annotation
+from share import *
+model = create_model('./control_v2p_sd21_mediapipe_face.yaml').cpu()
+model.load_state_dict(load_state_dict('./control_v2p_sd21_mediapipe_face.full.ckpt', location='cuda'))
+model = model.cuda()
+ddim_sampler = DDIMSampler(model)  # ControlNet _only_ works with DDIM.
+def process(input_image: Image.Image, prompt, a_prompt, n_prompt, max_faces, num_samples, ddim_steps, guess_mode, strength, scale, seed, eta):
+    with torch.no_grad():
+        empty = generate_annotation(input_image, max_faces)
+        visualization = Image.fromarray(empty)  # Save to help debug.
+        empty = numpy.moveaxis(empty, 2, 0)  # h, w, c -> c, h, w
+        control = torch.from_numpy(empty.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        # Sanity check the dimensions.
+        B, C, H, W = control.shape
+        assert C == 3
+        assert B == num_samples
+        if seed != -1:
+            random.seed(seed)
+            os.environ['PYTHONHASHSEED'] = str(seed)
+            numpy.random.seed(seed)
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+            torch.backends.cudnn.deterministic = True
+        if config.save_memory:
+            model.low_vram_shift(is_diffusing=False)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        if config.save_memory:
+            model.low_vram_shift(is_diffusing=True)
+        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
+        samples, intermediates = ddim_sampler.sample(
+            ddim_steps,
+            num_samples,
+            shape,
+            cond,
+            verbose=False,
+            eta=eta,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=un_cond
+        )
+        if config.save_memory:
+            model.low_vram_shift(is_diffusing=False)
+        x_samples = model.decode_first_stage(samples)
+        # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(numpy.uint8)
+        x_samples = numpy.moveaxis((x_samples * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(numpy.uint8), 1, -1)  # b, c, h, w -> b, h, w, c
+        results = [visualization] + [x_samples[i] for i in range(num_samples)]
+    return results
+block = gr.Blocks().queue()
+with block:
+    with gr.Row():
+        gr.Markdown("## Control Stable Diffusion with a Facial Pose")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="numpy")
+            prompt = gr.Textbox(label="Prompt")
+            run_button = gr.Button(label="Run")
+            with gr.Accordion("Advanced options", open=False):
+                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
+                max_faces = gr.Slider(label="Max Faces", minimum=1, maximum=5, value=1, step=1)
+                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
+                guess_mode = gr.Checkbox(label='Guess Mode', value=False)
+                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
+                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
+                eta = gr.Number(label="eta (DDIM)", value=0.0)
+                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
+                n_prompt = gr.Textbox(label="Negative Prompt",
+                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+        with gr.Column():
+            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
+    ips = [input_image, prompt, a_prompt, n_prompt, max_faces, num_samples, ddim_steps, guess_mode, strength, scale, seed, eta]
+    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
+block.launch(server_name='0.0.0.0')

laion_face_common.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from typing import Mapping
+import mediapipe as mp
+import numpy
+from PIL import Image
+mp_drawing = mp.solutions.drawing_utils
+mp_drawing_styles = mp.solutions.drawing_styles
+mp_face_detection = mp.solutions.face_detection  # Only for counting faces.
+mp_face_mesh = mp.solutions.face_mesh
+mp_face_connections = mp.solutions.face_mesh_connections.FACEMESH_TESSELATION
+mp_hand_connections = mp.solutions.hands_connections.HAND_CONNECTIONS
+mp_body_connections = mp.solutions.pose_connections.POSE_CONNECTIONS
+DrawingSpec = mp.solutions.drawing_styles.DrawingSpec
+PoseLandmark = mp.solutions.drawing_styles.PoseLandmark
+f_thick = 2
+f_rad = 1
+right_iris_draw = DrawingSpec(color=(10, 200, 250), thickness=f_thick, circle_radius=f_rad)
+right_eye_draw = DrawingSpec(color=(10, 200, 180), thickness=f_thick, circle_radius=f_rad)
+right_eyebrow_draw = DrawingSpec(color=(10, 220, 180), thickness=f_thick, circle_radius=f_rad)
+left_iris_draw = DrawingSpec(color=(250, 200, 10), thickness=f_thick, circle_radius=f_rad)
+left_eye_draw = DrawingSpec(color=(180, 200, 10), thickness=f_thick, circle_radius=f_rad)
+left_eyebrow_draw = DrawingSpec(color=(180, 220, 10), thickness=f_thick, circle_radius=f_rad)
+mouth_draw = DrawingSpec(color=(10, 180, 10), thickness=f_thick, circle_radius=f_rad)
+head_draw = DrawingSpec(color=(10, 200, 10), thickness=f_thick, circle_radius=f_rad)
+# mp_face_mesh.FACEMESH_CONTOURS has all the items we care about.
+face_connection_spec = {}
+for edge in mp_face_mesh.FACEMESH_FACE_OVAL:
+    face_connection_spec[edge] = head_draw
+for edge in mp_face_mesh.FACEMESH_LEFT_EYE:
+    face_connection_spec[edge] = left_eye_draw
+for edge in mp_face_mesh.FACEMESH_LEFT_EYEBROW:
+    face_connection_spec[edge] = left_eyebrow_draw
+# for edge in mp_face_mesh.FACEMESH_LEFT_IRIS:
+#    face_connection_spec[edge] = left_iris_draw
+for edge in mp_face_mesh.FACEMESH_RIGHT_EYE:
+    face_connection_spec[edge] = right_eye_draw
+for edge in mp_face_mesh.FACEMESH_RIGHT_EYEBROW:
+    face_connection_spec[edge] = right_eyebrow_draw
+# for edge in mp_face_mesh.FACEMESH_RIGHT_IRIS:
+#    face_connection_spec[edge] = right_iris_draw
+for edge in mp_face_mesh.FACEMESH_LIPS:
+    face_connection_spec[edge] = mouth_draw
+iris_landmark_spec = {468: right_iris_draw, 473: left_iris_draw}
+def draw_pupils(image, landmark_list, drawing_spec, halfwidth: int = 2):
+    """We have a custom function to draw the pupils because the mp.draw_landmarks method requires a parameter for all
+    landmarks.  Until our PR is merged into mediapipe, we need this separate method."""
+    if len(image.shape) != 3:
+        raise ValueError("Input image must be H,W,C.")
+    image_rows, image_cols, image_channels = image.shape
+    if image_channels != 3:  # BGR channels
+        raise ValueError('Input image must contain three channel bgr data.')
+    for idx, landmark in enumerate(landmark_list.landmark):
+        if (
+                (landmark.HasField('visibility') and landmark.visibility < 0.9) or
+                (landmark.HasField('presence') and landmark.presence < 0.5)
+        ):
+            continue
+        if landmark.x >= 1.0 or landmark.x < 0 or landmark.y >= 1.0 or landmark.y < 0:
+            continue
+        image_x = int(image_cols*landmark.x)
+        image_y = int(image_rows*landmark.y)
+        draw_color = None
+        if isinstance(drawing_spec, Mapping):
+            if drawing_spec.get(idx) is None:
+                continue
+            else:
+                draw_color = drawing_spec[idx].color
+        elif isinstance(drawing_spec, DrawingSpec):
+            draw_color = drawing_spec.color
+        image[image_y-halfwidth:image_y+halfwidth, image_x-halfwidth:image_x+halfwidth, :] = draw_color
+def reverse_channels(image):
+    """Given a numpy array in RGB form, convert to BGR.  Will also convert from BGR to RGB."""
+    # im[:,:,::-1] is a neat hack to convert BGR to RGB by reversing the indexing order.
+    # im[:,:,::[2,1,0]] would also work but makes a copy of the data.
+    return image[:, :, ::-1]
+def generate_annotation(
+        input_image: Image.Image,
+        max_faces: int,
+        min_face_size_pixels: int = 0,
+        return_annotation_data: bool = False
+):
+    """
+    Find up to 'max_faces' inside the provided input image.
+    If min_face_size_pixels is provided and nonzero it will be used to filter faces that occupy less than this many
+    pixels in the image.
+    If return_annotation_data is TRUE (default: false) then in addition to returning the 'detected face' image, three
+    additional parameters will be returned: faces before filtering, faces after filtering, and an annotation image.
+    The faces_before_filtering return value is the number of faces detected in an image with no filtering.
+    faces_after_filtering is the number of faces remaining after filtering small faces.
+    :return:
+      If 'return_annotation_data==True', returns (numpy array, numpy array, int, int).
+      If 'return_annotation_data==False' (default), returns a numpy array.
+    """
+    with mp_face_mesh.FaceMesh(
+            static_image_mode=True,
+            max_num_faces=max_faces,
+            refine_landmarks=True,
+            min_detection_confidence=0.5,
+    ) as facemesh:
+        img_rgb = numpy.asarray(input_image)
+        results = facemesh.process(img_rgb).multi_face_landmarks
+        faces_found_before_filtering = len(results)
+        # Filter faces that are too small
+        filtered_landmarks = []
+        for lm in results:
+            landmarks = lm.landmark
+            face_rect = [
+                landmarks[0].x,
+                landmarks[0].y,
+                landmarks[0].x,
+                landmarks[0].y,
+            ]  # Left, up, right, down.
+            for i in range(len(landmarks)):
+                face_rect[0] = min(face_rect[0], landmarks[i].x)
+                face_rect[1] = min(face_rect[1], landmarks[i].y)
+                face_rect[2] = max(face_rect[2], landmarks[i].x)
+                face_rect[3] = max(face_rect[3], landmarks[i].y)
+            if min_face_size_pixels > 0:
+                face_width = abs(face_rect[2] - face_rect[0])
+                face_height = abs(face_rect[3] - face_rect[1])
+                face_width_pixels = face_width * input_image.size[0]
+                face_height_pixels = face_height * input_image.size[1]
+                face_size = min(face_width_pixels, face_height_pixels)
+                if face_size >= min_face_size_pixels:
+                    filtered_landmarks.append(lm)
+            else:
+                filtered_landmarks.append(lm)
+        faces_remaining_after_filtering = len(filtered_landmarks)
+        # Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start.
+        empty = numpy.zeros_like(img_rgb)
+        # Draw detected faces:
+        for face_landmarks in filtered_landmarks:
+            mp_drawing.draw_landmarks(
+                empty,
+                face_landmarks,
+                connections=face_connection_spec.keys(),
+                landmark_drawing_spec=None,
+                connection_drawing_spec=face_connection_spec
+            )
+            draw_pupils(empty, face_landmarks, iris_landmark_spec, 2)
+        # Flip BGR back to RGB.
+        empty = reverse_channels(empty)
+        # We might have to generate a composite.
+        if return_annotation_data:
+            # Note that we're copying the input image AND flipping the channels so we can draw on top of it.
+            annotated = reverse_channels(numpy.asarray(input_image)).copy()
+            for face_landmarks in filtered_landmarks:
+                mp_drawing.draw_landmarks(
+                    empty,
+                    face_landmarks,
+                    connections=face_connection_spec.keys(),
+                    landmark_drawing_spec=None,
+                    connection_drawing_spec=face_connection_spec
+                )
+                draw_pupils(empty, face_landmarks, iris_landmark_spec, 2)
+            annotated = reverse_channels(annotated)
+        if not return_annotation_data:
+            return empty
+        else:
+            return empty, annotated, faces_found_before_filtering, faces_remaining_after_filtering

laion_face_dataset.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import json
+import numpy
+import os
+from PIL import Image
+from torch.utils.data import Dataset
+class LaionDataset(Dataset):
+    def __init__(self):
+        self.data = []
+        with open('./training/laion-face-processed/prompt.jsonl', 'rt') as f:
+            for line in f:
+                self.data.append(json.loads(line))
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        source_filename = os.path.split(item['source'])[-1]
+        target_filename = os.path.split(item['target'])[-1]
+        prompt = item['prompt']
+        # If prompt is "" or null, make it something simple.
+        if not prompt:
+            print(f"Image with index {idx} / {source_filename} has no text.")
+            prompt = "an image"
+        source_image = Image.open('./training/laion-face-processed/source/' + source_filename).convert("RGB")
+        target_image = Image.open('./training/laion-face-processed/target/' + target_filename).convert("RGB")
+        # Resize the image so that the minimum edge is bigger than 512x512, then crop center.
+        # This may cut off some parts of the face image, but in general they're smaller than 512x512 and we still want
+        # to cover the literal edge cases.
+        img_size = source_image.size
+        scale_factor = 512/min(img_size)
+        source_image = source_image.resize((1+int(img_size[0]*scale_factor), 1+int(img_size[1]*scale_factor)))
+        target_image = target_image.resize((1+int(img_size[0]*scale_factor), 1+int(img_size[1]*scale_factor)))
+        img_size = source_image.size
+        left_padding = (img_size[0] - 512)//2
+        top_padding = (img_size[1] - 512)//2
+        source_image = source_image.crop((left_padding, top_padding, left_padding+512, top_padding+512))
+        target_image = target_image.crop((left_padding, top_padding, left_padding+512, top_padding+512))
+        source = numpy.asarray(source_image)
+        target = numpy.asarray(target_image)
+        # Normalize source images to [0, 1].
+        source = source.astype(numpy.float32) / 255.0
+        # Normalize target images to [-1, 1].
+        target = (target.astype(numpy.float32) / 127.5) - 1.0
+        return dict(jpg=target, txt=prompt, hint=source)

tool_download_face_targets.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/usr/bin/python3
+"""
+tool_download_face_targets.py
+Reads in the metadata from the LAION images and begins downloading all images.
+"""
+import json
+import os
+import sys
+import time
+import urllib
+import urllib.request
+try:
+    from tqdm import tqdm
+except ImportError:
+    # Wrap this method into the identity.
+    print("TQDM not found.  Progress will be quiet without 'verbose'.")
+    def tqdm(x):
+        return x
+def main(logfile_path: str, verbose: bool = False, pause_between_fetches: float = 0.0):
+    """Open the metadata.json file from the training directory and fetch all target images."""
+    # Toggle a function pointer so we don't have to check verbosity everywhere.
+    def out(x):
+        pass
+    if verbose:
+        out = print
+    log = open(logfile_path, 'at')
+    skipped_image_count = 0
+    errored_image_count = 0
+    successful_image_count = 0
+    if not os.path.exists("training"):
+        print("ERROR: training directory does not exist in the current directory.")
+        print("Has the archive been unzipped?")
+        print("Are you running from the project root?")
+        return 2  # BASH: No such directory.
+    if not os.path.exists("training/laion-face-processed/metadata.json"):
+        print("ERROR: metadata.json was not found in training/laion-face-processed.")
+        return 2
+    with open("training/laion-face-processed/metadata.json", 'rt') as md_in:
+        metadata = json.load(md_in)
+    # Create the directory for targets if it does not exist.
+    if not os.path.exists("training/laion-face-processed/target"):
+        os.mkdir("training/laion-face-processed/target")
+    for image_id, image_data in tqdm(metadata.items()):
+        filename = f"training/laion-face-processed/target/{image_id}.jpg"
+        if os.path.exists(filename):
+            out(f"Skipping {image_id}: file exists.")
+            skipped_image_count += 1
+            continue
+        if not download_file(image_data['url'], filename, verbose):
+            error_message = f"Problem downloading {image_id}"
+            out(error_message)
+            log.write(error_message + "\n")
+            log.flush()  # Flush often in case we crash.
+            errored_image_count += 1
+        if pause_between_fetches > 0.0:
+            time.sleep(pause_between_fetches)
+        successful_image_count += 1
+    log.close()
+    print("Run success.")
+    print(f"{skipped_image_count} images skipped")
+    print(f"{errored_image_count} images failed to download")
+    print(f"{successful_image_count} images downloaded")
+def download_file(url: str, output_path: str, verbose: bool = False) -> bool:
+    """Download the file with the given URL and save it to the specified path.  Return true on success."""
+    try:
+        r = urllib.request.urlopen(url)
+        if not r.status == 200:
+            return False
+        with open(output_path, 'wb') as fout:
+            fout.write(r.read())
+        return True
+    except Exception as e:
+        if verbose:
+            print(e)
+        return False
+if __name__ == "__main__":
+    main("downloads.log", verbose="-v" in sys.argv)

tool_generate_face_poses.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import json
+import os
+import sys
+from dataclasses import dataclass, field
+from glob import glob
+from typing import Mapping
+from PIL import Image
+from tqdm import tqdm
+from laion_face_common import generate_annotation
+@dataclass
+class RunProgress:
+    pending: list = field(default_factory=list)
+    success: list = field(default_factory=list)
+    skipped_size: list = field(default_factory=list)
+    skipped_nsfw: list = field(default_factory=list)
+    skipped_noface: list = field(default_factory=list)
+    skipped_smallface: list = field(default_factory=list)
+def main(
+        status_filename: str,
+        prompt_filename: str,
+        input_glob: str,
+        output_directory: str,
+        annotated_output_directory: str = "",
+        min_image_size: int = 384,
+        max_image_size: int = 32766,
+        min_face_size_pixels: int = 64,
+        prompt_mapping: dict = None,  # If present, maps a filename to a text prompt.
+):
+    status = RunProgress()
+    if os.path.exists(status_filename):
+        print("Continuing from checkpoint.")
+        # Restore a saved state:
+        status_temp = json.load(open(status_filename, 'rt'))
+        for k in status.__dict__.keys():
+            status.__setattr__(k, status_temp[k])
+        # Output label file:
+        pout = open(prompt_filename, 'at')
+    else:
+        print("Starting run.")
+        status = RunProgress()
+        status.pending = list(glob(input_glob))
+        # Output label file:
+        pout = open(prompt_filename, 'wt')
+        with open(status_filename, 'wt') as fout:
+            json.dump(status.__dict__, fout)
+    print(f"{len(status.pending)} images remaining")
+    # If we don't have a preexisting set of labels (like for ImageNet/MSCOCO), just null-fill the mapping.
+    # We will try on a per-image basis to see if there's a metadata .json.
+    if prompt_mapping is None:
+        prompt_mapping = dict()
+    step = 0
+    with tqdm(total=len(status.pending)) as pbar:
+        while len(status.pending) > 0:
+            full_filename = status.pending.pop()
+            pbar.update(1)
+            step += 1
+            if step % 100 == 0:
+                # Checkpoint save:
+                with open(status_filename, 'wt') as fout:
+                    json.dump(status.__dict__, fout)
+            _fpath, fname = os.path.split(full_filename)
+            # Make our output filenames.
+            # We used to do this here so we could check if a file existed before writing, then skip it, but since we
+            # have a 'status' that we cache and update, we no longer have to do this check.
+            annotation_filename = ""
+            if annotated_output_directory:
+                annotation_filename = os.path.join(annotated_output_directory, fname)
+            output_filename = os.path.join(output_directory, fname)
+            # The LAION dataset has accompanying .json files with each image.
+            partial_filename, extension = os.path.splitext(full_filename)
+            candidate_json_fullpath = partial_filename + ".json"
+            image_metadata = {}
+            if os.path.exists(candidate_json_fullpath):
+                try:
+                    image_metadata = json.load(open(candidate_json_fullpath, 'rt'))
+                except Exception as e:
+                    print(e)
+            if "NSFW" in image_metadata:
+                nsfw_marker = image_metadata.get("NSFW")  # This can be "", None, or other weird things.
+                if nsfw_marker is not None and nsfw_marker.lower() != "unlikely":
+                    # Skip NSFW images.
+                    status.skipped_nsfw.append(full_filename)
+                    continue
+            # Try to get a prompt/caption from the metadata or the prompt mapping.
+            image_prompt = image_metadata.get("caption", prompt_mapping.get(fname, ""))
+            # Load image:
+            img = Image.open(full_filename).convert("RGB")
+            img_width = img.size[0]
+            img_height = img.size[1]
+            img_size = min(img.size[0], img.size[1])
+            if img_size < min_image_size or max(img_width, img_height) > max_image_size:
+                status.skipped_size.append(full_filename)
+                continue
+            # We re-initialize the detector every time because it has a habit of triggering weird race conditions.
+            empty, annotated, faces_before_filtering, faces_after_filtering = generate_annotation(
+                img,
+                max_faces=5,
+                min_face_size_pixels=min_face_size_pixels,
+                return_annotation_data=True
+            )
+            if faces_before_filtering == 0:
+                # Skip images with no faces.
+                status.skipped_noface.append(full_filename)
+                continue
+            if faces_after_filtering == 0:
+                # Skip images with no faces large enough
+                status.skipped_smallface.append(full_filename)
+                continue
+            Image.fromarray(empty).save(output_filename)
+            if annotation_filename:
+                Image.fromarray(annotated).save(annotation_filename)
+            # See https://github.com/lllyasviel/ControlNet/blob/main/docs/train.md for the training file format.
+            # prompt.json
+            # a JSONL file with {"source": "source/0.jpg", "target": "target/0.jpg", "prompt": "..."}.
+            # a source/xxxxx.jpg or source/xxxx.png file for each of the inputs.
+            # a target/xxxxx.jpg for each of the outputs.
+            pout.write(json.dumps({
+                "source": os.path.join(output_directory, fname),
+                "target": full_filename,
+                "prompt": image_prompt,
+            }) + "\n")
+            pout.flush()
+            status.success.append(full_filename)
+    # We do save every 100 iterations, but it's good to save on completion, too.
+    with open(status_filename, 'wt') as fout:
+        json.dump(status.__dict__, fout)
+    pout.close()
+    print("Done!")
+    print(f"{len(status.success)} images added to dataset.")
+    print(f"{len(status.skipped_size)} images rejected for size.")
+    print(f"{len(status.skipped_smallface)} images rejected for having faces too small.")
+    print(f"{len(status.skipped_noface)} images rejected for not having faces.")
+    print(f"{len(status.skipped_nsfw)} images rejected for NSFW.")
+if __name__ == "__main__":
+    if len(sys.argv) >= 3 and "-h" not in sys.argv:
+        prompt_jsonl = sys.argv[1]
+        in_glob = sys.argv[2]  # Should probably be in a directory called "target/*.jpg".
+        output_dir = sys.argv[3]  # Should probably be a directory called "source".
+        annotation_dir = ""
+        if len(sys.argv) > 4:
+            annotation_dir = sys.argv[4]
+        main("generate_face_poses_checkpoint.json", prompt_jsonl, in_glob, output_dir, annotation_dir)
+    else:
+        print(f"""Usage:
+        python {sys.argv[0]} prompt.jsonl target/*.jpg source/ [annotated/]
+        source and target are slightly confusing in this context.  We are writing the image names to prompt.jsonl, so
+        the naming system has to be consistent with what ControlNet expects.  In ControlNet, the source is the input and
+        target is the output.  We are generating source images from targets in this application, so the second argument
+        should be a folder full of images.  The third argument should be 'source', where the images should be places.
+        Optionally, an 'annotated' directory can be provided.  Augmented images will be placed here.
+        A checkpoint file named 'generate_face_poses_checkpoint.json' will be created in the place where the script is
+        run.  If a run is cancelled, it can be resumed from this checkpoint.
+        If invoking the script from bash, do not forget to enclose globs with quotes.  Example usage:
+        `python ./tool_generate_face_poses.py ./face_prompt.jsonl "/home/josephcatrambone/training_data/data-mscoco/images/train2017/*" /home/josephcatrambone/training_data/data-mscoco/images/source_2017/`
+        """)

train_laion_face.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from share import *
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+from laion_face_dataset import LaionDataset
+from cldm.logger import ImageLogger
+from cldm.model import create_model, load_state_dict
+# Configs
+resume_path = './models/controlnet_sd21_laion_face.ckpt'
+batch_size = 4
+logger_freq = 2500
+learning_rate = 1e-5
+sd_locked = True
+only_mid_control = False
+# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
+model = create_model('./models/cldm_v21.yaml').cpu()
+model.load_state_dict(load_state_dict(resume_path, location='cpu'))
+model.learning_rate = learning_rate
+model.sd_locked = sd_locked
+model.only_mid_control = only_mid_control
+# Save every so often:
+ckpt_callback = pl.callbacks.ModelCheckpoint(
+        dirpath="./checkpoints/",
+        filename="ckpt_controlnet_sd21_{epoch}_{step}_{loss}",
+        monitor='train/loss_simple_step',
+        save_top_k=5,
+        every_n_train_steps=5000,
+        save_last=True,
+)
+# Misc
+dataset = LaionDataset()
+dataloader = DataLoader(dataset, num_workers=0, batch_size=batch_size, shuffle=True)
+logger = ImageLogger(batch_frequency=logger_freq)
+trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger, ckpt_callback])
+# Train!
+trainer.fit(model, dataloader)

train_laion_face_sd15.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from share import *
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+from laion_face_dataset import LaionDataset
+from cldm.logger import ImageLogger
+from cldm.model import create_model, load_state_dict
+# Configs
+resume_path = './models/controlnet_sd15_laion_face.ckpt'
+batch_size = 8
+logger_freq = 2500
+learning_rate = 1e-5
+sd_locked = True
+only_mid_control = False
+# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
+model = create_model('./models/cldm_v15.yaml').cpu()
+model.load_state_dict(load_state_dict(resume_path, location='cpu'))
+model.learning_rate = learning_rate
+model.sd_locked = sd_locked
+model.only_mid_control = only_mid_control
+# Save every so often:
+ckpt_callback = pl.callbacks.ModelCheckpoint(
+        dirpath="./checkpoints/",
+        filename="controlnet_sd15_laion_face_{epoch}_{step}_{loss}.ckpt",
+        monitor='train/loss_simple_step',
+        save_top_k=5,
+        every_n_train_steps=5000,
+        save_last=True,
+)
+# Misc
+dataset = LaionDataset()
+dataloader = DataLoader(dataset, num_workers=0, batch_size=batch_size, shuffle=True)
+logger = ImageLogger(batch_frequency=logger_freq)
+trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger, ckpt_callback])
+# Train!
+trainer.fit(model, dataloader)