File size: 4,729 Bytes
522db09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f2539b
 
 
 
 
522db09
 
9f2539b
522db09
 
 
 
 
 
 
 
 
 
 
 
 
9f2539b
 
522db09
 
 
 
 
 
 
 
 
 
 
 
 
 
33ee32f
 
 
522db09
33ee32f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import cv2
import torch
import argparse
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from diffusers.utils import load_image
import numpy as np
from PIL import Image

def split_video_into_frames(video_path, frames_dir):
    if not os.path.exists(frames_dir):
        os.makedirs(frames_dir)
    print("splitting video")
    vidcap = cv2.VideoCapture(video_path)
    success, image = vidcap.read()
    count = 0
    while success:
        frame_path = os.path.join(frames_dir, f"frame{count:04d}.png")
        cv2.imwrite(frame_path, image)
        success, image = vidcap.read()
        count += 1

def frame_number(frame_filename):
    # Extract the frame number from the filename and convert it to an integer
    return int(frame_filename[5:-4])

def count_frame_images(frames_dir):
    # Count the number of frame images in the directory
    frame_files = [f for f in os.listdir(frames_dir) if f.startswith('frame') and f.endswith('.png')]
    return len(frame_files)

# Argument parser
parser = argparse.ArgumentParser(description='Generate images based on video frames.')
parser.add_argument('--prompt', default='a woman', help='the stable diffusion prompt')
parser.add_argument('--video_path', default='./None.mp4', help='Path to the input video file.')
parser.add_argument('--frames_dir', default='./frames', help='Directory to save the extracted video frames.')
parser.add_argument('--output_frames_dir', default='./output_frames', help='Directory to save the generated images.')
parser.add_argument('--init_image_path', default=None, help='Path to the initial conditioning image.')

args = parser.parse_args()

video_path = args.video_path
frames_dir = args.frames_dir
output_frames_dir = args.output_frames_dir
init_image_path = args.init_image_path
prompt = args.prompt

# If frame images do not already exist, split video into frames
if count_frame_images(frames_dir) == 0:
    split_video_into_frames(video_path, frames_dir)

# Create output frames directory if it doesn't exist
if not os.path.exists(output_frames_dir):
    os.makedirs(output_frames_dir)

# Load the initial conditioning image, if provided
if init_image_path:
    print(f"using image {init_image_path}")
    last_generated_image = load_image(init_image_path)
else:
    initial_frame_path = os.path.join(frames_dir, "frame0000.png")
    last_generated_image = load_image(initial_frame_path)

base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
controlnet1_path = "CiaraRowles/controlnet-temporalnet-sdxl-1.0"
controlnet2_path = "diffusers/controlnet-canny-sdxl-1.0"

controlnet = [
    ControlNetModel.from_pretrained(controlnet1_path, torch_dtype=torch.float16),
    ControlNetModel.from_pretrained(controlnet2_path, torch_dtype=torch.float16)
]
#controlnet = ControlNetModel.from_pretrained(controlnet2_path, torch_dtype=torch.float16)

pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    base_model_path, controlnet=controlnet, torch_dtype=torch.float16
)

#pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
#pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()

generator = torch.manual_seed(7)

# Loop over the saved frames in numerical order
frame_files = sorted(os.listdir(frames_dir), key=frame_number)

for i, frame_file in enumerate(frame_files):
    # Use the original video frame to create Canny edge-detected image as the conditioning image for the first ControlNetModel
    control_image_path = os.path.join(frames_dir, frame_file)
    control_image = load_image(control_image_path)
    
    canny_image = np.array(control_image)
    canny_image = cv2.Canny(canny_image, 25, 200)
    canny_image = canny_image[:, :, None]
    canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
    canny_image = Image.fromarray(canny_image)

    # Generate image
    image = pipe(
       prompt, num_inference_steps=20, generator=generator, image=[last_generated_image, canny_image], controlnet_conditioning_scale=[0.6, 0.7]
       #prompt, num_inference_steps=20, generator=generator, image=canny_image, controlnet_conditioning_scale=0.5
    ).images[0]
    
    # Save the generated image to output folder
    output_path = os.path.join(output_frames_dir, f"output{str(i).zfill(4)}.png")
    image.save(output_path)

    # Save the Canny image for reference
    canny_image_path = os.path.join(output_frames_dir, f"outputcanny{str(i).zfill(4)}.png")
    canny_image.save(canny_image_path)

    # Update the last_generated_image with the newly generated image for the next iteration
    last_generated_image = image

    print(f"Saved generated image for frame {i} to {output_path}")