Add video2video

#9
by multimodalart HF staff - opened
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. app.py +124 -16
  3. horse.mp4 +3 -0
  4. kitten.mp4 +0 -0
  5. requirements.txt +1 -1
  6. train_running.mp4 +0 -0
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  models/RealESRGAN_x4.pth filter=lfs diff=lfs merge=lfs -text
37
  models/flownet.pkl filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  models/RealESRGAN_x4.pth filter=lfs diff=lfs merge=lfs -text
37
  models/flownet.pkl filter=lfs diff=lfs merge=lfs -text
38
+ horse.mp4 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -4,9 +4,16 @@ import random
4
  import threading
5
  import time
6
 
 
 
 
 
 
 
7
  import gradio as gr
8
  import torch
9
- from diffusers import CogVideoXPipeline, CogVideoXDDIMScheduler,CogVideoXDPMScheduler
 
10
  from datetime import datetime, timedelta
11
 
12
  from diffusers.image_processor import VaeImageProcessor
@@ -27,6 +34,8 @@ pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timest
27
  pipe.transformer.to(memory_format=torch.channels_last)
28
  pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
29
 
 
 
30
  os.makedirs("./output", exist_ok=True)
31
  os.makedirs("./gradio_tmp", exist_ok=True)
32
 
@@ -46,6 +55,76 @@ Other times the user will not want modifications , but instead want a new image
46
  Video descriptions must have the same num of words as examples below. Extra words will be ignored.
47
  """
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def convert_prompt(prompt: str, retry_times: int = 3) -> str:
51
  if not os.environ.get("OPENAI_API_KEY"):
@@ -96,9 +175,10 @@ def convert_prompt(prompt: str, retry_times: int = 3) -> str:
96
  return response.choices[0].message.content
97
  return prompt
98
 
99
-
100
  def infer(
101
  prompt: str,
 
 
102
  num_inference_steps: int,
103
  guidance_scale: float,
104
  seed: int = -1,
@@ -106,16 +186,30 @@ def infer(
106
  ):
107
  if seed == -1:
108
  seed = random.randint(0, 2 ** 8 - 1)
109
- video_pt = pipe(
110
- prompt=prompt,
111
- num_videos_per_prompt=1,
112
- num_inference_steps=num_inference_steps,
113
- num_frames=49,
114
- use_dynamic_cfg=True,
115
- output_type="pt",
116
- guidance_scale=guidance_scale,
117
- generator=torch.Generator(device="cpu").manual_seed(seed),
118
- ).frames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  return (video_pt, seed)
121
 
@@ -146,6 +240,7 @@ def delete_old_files():
146
 
147
 
148
  threading.Thread(target=delete_old_files, daemon=True).start()
 
149
 
150
  with gr.Blocks() as demo:
151
  gr.Markdown("""
@@ -170,6 +265,10 @@ with gr.Blocks() as demo:
170
  """)
171
  with gr.Row():
172
  with gr.Column():
 
 
 
 
173
  prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5)
174
 
175
  with gr.Row():
@@ -265,14 +364,18 @@ with gr.Blocks() as demo:
265
 
266
 
267
  def generate(prompt,
 
 
268
  seed_value,
269
  scale_status,
270
  rife_status,
271
- progress=gr.Progress(track_tqdm=True)
272
  ):
273
 
274
  latents, seed = infer(
275
  prompt,
 
 
276
  num_inference_steps=50, # NOT Changed
277
  guidance_scale=7.0, # NOT Changed
278
  seed=seed_value,
@@ -308,12 +411,17 @@ with gr.Blocks() as demo:
308
 
309
  generate_button.click(
310
  generate,
311
- inputs=[prompt, seed_param, enable_scale, enable_rife],
312
  outputs=[video_output, download_video_button, download_gif_button, seed_text],
313
  )
314
 
315
  enhance_button.click(enhance_prompt_func, inputs=[prompt], outputs=[prompt])
316
-
 
 
 
 
 
317
  if __name__ == "__main__":
318
  demo.queue(max_size=15)
319
- demo.launch()
 
4
  import threading
5
  import time
6
 
7
+ import cv2
8
+ import numpy as np
9
+ import tempfile
10
+ import imageio
11
+ import imageio_ffmpeg
12
+
13
  import gradio as gr
14
  import torch
15
+ from diffusers import CogVideoXPipeline, CogVideoXDDIMScheduler,CogVideoXDPMScheduler, CogVideoXVideoToVideoPipeline
16
+ from diffusers.utils import export_to_video, load_video
17
  from datetime import datetime, timedelta
18
 
19
  from diffusers.image_processor import VaeImageProcessor
 
34
  pipe.transformer.to(memory_format=torch.channels_last)
35
  pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
36
 
37
+ pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b", transformer=pipe.transformer, vae=pipe.vae, scheduler=pipe.scheduler, tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder, torch_dtype=torch.bfloat16)
38
+
39
  os.makedirs("./output", exist_ok=True)
40
  os.makedirs("./gradio_tmp", exist_ok=True)
41
 
 
55
  Video descriptions must have the same num of words as examples below. Extra words will be ignored.
56
  """
57
 
58
+ def resize_if_unfit(input_video, progress=gr.Progress(track_tqdm=True)):
59
+ width, height = get_video_dimensions(input_video)
60
+
61
+ if width == 720 and height == 480:
62
+ processed_video = input_video
63
+ else:
64
+ processed_video = center_crop_resize(input_video)
65
+ return processed_video
66
+
67
+ def get_video_dimensions(input_video_path):
68
+ reader = imageio_ffmpeg.read_frames(input_video_path)
69
+ metadata = next(reader)
70
+ return metadata['size']
71
+
72
+ def center_crop_resize(input_video_path, target_width=720, target_height=480):
73
+ cap = cv2.VideoCapture(input_video_path)
74
+
75
+ orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
76
+ orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
77
+ orig_fps = cap.get(cv2.CAP_PROP_FPS)
78
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
79
+
80
+ width_factor = target_width / orig_width
81
+ height_factor = target_height / orig_height
82
+ resize_factor = max(width_factor, height_factor)
83
+
84
+ inter_width = int(orig_width * resize_factor)
85
+ inter_height = int(orig_height * resize_factor)
86
+
87
+ target_fps = 8
88
+ ideal_skip = max(0, math.ceil(orig_fps / target_fps) - 1)
89
+ skip = min(5, ideal_skip) # Cap at 5
90
+
91
+ while (total_frames / (skip + 1)) < 49 and skip > 0:
92
+ skip -= 1
93
+
94
+ processed_frames = []
95
+ frame_count = 0
96
+ total_read = 0
97
+
98
+ while frame_count < 49 and total_read < total_frames:
99
+ ret, frame = cap.read()
100
+ if not ret:
101
+ break
102
+
103
+ if total_read % (skip + 1) == 0:
104
+ resized = cv2.resize(frame, (inter_width, inter_height), interpolation=cv2.INTER_AREA)
105
+
106
+ start_x = (inter_width - target_width) // 2
107
+ start_y = (inter_height - target_height) // 2
108
+ cropped = resized[start_y:start_y+target_height, start_x:start_x+target_width]
109
+
110
+ processed_frames.append(cropped)
111
+ frame_count += 1
112
+
113
+ total_read += 1
114
+
115
+ cap.release()
116
+
117
+ with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
118
+ temp_video_path = temp_file.name
119
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
120
+ out = cv2.VideoWriter(temp_video_path, fourcc, target_fps, (target_width, target_height))
121
+
122
+ for frame in processed_frames:
123
+ out.write(frame)
124
+
125
+ out.release()
126
+
127
+ return temp_video_path
128
 
129
  def convert_prompt(prompt: str, retry_times: int = 3) -> str:
130
  if not os.environ.get("OPENAI_API_KEY"):
 
175
  return response.choices[0].message.content
176
  return prompt
177
 
 
178
  def infer(
179
  prompt: str,
180
+ video_input: str,
181
+ video_strenght: float,
182
  num_inference_steps: int,
183
  guidance_scale: float,
184
  seed: int = -1,
 
186
  ):
187
  if seed == -1:
188
  seed = random.randint(0, 2 ** 8 - 1)
189
+ if(video_input):
190
+ video = load_video(video_input)[:49] # Limit to 49 frames
191
+ video_pt = pipe_video(
192
+ video=video,
193
+ prompt=prompt,
194
+ num_inference_steps=num_inference_steps,
195
+ num_videos_per_prompt=1,
196
+ strength=video_strenght,
197
+ use_dynamic_cfg=True,
198
+ output_type="pt",
199
+ guidance_scale=guidance_scale,
200
+ generator=torch.Generator(device="cpu").manual_seed(seed),
201
+ ).frames
202
+ else:
203
+ video_pt = pipe(
204
+ prompt=prompt,
205
+ num_videos_per_prompt=1,
206
+ num_inference_steps=num_inference_steps,
207
+ num_frames=49,
208
+ use_dynamic_cfg=True,
209
+ output_type="pt",
210
+ guidance_scale=guidance_scale,
211
+ generator=torch.Generator(device="cpu").manual_seed(seed),
212
+ ).frames
213
 
214
  return (video_pt, seed)
215
 
 
240
 
241
 
242
  threading.Thread(target=delete_old_files, daemon=True).start()
243
+ examples = [["horse.mp4"], ["kitten.mp4"], ["train_running.mp4"]]
244
 
245
  with gr.Blocks() as demo:
246
  gr.Markdown("""
 
265
  """)
266
  with gr.Row():
267
  with gr.Column():
268
+ with gr.Accordion("Video-to-video", open=False):
269
+ video_input = gr.Video(label="Input Video (will be cropped to 49 frames, 6 seconds at 8fps)")
270
+ strength = gr.Slider(0.1, 1.0, value=0.8, step=0.01, label="Strength")
271
+ examples_component = gr.Examples(examples, inputs=[video_input], cache_examples=False)
272
  prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5)
273
 
274
  with gr.Row():
 
364
 
365
 
366
  def generate(prompt,
367
+ video_input,
368
+ video_strenght,
369
  seed_value,
370
  scale_status,
371
  rife_status,
372
+ #progress=gr.Progress(track_tqdm=True)
373
  ):
374
 
375
  latents, seed = infer(
376
  prompt,
377
+ video_input,
378
+ video_strenght,
379
  num_inference_steps=50, # NOT Changed
380
  guidance_scale=7.0, # NOT Changed
381
  seed=seed_value,
 
411
 
412
  generate_button.click(
413
  generate,
414
+ inputs=[prompt, video_input, strength, seed_param, enable_scale, enable_rife],
415
  outputs=[video_output, download_video_button, download_gif_button, seed_text],
416
  )
417
 
418
  enhance_button.click(enhance_prompt_func, inputs=[prompt], outputs=[prompt])
419
+
420
+ video_input.upload(
421
+ resize_if_unfit,
422
+ inputs=[video_input],
423
+ outputs=[video_input]
424
+ )
425
  if __name__ == "__main__":
426
  demo.queue(max_size=15)
427
+ demo.launch()
horse.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c857bbc0d197c0751db9d6da9b5c85eafd163511ff9b0e10be65adf8ef9e352
3
+ size 453387
kitten.mp4 ADDED
Binary file (882 kB). View file
 
requirements.txt CHANGED
@@ -4,7 +4,7 @@ spandrel>=0.3.4
4
  tqdm>=4.66.5
5
  opencv-python>=4.10.0.84
6
  scikit-video>=1.1.11
7
- diffusers>=0.30.1
8
  transformers>=4.44.0
9
  accelerate>=0.33.0
10
  sentencepiece>=0.2.0
 
4
  tqdm>=4.66.5
5
  opencv-python>=4.10.0.84
6
  scikit-video>=1.1.11
7
+ git+https://github.com/huggingface/diffusers.git@3b5977dc29577cacbfec1d74221df4e28259a9bc
8
  transformers>=4.44.0
9
  accelerate>=0.33.0
10
  sentencepiece>=0.2.0
train_running.mp4 ADDED
Binary file (577 kB). View file