Spaces:
Running
on
L4
Running
on
L4
Migrated from GitHub
Browse files- .gitattributes +5 -0
- LICENSE +201 -0
- ORIGINAL_README.md +127 -0
- assets/Logo.png +0 -0
- assets/cut_and_drag_example_1.gif +3 -0
- assets/cut_and_drag_example_2.gif +3 -0
- assets/cut_and_drag_example_3.gif +3 -0
- assets/cut_and_drag_example_4.gif +3 -0
- assets/cut_and_drag_example_5.gif +3 -0
- cut_and_drag_gui.py +416 -0
- cut_and_drag_inference.py +489 -0
- make_warped_noise.py +93 -0
- requirements.txt +18 -0
- requirements_local.txt +12 -0
.gitattributes
CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
assets/cut_and_drag_example_1.gif filter=lfs diff=lfs merge=lfs -text
|
37 |
+
assets/cut_and_drag_example_2.gif filter=lfs diff=lfs merge=lfs -text
|
38 |
+
assets/cut_and_drag_example_3.gif filter=lfs diff=lfs merge=lfs -text
|
39 |
+
assets/cut_and_drag_example_4.gif filter=lfs diff=lfs merge=lfs -text
|
40 |
+
assets/cut_and_drag_example_5.gif filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
ORIGINAL_README.md
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- # Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using Real-Time Warped Noise -->
|
2 |
+
|
3 |
+
<p align="center">
|
4 |
+
<img src="assets/Logo.png" alt="Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using Real-Time Warped Noise" width="100%">
|
5 |
+
</p>
|
6 |
+
|
7 |
+
[![Project Page](https://img.shields.io/badge/Project-Page-green?logo=googlechrome&logoColor=green)](https://eyeline-research.github.io/Go-with-the-Flow/)
|
8 |
+
[![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b?logo=arxiv&logoColor=red)](https://arxiv.org/abs/2501.08331)
|
9 |
+
[![YouTube Tutorial](https://img.shields.io/badge/YouTube-Tutorial-red?logo=youtube&logoColor=red)](https://www.youtube.com/watch?v=IO3pbQpT5F8)
|
10 |
+
[![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Go--with--the--Flow-blue)](https://huggingface.co/Eyeline-Research/Go-with-the-Flow/tree/main)
|
11 |
+
|
12 |
+
[Ryan Burgert](https://ryanndagreat.github.io)<sup>1,3</sup>, [Yuancheng Xu](https://yuancheng-xu.github.io)<sup>1,4</sup>, [Wenqi Xian](https://www.cs.cornell.edu/~wenqixian/)<sup>1</sup>, [Oliver Pilarski](https://www.linkedin.com/in/oliverpilarski/)<sup>1</sup>, [Pascal Clausen](https://www.linkedin.com/in/pascal-clausen-a179566a/?originalSubdomain=ch)<sup>1</sup>, [Mingming He](https://mingminghe.com/)<sup>1</sup>, [Li Ma](https://limacv.github.io/homepage/)<sup>1</sup>,
|
13 |
+
|
14 |
+
[Yitong Deng](https://yitongdeng.github.io)<sup>2,5</sup>, [Lingxiao Li](https://scholar.google.com/citations?user=rxQDLWcAAAAJ&hl=en)<sup>2</sup>, [Mohsen Mousavi](www.linkedin.com/in/mohsen-mousavi-0516a03)<sup>1</sup>, [Michael Ryoo](http://michaelryoo.com)<sup>3</sup>, [Paul Debevec](https://www.pauldebevec.com)<sup>1</sup>, [Ning Yu](https://ningyu1991.github.io)<sup>1†</sup>
|
15 |
+
|
16 |
+
<sup>1</sup>Netflix Eyeline Studios, <sup>2</sup>Netflix, <sup>3</sup>Stony Brook University, <sup>4</sup>University of Maryland, <sup>5</sup>Stanford University<br>
|
17 |
+
<sup>†</sup>Project Lead
|
18 |
+
|
19 |
+
### Table of Contents
|
20 |
+
- [Abstract](#abstract)
|
21 |
+
- [Quick Start: Cut-and-drag Motion Control](#quick-start-cut-and-drag-motion-control)
|
22 |
+
- [Animation Template GUI (Local)](#1-animation-template-gui-local)
|
23 |
+
- [Running Video Diffusion (GPU)](#2-running-video-diffusion-gpu)
|
24 |
+
- [TODO](#todo)
|
25 |
+
- [Citation](#citation)
|
26 |
+
|
27 |
+
<a name="abstract"></a>
|
28 |
+
## :book: Abstract
|
29 |
+
|
30 |
+
Go-with-the-Flow is an easy and efficient way to control the motion patterns of video diffusion models. It lets a user decide how the camera and objects in a scene will move, and can even let you transfer motion patterns from one video to another.
|
31 |
+
|
32 |
+
We simply fine-tune a base model — requiring no changes to the original pipeline or architecture, except: instead of using pure i.i.d. Gaussian noise, we use **warped noise** instead. Inference has exactly the same computational cost as running the base model.
|
33 |
+
|
34 |
+
If you create something cool with our model - and want to share it on our website - email [email protected]. We will be creating a user-generated content section, starting with whomever submits the first video!
|
35 |
+
|
36 |
+
If you like this project, please give it a ★!
|
37 |
+
|
38 |
+
<a name="quick-start-cut-and-drag-motion-control"></a>
|
39 |
+
## :rocket: Quick Start: Cut-and-drag Motion Control
|
40 |
+
|
41 |
+
Cut-and-drag motion control lets you take an image, and create a video by cutting out different parts of that image and dragging them around.
|
42 |
+
|
43 |
+
For cut-and-drag motion control, there are two parts: an GUI to create a crude animation (no GPU needed), then a diffusion script to turn that crude animation into a pretty one (requires GPU).
|
44 |
+
|
45 |
+
**YouTube Tutorial**: [![YouTube Tutorial](https://img.shields.io/badge/YouTube-Tutorial-red?logo=youtube&logoColor=red)](https://www.youtube.com/watch?v=IO3pbQpT5F8)
|
46 |
+
|
47 |
+
Examples:
|
48 |
+
|
49 |
+
<p align="center">
|
50 |
+
<img src="assets/cut_and_drag_example_1.gif" width="80%">
|
51 |
+
<!-- <img src="assets/cut_and_drag_example_2.gif" width="80%"> -->
|
52 |
+
<img src="assets/cut_and_drag_example_3.gif" width="80%">
|
53 |
+
<img src="assets/cut_and_drag_example_4.gif" width="80%">
|
54 |
+
<img src="assets/cut_and_drag_example_5.gif" width="80%">
|
55 |
+
</p>
|
56 |
+
|
57 |
+
<a name="1-animation-template-gui-local"></a>
|
58 |
+
### 1. Animation Template GUI (Local)
|
59 |
+
|
60 |
+
1. Clone this repo, then `cd` into it.
|
61 |
+
2. Install local requirements:
|
62 |
+
|
63 |
+
`pip install -r requirements_local.txt`
|
64 |
+
|
65 |
+
3. Run the GUI:
|
66 |
+
|
67 |
+
`python cut_and_drag_gui.py`
|
68 |
+
|
69 |
+
4. Follow the instructions shown in the GUI.
|
70 |
+
|
71 |
+
After completion, an MP4 file will be generated. You'll need to move this file to a computer with a decent GPU to continue.
|
72 |
+
|
73 |
+
<a name="2-running-video-diffusion-gpu"></a>
|
74 |
+
### 2. Running Video Diffusion (GPU)
|
75 |
+
|
76 |
+
1. Clone this repo on the machine with the GPU, then `cd` into it.
|
77 |
+
2. Install requirements:
|
78 |
+
|
79 |
+
`pip install -r requirements.txt`
|
80 |
+
|
81 |
+
3. Warp the noise (replace `<PATH TO VIDEO OR URL>` accordingly):
|
82 |
+
|
83 |
+
`python make_warped_noise.py <PATH TO VIDEO OR URL> --output_folder noise_warp_output_folder`
|
84 |
+
|
85 |
+
4. Run inference:
|
86 |
+
|
87 |
+
```
|
88 |
+
python cut_and_drag_inference.py noise_warp_output_folder \
|
89 |
+
--prompt "A duck splashing" \
|
90 |
+
--output_mp4_path "output.mp4" \
|
91 |
+
--device "cuda" \
|
92 |
+
--num_inference_steps 5
|
93 |
+
```
|
94 |
+
|
95 |
+
Adjust folder paths, prompts, and other hyperparameters as needed. The output will be saved as `output.mp4`.
|
96 |
+
|
97 |
+
<a name="todo"></a>
|
98 |
+
## :clipboard: TODO
|
99 |
+
|
100 |
+
- [x] Upload All CogVideoX Models
|
101 |
+
- [x] Upload Cut-And-Drag Inference Code
|
102 |
+
- [x] Release to Arxiv
|
103 |
+
- [ ] Google Colab for people without GPU's
|
104 |
+
- [ ] Depth-Warping Inference Code
|
105 |
+
- [ ] T2V Motion Transfer Code
|
106 |
+
- [ ] ComfyUI Node
|
107 |
+
- [ ] Release 3D-to-Video Inference Code + Blender File
|
108 |
+
- [ ] Upload AnimateDiff Model
|
109 |
+
- [ ] Replicate Instance
|
110 |
+
- [ ] Fine-Tuning Code
|
111 |
+
|
112 |
+
<a name="citation"></a>
|
113 |
+
## :black_nib: Citation
|
114 |
+
|
115 |
+
If you use this in your research, please consider citing:
|
116 |
+
|
117 |
+
```
|
118 |
+
@misc{burgert2025gowiththeflowmotioncontrollablevideodiffusion,
|
119 |
+
title={Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using Real-Time Warped Noise},
|
120 |
+
author={Ryan Burgert and Yuancheng Xu and Wenqi Xian and Oliver Pilarski and Pascal Clausen and Mingming He and Li Ma and Yitong Deng and Lingxiao Li and Mohsen Mousavi and Michael Ryoo and Paul Debevec and Ning Yu},
|
121 |
+
year={2025},
|
122 |
+
eprint={2501.08331},
|
123 |
+
archivePrefix={arXiv},
|
124 |
+
primaryClass={cs.CV},
|
125 |
+
url={https://arxiv.org/abs/2501.08331},
|
126 |
+
}
|
127 |
+
```
|
assets/Logo.png
ADDED
![]() |
assets/cut_and_drag_example_1.gif
ADDED
![]() |
Git LFS Details
|
assets/cut_and_drag_example_2.gif
ADDED
![]() |
Git LFS Details
|
assets/cut_and_drag_example_3.gif
ADDED
![]() |
Git LFS Details
|
assets/cut_and_drag_example_4.gif
ADDED
![]() |
Git LFS Details
|
assets/cut_and_drag_example_5.gif
ADDED
![]() |
Git LFS Details
|
cut_and_drag_gui.py
ADDED
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rp import *
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import numpy as np
|
4 |
+
from matplotlib.widgets import Slider
|
5 |
+
from matplotlib.patches import Polygon as Polygon
|
6 |
+
import cv2
|
7 |
+
git_import('CommonSource')
|
8 |
+
import rp.git.CommonSource.noise_warp as nw
|
9 |
+
from easydict import EasyDict
|
10 |
+
|
11 |
+
|
12 |
+
def select_polygon(image):
|
13 |
+
fig, ax = plt.subplots()
|
14 |
+
ax.imshow(image)
|
15 |
+
ax.set_title("Left click to add points. Right click to undo. Close the window to finish.")
|
16 |
+
|
17 |
+
path = []
|
18 |
+
|
19 |
+
def onclick(event):
|
20 |
+
if event.button == 1: # Left click
|
21 |
+
if event.xdata is not None and event.ydata is not None:
|
22 |
+
path.append((event.xdata, event.ydata))
|
23 |
+
ax.clear()
|
24 |
+
ax.imshow(image)
|
25 |
+
ax.set_title("Left click to add points. Right click to undo. Close the window to finish.")
|
26 |
+
for i in range(len(path)):
|
27 |
+
if i > 0:
|
28 |
+
ax.plot([path[i - 1][0], path[i][0]], [path[i - 1][1], path[i][1]], "r-")
|
29 |
+
ax.plot(path[i][0], path[i][1], "ro")
|
30 |
+
if len(path) > 1:
|
31 |
+
ax.plot([path[-1][0], path[0][0]], [path[-1][1], path[0][1]], "r--")
|
32 |
+
if len(path) > 2:
|
33 |
+
polygon = Polygon(path, closed=True, alpha=0.3, facecolor="r", edgecolor="r")
|
34 |
+
ax.add_patch(polygon)
|
35 |
+
fig.canvas.draw()
|
36 |
+
elif event.button == 3 and path: # Right click
|
37 |
+
path.pop()
|
38 |
+
ax.clear()
|
39 |
+
ax.imshow(image)
|
40 |
+
ax.set_title("Left click to add points. Right click to undo. Close the window to finish.")
|
41 |
+
for i in range(len(path)):
|
42 |
+
if i > 0:
|
43 |
+
ax.plot([path[i - 1][0], path[i][0]], [path[i - 1][1], path[i][1]], "r-")
|
44 |
+
ax.plot(path[i][0], path[i][1], "ro")
|
45 |
+
if len(path) > 1:
|
46 |
+
ax.plot([path[-1][0], path[0][0]], [path[-1][1], path[0][1]], "r--")
|
47 |
+
if len(path) > 2:
|
48 |
+
polygon = Polygon(path, closed=True, alpha=0.3, facecolor="r", edgecolor="r")
|
49 |
+
ax.add_patch(polygon)
|
50 |
+
fig.canvas.draw()
|
51 |
+
|
52 |
+
cid = fig.canvas.mpl_connect("button_press_event", onclick)
|
53 |
+
plt.show()
|
54 |
+
fig.canvas.mpl_disconnect(cid)
|
55 |
+
|
56 |
+
return path
|
57 |
+
|
58 |
+
|
59 |
+
def select_polygon_and_path(image):
|
60 |
+
fig, ax = plt.subplots()
|
61 |
+
ax.imshow(image)
|
62 |
+
ax.set_title("Left click to add points. Right click to undo. Close the window to finish.")
|
63 |
+
|
64 |
+
polygon_path = []
|
65 |
+
movement_path = []
|
66 |
+
|
67 |
+
cid = fig.canvas.mpl_connect("button_press_event", onclick)
|
68 |
+
plt.show()
|
69 |
+
fig.canvas.mpl_disconnect(cid)
|
70 |
+
|
71 |
+
return polygon_path, movement_path
|
72 |
+
|
73 |
+
|
74 |
+
def select_path(image, polygon, num_frames=49):
|
75 |
+
fig, ax = plt.subplots()
|
76 |
+
plt.subplots_adjust(left=0.25, bottom=0.25)
|
77 |
+
ax.imshow(image)
|
78 |
+
ax.set_title("Left click to add points. Right click to undo. Close the window to finish.")
|
79 |
+
|
80 |
+
path = []
|
81 |
+
|
82 |
+
# Add sliders for final scale and rotation
|
83 |
+
ax_scale = plt.axes([0.25, 0.1, 0.65, 0.03])
|
84 |
+
ax_rot = plt.axes([0.25, 0.15, 0.65, 0.03])
|
85 |
+
|
86 |
+
scale_slider = Slider(ax_scale, "Final Scale", 0.1, 5.0, valinit=1)
|
87 |
+
rot_slider = Slider(ax_rot, "Final Rotation", -360, 360, valinit=0)
|
88 |
+
|
89 |
+
scales = []
|
90 |
+
rotations = []
|
91 |
+
|
92 |
+
def interpolate_transformations(n_points):
|
93 |
+
# scales = np.linspace(1, scale_slider.val, n_points)
|
94 |
+
scales = np.exp(np.linspace(0, np.log(scale_slider.val), n_points))
|
95 |
+
rotations = np.linspace(0, rot_slider.val, n_points)
|
96 |
+
return scales, rotations
|
97 |
+
|
98 |
+
def update_display():
|
99 |
+
ax.clear()
|
100 |
+
ax.imshow(image)
|
101 |
+
ax.set_title("Left click to add points. Right click to undo. Close the window to finish.")
|
102 |
+
|
103 |
+
n_points = len(path)
|
104 |
+
if n_points < 1:
|
105 |
+
fig.canvas.draw_idle()
|
106 |
+
return
|
107 |
+
|
108 |
+
# Interpolate scales and rotations over the total number of points
|
109 |
+
scales[:], rotations[:] = interpolate_transformations(n_points)
|
110 |
+
|
111 |
+
origin = np.array(path[0])
|
112 |
+
|
113 |
+
for i in range(n_points):
|
114 |
+
ax.plot(path[i][0], path[i][1], "bo")
|
115 |
+
if i > 0:
|
116 |
+
ax.plot([path[i - 1][0], path[i][0]], [path[i - 1][1], path[i][1]], "b-")
|
117 |
+
# Apply transformation to the polygon
|
118 |
+
transformed_polygon = apply_transformation(np.array(polygon), scales[i], rotations[i], origin)
|
119 |
+
# Offset polygon to the current point relative to the first point
|
120 |
+
position_offset = np.array(path[i]) - origin
|
121 |
+
transformed_polygon += position_offset
|
122 |
+
mpl_poly = Polygon(
|
123 |
+
transformed_polygon,
|
124 |
+
closed=True,
|
125 |
+
alpha=0.3,
|
126 |
+
facecolor="r",
|
127 |
+
edgecolor="r",
|
128 |
+
)
|
129 |
+
ax.add_patch(mpl_poly)
|
130 |
+
|
131 |
+
fig.canvas.draw_idle()
|
132 |
+
|
133 |
+
def onclick(event):
|
134 |
+
if event.inaxes != ax:
|
135 |
+
return
|
136 |
+
if event.button == 1: # Left click
|
137 |
+
path.append((event.xdata, event.ydata))
|
138 |
+
update_display()
|
139 |
+
elif event.button == 3 and path: # Right click
|
140 |
+
path.pop()
|
141 |
+
update_display()
|
142 |
+
|
143 |
+
def on_slider_change(val):
|
144 |
+
update_display()
|
145 |
+
|
146 |
+
scale_slider.on_changed(on_slider_change)
|
147 |
+
rot_slider.on_changed(on_slider_change)
|
148 |
+
|
149 |
+
scales, rotations = [], []
|
150 |
+
|
151 |
+
cid_click = fig.canvas.mpl_connect("button_press_event", onclick)
|
152 |
+
plt.show()
|
153 |
+
fig.canvas.mpl_disconnect(cid_click)
|
154 |
+
|
155 |
+
# Final interpolation after the window is closed
|
156 |
+
n_points = num_frames
|
157 |
+
if n_points > 0:
|
158 |
+
scales, rotations = interpolate_transformations(n_points)
|
159 |
+
rotations = [-x for x in rotations]
|
160 |
+
path = as_numpy_array(path)
|
161 |
+
path = as_numpy_array([linterp(path, i) for i in np.linspace(0, len(path) - 1, num=n_points)])
|
162 |
+
|
163 |
+
return path, scales, rotations
|
164 |
+
|
165 |
+
|
166 |
+
def animate_polygon(image, polygon, path, scales, rotations,interp=cv2.INTER_LINEAR):
|
167 |
+
frames = []
|
168 |
+
transformed_polygons = []
|
169 |
+
origin = np.array(path[0])
|
170 |
+
|
171 |
+
h, w = image.shape[:2]
|
172 |
+
|
173 |
+
for i in eta(range(len(path)), title="Creating frames for this layer..."):
|
174 |
+
# Compute the affine transformation matrix
|
175 |
+
theta = np.deg2rad(rotations[i])
|
176 |
+
scale = scales[i]
|
177 |
+
|
178 |
+
a11 = scale * np.cos(theta)
|
179 |
+
a12 = -scale * np.sin(theta)
|
180 |
+
a21 = scale * np.sin(theta)
|
181 |
+
a22 = scale * np.cos(theta)
|
182 |
+
|
183 |
+
# Compute translation components
|
184 |
+
tx = path[i][0] - (a11 * origin[0] + a12 * origin[1])
|
185 |
+
ty = path[i][1] - (a21 * origin[0] + a22 * origin[1])
|
186 |
+
|
187 |
+
M = np.array([[a11, a12, tx], [a21, a22, ty]])
|
188 |
+
|
189 |
+
# Apply the affine transformation to the image
|
190 |
+
warped_image = cv2.warpAffine(
|
191 |
+
image,
|
192 |
+
M,
|
193 |
+
(w, h),
|
194 |
+
flags=interp,
|
195 |
+
borderMode=cv2.BORDER_CONSTANT,
|
196 |
+
borderValue=(0, 0, 0),
|
197 |
+
)
|
198 |
+
|
199 |
+
# Transform the polygon points
|
200 |
+
polygon_np = np.array(polygon)
|
201 |
+
ones = np.ones(shape=(len(polygon_np), 1))
|
202 |
+
points_ones = np.hstack([polygon_np, ones])
|
203 |
+
transformed_polygon = M.dot(points_ones.T).T
|
204 |
+
transformed_polygons.append(transformed_polygon)
|
205 |
+
|
206 |
+
# Create a mask for the transformed polygon
|
207 |
+
mask = np.zeros((h, w), dtype=np.uint8)
|
208 |
+
cv2.fillPoly(mask, [np.int32(transformed_polygon)], 255)
|
209 |
+
|
210 |
+
# Extract the polygon area from the warped image
|
211 |
+
rgba_image = cv2.cvtColor(warped_image, cv2.COLOR_BGR2BGRA)
|
212 |
+
alpha_channel = np.zeros((h, w), dtype=np.uint8)
|
213 |
+
alpha_channel[mask == 255] = 255
|
214 |
+
rgba_image[:, :, 3] = alpha_channel
|
215 |
+
|
216 |
+
# Set areas outside the polygon to transparent
|
217 |
+
rgba_image[mask == 0] = (0, 0, 0, 0)
|
218 |
+
|
219 |
+
frames.append(rgba_image)
|
220 |
+
|
221 |
+
# return gather_vars("frames transformed_polygons")
|
222 |
+
return EasyDict(frames=frames,transformed_polygons=transformed_polygons)
|
223 |
+
|
224 |
+
|
225 |
+
def apply_transformation(polygon, scale, rotation, origin):
|
226 |
+
# Translate polygon to origin
|
227 |
+
translated_polygon = polygon - origin
|
228 |
+
# Apply scaling
|
229 |
+
scaled_polygon = translated_polygon * scale
|
230 |
+
# Apply rotation
|
231 |
+
theta = np.deg2rad(rotation)
|
232 |
+
rotation_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
|
233 |
+
rotated_polygon = np.dot(scaled_polygon, rotation_matrix)
|
234 |
+
# Translate back
|
235 |
+
final_polygon = rotated_polygon + origin
|
236 |
+
return final_polygon
|
237 |
+
|
238 |
+
|
239 |
+
# def cogvlm_caption_video(video_path, prompt="Please describe this video in detail."):
|
240 |
+
# import rp.web_evaluator as wev
|
241 |
+
#
|
242 |
+
# client = wev.Client("100.113.27.133")
|
243 |
+
# result = client.evaluate("run_captioner(x,prompt=prompt)", x=video_path, prompt=prompt)
|
244 |
+
# if result.errored:
|
245 |
+
# raise result.error
|
246 |
+
# return result.value
|
247 |
+
|
248 |
+
|
249 |
+
if __name__ == "__main__":
|
250 |
+
fansi_print(big_ascii_text("Go With The Flow!"), "yellow green", "bold")
|
251 |
+
|
252 |
+
image_path = input_conditional(
|
253 |
+
fansi("First Frame: Enter Image Path or URL", "blue cyan", "italic bold underlined"),
|
254 |
+
lambda x: is_a_file(x.strip()) or is_valid_url(x.strip()),
|
255 |
+
).strip()
|
256 |
+
|
257 |
+
print("Using path: " + fansi_highlight_path(image_path))
|
258 |
+
if is_video_file(image_path):
|
259 |
+
fansi_print('Video path was given. Using first frame as image.')
|
260 |
+
image=load_video(image_path,length=1)[0]
|
261 |
+
else:
|
262 |
+
image = load_image(image_path, use_cache=True)
|
263 |
+
image = resize_image_to_fit(image, height=1440, allow_growth=False)
|
264 |
+
|
265 |
+
rp.fansi_print("PRO TIP: Use this website to help write your captions: https://huggingface.co/spaces/THUDM/CogVideoX-5B-Space", 'blue cyan')
|
266 |
+
prompt=input(fansi('Input the video caption >>> ','blue cyan','bold'))
|
267 |
+
|
268 |
+
SCALE_FACTOR=1
|
269 |
+
#Adjust resolution to 720x480: resize then center-crop
|
270 |
+
HEIGHT=480*SCALE_FACTOR
|
271 |
+
WIDTH=720*SCALE_FACTOR
|
272 |
+
image = resize_image_to_hold(image,height=HEIGHT,width=WIDTH)
|
273 |
+
image = crop_image(image, height=HEIGHT,width=WIDTH, origin='center')
|
274 |
+
title = input_default(
|
275 |
+
fansi("Enter a title: ", "blue cyan", "italic bold underlined"),
|
276 |
+
get_file_name(
|
277 |
+
image_path,
|
278 |
+
include_file_extension=False,
|
279 |
+
),
|
280 |
+
)
|
281 |
+
output_folder=make_directory(get_unique_copy_path(title))
|
282 |
+
print("Output folder: " + fansi_highlight_path(output_folder))
|
283 |
+
|
284 |
+
fansi_print("How many layers?", "blue cyan", "italic bold underlined"),
|
285 |
+
num_layers = input_integer(
|
286 |
+
minimum=1,
|
287 |
+
)
|
288 |
+
|
289 |
+
layer_videos = []
|
290 |
+
layer_polygons = []
|
291 |
+
layer_first_frame_masks = []
|
292 |
+
layer_noises = []
|
293 |
+
|
294 |
+
for layer_num in range(num_layers):
|
295 |
+
layer_noise=np.random.randn(HEIGHT,WIDTH,18).astype(np.float32)
|
296 |
+
|
297 |
+
fansi_print(f'You are currently working on layer #{layer_num+1} of {num_layers}','yellow orange','bold')
|
298 |
+
if True or not "polygon" in vars() or input_yes_no("New Polygon?"):
|
299 |
+
polygon = select_polygon(image)
|
300 |
+
if True or not "animation" in vars() or input_yes_no("New Animation?"):
|
301 |
+
animation = select_path(image, polygon)
|
302 |
+
|
303 |
+
|
304 |
+
animation_output = animate_polygon(image, polygon, *animation)
|
305 |
+
|
306 |
+
noise_output_1 = as_numpy_array(animate_polygon(layer_noise[:,:,3*0:3*1], polygon, *animation, interp=cv2.INTER_NEAREST).frames)
|
307 |
+
noise_output_2 = as_numpy_array(animate_polygon(layer_noise[:,:,3*1:3*2], polygon, *animation, interp=cv2.INTER_NEAREST).frames)
|
308 |
+
noise_output_3 = as_numpy_array(animate_polygon(layer_noise[:,:,3*2:3*3], polygon, *animation, interp=cv2.INTER_NEAREST).frames)
|
309 |
+
noise_output_4 = as_numpy_array(animate_polygon(layer_noise[:,:,3*3:3*4], polygon, *animation, interp=cv2.INTER_NEAREST).frames)
|
310 |
+
noise_output_5 = as_numpy_array(animate_polygon(layer_noise[:,:,3*4:3*5], polygon, *animation, interp=cv2.INTER_NEAREST).frames)
|
311 |
+
noise_output_6 = as_numpy_array(animate_polygon(layer_noise[:,:,3*5:3*6], polygon, *animation, interp=cv2.INTER_NEAREST).frames)
|
312 |
+
noise_warp_output = np.concatenate(
|
313 |
+
[
|
314 |
+
noise_output_1[:,:,:,:3],
|
315 |
+
noise_output_2[:,:,:,:3],
|
316 |
+
noise_output_3[:,:,:,:3],
|
317 |
+
noise_output_4[:,:,:,:3],
|
318 |
+
noise_output_5[:,:,:,:3],
|
319 |
+
noise_output_6[:,:,:,:1],
|
320 |
+
],
|
321 |
+
axis=3,#THWC
|
322 |
+
)
|
323 |
+
|
324 |
+
frames, transformed_polygons = destructure(animation_output)
|
325 |
+
|
326 |
+
mask = get_image_alpha(frames[0]) > 0
|
327 |
+
|
328 |
+
layer_polygons.append(transformed_polygons)
|
329 |
+
layer_first_frame_masks.append(mask)
|
330 |
+
layer_videos.append(frames)
|
331 |
+
layer_noises.append(noise_warp_output)
|
332 |
+
|
333 |
+
if True or input_yes_no("Inpaint background?"):
|
334 |
+
total_mask = sum(layer_first_frame_masks).astype(bool)
|
335 |
+
background = cv_inpaint_image(image, mask=total_mask)
|
336 |
+
else:
|
337 |
+
background = "https://t3.ftcdn.net/jpg/02/76/96/64/360_F_276966430_HsEI96qrQyeO4wkcnXtGZOm0Qu4TKCgR.jpg"
|
338 |
+
background = load_image(background, use_cache=True)
|
339 |
+
background = cv_resize_image(background, get_image_dimensions(image))
|
340 |
+
background=as_rgba_image(background)
|
341 |
+
|
342 |
+
###
|
343 |
+
output_frames = [
|
344 |
+
overlay_images(
|
345 |
+
background,
|
346 |
+
*frame_layers,
|
347 |
+
)
|
348 |
+
for frame_layers in eta(list_transpose(layer_videos),title=fansi("Compositing all frames of the video...",'green','bold'))
|
349 |
+
]
|
350 |
+
output_frames=as_numpy_array(output_frames)
|
351 |
+
|
352 |
+
|
353 |
+
output_video_file=save_video_mp4(output_frames, output_folder+'/'+title + ".mp4", video_bitrate="max")
|
354 |
+
output_mask_file = save_video_mp4(
|
355 |
+
[
|
356 |
+
sum([get_image_alpha(x) for x in layers])
|
357 |
+
for layers in list_transpose(layer_videos)
|
358 |
+
],
|
359 |
+
output_folder + "/" + title + "_mask.mp4",
|
360 |
+
video_bitrate="max",
|
361 |
+
)
|
362 |
+
|
363 |
+
|
364 |
+
###
|
365 |
+
fansi_print("Warping noise...",'yellow green','bold italic')
|
366 |
+
output_noises = np.random.randn(1,HEIGHT,WIDTH,16)
|
367 |
+
output_noises=np.repeat(output_noises,49,axis=0)
|
368 |
+
for layer_num in range(num_layers):
|
369 |
+
fansi_print(f'Warping noise for layer #{layer_num+1} of {num_layers}','green','bold')
|
370 |
+
for frame in eta(range(49),title='frame number'):
|
371 |
+
noise_mask = get_image_alpha(layer_videos[layer_num][frame])[:,:,None]>0
|
372 |
+
noise_video_layer = layer_noises[layer_num][frame]
|
373 |
+
output_noises[frame]*=(noise_mask==0)
|
374 |
+
output_noises[frame]+=noise_video_layer*noise_mask
|
375 |
+
#display_image((noise_mask * noise_video_layer)[:,:,:3])
|
376 |
+
display_image(output_noises[frame][:,:,:3]/5+.5)
|
377 |
+
|
378 |
+
import einops
|
379 |
+
import torch
|
380 |
+
torch_noises=torch.tensor(output_noises)
|
381 |
+
torch_noises=einops.rearrange(torch_noises,'F H W C -> F C H W')
|
382 |
+
#
|
383 |
+
small_torch_noises=[]
|
384 |
+
for i in eta(range(49),title='Regaussianizing'):
|
385 |
+
torch_noises[i]=nw.regaussianize(torch_noises[i])[0]
|
386 |
+
small_torch_noise=nw.resize_noise(torch_noises[i],(480//8,720//8))
|
387 |
+
small_torch_noises.append(small_torch_noise)
|
388 |
+
#display_image(as_numpy_image(small_torch_noise[:3])/5+.5)
|
389 |
+
display_image(as_numpy_image(torch_noises[i,:3])/5+.5)
|
390 |
+
small_torch_noises=torch.stack(small_torch_noises)#DOWNSAMPLED NOISE FOR CARTRIDGE!
|
391 |
+
|
392 |
+
###
|
393 |
+
cartridge={}
|
394 |
+
cartridge['instance_noise']=small_torch_noises.bfloat16()
|
395 |
+
cartridge['instance_video']=(as_torch_images(output_frames)*2-1).bfloat16()
|
396 |
+
cartridge['instance_prompt']=prompt
|
397 |
+
output_cartridge_file=object_to_file(cartridge, output_folder + "/" + title + "_cartridge.pkl")
|
398 |
+
|
399 |
+
###
|
400 |
+
|
401 |
+
|
402 |
+
output_polygons_file=output_folder+'/'+'polygons.npy'
|
403 |
+
polygons=as_numpy_array(layer_polygons)
|
404 |
+
np.save(output_polygons_file,polygons)
|
405 |
+
|
406 |
+
print()
|
407 |
+
print(fansi('Saved outputs:','green','bold'))
|
408 |
+
print(fansi(' - Saved video: ','green','bold'),fansi_highlight_path(get_relative_path(output_video_file)))
|
409 |
+
print(fansi(' - Saved masks: ','green','bold'),fansi_highlight_path(get_relative_path(output_mask_file)))
|
410 |
+
print(fansi(' - Saved shape: ','green','bold'),fansi_highlight_path(output_polygons_file))
|
411 |
+
print(fansi(' - Saved cartridge: ','green','bold'),fansi_highlight_path(output_cartridge_file))
|
412 |
+
|
413 |
+
print("Press CTRL+C to exit")
|
414 |
+
|
415 |
+
|
416 |
+
display_video(video_with_progress_bar(output_frames), loop=True)
|
cut_and_drag_inference.py
ADDED
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import rp
|
2 |
+
# from rp import *
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import einops
|
6 |
+
from diffusers import CogVideoXImageToVideoPipeline
|
7 |
+
from diffusers import CogVideoXVideoToVideoPipeline
|
8 |
+
from diffusers import CogVideoXPipeline
|
9 |
+
from diffusers.utils import export_to_video, load_image
|
10 |
+
from icecream import ic
|
11 |
+
from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel
|
12 |
+
from transformers import T5EncoderModel
|
13 |
+
|
14 |
+
import rp.git.CommonSource.noise_warp as nw
|
15 |
+
|
16 |
+
pipe_ids = dict(
|
17 |
+
T2V5B="THUDM/CogVideoX-5b",
|
18 |
+
T2V2B="THUDM/CogVideoX-2b",
|
19 |
+
I2V5B="THUDM/CogVideoX-5b-I2V",
|
20 |
+
)
|
21 |
+
|
22 |
+
# From a bird's-eye view, a serene scene unfolds: a herd of deer gracefully navigates shallow, warm-hued waters, their silhouettes stark against the earthy tones. The deer, spread across the frame, cast elongated, well-defined shadows that accentuate their antlers, creating a mesmerizing play of light and dark. This aerial perspective captures the tranquil essence of the setting, emphasizing the harmonious contrast between the deer and their mirror-like reflections on the water's surface. The composition exudes a peaceful stillness, yet the subtle movement suggested by the shadows adds a dynamic layer to the natural beauty and symmetry of the moment.
|
23 |
+
base_url = 'https://huggingface.co/Eyeline-Research/Go-with-the-Flow'
|
24 |
+
lora_urls = dict(
|
25 |
+
I2V5B_final_i30000_lora_weights = base_url+'I2V5B_final_i30000_lora_weights.safetensors',
|
26 |
+
I2V5B_final_i38800_nearest_lora_weights = base_url+'I2V5B_final_i38800_nearest_lora_weights.safetensors',
|
27 |
+
I2V5B_resum_blendnorm_0degrad_i13600_DATASET_lora_weights = base_url+'I2V5B_resum_blendnorm_0degrad_i13600_DATASET_lora_weights.safetensors',
|
28 |
+
T2V2B_RDeg_i30000_lora_weights = base_url+'T2V2B_RDeg_i30000_lora_weights.safetensors',
|
29 |
+
T2V5B_blendnorm_i18000_DATASET_lora_weights = base_url+'T2V5B_blendnorm_i18000_DATASET_lora_weights.safetensors',
|
30 |
+
T2V5B_blendnorm_i25000_DATASET_nearest_lora_weights = base_url+'T2V5B_blendnorm_i25000_DATASET_nearest_lora_weights.safetensors',
|
31 |
+
)
|
32 |
+
|
33 |
+
dtype=torch.bfloat16
|
34 |
+
|
35 |
+
#https://medium.com/@ChatGLM/open-sourcing-cogvideox-a-step-towards-revolutionizing-video-generation-28fa4812699d
|
36 |
+
B, F, C, H, W = 1, 13, 16, 60, 90 # The defaults
|
37 |
+
num_frames=(F-1)*4+1 #https://miro.medium.com/v2/resize:fit:1400/format:webp/0*zxsAG1xks9pFIsoM
|
38 |
+
#Possible num_frames: 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49
|
39 |
+
assert num_frames==49
|
40 |
+
|
41 |
+
@rp.memoized #Torch never manages to unload it from memory anyway
|
42 |
+
def get_pipe(model_name, device=None, low_vram=True):
|
43 |
+
"""
|
44 |
+
model_name is like "I2V5B", "T2V2B", or "T2V5B", or a LoRA name like "T2V2B_RDeg_i30000_lora_weights"
|
45 |
+
device is automatically selected if unspecified
|
46 |
+
low_vram, if True, will make the pipeline use CPU offloading
|
47 |
+
"""
|
48 |
+
|
49 |
+
if model_name in pipe_ids:
|
50 |
+
lora_name = None
|
51 |
+
pipe_name = model_name
|
52 |
+
else:
|
53 |
+
#By convention, we have lora_paths that start with the pipe names
|
54 |
+
rp.fansi_print(f"Getting pipe name from model_name={model_name}",'cyan','bold')
|
55 |
+
lora_name = model_name
|
56 |
+
pipe_name = lora_name.split('_')[0]
|
57 |
+
|
58 |
+
is_i2v = "I2V" in pipe_name # This is a convention I'm using right now
|
59 |
+
# is_v2v = "V2V" in pipe_name # This is a convention I'm using right now
|
60 |
+
|
61 |
+
# if is_v2v:
|
62 |
+
# old_pipe_name = pipe_name
|
63 |
+
# old_lora_name = lora_name
|
64 |
+
# if pipe_name is not None: pipe_name = pipe_name.replace('V2V','T2V')
|
65 |
+
# if lora_name is not None: lora_name = lora_name.replace('V2V','T2V')
|
66 |
+
# rp.fansi_print(f"V2V: {old_pipe_name} --> {pipe_name} &&& {old_lora_name} --> {lora_name}",'white','bold italic','red')
|
67 |
+
|
68 |
+
pipe_id = pipe_ids[pipe_name]
|
69 |
+
print(f"LOADING PIPE WITH device={device} pipe_name={pipe_name} pipe_id={pipe_id} lora_name={lora_name}" )
|
70 |
+
|
71 |
+
hub_model_id = pipe_ids[pipe_name]
|
72 |
+
|
73 |
+
transformer = CogVideoXTransformer3DModel.from_pretrained(hub_model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
|
74 |
+
text_encoder = T5EncoderModel.from_pretrained(hub_model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16)
|
75 |
+
vae = AutoencoderKLCogVideoX.from_pretrained(hub_model_id, subfolder="vae", torch_dtype=torch.bfloat16)
|
76 |
+
|
77 |
+
PipeClass = CogVideoXImageToVideoPipeline if is_i2v else CogVideoXPipeline
|
78 |
+
pipe = PipeClass.from_pretrained(hub_model_id, torch_dtype=torch.bfloat16, vae=vae,transformer=transformer,text_encoder=text_encoder)
|
79 |
+
|
80 |
+
if lora_name is not None:
|
81 |
+
lora_folder = rp.make_directory('lora_models')
|
82 |
+
lora_url = lora_urls[lora_name]
|
83 |
+
lora_path = rp.download_url(lora_url, lora_folder, show_progress=True, skip_existing=True)
|
84 |
+
assert rp.file_exists(lora_path), (lora_name, lora_path)
|
85 |
+
print(end="\tLOADING LORA WEIGHTS...",flush=True)
|
86 |
+
pipe.load_lora_weights(lora_path)
|
87 |
+
print("DONE!")
|
88 |
+
|
89 |
+
if device is None:
|
90 |
+
device = rp.select_torch_device()
|
91 |
+
|
92 |
+
if not low_vram:
|
93 |
+
print("\tUSING PIPE DEVICE", device)
|
94 |
+
pipe = pipe.to(device)
|
95 |
+
else:
|
96 |
+
print("\tUSING PIPE DEVICE WITH CPU OFFLOADING",device)
|
97 |
+
pipe=pipe.to('cpu')
|
98 |
+
pipe.enable_sequential_cpu_offload(device=device)
|
99 |
+
|
100 |
+
# pipe.vae.enable_tiling()
|
101 |
+
# pipe.vae.enable_slicing()
|
102 |
+
|
103 |
+
# Metadata
|
104 |
+
pipe.lora_name = lora_name
|
105 |
+
pipe.pipe_name = pipe_name
|
106 |
+
pipe.is_i2v = is_i2v
|
107 |
+
# pipe.is_v2v = is_v2v
|
108 |
+
|
109 |
+
return pipe
|
110 |
+
|
111 |
+
def get_downtemp_noise(noise, noise_downtemp_interp):
|
112 |
+
assert noise_downtemp_interp in {'nearest', 'blend', 'blend_norm', 'randn'}, noise_downtemp_interp
|
113 |
+
if noise_downtemp_interp == 'nearest' : return rp.resize_list(noise, 13)
|
114 |
+
elif noise_downtemp_interp == 'blend' : return downsamp_mean(noise, 13)
|
115 |
+
elif noise_downtemp_interp == 'blend_norm' : return normalized_noises(downsamp_mean(noise, 13))
|
116 |
+
elif noise_downtemp_interp == 'randn' : return torch.randn_like(rp.resize_list(noise, 13)) #Basically no warped noise, just r
|
117 |
+
else: assert False, 'impossible'
|
118 |
+
|
119 |
+
def downsamp_mean(x, l=13):
|
120 |
+
return torch.stack([rp.mean(u) for u in rp.split_into_n_sublists(x, l)])
|
121 |
+
|
122 |
+
def normalized_noises(noises):
|
123 |
+
#Noises is in TCHW form
|
124 |
+
return torch.stack([x / x.std(1, keepdim=True) for x in noises])
|
125 |
+
|
126 |
+
|
127 |
+
@rp.memoized
|
128 |
+
def load_sample_cartridge(
|
129 |
+
sample_path: str,
|
130 |
+
degradation=0,
|
131 |
+
noise_downtemp_interp='nearest',
|
132 |
+
image=None,
|
133 |
+
prompt=None,
|
134 |
+
#SETTINGS:
|
135 |
+
num_inference_steps=30,
|
136 |
+
guidance_scale=6,
|
137 |
+
):
|
138 |
+
"""
|
139 |
+
COMPLETELY FROM SAMPLE: Generate with /root/micromamba/envs/i2sb/lib/python3.8/site-packages/rp/git/CommonSource/notebooks/CogVidSampleGenerator.ipynb
|
140 |
+
EXAMPLE PATHS:
|
141 |
+
sample_path = '/root/micromamba/envs/i2sb/lib/python3.8/site-packages/rp/git/CommonSource/notebooks/CogVidX_Saved_Train_Samples/plus_pug.pkl'
|
142 |
+
sample_path = '/root/micromamba/envs/i2sb/lib/python3.8/site-packages/rp/git/CommonSource/notebooks/CogVidX_Saved_Train_Samples/amuse_chop.pkl'
|
143 |
+
sample_path = '/root/micromamba/envs/i2sb/lib/python3.8/site-packages/rp/git/CommonSource/notebooks/CogVidX_Saved_Train_Samples/chomp_shop.pkl'
|
144 |
+
sample_path = '/root/micromamba/envs/i2sb/lib/python3.8/site-packages/rp/git/CommonSource/notebooks/CogVidX_Saved_Train_Samples/ahead_job.pkl'
|
145 |
+
sample_path = rp.random_element(glob.glob('/root/micromamba/envs/i2sb/lib/python3.8/site-packages/rp/git/CommonSource/notebooks/CogVidX_Saved_Train_Samples/*.pkl'))
|
146 |
+
"""
|
147 |
+
|
148 |
+
#These could be args in the future. I can't think of a use case yet though, so I'll keep the signature clean.
|
149 |
+
noise=None
|
150 |
+
video=None
|
151 |
+
|
152 |
+
if rp.is_a_folder(sample_path):
|
153 |
+
#Was generated using the flow pipeline
|
154 |
+
print(end="LOADING CARTRIDGE FOLDER "+sample_path+"...")
|
155 |
+
|
156 |
+
noise_file=rp.path_join(sample_path,'noises.npy')
|
157 |
+
instance_noise = np.load(noise_file)
|
158 |
+
instance_noise = torch.tensor(instance_noise)
|
159 |
+
instance_noise = einops.rearrange(instance_noise, 'F H W C -> F C H W')
|
160 |
+
|
161 |
+
video_file=rp.path_join(sample_path,'input.mp4')
|
162 |
+
instance_video = rp.load_video(video_file)
|
163 |
+
instance_video = rp.as_torch_images(instance_video)
|
164 |
+
instance_video = instance_video * 2 - 1
|
165 |
+
|
166 |
+
sample = rp.as_easydict(
|
167 |
+
instance_prompt = '', #Please have some prompt to override this! Ideally the defualt would come from a VLM
|
168 |
+
instance_noise = instance_noise,
|
169 |
+
instance_video = instance_video,
|
170 |
+
)
|
171 |
+
|
172 |
+
print("DONE!")
|
173 |
+
|
174 |
+
else:
|
175 |
+
#Was generated using the Cut-And-Drag GUI
|
176 |
+
print(end="LOADING CARTRIDGE FILE "+sample_path+"...")
|
177 |
+
sample=rp.file_to_object(sample_path)
|
178 |
+
print("DONE!")
|
179 |
+
|
180 |
+
#SAMPLE EXAMPLE:
|
181 |
+
# >>> sample=file_to_object('/root/micromamba/envs/i2sb/lib/python3.8/site-packages/rp/git/CommonSource/notebooks/CogVidX_Saved_Train_Samples/ahead_job.pkl')
|
182 |
+
# >>> list(sample)?s --> ['instance_prompt', 'instance_video', 'instance_noise']
|
183 |
+
# >>> sample.instance_prompt?s --> A group of elk, including a dominant bull, is seen grazing and moving through...
|
184 |
+
# >>> sample.instance_noise.shape?s --> torch.Size([49, 16, 60, 90])
|
185 |
+
# >>> sample.instance_video.shape?s --> torch.Size([49, 3, 480, 720]) # Range: [-1, 1]
|
186 |
+
|
187 |
+
sample_noise = sample["instance_noise" ].to(dtype)
|
188 |
+
sample_video = sample["instance_video" ].to(dtype)
|
189 |
+
sample_prompt = sample["instance_prompt"]
|
190 |
+
|
191 |
+
sample_gif_path = sample_path+'.mp4'
|
192 |
+
if not rp.file_exists(sample_gif_path):
|
193 |
+
sample_gif_path = sample_path+'.gif' #The older scripts made this. Backwards compatibility.
|
194 |
+
if not rp.file_exists(sample_gif_path):
|
195 |
+
#Create one!
|
196 |
+
#Clientside warped noise does not come with a nice GIF so we make one here and now!
|
197 |
+
sample_gif_path = sample_path+'.mp4'
|
198 |
+
|
199 |
+
rp.fansi_print("MAKING SAMPLE PREVIEW VIDEO",'light blue green','underlined')
|
200 |
+
preview_sample_video=rp.as_numpy_images(sample_video)/2+.5
|
201 |
+
preview_sample_noise=rp.as_numpy_images(sample_noise)[:,:,:,:3]/5+.5
|
202 |
+
preview_sample_noise = rp.resize_images(preview_sample_noise, size=8, interp="nearest")
|
203 |
+
preview_sample=rp.horizontally_concatenated_videos(preview_sample_video,preview_sample_noise)
|
204 |
+
rp.save_video_mp4(preview_sample,sample_gif_path,video_bitrate='max',framerate=12)
|
205 |
+
rp.fansi_print("DONE MAKING SAMPLE PREVIEW VIDEO!",'light blue green','underlined')
|
206 |
+
|
207 |
+
#prompt=sample.instance_prompt
|
208 |
+
downtemp_noise = get_downtemp_noise(
|
209 |
+
sample_noise,
|
210 |
+
noise_downtemp_interp=noise_downtemp_interp,
|
211 |
+
)
|
212 |
+
downtemp_noise = downtemp_noise[None]
|
213 |
+
downtemp_noise = nw.mix_new_noise(downtemp_noise, degradation)
|
214 |
+
|
215 |
+
assert downtemp_noise.shape == (B, F, C, H, W), (downtemp_noise.shape,(B, F, C, H, W))
|
216 |
+
|
217 |
+
if image is None : sample_image = rp.as_pil_image(rp.as_numpy_image(sample_video[0].float()/2+.5))
|
218 |
+
elif isinstance(image, str) : sample_image = rp.as_pil_image(rp.as_rgb_image(rp.load_image(image)))
|
219 |
+
else : sample_image = rp.as_pil_image(rp.as_rgb_image(image))
|
220 |
+
|
221 |
+
metadata = rp.gather_vars('sample_path degradation downtemp_noise sample_gif_path sample_video sample_noise noise_downtemp_interp')
|
222 |
+
settings = rp.gather_vars('num_inference_steps guidance_scale'+0*'v2v_strength')
|
223 |
+
|
224 |
+
if noise is None: noise = downtemp_noise
|
225 |
+
if video is None: video = sample_video
|
226 |
+
if image is None: image = sample_image
|
227 |
+
if prompt is None: prompt = sample_prompt
|
228 |
+
|
229 |
+
assert noise.shape == (B, F, C, H, W), (noise.shape,(B, F, C, H, W))
|
230 |
+
|
231 |
+
return rp.gather_vars('prompt noise image video metadata settings')
|
232 |
+
|
233 |
+
def dict_to_name(d=None, **kwargs):
|
234 |
+
"""
|
235 |
+
Used to generate MP4 file names
|
236 |
+
|
237 |
+
EXAMPLE:
|
238 |
+
>>> dict_to_name(dict(a=5,b='hello',c=None))
|
239 |
+
ans = a=5,b=hello,c=None
|
240 |
+
>>> name_to_dict(ans)
|
241 |
+
ans = {'a': '5', 'b': 'hello', 'c': 'None'}
|
242 |
+
"""
|
243 |
+
if d is None:
|
244 |
+
d = {}
|
245 |
+
d.update(kwargs)
|
246 |
+
return ",".join("=".join(map(str, [key, value])) for key, value in d.items())
|
247 |
+
|
248 |
+
# def name_to_dict(nam"
|
249 |
+
# Useful for analyzing output MP4 files
|
250 |
+
#
|
251 |
+
# EXAMPLE:
|
252 |
+
# >>> dict_to_name(dict(a=5,b='hello',c=None))
|
253 |
+
# ans = a=5,b=hello,c=None
|
254 |
+
# >>> name_to_dict(ans)
|
255 |
+
# ans = {'a': '5', 'b': 'hello', 'c': 'None'}
|
256 |
+
# """
|
257 |
+
# output=rp.as_easydict()
|
258 |
+
# for entry in name.split(','):
|
259 |
+
# key,value=entry.split('=',maxsplit=1)
|
260 |
+
# output[key]=value
|
261 |
+
# return output
|
262 |
+
#
|
263 |
+
#
|
264 |
+
def get_output_path(pipe, cartridge, subfolder:str, output_root:str):
|
265 |
+
"""
|
266 |
+
Generates a unique output path for saving a generated video.
|
267 |
+
|
268 |
+
Args:
|
269 |
+
pipe: The video generation pipeline used.
|
270 |
+
cartridge: Data used for generating the video.
|
271 |
+
subfolder (str): Subfolder for saving the video.
|
272 |
+
output_root (str): Root directory for output videos.
|
273 |
+
|
274 |
+
Returns:
|
275 |
+
String representing the unique path to save the video.
|
276 |
+
"""
|
277 |
+
|
278 |
+
time = rp.millis()
|
279 |
+
|
280 |
+
output_name = (
|
281 |
+
dict_to_name(
|
282 |
+
t=time,
|
283 |
+
pipe=pipe.pipe_name,
|
284 |
+
lora=pipe.lora_name,
|
285 |
+
steps = cartridge.settings.num_inference_steps,
|
286 |
+
# strength = cartridge.settings.v2v_strength,
|
287 |
+
degrad = cartridge.metadata.degradation,
|
288 |
+
downtemp = cartridge.metadata.noise_downtemp_interp,
|
289 |
+
samp = rp.get_file_name(rp.get_parent_folder(cartridge.metadata.sample_path), False),
|
290 |
+
)
|
291 |
+
+ ".mp4"
|
292 |
+
)
|
293 |
+
|
294 |
+
output_path = rp.get_unique_copy_path(
|
295 |
+
rp.path_join(
|
296 |
+
rp.make_directory(
|
297 |
+
rp.path_join(output_root, subfolder),
|
298 |
+
),
|
299 |
+
output_name,
|
300 |
+
),
|
301 |
+
)
|
302 |
+
|
303 |
+
rp.fansi_print(f"OUTPUT PATH: {rp.fansi_highlight_path(output_path)}", "blue", "bold")
|
304 |
+
|
305 |
+
return output_path
|
306 |
+
|
307 |
+
def run_pipe(
|
308 |
+
pipe,
|
309 |
+
cartridge,
|
310 |
+
subfolder="first_subfolder",
|
311 |
+
output_root: str = "infer_outputs",
|
312 |
+
output_mp4_path = None, #This overrides subfolder and output_root if specified
|
313 |
+
):
|
314 |
+
# output_mp4_path = output_mp4_path or get_output_path(pipe, cartridge, subfolder, output_root)
|
315 |
+
|
316 |
+
if rp.file_exists(output_mp4_path):
|
317 |
+
raise RuntimeError("{output_mp4_path} already exists! Please choose a different output file or delete that one. This script is designed not to clobber previous results.")
|
318 |
+
|
319 |
+
if pipe.is_i2v:
|
320 |
+
image = cartridge.image
|
321 |
+
if isinstance(image, str):
|
322 |
+
image = rp.load_image(image,use_cache=True)
|
323 |
+
image = rp.as_pil_image(rp.as_rgb_image(image))
|
324 |
+
|
325 |
+
# if pipe.is_v2v:
|
326 |
+
# print("Making v2v video...")
|
327 |
+
# v2v_video=cartridge.video
|
328 |
+
# v2v_video=rp.as_numpy_images(v2v_video) / 2 + .5
|
329 |
+
# v2v_video=rp.as_pil_images(v2v_video)
|
330 |
+
|
331 |
+
print("NOISE SHAPE",cartridge.noise.shape)
|
332 |
+
print("IMAGE",image)
|
333 |
+
|
334 |
+
video = pipe(
|
335 |
+
prompt=cartridge.prompt,
|
336 |
+
**(dict(image =image ) if pipe.is_i2v else {}),
|
337 |
+
# **(dict(strength=cartridge.settings.v2v_strength) if pipe.is_v2v else {}),
|
338 |
+
# **(dict(video =v2v_video ) if pipe.is_v2v else {}),
|
339 |
+
num_inference_steps=cartridge.settings.num_inference_steps,
|
340 |
+
latents=cartridge.noise,
|
341 |
+
|
342 |
+
guidance_scale=cartridge.settings.guidance_scale,
|
343 |
+
# generator=torch.Generator(device=device).manual_seed(42),
|
344 |
+
).frames[0]
|
345 |
+
|
346 |
+
export_to_video(video, output_mp4_path, fps=8)
|
347 |
+
|
348 |
+
sample_gif=rp.load_video(cartridge.metadata.sample_gif_path)
|
349 |
+
video=rp.as_numpy_images(video)
|
350 |
+
prevideo = rp.horizontally_concatenated_videos(
|
351 |
+
rp.resize_list(sample_gif, len(video)),
|
352 |
+
video,
|
353 |
+
origin='bottom right',
|
354 |
+
)
|
355 |
+
import textwrap
|
356 |
+
prevideo = rp.labeled_images(
|
357 |
+
prevideo,
|
358 |
+
position="top",
|
359 |
+
labels=cartridge.metadata.sample_path +"\n"+output_mp4_path +"\n\n" + rp.wrap_string_to_width(cartridge.prompt, 250),
|
360 |
+
size_by_lines=True,
|
361 |
+
text_color='light light light blue',
|
362 |
+
# font='G:Lexend'
|
363 |
+
)
|
364 |
+
|
365 |
+
preview_mp4_path = output_mp4_path + "_preview.mp4"
|
366 |
+
preview_gif_path = preview_mp4_path + ".gif"
|
367 |
+
print(end=f"Saving preview MP4 to preview_mp4_path = {preview_mp4_path}...")
|
368 |
+
rp.save_video_mp4(prevideo, preview_mp4_path, framerate=16, video_bitrate="max", show_progress=False)
|
369 |
+
compressed_preview_mp4_path = rp.save_video_mp4(prevideo, output_mp4_path + "_preview_compressed.mp4", framerate=16, show_progress=False)
|
370 |
+
print("done!")
|
371 |
+
print(end=f"Saving preview gif to preview_gif_path = {preview_gif_path}...")
|
372 |
+
rp.convert_to_gif_via_ffmpeg(preview_mp4_path, preview_gif_path, framerate=12,show_progress=False)
|
373 |
+
print("done!")
|
374 |
+
|
375 |
+
return rp.gather_vars('video output_mp4_path preview_mp4_path compressed_preview_mp4_path cartridge subfolder preview_mp4_path preview_gif_path')
|
376 |
+
|
377 |
+
|
378 |
+
# #prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
|
379 |
+
# prompt = "An old house by the lake with wooden plank siding and a thatched roof"
|
380 |
+
# prompt = "Soaring through deep space"
|
381 |
+
# prompt = "Swimming by the ruins of the titanic"
|
382 |
+
# prompt = "A camera flyby of a gigantic ice tower that a princess lives in, zooming in from far away from the castle into her dancing in the window"
|
383 |
+
# prompt = "A drone flyby of the grand canyon, aerial view"
|
384 |
+
# prompt = "A bunch of puppies running around a front lawn in a giant courtyard "
|
385 |
+
# #image = load_image(image=download_url_to_cache("https://media.sciencephoto.com/f0/22/69/89/f0226989-800px-wm.jpg"))
|
386 |
+
|
387 |
+
def main(
|
388 |
+
sample_path,
|
389 |
+
output_mp4_path:str,
|
390 |
+
prompt=None,
|
391 |
+
degradation=.5,
|
392 |
+
model_name='I2V5B_final_i38800_nearest_lora_weights',
|
393 |
+
|
394 |
+
low_vram=True,
|
395 |
+
device:str=None,
|
396 |
+
|
397 |
+
#BROADCASTABLE:
|
398 |
+
noise_downtemp_interp='nearest',
|
399 |
+
image=None,
|
400 |
+
num_inference_steps=30,
|
401 |
+
guidance_scale=6,
|
402 |
+
# v2v_strength=.5,#Timestep for when using Vid2Vid. Only set to not none when using a T2V model!
|
403 |
+
):
|
404 |
+
"""
|
405 |
+
Main function to run the video generation pipeline with specified parameters.
|
406 |
+
|
407 |
+
Args:
|
408 |
+
model_name (str): Name of the pipeline to use ('T2V5B', 'T2V2B', 'I2V5B', etc).
|
409 |
+
device (str or int, optional): Device to run the model on (e.g., 'cuda:0' or 0). If unspecified, the GPU with the most free VRAM will be chosen.
|
410 |
+
low_vram (bool): Set to True if you have less than 32GB of VRAM. In enables model cpu offloading, which slows down inference but needs much less vram.
|
411 |
+
sample_path (str or list, optional): Broadcastable. Path(s) to the sample `.pkl` file(s) or folders containing (noise.npy and input.mp4 files)
|
412 |
+
degradation (float or list): Broadcastable. Degradation level(s) for the noise warp (float between 0 and 1).
|
413 |
+
noise_downtemp_interp (str or list): Broadcastable. Interpolation method(s) for down-temporal noise. Options: 'nearest', 'blend', 'blend_norm'.
|
414 |
+
image (str, PIL.Image, or list, optional): Broadcastable. Image(s) to use as the initial frame(s). Can be a URL or a path to an image.
|
415 |
+
prompt (str or list, optional): Broadcastable. Text prompt(s) for video generation.
|
416 |
+
num_inference_steps (int or list): Broadcastable. Number of inference steps for the pipeline.
|
417 |
+
"""
|
418 |
+
output_root='infer_outputs', # output_root (str): Root directory where output videos will be saved.
|
419 |
+
subfolder='default_subfolder', # subfolder (str): Subfolder within output_root to save outputs.
|
420 |
+
|
421 |
+
if device is None:
|
422 |
+
device = rp.select_torch_device(reserve=True, prefer_used=True)
|
423 |
+
rp.fansi_print(f"Selected torch device: {device}")
|
424 |
+
|
425 |
+
|
426 |
+
cartridge_kwargs = rp.broadcast_kwargs(
|
427 |
+
rp.gather_vars(
|
428 |
+
"sample_path",
|
429 |
+
"degradation",
|
430 |
+
"noise_downtemp_interp",
|
431 |
+
"image",
|
432 |
+
"prompt",
|
433 |
+
"num_inference_steps",
|
434 |
+
"guidance_scale",
|
435 |
+
# "v2v_strength",
|
436 |
+
)
|
437 |
+
)
|
438 |
+
|
439 |
+
rp.fansi_print("cartridge_kwargs:", "cyan", "bold")
|
440 |
+
print(
|
441 |
+
rp.indentify(
|
442 |
+
rp.with_line_numbers(
|
443 |
+
rp.fansi_pygments(
|
444 |
+
rp.autoformat_json(cartridge_kwargs),
|
445 |
+
"json",
|
446 |
+
),
|
447 |
+
align=True,
|
448 |
+
)
|
449 |
+
),
|
450 |
+
)
|
451 |
+
|
452 |
+
# cartridges = [load_sample_cartridge(**x) for x in cartridge_kwargs]
|
453 |
+
cartridges = rp.load_files(lambda x:load_sample_cartridge(**x), cartridge_kwargs, show_progress='eta:Loading Cartridges')
|
454 |
+
|
455 |
+
pipe = get_pipe(model_name, device, low_vram=low_vram)
|
456 |
+
|
457 |
+
output=[]
|
458 |
+
for cartridge in cartridges:
|
459 |
+
pipe_out = run_pipe(
|
460 |
+
pipe=pipe,
|
461 |
+
cartridge=cartridge,
|
462 |
+
output_root=output_root,
|
463 |
+
subfolder=subfolder,
|
464 |
+
output_mp4_path=output_mp4_path,
|
465 |
+
)
|
466 |
+
|
467 |
+
output.append(
|
468 |
+
rp.as_easydict(
|
469 |
+
rp.gather(
|
470 |
+
pipe_out,
|
471 |
+
[
|
472 |
+
"output_mp4_path",
|
473 |
+
"preview_mp4_path",
|
474 |
+
"compressed_preview_mp4_path",
|
475 |
+
"preview_mp4_path",
|
476 |
+
"preview_gif_path",
|
477 |
+
],
|
478 |
+
as_dict=True,
|
479 |
+
)
|
480 |
+
)
|
481 |
+
)
|
482 |
+
return output
|
483 |
+
|
484 |
+
if __name__ == '__main__':
|
485 |
+
import fire
|
486 |
+
fire.Fire(main)
|
487 |
+
|
488 |
+
|
489 |
+
|
make_warped_noise.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Ryan Burgert 2024
|
2 |
+
|
3 |
+
#Setup:
|
4 |
+
# Run this in a Jupyter Notebook on a computer with at least one GPU
|
5 |
+
# `sudo apt install ffmpeg git`
|
6 |
+
# `pip install rp`
|
7 |
+
# The first time you run this it might be a bit slow (it will download necessary models)
|
8 |
+
# The `rp` package will take care of installing the rest of the python packages for you
|
9 |
+
|
10 |
+
import rp
|
11 |
+
|
12 |
+
rp.r._pip_import_autoyes=True #Automatically install missing packages
|
13 |
+
|
14 |
+
rp.pip_import('fire')
|
15 |
+
rp.git_import('CommonSource') #If missing, installs code from https://github.com/RyannDaGreat/CommonSource
|
16 |
+
import rp.git.CommonSource.noise_warp as nw
|
17 |
+
import fire
|
18 |
+
|
19 |
+
def main(video:str, output_folder:str):
|
20 |
+
"""
|
21 |
+
Takes a video URL or filepath and an output folder path
|
22 |
+
It then resizes that video to height=480, width=720, 49 frames (CogVidX's dimensions)
|
23 |
+
Then it calculates warped noise at latent resolution (i.e. 1/8 of the width and height) with 16 channels
|
24 |
+
It saves that warped noise, optical flows, and related preview videos and images to the output folder
|
25 |
+
The main file you need is <output_folder>/noises.npy which is the gaussian noises in (H,W,C) form
|
26 |
+
"""
|
27 |
+
|
28 |
+
if rp.folder_exists(output_folder):
|
29 |
+
raise RuntimeError(f"The given output_folder={repr(output_folder)} already exists! To avoid clobbering what might be in there, please specify a folder that doesn't exist so I can create one for you. Alternatively, you could delete that folder if you don't care whats in it.")
|
30 |
+
|
31 |
+
FRAME = 2**-1 #We immediately resize the input frames by this factor, before calculating optical flow
|
32 |
+
#The flow is calulated at (input size) × FRAME resolution.
|
33 |
+
#Higher FLOW values result in slower optical flow calculation and higher intermediate noise resolution
|
34 |
+
#Larger is not always better - watch the preview in Jupyter to see if it looks good!
|
35 |
+
|
36 |
+
FLOW = 2**3 #Then, we use bilinear interpolation to upscale the flow by this factor
|
37 |
+
#We warp the noise at (input size) × FRAME × FLOW resolution
|
38 |
+
#The noise is then downsampled back to (input size)
|
39 |
+
#Higher FLOW values result in more temporally consistent noise warping at the cost of higher VRAM usage and slower inference time
|
40 |
+
LATENT = 8 #We further downsample the outputs by this amount - because 8 pixels wide corresponds to one latent wide in Stable Diffusion
|
41 |
+
#The final output size is (input size) ÷ LATENT regardless of FRAME and FLOW
|
42 |
+
|
43 |
+
#LATENT = 1 #Uncomment this line for a prettier visualization! But for latent diffusion models, use LATENT=8
|
44 |
+
|
45 |
+
#You can also use video files or URLs
|
46 |
+
# video = "https://www.shutterstock.com/shutterstock/videos/1100085499/preview/stock-footage-bremen-germany-october-old-style-carousel-moving-on-square-in-city-horses-on-traditional.webm"
|
47 |
+
|
48 |
+
# output_folder = "NoiseWarpOutputFolder"
|
49 |
+
|
50 |
+
if isinstance(video,str):
|
51 |
+
video=rp.load_video(video)
|
52 |
+
|
53 |
+
#Preprocess the video
|
54 |
+
video=rp.resize_list(video,length=49) #Stretch or squash video to 49 frames (CogVideoX's length)
|
55 |
+
video=rp.resize_images_to_hold(video,height=480,width=720)
|
56 |
+
video=rp.crop_images(video,height=480,width=720,origin='center') #Make the resolution 480x720 (CogVideoX's resolution)
|
57 |
+
video=rp.as_numpy_array(video)
|
58 |
+
|
59 |
+
|
60 |
+
#See this function's docstring for more information!
|
61 |
+
output = nw.get_noise_from_video(
|
62 |
+
video,
|
63 |
+
remove_background=False, #Set this to True to matte the foreground - and force the background to have no flow
|
64 |
+
visualize=True, #Generates nice visualization videos and previews in Jupyter notebook
|
65 |
+
save_files=True, #Set this to False if you just want the noises without saving to a numpy file
|
66 |
+
|
67 |
+
noise_channels=16,
|
68 |
+
output_folder=output_folder,
|
69 |
+
resize_frames=FRAME,
|
70 |
+
resize_flow=FLOW,
|
71 |
+
downscale_factor=round(FRAME * FLOW) * LATENT,
|
72 |
+
)
|
73 |
+
|
74 |
+
output.first_frame_path = rp.save_image(video[0],rp.path_join(output_folder,'first_frame.png'))
|
75 |
+
|
76 |
+
rp.save_video_mp4(video, rp.path_join(output_folder, 'input.mp4'), framerate=12, video_bitrate='max')
|
77 |
+
|
78 |
+
#output.numpy_noises_downsampled = as_numpy_images(
|
79 |
+
#nw.resize_noise(
|
80 |
+
#as_torch_images(x),
|
81 |
+
#1 / 8,
|
82 |
+
#)for x
|
83 |
+
#)
|
84 |
+
#
|
85 |
+
#output.numpy_noises_downsampled_path = path_join(output_folder, 'noises_downsampled.npy')
|
86 |
+
#np.save(numpy_noises_downsampled_path, output.numpy_noises_downsampled)
|
87 |
+
|
88 |
+
print("Noise shape:" ,output.numpy_noises.shape)
|
89 |
+
print("Flow shape:" ,output.numpy_flows .shape)
|
90 |
+
print("Output folder:",output.output_folder)
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
fire.Fire(main)
|
requirements.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--index-url https://download.pytorch.org/whl/cu118
|
2 |
+
rp
|
3 |
+
torch
|
4 |
+
torchvision
|
5 |
+
diffusers
|
6 |
+
einops
|
7 |
+
easydict
|
8 |
+
transformers
|
9 |
+
accelerate
|
10 |
+
oldest-supported-numpy
|
11 |
+
sentencepiece
|
12 |
+
peft
|
13 |
+
opencv-contrib-python
|
14 |
+
imageio-ffmpeg
|
15 |
+
fire
|
16 |
+
moviepy
|
17 |
+
icecream
|
18 |
+
matplotlib
|
requirements_local.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
rp
|
2 |
+
easydict
|
3 |
+
oldest-supported-numpy
|
4 |
+
opencv-contrib-python
|
5 |
+
imageio-ffmpeg
|
6 |
+
fire
|
7 |
+
moviepy
|
8 |
+
icecream
|
9 |
+
matplotlib
|
10 |
+
art
|
11 |
+
torchvision
|
12 |
+
torch
|