Mochi1

Runtime error

App Files Files Community

Haoxin Chen commited on Jan 26, 2024

Commit

4949f04

1 Parent(s): d98ccdc

update videocrafter2

Browse files

Files changed (3) hide show

app.py +2 -37
configs/inference_t2v_512_v2.0.yaml +77 -0
t2v_test.py +8 -8

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import sys
 import gradio as gr
 # from demo_test import Text2Video, Image2Video
 from t2v_test import Text2Video
-from i2v_test import Image2Video
 sys.path.insert(1, os.path.join(sys.path[0], 'lvdm'))
 t2v_examples = [
@@ -15,18 +14,14 @@ t2v_examples = [
     ['Robot dancing in times square',25,12,1,16],
 ]
-i2v_examples = [
-    ['prompts/i2v_prompts/horse.png', 'horses are walking on the grassland', 50, 12, 1, 16]
-]
 def videocrafter_demo(result_dir='./tmp/'):
     text2video = Text2Video(result_dir)
-    image2video = Image2Video(result_dir)
     with gr.Blocks(analytics_enabled=False) as videocrafter_iface:
-        gr.Markdown("<div align='center'> <h2> VideoCrafter1: Open Diffusion Models for High-Quality Video Generation </span> </h2> \
                      <a style='font-size:18px;color: #000000' href='https://github.com/AILab-CVC/VideoCrafter'> Github </div>")
-        gr.Markdown("<b> You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href='https://huggingface.co/spaces/VideoCrafter/VideoCrafter?duplicate=true'> <img src='https://bit.ly/3gLdBN6' alt='Duplicate Space'></a> </b>")
         #######t2v#######
         with gr.Tab(label="Text2Video"):
             with gr.Column():
@@ -54,36 +49,6 @@ def videocrafter_demo(result_dir='./tmp/'):
                 inputs=[input_text,steps,cfg_scale,eta,fps],
                 outputs=[output_video_1],
             )
-        #######image2video######
-        with gr.Tab(label='Image2Video'):
-            with gr.Column():
-                with gr.Row():
-                    with gr.Column():
-                        with gr.Row():
-                            i2v_input_image = gr.Image(label="Input Image").style(width=256)
-                        with gr.Row():
-                            i2v_input_text = gr.Text(label='Prompts')
-                        with gr.Row():
-                            i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
-                            i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=30.0, step=0.5, label='CFG Scale', value=12.0, elem_id="i2v_cfg_scale")
-                        with gr.Row():
-                            i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
-                            i2v_fps = gr.Slider(minimum=4, maximum=32, step=1, elem_id="i2v_fps", label="Generative fps", value=16)
-                        i2v_end_btn = gr.Button("Send")
-                    with gr.Tab(label='Result'):
-                        with gr.Row():
-                            i2v_output_video = gr.Video(label="Generated Video").style(width=512)
-                gr.Examples(examples=i2v_examples,
-                            inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_fps],
-                            outputs=[i2v_output_video],
-                            fn = image2video.get_image,
-                            cache_examples=os.getenv('SYSTEM') == 'spaces',
-                )
-            i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_fps],
-                            outputs=[i2v_output_video],
-                            fn = image2video.get_image
-            )
     return videocrafter_iface

 import gradio as gr
 # from demo_test import Text2Video, Image2Video
 from t2v_test import Text2Video
 sys.path.insert(1, os.path.join(sys.path[0], 'lvdm'))
 t2v_examples = [
     ['Robot dancing in times square',25,12,1,16],
 ]
 def videocrafter_demo(result_dir='./tmp/'):
     text2video = Text2Video(result_dir)
     with gr.Blocks(analytics_enabled=False) as videocrafter_iface:
+        gr.Markdown("<div align='center'> <h2> VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models </span> </h2> \
                      <a style='font-size:18px;color: #000000' href='https://github.com/AILab-CVC/VideoCrafter'> Github </div>")
+        # gr.Markdown("<b> You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href='https://huggingface.co/spaces/VideoCrafter/VideoCrafter?duplicate=true'> <img src='https://bit.ly/3gLdBN6' alt='Duplicate Space'></a> </b>")
         #######t2v#######
         with gr.Tab(label="Text2Video"):
             with gr.Column():
                 inputs=[input_text,steps,cfg_scale,eta,fps],
                 outputs=[output_video_1],
             )
     return videocrafter_iface

configs/inference_t2v_512_v2.0.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+model:
+  target: lvdm.models.ddpm3d.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    image_size:
+    - 40
+    - 64
+    channels: 4
+    scale_by_std: false
+    scale_factor: 0.18215
+    use_ema: false
+    uncond_type: empty_seq
+    use_scale: true
+    scale_b: 0.7
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: true
+        temporal_conv: true
+        temporal_attention: true
+        temporal_selfatt_only: true
+        use_relative_position: false
+        use_causal_attention: false
+        temporal_length: 16
+        addition_attention: true
+        fps_cond: true
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: penultimate

t2v_test.py CHANGED Viewed

@@ -12,8 +12,8 @@ class Text2Video():
         self.result_dir = result_dir
         if not os.path.exists(self.result_dir):
             os.mkdir(self.result_dir)
-        ckpt_path='checkpoints/base_1024_v1/model.ckpt'
-        config_file='configs/inference_t2v_1024_v1.0.yaml'
         config = OmegaConf.load(config_file)
         model_config = config.pop("model", OmegaConf.create())
         model_config['params']['unet_config']['params']['use_checkpoint']=False
@@ -40,7 +40,7 @@ class Text2Video():
         batch_size=1
         channels = model.model.diffusion_model.in_channels
         frames = model.temporal_length
-        h, w = 576 // 8, 1024 // 8
         noise_shape = [batch_size, channels, frames, h, w]
         #prompts = batch_size * [""]
@@ -61,15 +61,15 @@ class Text2Video():
         return os.path.join(self.result_dir, f"{prompt_str}.mp4")
     def download_model(self):
-        REPO_ID = 'VideoCrafter/Text2Video-1024'
         filename_list = ['model.ckpt']
-        if not os.path.exists('./checkpoints/base_1024_v1/'):
-            os.makedirs('./checkpoints/base_1024_v1/')
         for filename in filename_list:
-            local_file = os.path.join('./checkpoints/base_1024_v1/', filename)
             if not os.path.exists(local_file):
-                hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/base_1024_v1/', local_dir_use_symlinks=False)
 if __name__ == '__main__':

         self.result_dir = result_dir
         if not os.path.exists(self.result_dir):
             os.mkdir(self.result_dir)
+        ckpt_path='checkpoints/base_512_v2/model.ckpt'
+        config_file='configs/inference_t2v_512_v2.0.yaml'
         config = OmegaConf.load(config_file)
         model_config = config.pop("model", OmegaConf.create())
         model_config['params']['unet_config']['params']['use_checkpoint']=False
         batch_size=1
         channels = model.model.diffusion_model.in_channels
         frames = model.temporal_length
+        h, w = 320 // 8, 512 // 8
         noise_shape = [batch_size, channels, frames, h, w]
         #prompts = batch_size * [""]
         return os.path.join(self.result_dir, f"{prompt_str}.mp4")
     def download_model(self):
+        REPO_ID = 'VideoCrafter/VideoCrafter2'
         filename_list = ['model.ckpt']
+        if not os.path.exists('./checkpoints/base_512_v2/'):
+            os.makedirs('./checkpoints/base_512_v2/')
         for filename in filename_list:
+            local_file = os.path.join('./checkpoints/base_512_v2/', filename)
             if not os.path.exists(local_file):
+                hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/base_512_v2/', local_dir_use_symlinks=False)
 if __name__ == '__main__':