Spaces:
fantaxy
/
Running on Zero

fantaxy commited on
Commit
d1765b9
ยท
verified ยท
1 Parent(s): ec476cf

Create app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +375 -0
app-backup.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from functools import lru_cache
3
+ import gradio as gr
4
+ from gradio_toggle import Toggle
5
+ import torch
6
+ from huggingface_hub import snapshot_download
7
+ from transformers import CLIPProcessor, CLIPModel, pipeline
8
+ import random
9
+ from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
10
+ from xora.models.transformers.transformer3d import Transformer3DModel
11
+ from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
12
+ from xora.schedulers.rf import RectifiedFlowScheduler
13
+ from xora.pipelines.pipeline_xora_video import XoraVideoPipeline
14
+ from transformers import T5EncoderModel, T5Tokenizer
15
+ from xora.utils.conditioning_method import ConditioningMethod
16
+ from pathlib import Path
17
+ import safetensors.torch
18
+ import json
19
+ import numpy as np
20
+ import cv2
21
+ from PIL import Image
22
+ import tempfile
23
+ import os
24
+ import gc
25
+ import csv
26
+ from datetime import datetime
27
+ from openai import OpenAI
28
+
29
+ # ํ•œ๊ธ€-์˜์–ด ๋ฒˆ์—ญ๊ธฐ ์ดˆ๊ธฐํ™”
30
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
31
+
32
+ torch.backends.cuda.matmul.allow_tf32 = False
33
+ torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
34
+ torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
35
+ torch.backends.cudnn.allow_tf32 = False
36
+ torch.backends.cudnn.deterministic = False
37
+ torch.backends.cuda.preferred_blas_library="cublas"
38
+ torch.set_float32_matmul_precision("highest")
39
+
40
+ MAX_SEED = np.iinfo(np.int32).max
41
+
42
+ # Load Hugging Face token if needed
43
+ hf_token = os.getenv("HF_TOKEN")
44
+ openai_api_key = os.getenv("OPENAI_API_KEY")
45
+ client = OpenAI(api_key=openai_api_key)
46
+
47
+ system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
48
+ with open(system_prompt_t2v_path, "r") as f:
49
+ system_prompt_t2v = f.read()
50
+
51
+ # Set model download directory within Hugging Face Spaces
52
+ model_path = "asset"
53
+
54
+ commit_hash='c7c8ad4c2ddba847b94e8bfaefbd30bd8669fafc'
55
+
56
+ if not os.path.exists(model_path):
57
+ snapshot_download("Lightricks/LTX-Video", revision=commit_hash, local_dir=model_path, repo_type="model", token=hf_token)
58
+
59
+ # Global variables to load components
60
+ vae_dir = Path(model_path) / "vae"
61
+ unet_dir = Path(model_path) / "unet"
62
+ scheduler_dir = Path(model_path) / "scheduler"
63
+
64
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
65
+
66
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir=model_path).to(torch.device("cuda:0"))
67
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=model_path)
68
+
69
+ def process_prompt(prompt):
70
+ # ํ•œ๊ธ€์ด ํฌํ•จ๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธ
71
+ if any(ord('๊ฐ€') <= ord(char) <= ord('ํžฃ') for char in prompt):
72
+ # ํ•œ๊ธ€์„ ์˜์–ด๋กœ ๋ฒˆ์—ญ
73
+ translated = translator(prompt)[0]['translation_text']
74
+ return translated
75
+ return prompt
76
+
77
+ def compute_clip_embedding(text=None):
78
+ inputs = clip_processor(text=text, return_tensors="pt", padding=True).to(device)
79
+ outputs = clip_model.get_text_features(**inputs)
80
+ embedding = outputs.detach().cpu().numpy().flatten().tolist()
81
+ return embedding
82
+
83
+ def load_vae(vae_dir):
84
+ vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
85
+ vae_config_path = vae_dir / "config.json"
86
+ with open(vae_config_path, "r") as f:
87
+ vae_config = json.load(f)
88
+ vae = CausalVideoAutoencoder.from_config(vae_config)
89
+ vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
90
+ vae.load_state_dict(vae_state_dict)
91
+ return vae.to(device).to(torch.bfloat16)
92
+
93
+ def load_unet(unet_dir):
94
+ unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
95
+ unet_config_path = unet_dir / "config.json"
96
+ transformer_config = Transformer3DModel.load_config(unet_config_path)
97
+ transformer = Transformer3DModel.from_config(transformer_config)
98
+ unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
99
+ transformer.load_state_dict(unet_state_dict, strict=True)
100
+ return transformer.to(device).to(torch.bfloat16)
101
+
102
+ def load_scheduler(scheduler_dir):
103
+ scheduler_config_path = scheduler_dir / "scheduler_config.json"
104
+ scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
105
+ return RectifiedFlowScheduler.from_config(scheduler_config)
106
+
107
+ # Preset options for resolution and frame configuration
108
+ preset_options = [
109
+ {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
110
+ {"label": "1088x704, 49 frames", "width": 1088, "height": 704, "num_frames": 49},
111
+ {"label": "1056x640, 57 frames", "width": 1056, "height": 640, "num_frames": 57},
112
+ {"label": "448x448, 100 frames", "width": 448, "height": 448, "num_frames": 100},
113
+ {"label": "448x448, 200 frames", "width": 448, "height": 448, "num_frames": 200},
114
+ {"label": "448x448, 300 frames", "width": 448, "height": 448, "num_frames": 300},
115
+ {"label": "640x640, 80 frames", "width": 640, "height": 640, "num_frames": 80},
116
+ {"label": "640x640, 120 frames", "width": 640, "height": 640, "num_frames": 120},
117
+ {"label": "768x768, 64 frames", "width": 768, "height": 768, "num_frames": 64},
118
+ {"label": "768x768, 90 frames", "width": 768, "height": 768, "num_frames": 90},
119
+ {"label": "720x720, 64 frames", "width": 768, "height": 768, "num_frames": 64},
120
+ {"label": "720x720, 100 frames", "width": 768, "height": 768, "num_frames": 100},
121
+ {"label": "768x512, 97 frames", "width": 768, "height": 512, "num_frames": 97},
122
+ {"label": "512x512, 160 frames", "width": 512, "height": 512, "num_frames": 160},
123
+ {"label": "512x512, 200 frames", "width": 512, "height": 512, "num_frames": 200},
124
+ ]
125
+
126
+ def preset_changed(preset):
127
+ if preset != "Custom":
128
+ selected = next(item for item in preset_options if item["label"] == preset)
129
+ return (
130
+ selected["height"],
131
+ selected["width"],
132
+ selected["num_frames"],
133
+ gr.update(visible=False),
134
+ gr.update(visible=False),
135
+ gr.update(visible=False),
136
+ )
137
+ else:
138
+ return (
139
+ None,
140
+ None,
141
+ None,
142
+ gr.update(visible=True),
143
+ gr.update(visible=True),
144
+ gr.update(visible=True),
145
+ )
146
+
147
+ # Load models
148
+ vae = load_vae(vae_dir)
149
+ unet = load_unet(unet_dir)
150
+ scheduler = load_scheduler(scheduler_dir)
151
+ patchifier = SymmetricPatchifier(patch_size=1)
152
+ text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to(torch.device("cuda:0"))
153
+ tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
154
+
155
+ pipeline = XoraVideoPipeline(
156
+ transformer=unet,
157
+ patchifier=patchifier,
158
+ text_encoder=text_encoder,
159
+ tokenizer=tokenizer,
160
+ scheduler=scheduler,
161
+ vae=vae,
162
+ ).to(torch.device("cuda:0"))
163
+
164
+ def enhance_prompt_if_enabled(prompt, enhance_toggle):
165
+ if not enhance_toggle:
166
+ print("Enhance toggle is off, Prompt: ", prompt)
167
+ return prompt
168
+
169
+ messages = [
170
+ {"role": "system", "content": system_prompt_t2v},
171
+ {"role": "user", "content": prompt},
172
+ ]
173
+
174
+ try:
175
+ response = client.chat.completions.create(
176
+ model="gpt-4-mini",
177
+ messages=messages,
178
+ max_tokens=200,
179
+ )
180
+ print("Enhanced Prompt: ", response.choices[0].message.content.strip())
181
+ return response.choices[0].message.content.strip()
182
+ except Exception as e:
183
+ print(f"Error: {e}")
184
+ return prompt
185
+
186
+ @spaces.GPU(duration=90)
187
+ def generate_video_from_text_90(
188
+ prompt="",
189
+ enhance_prompt_toggle=False,
190
+ negative_prompt="",
191
+ frame_rate=25,
192
+ seed=random.randint(0, MAX_SEED),
193
+ num_inference_steps=30,
194
+ guidance_scale=3.2,
195
+ height=768,
196
+ width=768,
197
+ num_frames=60,
198
+ progress=gr.Progress(),
199
+ ):
200
+ # ํ”„๋กฌํ”„ํŠธ ์ „์ฒ˜๋ฆฌ (ํ•œ๊ธ€ -> ์˜์–ด)
201
+ prompt = process_prompt(prompt)
202
+ negative_prompt = process_prompt(negative_prompt)
203
+
204
+ if len(prompt.strip()) < 50:
205
+ raise gr.Error(
206
+ "Prompt must be at least 50 characters long. Please provide more details for the best results.",
207
+ duration=5,
208
+ )
209
+
210
+ prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle)
211
+
212
+ sample = {
213
+ "prompt": prompt,
214
+ "prompt_attention_mask": None,
215
+ "negative_prompt": negative_prompt,
216
+ "negative_prompt_attention_mask": None,
217
+ "media_items": None,
218
+ }
219
+
220
+ generator = torch.Generator(device="cuda").manual_seed(seed)
221
+
222
+ def gradio_progress_callback(self, step, timestep, kwargs):
223
+ progress((step + 1) / num_inference_steps)
224
+
225
+ try:
226
+ with torch.no_grad():
227
+ images = pipeline(
228
+ num_inference_steps=num_inference_steps,
229
+ num_images_per_prompt=1,
230
+ guidance_scale=guidance_scale,
231
+ generator=generator,
232
+ output_type="pt",
233
+ height=height,
234
+ width=width,
235
+ num_frames=num_frames,
236
+ frame_rate=frame_rate,
237
+ **sample,
238
+ is_video=True,
239
+ vae_per_channel_normalize=True,
240
+ conditioning_method=ConditioningMethod.UNCONDITIONAL,
241
+ mixed_precision=True,
242
+ callback_on_step_end=gradio_progress_callback,
243
+ ).images
244
+ except Exception as e:
245
+ raise gr.Error(
246
+ f"An error occurred while generating the video. Please try again. Error: {e}",
247
+ duration=5,
248
+ )
249
+ finally:
250
+ torch.cuda.empty_cache()
251
+ gc.collect()
252
+
253
+ output_path = tempfile.mktemp(suffix=".mp4")
254
+ video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
255
+ video_np = (video_np * 255).astype(np.uint8)
256
+ height, width = video_np.shape[1:3]
257
+ out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height))
258
+ for frame in video_np[..., ::-1]:
259
+ out.write(frame)
260
+ out.release()
261
+ del images
262
+ del video_np
263
+ torch.cuda.empty_cache()
264
+ return output_path
265
+
266
+ def create_advanced_options():
267
+ with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
268
+ seed = gr.Slider(label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=646373)
269
+ inference_steps = gr.Slider(label="4.2 Inference Steps", minimum=5, maximum=150, step=5, value=40)
270
+ guidance_scale = gr.Slider(label="4.3 Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=4.2)
271
+
272
+ height_slider = gr.Slider(
273
+ label="4.4 Height",
274
+ minimum=256,
275
+ maximum=1024,
276
+ step=64,
277
+ value=768,
278
+ visible=False,
279
+ )
280
+ width_slider = gr.Slider(
281
+ label="4.5 Width",
282
+ minimum=256,
283
+ maximum=1024,
284
+ step=64,
285
+ value=768,
286
+ visible=False,
287
+ )
288
+ num_frames_slider = gr.Slider(
289
+ label="4.5 Number of Frames",
290
+ minimum=1,
291
+ maximum=500,
292
+ step=1,
293
+ value=60,
294
+ visible=False,
295
+ )
296
+
297
+ return [
298
+ seed,
299
+ inference_steps,
300
+ guidance_scale,
301
+ height_slider,
302
+ width_slider,
303
+ num_frames_slider,
304
+ ]
305
+
306
+ with gr.Blocks(theme=gr.themes.Soft()) as iface:
307
+
308
+ with gr.Column():
309
+ txt2vid_prompt = gr.Textbox(
310
+ label="Step 1: Enter Your Prompt (ํ•œ๊ธ€ ๋˜๋Š” ์˜์–ด)",
311
+ placeholder="์ƒ์„ฑํ•˜๊ณ  ์‹ถ์€ ๋น„๋””์˜ค๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š” (์ตœ์†Œ 50์ž)...",
312
+ value="๊ธด ๊ฐˆ์ƒ‰ ๋จธ๋ฆฌ์™€ ๋ฐ์€ ํ”ผ๋ถ€๋ฅผ ๊ฐ€์ง„ ์—ฌ์„ฑ์ด ๊ธด ๊ธˆ๋ฐœ ๋จธ๋ฆฌ๋ฅผ ๊ฐ€์ง„ ๋‹ค๋ฅธ ์—ฌ์„ฑ์„ ํ–ฅํ•ด ๋ฏธ์†Œ ์ง“์Šต๋‹ˆ๋‹ค. ๊ฐˆ์ƒ‰ ๋จธ๋ฆฌ ์—ฌ์„ฑ์€ ๊ฒ€์€ ์žฌํ‚ท์„ ์ž…๊ณ  ์žˆ์œผ๋ฉฐ ์˜ค๋ฅธ์ชฝ ๋บจ์— ์ž‘๊ณ  ๊ฑฐ์˜ ๋ˆˆ์— ๋„์ง€ ์•Š๋Š” ์ ์ด ์žˆ์Šต๋‹ˆ๋‹ค. ์นด๋ฉ”๋ผ ์•ต๊ธ€์€ ๊ฐˆ์ƒ‰ ๋จธ๋ฆฌ ์—ฌ์„ฑ์˜ ์–ผ๊ตด์— ์ดˆ์ ์„ ๋งž์ถ˜ ํด๋กœ์ฆˆ์—…์ž…๋‹ˆ๋‹ค. ์กฐ๋ช…์€ ๋”ฐ๋œปํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šฐ๋ฉฐ, ์•„๋งˆ๋„ ์ง€๋Š” ํ•ด์—์„œ ๋‚˜์˜ค๋Š” ๊ฒƒ ๊ฐ™์•„ ์žฅ๋ฉด์— ๋ถ€๋“œ๋Ÿฌ์šด ๋น›์„ ๋น„์ถฅ๋‹ˆ๋‹ค.",
313
+ lines=5,
314
+ )
315
+
316
+
317
+ txt2vid_enhance_toggle = Toggle(
318
+ label="Enhance Prompt",
319
+ value=False,
320
+ interactive=True,
321
+ )
322
+
323
+ txt2vid_negative_prompt = gr.Textbox(
324
+ label="Step 2: Enter Negative Prompt",
325
+ placeholder="๋น„๋””์˜ค์—์„œ ์›ํ•˜์ง€ ์•Š๋Š” ์š”์†Œ๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”...",
326
+ value="low quality, worst quality, deformed, distorted, damaged, motion blur, motion artifacts, fused fingers, incorrect anatomy, strange hands, ugly",
327
+ lines=2,
328
+ )
329
+
330
+ txt2vid_preset = gr.Dropdown(
331
+ choices=[p["label"] for p in preset_options],
332
+ value="512x512, 160 frames",
333
+ label="Step 3.1: Choose Resolution Preset",
334
+ )
335
+
336
+ txt2vid_frame_rate = gr.Slider(
337
+ label="Step 3.2: Frame Rate",
338
+ minimum=6,
339
+ maximum=60,
340
+ step=1,
341
+ value=20,
342
+ )
343
+
344
+ txt2vid_advanced = create_advanced_options()
345
+ txt2vid_generate = gr.Button(
346
+ "Step 5: Generate Video",
347
+ variant="primary",
348
+ size="lg",
349
+ )
350
+
351
+ txt2vid_output = gr.Video(label="Generated Output")
352
+
353
+ txt2vid_preset.change(
354
+ fn=preset_changed,
355
+ inputs=[txt2vid_preset],
356
+ outputs=txt2vid_advanced[3:],
357
+ )
358
+
359
+ txt2vid_generate.click(
360
+ fn=generate_video_from_text_90,
361
+ inputs=[
362
+ txt2vid_prompt,
363
+ txt2vid_enhance_toggle,
364
+ txt2vid_negative_prompt,
365
+ txt2vid_frame_rate,
366
+ *txt2vid_advanced,
367
+ ],
368
+ outputs=txt2vid_output,
369
+ concurrency_limit=1,
370
+ concurrency_id="generate_video",
371
+ queue=True,
372
+ )
373
+
374
+ iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(share=True, show_api=False)
375
+ # ===== Application Startup at 2024-12-20 01:30:34 =====