Spaces:

MohamedRashad
/

Infinity

Running on Zero

App Files Files Community

MohamedRashad commited on 21 days ago

Commit

32287b3

1 Parent(s): a8efd17

Add initial project structure with requirements and utility functions

Browse files

Files changed (28) hide show

.gitignore +171 -0
app.py +475 -0
models/__init__.py +26 -0
models/basic.py +575 -0
models/bitwise_self_correction.py +97 -0
models/bsq_vae/conv.py +71 -0
models/bsq_vae/dynamic_resolution.py +32 -0
models/bsq_vae/flux_vqgan.py +557 -0
models/bsq_vae/multiscale_bsq.py +718 -0
models/bsq_vae/vae.py +255 -0
models/ema.py +23 -0
models/flex_attn.py +130 -0
models/fused_op.py +27 -0
models/infinity.py +795 -0
models/init_param.py +33 -0
models/t5.py +369 -0
requirements.txt +9 -0
utils/amp_opt.py +187 -0
utils/arg_util.py +482 -0
utils/csv_util.py +20 -0
utils/dist.py +326 -0
utils/dynamic_resolution.py +73 -0
utils/large_file_util.py +70 -0
utils/load.py +100 -0
utils/lr_control.py +148 -0
utils/misc.py +397 -0
utils/save_and_load.py +150 -0
utils/wandb_utils.py +55 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc

app.py ADDED Viewed

	@@ -0,0 +1,475 @@

+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import os.path as osp
+import time
+import hashlib
+import argparse
+import shutil
+import re
+import random
+from pathlib import Path
+from typing import List
+import cv2
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from PIL import Image, ImageEnhance
+import PIL.Image as PImage
+from torchvision.transforms.functional import to_tensor
+from transformers import AutoTokenizer, T5EncoderModel, T5TokenizerFast, T5Tokenizer, T5ForConditionalGeneration
+from huggingface_hub import hf_hub_download
+import gradio as gr
+import spaces
+from models.infinity import Infinity
+from models.basic import *
+from utils.dynamic_resolution import dynamic_resolution_h_w, h_div_w_templates
+torch._dynamo.config.cache_size_limit = 64
+# Define a function to download weights if not present
+def download_weights(weights_path):
+    try:
+        model_file = weights_path / 'infinity_2b_reg.pth'
+        if not model_file.exists():
+            hf_hub_download(repo_id="FoundationVision/Infinity", filename="infinity_2b_reg.pth", local_dir=str(weights_path))
+        vae_file = weights_path / 'infinity_vae_d32reg.pth'
+        if not vae_file.exists():
+            hf_hub_download(repo_id="FoundationVision/Infinity", filename="infinity_vae_d32reg.pth", local_dir=str(weights_path))
+        # For the text encoder, we need to download the entire model
+        text_encoder_ckpt = weights_path / 'flan-t5-xl'
+        if not text_encoder_ckpt.exists():
+            tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
+            model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")
+            tokenizer.save_pretrained(text_encoder_ckpt)
+            model.save_pretrained(text_encoder_ckpt)
+    except Exception as e:
+        print(f"Error downloading weights: {e}")
+def extract_key_val(text):
+    pattern = r'<(.+?):(.+?)>'
+    matches = re.findall(pattern, text)
+    key_val = {}
+    for match in matches:
+        key_val[match[0]] = match[1].lstrip()
+    return key_val
+def encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt=False):
+    if enable_positive_prompt:
+        print(f'before positive_prompt aug: {prompt}')
+        prompt = aug_with_positive_prompt(prompt)
+        print(f'after positive_prompt aug: {prompt}')
+    print(f'prompt={prompt}')
+    captions = [prompt]
+    tokens = text_tokenizer(text=captions, max_length=512, padding='max_length', truncation=True, return_tensors='pt')  # todo: put this into dataset
+    input_ids = tokens.input_ids.cuda(non_blocking=True)
+    mask = tokens.attention_mask.cuda(non_blocking=True)
+    text_features = text_encoder(input_ids=input_ids, attention_mask=mask)['last_hidden_state'].float()
+    lens: List[int] = mask.sum(dim=-1).tolist()
+    cu_seqlens_k = F.pad(mask.sum(dim=-1).to(dtype=torch.int32).cumsum_(0), (1, 0))
+    Ltext = max(lens)
+    kv_compact = []
+    for len_i, feat_i in zip(lens, text_features.unbind(0)):
+        kv_compact.append(feat_i[:len_i])
+    kv_compact = torch.cat(kv_compact, dim=0)
+    text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext)
+    return text_cond_tuple
+def aug_with_positive_prompt(prompt):
+    for key in ['man', 'woman', 'men', 'women', 'boy', 'girl', 'child', 'person', 'human', 'adult', 'teenager', 'employee',
+                'employer', 'worker', 'mother', 'father', 'sister', 'brother', 'grandmother', 'grandfather', 'son', 'daughter']:
+        if key in prompt:
+            prompt = prompt + '. very smooth faces, good looking faces, face to the camera, perfect facial features'
+            break
+    return prompt
+def enhance_image(image):
+    for t in range(1):
+        contrast_image = image.copy()
+        contrast_enhancer = ImageEnhance.Contrast(contrast_image)
+        contrast_image = contrast_enhancer.enhance(1.05)  # 增强对比度
+        color_image = contrast_image.copy()
+        color_enhancer = ImageEnhance.Color(color_image)
+        color_image = color_enhancer.enhance(1.05)  # 增强饱和度
+    return color_image
+def gen_one_img(
+    infinity_test,
+    vae,
+    text_tokenizer,
+    text_encoder,
+    prompt,
+    cfg_list=[],
+    tau_list=[],
+    negative_prompt='',
+    scale_schedule=None,
+    top_k=900,
+    top_p=0.97,
+    cfg_sc=3,
+    cfg_exp_k=0.0,
+    cfg_insertion_layer=-5,
+    vae_type=0,
+    gumbel=0,
+    softmax_merge_topk=-1,
+    gt_leak=-1,
+    gt_ls_Bl=None,
+    g_seed=None,
+    sampling_per_bits=1,
+    enable_positive_prompt=0,
+):
+    sstt = time.time()
+    if not isinstance(cfg_list, list):
+        cfg_list = [cfg_list] * len(scale_schedule)
+    if not isinstance(tau_list, list):
+        tau_list = [tau_list] * len(scale_schedule)
+    text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt)
+    if negative_prompt:
+        negative_label_B_or_BLT = encode_prompt(text_tokenizer, text_encoder, negative_prompt)
+    else:
+        negative_label_B_or_BLT = None
+    print(f'cfg: {cfg_list}, tau: {tau_list}')
+    with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True):
+        stt = time.time()
+        _, _, img_list = infinity_test.autoregressive_infer_cfg(
+            vae=vae,
+            scale_schedule=scale_schedule,
+            label_B_or_BLT=text_cond_tuple, g_seed=g_seed,
+            B=1, negative_label_B_or_BLT=negative_label_B_or_BLT, force_gt_Bhw=None,
+            cfg_sc=cfg_sc, cfg_list=cfg_list, tau_list=tau_list, top_k=top_k, top_p=top_p,
+            returns_vemb=1, ratio_Bl1=None, gumbel=gumbel, norm_cfg=False,
+            cfg_exp_k=cfg_exp_k, cfg_insertion_layer=cfg_insertion_layer,
+            vae_type=vae_type, softmax_merge_topk=softmax_merge_topk,
+            ret_img=True, trunk_scale=1000,
+            gt_leak=gt_leak, gt_ls_Bl=gt_ls_Bl, inference_mode=True,
+            sampling_per_bits=sampling_per_bits,
+        )
+    print(f"cost: {time.time() - sstt}, infinity cost={time.time() - stt}")
+    img = img_list[0]
+    return img
+def get_prompt_id(prompt):
+    md5 = hashlib.md5()
+    md5.update(prompt.encode('utf-8'))
+    prompt_id = md5.hexdigest()
+    return prompt_id
+def save_slim_model(infinity_model_path, save_file=None, device='cpu', key='gpt_fsdp'):
+    print('[Save slim model]')
+    full_ckpt = torch.load(infinity_model_path, map_location=device)
+    infinity_slim = full_ckpt['trainer'][key]
+    # ema_state_dict = cpu_d['trainer'].get('gpt_ema_fsdp', state_dict)
+    if not save_file:
+        save_file = osp.splitext(infinity_model_path)[0] + '-slim.pth'
+    print(f'Save to {save_file}')
+    torch.save(infinity_slim, save_file)
+    print('[Save slim model] done')
+    return save_file
+def load_tokenizer(t5_path =''):
+    print(f'[Loading tokenizer and text encoder]')
+    text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True)
+    text_tokenizer.model_max_length = 512
+    text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
+    text_encoder.to('cuda')
+    text_encoder.eval()
+    text_encoder.requires_grad_(False)
+    return text_tokenizer, text_encoder
+def load_infinity(
+    rope2d_each_sa_layer,
+    rope2d_normalized_by_hw,
+    use_scale_schedule_embedding,
+    pn,
+    use_bit_label,
+    add_lvl_embeding_only_first_block,
+    model_path='',
+    scale_schedule=None,
+    vae=None,
+    device='cuda',
+    model_kwargs=None,
+    text_channels=2048,
+    apply_spatial_patchify=0,
+    use_flex_attn=False,
+    bf16=False,
+):
+    print(f'[Loading Infinity]')
+    text_maxlen = 512
+    with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True), torch.no_grad():
+        infinity_test: Infinity = Infinity(
+            vae_local=vae, text_channels=text_channels, text_maxlen=text_maxlen,
+            shared_aln=True, raw_scale_schedule=scale_schedule,
+            checkpointing='full-block',
+            customized_flash_attn=False,
+            fused_norm=True,
+            pad_to_multiplier=128,
+            use_flex_attn=use_flex_attn,
+            add_lvl_embeding_only_first_block=add_lvl_embeding_only_first_block,
+            use_bit_label=use_bit_label,
+            rope2d_each_sa_layer=rope2d_each_sa_layer,
+            rope2d_normalized_by_hw=rope2d_normalized_by_hw,
+            pn=pn,
+            apply_spatial_patchify=apply_spatial_patchify,
+            inference_mode=True,
+            train_h_div_w_list=[1.0],
+            **model_kwargs,
+        ).to(device=device)
+        print(f'[you selected Infinity with {model_kwargs=}] model size: {sum(p.numel() for p in infinity_test.parameters())/1e9:.2f}B, bf16={bf16}')
+        if bf16:
+            for block in infinity_test.unregistered_blocks:
+                block.bfloat16()
+        infinity_test.eval()
+        infinity_test.requires_grad_(False)
+        infinity_test.cuda()
+        torch.cuda.empty_cache()
+        print(f'[Load Infinity weights]')
+        state_dict = torch.load(model_path, map_location=device)
+        print(infinity_test.load_state_dict(state_dict))
+        infinity_test.rng = torch.Generator(device=device)
+        return infinity_test
+def transform(pil_img, tgt_h, tgt_w):
+    width, height = pil_img.size
+    if width / height <= tgt_w / tgt_h:
+        resized_width = tgt_w
+        resized_height = int(tgt_w / (width / height))
+    else:
+        resized_height = tgt_h
+        resized_width = int((width / height) * tgt_h)
+    pil_img = pil_img.resize((resized_width, resized_height), resample=PImage.LANCZOS)
+    # crop the center out
+    arr = np.array(pil_img)
+    crop_y = (arr.shape[0] - tgt_h) // 2
+    crop_x = (arr.shape[1] - tgt_w) // 2
+    im = to_tensor(arr[crop_y: crop_y + tgt_h, crop_x: crop_x + tgt_w])
+    return im.add(im).add_(-1)
+def joint_vi_vae_encode_decode(vae, image_path, scale_schedule, device, tgt_h, tgt_w):
+    pil_image = Image.open(image_path).convert('RGB')
+    inp = transform(pil_image, tgt_h, tgt_w)
+    inp = inp.unsqueeze(0).to(device)
+    scale_schedule = [(item[0], item[1], item[2]) for item in scale_schedule]
+    t1 = time.time()
+    h, z, _, all_bit_indices, _, infinity_input = vae.encode(inp, scale_schedule=scale_schedule)
+    t2 = time.time()
+    recons_img = vae.decode(z)[0]
+    if len(recons_img.shape) == 4:
+        recons_img = recons_img.squeeze(1)
+    print(f'recons: z.shape: {z.shape}, recons_img shape: {recons_img.shape}')
+    t3 = time.time()
+    print(f'vae encode takes {t2-t1:.2f}s, decode takes {t3-t2:.2f}s')
+    recons_img = (recons_img + 1) / 2
+    recons_img = recons_img.permute(1, 2, 0).mul_(255).cpu().numpy().astype(np.uint8)
+    gt_img = (inp[0] + 1) / 2
+    gt_img = gt_img.permute(1, 2, 0).mul_(255).cpu().numpy().astype(np.uint8)
+    print(recons_img.shape, gt_img.shape)
+    return gt_img, recons_img, all_bit_indices
+def load_visual_tokenizer(args):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # load vae
+    if args.vae_type in [16,18,20,24,32,64]:
+        from models.bsq_vae.vae import vae_model
+        schedule_mode = "dynamic"
+        codebook_dim = args.vae_type
+        codebook_size = 2**codebook_dim
+        if args.apply_spatial_patchify:
+            patch_size = 8
+            encoder_ch_mult=[1, 2, 4, 4]
+            decoder_ch_mult=[1, 2, 4, 4]
+        else:
+            patch_size = 16
+            encoder_ch_mult=[1, 2, 4, 4, 4]
+            decoder_ch_mult=[1, 2, 4, 4, 4]
+        vae = vae_model(args.vae_path, schedule_mode, codebook_dim, codebook_size, patch_size=patch_size,
+                        encoder_ch_mult=encoder_ch_mult, decoder_ch_mult=decoder_ch_mult, test_mode=True).to(device)
+    else:
+        raise ValueError(f'vae_type={args.vae_type} not supported')
+    return vae
+def load_transformer(vae, args):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model_path = args.model_path
+    if args.checkpoint_type == 'torch':
+        # copy large model to local; save slim to local; and copy slim to nas; load local slim model
+        if osp.exists(args.cache_dir):
+            local_model_path = osp.join(args.cache_dir, 'tmp', model_path.replace('/', '_'))
+        else:
+            local_model_path = model_path
+        if args.enable_model_cache:
+            slim_model_path = model_path.replace('ar-', 'slim-')
+            local_slim_model_path = local_model_path.replace('ar-', 'slim-')
+            os.makedirs(osp.dirname(local_slim_model_path), exist_ok=True)
+            print(f'model_path: {model_path}, slim_model_path: {slim_model_path}')
+            print(f'local_model_path: {local_model_path}, local_slim_model_path: {local_slim_model_path}')
+            if not osp.exists(local_slim_model_path):
+                if osp.exists(slim_model_path):
+                    print(f'copy {slim_model_path} to {local_slim_model_path}')
+                    shutil.copyfile(slim_model_path, local_slim_model_path)
+                else:
+                    if not osp.exists(local_model_path):
+                        print(f'copy {model_path} to {local_model_path}')
+                        shutil.copyfile(model_path, local_model_path)
+                    save_slim_model(local_model_path, save_file=local_slim_model_path, device=device)
+                    print(f'copy {local_slim_model_path} to {slim_model_path}')
+                    if not osp.exists(slim_model_path):
+                        shutil.copyfile(local_slim_model_path, slim_model_path)
+                        os.remove(local_model_path)
+                        os.remove(model_path)
+            slim_model_path = local_slim_model_path
+        else:
+            slim_model_path = model_path
+        print(f'load checkpoint from {slim_model_path}')
+    if args.model_type == 'infinity_2b':
+        kwargs_model = dict(depth=32, embed_dim=2048, num_heads=2048//128, drop_path_rate=0.1, mlp_ratio=4, block_chunks=8) # 2b model
+    elif args.model_type == 'infinity_layer12':
+        kwargs_model = dict(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer16':
+        kwargs_model = dict(depth=16, embed_dim=1152, num_heads=12, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer24':
+        kwargs_model = dict(depth=24, embed_dim=1536, num_heads=16, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer32':
+        kwargs_model = dict(depth=32, embed_dim=2080, num_heads=20, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer40':
+        kwargs_model = dict(depth=40, embed_dim=2688, num_heads=24, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer48':
+        kwargs_model = dict(depth=48, embed_dim=3360, num_heads=28, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    infinity = load_infinity(
+        rope2d_each_sa_layer=args.rope2d_each_sa_layer,
+        rope2d_normalized_by_hw=args.rope2d_normalized_by_hw,
+        use_scale_schedule_embedding=args.use_scale_schedule_embedding,
+        pn=args.pn,
+        use_bit_label=args.use_bit_label,
+        add_lvl_embeding_only_first_block=args.add_lvl_embeding_only_first_block,
+        model_path=slim_model_path,
+        scale_schedule=None,
+        vae=vae,
+        device=device,
+        model_kwargs=kwargs_model,
+        text_channels=args.text_channels,
+        apply_spatial_patchify=args.apply_spatial_patchify,
+        use_flex_attn=args.use_flex_attn,
+        bf16=args.bf16,
+    )
+    return infinity
+# Set up paths
+weights_path = Path(__file__).parent / 'weights'
+weights_path.mkdir(exist_ok=True)
+download_weights(weights_path)
+# Device setup
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
+# Define args
+args = argparse.Namespace(
+    pn='1M',
+    model_path=str(weights_path / 'infinity_2b_reg.pth'),
+    cfg_insertion_layer=0,
+    vae_type=32,
+    vae_path=str(weights_path / 'infinity_vae_d32reg.pth'),
+    add_lvl_embeding_only_first_block=1,
+    use_bit_label=1,
+    model_type='infinity_2b',
+    rope2d_each_sa_layer=1,
+    rope2d_normalized_by_hw=2,
+    use_scale_schedule_embedding=0,
+    sampling_per_bits=1,
+    text_encoder_ckpt=str(weights_path / 'flan-t5-xl'),
+    text_channels=2048,
+    apply_spatial_patchify=0,
+    h_div_w_template=1.000,
+    use_flex_attn=0,
+    cache_dir='/dev/shm',
+    checkpoint_type='torch',
+    seed=0,
+    bf16=1 if dtype == torch.bfloat16 else 0,
+    save_file='tmp.jpg',
+    enable_model_cache=False,
+)
+# Load models
+text_tokenizer, text_encoder = load_tokenizer(t5_path=str(weights_path / 'flan-t5-xl'))
+vae = load_visual_tokenizer(args)
+infinity = load_transformer(vae, args)
+# Define the image generation function
+@spaces.GPU
+def generate_image(prompt, cfg, tau, h_div_w, seed, enable_positive_prompt):
+    try:
+        args.prompt = prompt
+        args.cfg = cfg
+        args.tau = tau
+        args.h_div_w = h_div_w
+        args.seed = seed
+        args.enable_positive_prompt = enable_positive_prompt
+        # Find the closest h_div_w_template
+        h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates - h_div_w))]
+        # Get scale_schedule based on h_div_w_template_
+        scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']
+        scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
+        # Generate the image
+        generated_image = gen_one_img(
+            infinity,
+            vae,
+            text_tokenizer,
+            text_encoder,
+            prompt,
+            g_seed=seed,
+            gt_leak=0,
+            gt_ls_Bl=None,
+            cfg_list=cfg,
+            tau_list=tau,
+            scale_schedule=scale_schedule,
+            cfg_insertion_layer=[args.cfg_insertion_layer],
+            vae_type=args.vae_type,
+            sampling_per_bits=args.sampling_per_bits,
+            enable_positive_prompt=enable_positive_prompt,
+        )
+        # Convert the image to RGB and uint8
+        image = generated_image.cpu().numpy()
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = np.uint8(image)
+        return image
+    except Exception as e:
+        print(f"Error generating image: {e}")
+        return None
+# Set up Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("<h1><center>Infinity Image Generator</center></h1>")
+    with gr.Row():
+        prompt = gr.Textbox(label="Prompt", value="alien spaceship enterprise")
+        cfg = gr.Slider(label="CFG", minimum=1, maximum=10, step=0.5, value=3)
+        tau = gr.Slider(label="Tau", minimum=0.1, maximum=1.0, step=0.1, value=0.5)
+        h_div_w = gr.Slider(label="Aspect Ratio (Height/Width)", minimum=0.5, maximum=2.0, step=0.1, value=1.0)
+        seed = gr.Number(label="Seed", value=random.randint(0, 10000))
+        enable_positive_prompt = gr.Checkbox(label="Enable Positive Prompt", value=False)
+    generate_button = gr.Button("Generate Image")
+    output_image = gr.Image(label="Generated Image", type="pil")
+    generate_button.click(
+        generate_image,
+        inputs=[prompt, cfg, tau, h_div_w, seed, enable_positive_prompt],
+        outputs=output_image
+    )
+# Launch the Gradio app
+demo.launch()

models/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+from timm.loss import SoftTargetCrossEntropy
+from timm.models.layers import DropPath
+from .infinity import Infinity, sample_with_top_k_top_p_also_inplace_modifying_logits_
+def _ex_repr(self):
+    return ', '.join(
+        f'{k}=' + (f'{v:g}' if isinstance(v, float) else str(v))
+        for k, v in vars(self).items()
+        if not k.startswith('_') and k != 'training'
+        and not isinstance(v, (torch.nn.Module, torch.Tensor))
+    )
+for clz in (torch.nn.CrossEntropyLoss, SoftTargetCrossEntropy):  # no longer __repr__ DropPath with drop_prob
+    if hasattr(clz, 'extra_repr'):
+        clz.extra_repr = _ex_repr
+    else:
+        clz.__repr__ = lambda self: f'{type(self).__name__}({_ex_repr(self)})'
+DropPath.__repr__ = lambda self: f'{type(self).__name__}(...)'
+alias_dict = {}
+for d in range(6, 40+2, 2):
+    alias_dict[f'd{d}'] = f'infinity_d{d}'
+alias_dict_inv = {v: k for k, v in alias_dict.items()}

models/basic.py ADDED Viewed

	@@ -0,0 +1,575 @@

+"""
+Definitions of blocks of VAR transformer model.
+"""
+import math
+import os
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from timm.models.layers import DropPath, drop_path
+from torch.utils.checkpoint import checkpoint
+# Import flash_attn's attention
+from flash_attn import flash_attn_func                  # q, k, or v: BLHc, ret: BLHc
+from flash_attn import flash_attn_varlen_kvpacked_func  # qkv: N3Hc, ret: NHc
+from torch.nn.functional import scaled_dot_product_attention as slow_attn    # q, k, v: BHLc
+# Import flash_attn's fused ops
+try:
+    from flash_attn.ops.layer_norm import dropout_add_layer_norm
+    from flash_attn.ops.rms_norm import dropout_add_rms_norm
+    from flash_attn.ops.rms_norm import rms_norm as rms_norm_impl
+    from flash_attn.ops.fused_dense import fused_mlp_func
+    flash_fused_op_installed = True
+except ImportError:
+    dropout_add_layer_norm = dropout_add_rms_norm = fused_mlp_func = None
+    flash_fused_op_installed = False
+    def rms_norm_impl(x, weight, epsilon):
+        return (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True).add_(epsilon))) * weight
+def precompute_rope2d_freqs_grid(dim, dynamic_resolution_h_w, rope2d_normalized_by_hw, pad_to_multiplier=1, max_height=2048 // 16, max_width=2048 // 16, base=10000.0, device=None, scaling_factor=1.0):
+    # split the dimension into half, one for x and one for y
+    half_dim = dim // 2
+    inv_freq = 1.0 / (base ** (torch.arange(0, half_dim, 2, dtype=torch.int64).float().to(device) / half_dim)) # namely theta, 1 / (10000^(i/half_dim)), i=0,2,..., half_dim-2
+    t_height = torch.arange(max_height, device=device, dtype=torch.int64).type_as(inv_freq)
+    t_width = torch.arange(max_width, device=device, dtype=torch.int64).type_as(inv_freq)
+    t_height = t_height / scaling_factor
+    freqs_height = torch.outer(t_height, inv_freq)  # (max_height, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2), namely y*theta
+    t_width = t_width / scaling_factor
+    freqs_width = torch.outer(t_width, inv_freq)  # (max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2), namely x*theta
+    freqs_grid_map = torch.concat([
+        freqs_height[:, None, :].expand(-1, max_width, -1), # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2)
+        freqs_width[None, :, :].expand(max_height, -1, -1), # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2)
+    ], dim=-1)  # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d))
+    freqs_grid_map = torch.stack([torch.cos(freqs_grid_map), torch.sin(freqs_grid_map)], dim=0)
+    # (2, max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d))
+    rope2d_freqs_grid = {}
+    for h_div_w in dynamic_resolution_h_w:
+        scale_schedule = dynamic_resolution_h_w[h_div_w]['1M']['scales']
+        _, ph, pw = scale_schedule[-1]
+        max_edge_length = freqs_grid_map.shape[1]
+        if ph >= pw:
+            uph, upw = max_edge_length, int(max_edge_length / ph * pw)
+        else:
+            uph, upw = int(max_edge_length / pw * ph), max_edge_length
+        rope_cache_list = []
+        for (_, ph, pw) in scale_schedule:
+            ph_mul_pw = ph * pw
+            if rope2d_normalized_by_hw == 1: # downsample
+                rope_cache = F.interpolate(freqs_grid_map[:, :uph, :upw, :].permute([0,3,1,2]), size=(ph, pw), mode='bilinear', align_corners=True)
+                rope_cache = rope_cache.permute([0,2,3,1]) # (2, ph, pw, half_head_dim)
+            elif rope2d_normalized_by_hw == 2: # star stylee
+                _, uph, upw = scale_schedule[-1]
+                indices = torch.stack([
+                    (torch.arange(ph) * (uph / ph)).reshape(ph, 1).expand(ph, pw),
+                    (torch.arange(pw) * (upw / pw)).reshape(1, pw).expand(ph, pw),
+                ], dim=-1).round().int() # (ph, pw, 2)
+                indices = indices.reshape(-1, 2) # (ph*pw, 2)
+                rope_cache = freqs_grid_map[:, indices[:,0], indices[:,1], :] # (2, ph*pw, half_head_dim)
+                rope_cache = rope_cache.reshape(2, ph, pw, -1)
+            elif rope2d_normalized_by_hw == 0:
+                rope_cache = freqs_grid_map[:, :ph, :pw, :] # (2, ph, pw, half_head_dim)
+            else:
+                raise ValueError(f'Unknown rope2d_normalized_by_hw: {rope2d_normalized_by_hw}')
+            rope_cache_list.append(rope_cache.reshape(2, ph_mul_pw, -1))
+        cat_rope_cache = torch.cat(rope_cache_list, 1) # (2, seq_len, half_head_dim)
+        if cat_rope_cache.shape[1] % pad_to_multiplier:
+            pad = torch.zeros(2, pad_to_multiplier - cat_rope_cache.shape[1] % pad_to_multiplier, half_dim)
+            cat_rope_cache = torch.cat([cat_rope_cache, pad], dim=1)
+        cat_rope_cache = cat_rope_cache[:,None,None,None] # (2, 1, 1, 1, seq_len, half_dim)
+        for pn in dynamic_resolution_h_w[h_div_w]:
+            scale_schedule = dynamic_resolution_h_w[h_div_w][pn]['scales']
+            tmp_scale_schedule = [(1, h, w) for _, h, w in scale_schedule]
+            rope2d_freqs_grid[str(tuple(tmp_scale_schedule))] = cat_rope_cache
+    return rope2d_freqs_grid
+def apply_rotary_emb(q, k, scale_schedule, rope2d_freqs_grid, pad_to_multiplier, rope2d_normalized_by_hw, scale_ind):
+    qk = torch.stack((q, k), dim=0)  #(2, batch_size, heads, seq_len, head_dim)
+    device_type = qk.device.type
+    device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+    with torch.autocast(device_type=device_type, enabled=False):
+        seq_len = qk.shape[3]
+        start = 0
+        if scale_ind >= 1:
+            assert len(scale_schedule[0]) == 3
+            start = np.sum([item[0] * item[1] * item[2] for item in scale_schedule[:scale_ind]])
+        rope2d_freqs_grid[str(tuple(scale_schedule))] = rope2d_freqs_grid[str(tuple(scale_schedule))].to(qk.device)
+        assert start+seq_len <= rope2d_freqs_grid[str(tuple(scale_schedule))].shape[4]
+        rope_cache = rope2d_freqs_grid[str(tuple(scale_schedule))][:, :, :, :, start:start+seq_len] # rope_cache shape: [2, 1, 1, 1, seq_len, half_head_dim]
+        qk = qk.reshape(*qk.shape[:-1], -1, 2) #(2, batch_size, heads, seq_len, half_head_dim, 2)
+        qk = torch.stack([
+            rope_cache[0] * qk[...,0] - rope_cache[1] * qk[...,1],
+            rope_cache[1] * qk[...,0] + rope_cache[0] * qk[...,1],
+        ], dim=-1) # (2, batch_size, heads, seq_len, half_head_dim, 2), here stack + reshape should not be concate
+        qk = qk.reshape(*qk.shape[:-2], -1) #(2, batch_size, heads, seq_len, head_dim)
+        q, k = qk.unbind(dim=0) # (batch_size, heads, seq_len, head_dim)
+    return q, k
+class FastRMSNorm(nn.Module):
+    def __init__(self, C, eps=1e-6, elementwise_affine=True):
+        super().__init__()
+        self.C = C
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(C))
+        else:
+            self.register_buffer('weight', torch.ones(C))
+    def forward(self, x):
+        src_type = x.dtype
+        return rms_norm_impl(x.float(), self.weight, epsilon=self.eps).to(src_type)
+    def extra_repr(self) -> str:
+        return f'C={self.C}, eps={self.eps:g}, elementwise_affine={self.elementwise_affine}'
+def get_dropout_layer(p):
+    return nn.Dropout(p, inplace=True) if p > 0 else nn.Identity()
+class FFN(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0., fused_mlp=False):
+        super().__init__()
+        self.fused_mlp_func = fused_mlp_func if fused_mlp else None
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU(approximate='tanh')
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = get_dropout_layer(drop)
+        self.heuristic = -1
+    def forward(self, x):
+        if self.fused_mlp_func is not None:
+            return self.drop(self.fused_mlp_func(
+                x=x,
+                weight1=self.fc1.weight,
+                weight2=self.fc2.weight,
+                bias1=self.fc1.bias,
+                bias2=self.fc2.bias,
+                activation='gelu_approx',
+                save_pre_act=self.training,
+                return_residual=False,
+                checkpoint_lvl=0,
+                heuristic=self.heuristic,
+                process_group=None,
+            ))
+        else:
+            return self.drop(self.fc2( self.act(self.fc1(x)) ))
+    def extra_repr(self) -> str:
+        return f'fused_mlp={self.fused_mlp_func is not None}'
+class FFNSwiGLU(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features=None, drop=0., fused_mlp=False):
+        super().__init__()
+        self.fused_mlp_func = None
+        hidden_features = round(2 * hidden_features / 3 / 256) * 256
+        out_features = out_features or in_features
+        self.fcg = nn.Linear(in_features, hidden_features, bias=False)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=False)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=False)
+        self.drop = get_dropout_layer(drop)
+    def forward(self, x):
+        return self.drop(self.fc2( F.silu(self.fcg(x), inplace=True).mul_(self.fc1(x)) ))
+    def extra_repr(self) -> str:
+        return f'fused_mlp={self.fused_mlp_func is not None}'
+class SelfAttention(nn.Module):
+    def __init__(
+        self, embed_dim=768, num_heads=12,
+        proj_drop=0., tau=1, cos_attn=False, customized_flash_attn=True, use_flex_attn=False,
+        batch_size=2, pad_to_multiplier=1, rope2d_normalized_by_hw=0,
+    ):
+        """
+        :param embed_dim: model's width
+        :param num_heads: num heads of multi-head attention
+        :param proj_drop: always 0 for testing
+        :param tau: always 1
+        :param cos_attn: always True: during attention, q and k will be L2-normalized and scaled by a head-wise learnable parameter self.scale_mul_1H11
+        :param customized_flash_attn:
+        """
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        self.using_flash = customized_flash_attn
+        self.num_heads, self.head_dim = num_heads, embed_dim // num_heads
+        self.tau, self.cos_attn = tau, cos_attn
+        if self.cos_attn:
+            self.scale = 1
+            size = (1, 1, self.num_heads, 1) if self.using_flash else (1, self.num_heads, 1, 1)
+            # size: 11H1 or 1H11
+            self.scale_mul_1H11 = nn.Parameter(torch.full(size=size, fill_value=4.0).log(), requires_grad=True)
+            self.max_scale_mul = torch.log(torch.tensor(100)).item()
+        else:
+            self.scale = 1 / math.sqrt(self.head_dim) / self.tau
+        self.mat_qkv = nn.Linear(embed_dim, embed_dim * 3, bias=False)
+        self.q_bias, self.v_bias = nn.Parameter(torch.zeros(embed_dim)), nn.Parameter(torch.zeros(embed_dim))
+        self.register_buffer('zero_k_bias', torch.zeros(embed_dim))
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = get_dropout_layer(proj_drop)
+        self.caching = False    # kv caching: only used during inference
+        self.cached_k = None    # kv caching: only used during inference
+        self.cached_v = None    # kv caching: only used during inference
+        self.batch_size = batch_size
+        self.use_flex_attn = use_flex_attn
+        self.pad_to_multiplier = pad_to_multiplier
+        self.rope2d_normalized_by_hw = rope2d_normalized_by_hw
+    def kv_caching(self, enable: bool): # kv caching: only used during inference
+        self.caching = enable
+        self.cached_k = None
+        self.cached_v = None
+    # NOTE: attn_bias_or_two_vector is None during inference
+    def forward(self, x, attn_bias_or_two_vector: Union[torch.Tensor, Tuple[torch.IntTensor, torch.IntTensor]], attn_fn=None, scale_schedule=None, rope2d_freqs_grid=None, scale_ind=0):
+        """
+        :param (fp32) x: shaped (B or batch_size, L or seq_length, C or hidden_dim); if seq-parallel is used, the `L` dim would be shared
+        :param (fp32) attn_bias_or_two_vector:
+                if not using_flash:
+                    a block-wise, lower-triangle matrix, like:
+                    [[[[0, -, -, -, -, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]]
+                    where 0 means visible and - means invisible (-inf)
+                else:
+                    a tuple of two 1-dim int vector (VAR_visible_kvlen, VAR_invisible_qlen)
+        :return: shaped (B or batch_size, L or seq_length, C or hidden_dim); if seq-parallel is used, the `L` dim would be shared
+        """
+        # x: fp32
+        B, L, C = x.shape
+        # qkv: amp, bf16
+        qkv = F.linear(input=x, weight=self.mat_qkv.weight, bias=torch.cat((self.q_bias, self.zero_k_bias, self.v_bias))).view(B, L, 3, self.num_heads, self.head_dim)  # BL3Hc
+        if self.using_flash: q, k, v = qkv.unbind(dim=2); L_dim = 1           # q or k or v: all are shaped in (B:batch_size, L:seq_len, H:heads, c:head_dim)
+        else: q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(dim=0); L_dim = 2   # q or k or v: all are shaped in (B:batch_size, H:heads, L:seq_len, c:head_dim)
+        if self.cos_attn:   # always True
+            scale_mul = self.scale_mul_1H11.clamp_max(self.max_scale_mul).exp() # 11H1 (flash), or 1H11 (not flash)
+            q = F.normalize(q, dim=-1, eps=1e-12).mul(scale_mul).contiguous()   # fp32
+            k = F.normalize(k, dim=-1, eps=1e-12).contiguous()                  # fp32
+            v = v.contiguous()                                                  # bf16
+        else:   # be contiguous, to make kernel happy
+            q = q.contiguous()      # bf16
+            k = k.contiguous()      # bf16
+            v = v.contiguous()      # bf16
+        if rope2d_freqs_grid is not None:
+            q, k = apply_rotary_emb(q, k, scale_schedule, rope2d_freqs_grid, self.pad_to_multiplier, self.rope2d_normalized_by_hw, scale_ind) #, freqs_cis=freqs_cis)
+        if self.caching:    # kv caching: only used during inference
+            if self.cached_k is None: self.cached_k = k; self.cached_v = v
+            else: k = self.cached_k = torch.cat((self.cached_k, k), dim=L_dim); v = self.cached_v = torch.cat((self.cached_v, v), dim=L_dim)
+        if self.using_flash:
+            if attn_bias_or_two_vector is not None: # training
+                kw = dict(VAR_visible_kvlen=attn_bias_or_two_vector[0], VAR_invisible_qlen=attn_bias_or_two_vector[1])
+            else:                                   # inference (autoregressive sampling)
+                kw = dict()
+            oup = flash_attn_func(q.to(v.dtype), k.to(v.dtype), v, dropout_p=0, softmax_scale=self.scale, **kw).view(B, L, C)
+        else:
+            # if self.cos_attn: q, k are in fp32; v is in bf16
+            # else: q, k, v are in bf16
+            if self.use_flex_attn and attn_fn is not None:
+                oup = attn_fn(q, k, v, scale=self.scale).transpose(1, 2).reshape(B, L, C)
+            else:
+                oup = slow_attn(query=q, key=k, value=v, scale=self.scale, attn_mask=attn_bias_or_two_vector, dropout_p=0).transpose(1, 2).reshape(B, L, C)
+            # oup: bf16
+        return self.proj_drop(self.proj(oup))
+    def extra_repr(self) -> str:
+        tail = ''
+        return f'using_flash={self.using_flash}, tau={self.tau}, cos_attn={self.cos_attn}{tail}'
+class CrossAttention(nn.Module):
+    def __init__(
+        self, for_attn_pool=False, embed_dim=768, kv_dim=4096, num_heads=12,
+        proj_drop=0., cos_attn=False,
+    ):
+        """
+        :param for_attn_pool: only used in VAR.text_proj_for_sos
+        :param embed_dim: Q's dim
+        :param kv_dim: K's and V's dim
+        :param num_heads: num heads of multi-head attention
+        :param proj_drop: proj drop out
+        :param cos_attn: during attention, q and k will be L2-normalized and scaled by a head-wise learnable parameter self.scale_mul_1H11
+        """
+        cos_attn = False    # TODO: never use cos attn in cross attention with T5 kv
+        super().__init__()
+        self.for_attn_pool = for_attn_pool
+        self.embed_dim = embed_dim
+        self.kv_dim = kv_dim
+        assert embed_dim % num_heads == 0
+        self.num_heads, self.head_dim = num_heads, embed_dim // num_heads  # =64
+        self.cos_attn = cos_attn
+        if self.cos_attn:
+            self.scale = 1
+            self.scale_mul_1H1 = nn.Parameter(torch.full(size=(1, self.num_heads, 1, 1), fill_value=4.0).log(), requires_grad=True)
+            self.max_scale_mul = torch.log(torch.tensor(100)).item()
+        else:
+            self.scale = 1 / math.sqrt(self.head_dim)
+        if for_attn_pool:
+            q = torch.empty(1, self.num_heads, self.head_dim)
+            nn.init.trunc_normal_(q, mean=0, std=math.sqrt(1 / embed_dim / 3))
+            self.mat_q = nn.Parameter(q)
+        else:
+            self.mat_q = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.mat_kv = nn.Linear(kv_dim, embed_dim*2, bias=False)
+        self.v_bias = nn.Parameter(torch.zeros(embed_dim))
+        self.register_buffer('zero_k_bias', torch.zeros(embed_dim))
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = get_dropout_layer(proj_drop)
+    def forward(self, q, ca_kv):
+        """
+        :param q: shaped as (batch, seq_len, Q_dim)
+        :param ca_kv: contains several vectors, each of which is shaped as (len_i, KV_dim). We have [len_1xKV_dim, len_2xKV_dim, len_3xKV_dim, ...] and lens == [len_1, len_2, len_3, ...]
+            - kv_compact: shaped as (sum(lens), KV_dim)
+            - cu_seqlens_k: cumulated sum of lens
+            - max_seqlen_k: int, max(lens)
+        NOTE: seq_len (num of Qs) can reach 10k;  but len_i (num of KVs) must <= 256
+        :return: shaped as (batch, seq_len, Q_dim)
+        """
+        kv_compact, cu_seqlens_k, max_seqlen_k = ca_kv
+        N = kv_compact.shape[0]
+        kv_compact = F.linear(kv_compact, weight=self.mat_kv.weight, bias=torch.cat((self.zero_k_bias, self.v_bias))).view(N, 2, self.num_heads, self.head_dim) # NC => N2Hc
+        # attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens
+        if not self.for_attn_pool:
+            B, Lq = q.shape[:2]
+            q_compact = self.mat_q(q).view(-1, self.num_heads, self.head_dim)
+        else:
+            B = cu_seqlens_k.shape[0] - 1
+            Lq = 1
+            q_compact = self.mat_q.repeat(B, 1, 1).to(dtype=kv_compact.dtype)
+        if self.cos_attn:   # always False
+            scale_mul = self.scale_mul_1H1.clamp_max(self.max_scale_mul).exp()
+            k, v = kv_compact.unbind(dim=1)
+            q_compact = F.normalize(q_compact, dim=-1).mul(scale_mul)
+            k = F.normalize(k, dim=-1)
+            kv_compact = torch.stack((k, v), dim=1)
+        q_compact = q_compact.contiguous()
+        kv_compact = kv_compact.contiguous()
+        cu_seqlens_q = torch.arange(0, Lq * (B+1), Lq, dtype=torch.int32, device=q_compact.device)
+        if q_compact.dtype == torch.float32:    # todo: fp16 or bf16?
+            oup = flash_attn_varlen_kvpacked_func(q=q_compact.to(dtype=torch.bfloat16), kv=kv_compact.to(dtype=torch.bfloat16), cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=Lq, max_seqlen_k=max_seqlen_k, dropout_p=0, softmax_scale=self.scale).reshape(B, Lq, -1)
+            oup = oup.float()
+        else:
+            oup = flash_attn_varlen_kvpacked_func(q=q_compact, kv=kv_compact, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=Lq, max_seqlen_k=max_seqlen_k, dropout_p=0, softmax_scale=self.scale).reshape(B, Lq, -1)
+        return self.proj_drop(self.proj(oup))
+    def extra_repr(self) -> str:
+        return f'Cq={self.embed_dim}, Ckv={self.kv_dim}, cos_attn={self.cos_attn}'
+class SelfAttnBlock(nn.Module):
+    def __init__(
+        self, embed_dim, kv_dim, cross_attn_layer_scale, cond_dim, act: bool, shared_aln: bool, norm_layer: partial,
+        num_heads, mlp_ratio=4., drop=0., drop_path=0., tau=1, cos_attn=False,
+        swiglu=False, customized_flash_attn=False, fused_mlp=False, fused_norm_func=None, checkpointing_sa_only=False,
+    ):
+        super(SelfAttnBlock, self).__init__()
+        self.C, self.D = embed_dim, cond_dim
+        self.drop_path_rate = drop_path
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.attn = SelfAttention(
+            embed_dim=embed_dim, num_heads=num_heads, proj_drop=drop, tau=tau, cos_attn=cos_attn, customized_flash_attn=customized_flash_attn, attn_fn = attn_fn
+        )
+        self.using_swiglu = swiglu
+        self.ffn = (FFNSwiGLU if swiglu else FFN)(in_features=embed_dim, hidden_features=round(embed_dim * mlp_ratio / 256) * 256, drop=drop, fused_mlp=fused_mlp)
+        self.ln_wo_grad = norm_layer(embed_dim, elementwise_affine=False)
+        self.fused_norm_func = fused_norm_func
+        self.norm_eps = norm_layer.keywords.get('eps', 1e-6)
+        self.shared_aln = shared_aln
+        if self.shared_aln:
+            self.ada_gss = nn.Parameter(torch.randn(1, 1, 6, embed_dim) / embed_dim**0.5)
+        else:
+            lin = nn.Linear(cond_dim, 6*embed_dim)
+            self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin) if act else nn.Sequential(lin)
+    # NOTE: attn_bias_or_two_vector is None during inference
+    def forward(self, x, cond_BD, ca_kv, attn_bias_or_two_vector):  # todo: minGPT and vqgan also uses pre-norm, just like this, while MaskGiT uses post-norm
+        with torch.cuda.amp.autocast(enabled=False):
+            if self.shared_aln: # always True;                   (1, 1, 6, C)  + (B, 1, 6, C)
+                gamma1, gamma2, scale1, scale2, shift1, shift2 = (self.ada_gss + cond_BD).unbind(2) # 116C + B16C =unbind(2)=> 6 B1C
+            else:
+                gamma1, gamma2, scale1, scale2, shift1, shift2 = self.ada_lin(cond_BD).view(-1, 1, 6, self.C).unbind(2)
+        if self.fused_ada_norm is None:
+            x = x + self.drop_path(self.attn( self.ln_wo_grad(x.float()).mul(scale1.add(1)).add_(shift1), attn_bias_or_two_vector=attn_bias_or_two_vector ).mul_(gamma1))
+            x = x + self.drop_path(self.ffn( self.ln_wo_grad(x.float()).mul(scale2.add(1)).add_(shift2) ).mul(gamma2)) # this mul(gamma2) cannot be in-placed cuz we possibly use FusedMLP
+        else:
+            x = x + self.drop_path(self.attn(self.fused_ada_norm(C=self.C, eps=self.norm_eps, x=x, scale=scale1, shift=shift1), attn_bias_or_two_vector=attn_bias_or_two_vector).mul_(gamma1))
+            x = x + self.drop_path(self.ffn(self.fused_ada_norm(C=self.C, eps=self.norm_eps, x=x, scale=scale2, shift=shift2)).mul(gamma2)) # this mul(gamma2) cannot be in-placed cuz we possibly use FusedMLP
+        return x
+    def extra_repr(self) -> str:
+        return f'shared_aln={self.shared_aln}, fused_norm={self.fused_norm_func is not None}'
+class CrossAttnBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dim, kv_dim, cross_attn_layer_scale, cond_dim, act: bool, shared_aln: bool, norm_layer: partial,
+        num_heads, mlp_ratio=4., drop=0., drop_path=0., tau=1, cos_attn=False,
+        swiglu=False, customized_flash_attn=False, fused_mlp=False, fused_norm_func=None, checkpointing_sa_only=False,
+        use_flex_attn=False, batch_size=2, pad_to_multiplier=1, apply_rope2d=False, rope2d_normalized_by_hw=False,
+    ):
+        super(CrossAttnBlock, self).__init__()
+        self.C, self.D = embed_dim, cond_dim
+        self.drop_path_rate = drop_path
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.sa = SelfAttention(
+            embed_dim=embed_dim, num_heads=num_heads, proj_drop=drop, tau=tau, cos_attn=cos_attn, customized_flash_attn=customized_flash_attn,
+            use_flex_attn=use_flex_attn, batch_size=batch_size, pad_to_multiplier=pad_to_multiplier, rope2d_normalized_by_hw=rope2d_normalized_by_hw,
+        )
+        self.ca = CrossAttention(embed_dim=embed_dim, kv_dim=kv_dim, num_heads=num_heads, proj_drop=drop, cos_attn=cos_attn)
+        self.using_swiglu = swiglu
+        self.ffn = (FFNSwiGLU if swiglu else FFN)(in_features=embed_dim, hidden_features=round(embed_dim * mlp_ratio / 256) * 256, drop=drop, fused_mlp=fused_mlp)
+        self.ln_wo_grad = norm_layer(embed_dim, elementwise_affine=False)
+        self.fused_norm_func = fused_norm_func
+        self.norm_eps = norm_layer.keywords.get('eps', 1e-6)
+        self.ca_norm = norm_layer(embed_dim, elementwise_affine=True)
+        self.shared_aln = shared_aln
+        if self.shared_aln: # always True
+            self.ada_gss = nn.Parameter(torch.randn(1, 1, 6, embed_dim) / embed_dim**0.5)
+        else:
+            lin = nn.Linear(cond_dim, 6*embed_dim)
+            self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin) if act else nn.Sequential(lin)
+        if cross_attn_layer_scale >= 0:
+            self.ca_gamma = nn.Parameter(cross_attn_layer_scale * torch.ones(embed_dim), requires_grad=True)
+        else:
+            self.ca_gamma = 1
+        self.checkpointing_sa_only = checkpointing_sa_only
+    # NOTE: attn_bias_or_two_vector is None during inference
+    def forward(self, x, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn=None, scale_schedule=None, rope2d_freqs_grid=None, scale_ind=0):    # todo: minGPT and vqgan also uses pre-norm, just like this, while MaskGiT uses post-norm
+        with torch.cuda.amp.autocast(enabled=False):    # disable half precision
+            if self.shared_aln: # always True;                   (1, 1, 6, C)  + (B, 1, 6, C)
+                gamma1, gamma2, scale1, scale2, shift1, shift2 = (self.ada_gss + cond_BD).unbind(2) # 116C + B16C =unbind(2)=> 6 B1C
+            else:
+                gamma1, gamma2, scale1, scale2, shift1, shift2 = self.ada_lin(cond_BD).view(-1, 1, 6, self.C).unbind(2)
+        if self.fused_norm_func is None:
+            x_sa = self.ln_wo_grad(x.float()).mul(scale1.add(1)).add_(shift1)
+            if self.checkpointing_sa_only and self.training:
+                x_sa = checkpoint(self.sa, x_sa, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid, use_reentrant=False)
+            else:
+                x_sa = self.sa(x_sa, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid)
+            x = x + self.drop_path(x_sa.mul_(gamma1))
+            x = x + self.ca(self.ca_norm(x), ca_kv).float().mul_(self.ca_gamma)
+            x = x + self.drop_path(self.ffn( self.ln_wo_grad(x.float()).mul(scale2.add(1)).add_(shift2) ).mul(gamma2)) # this mul(gamma2) cannot be in-placed cuz we possibly use FusedMLP
+        else:
+            x_sa = self.fused_norm_func(C=self.C, eps=self.norm_eps, x=x, scale=scale1, shift=shift1)
+            if self.checkpointing_sa_only and self.training:
+                x_sa = checkpoint(self.sa, x_sa, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid, use_reentrant=False)
+            else:
+                x_sa = self.sa(x_sa, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid, scale_ind=scale_ind)
+            x = x + self.drop_path(x_sa.mul_(gamma1))
+            x = x + self.ca(self.ca_norm(x), ca_kv).float().mul_(self.ca_gamma)
+            x = x + self.drop_path(self.ffn(self.fused_norm_func(C=self.C, eps=self.norm_eps, x=x, scale=scale2, shift=shift2)).mul(gamma2)) # this mul(gamma2) cannot be in-placed cuz we possibly use FusedMLP
+        return x
+    def extra_repr(self) -> str:
+        return f'shared_aln={self.shared_aln}, fused_norm={self.fused_norm_func is not None}, ca_gamma={"<learnable>" if isinstance(self.ca_gamma, nn.Parameter) else self.ca_gamma}'
+class AdaLNBeforeHead(nn.Module):
+    def __init__(self, C, D, act: bool, norm_layer: partial, fused_norm_func=None):   # C: embed_dim, D: cond_dim
+        super().__init__()
+        self.C, self.D = C, D
+        self.ln_wo_grad = norm_layer(C, elementwise_affine=False)
+        self.fused_norm_func = fused_norm_func
+        self.norm_eps = norm_layer.keywords.get('eps', 1e-6)
+        lin = nn.Linear(D, 2*C)
+        self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin) if act else nn.Sequential(lin)
+    def forward(self, x_BLC: torch.Tensor, cond_BD: Optional[torch.Tensor]):
+        scale, shift = self.ada_lin(cond_BD).view(-1, 1, 2, self.C).unbind(2)
+        if self.fused_norm_func is None:
+            return self.ln_wo_grad(x_BLC).mul(scale.add(1)).add_(shift)
+        else:
+            return self.fused_norm_func(C=self.C, eps=self.norm_eps, x=x_BLC, scale=scale, shift=shift)
+def main():
+    dev = 'cpu' # 'cuda' if torch.cuda.is_available() else 'cpu'
+    rng = torch.Generator(device=dev)
+    # for Li in ([1, 3, 5], [1, 3]):
+    rng.manual_seed(0)
+    B, H, cq, ckv = 4, 8, 64, 96
+    Cq = H*cq
+    Ckv = H*ckv
+    Li = [5, 4, 7, 6]
+    Lq = 10
+    L = max(Li)
+    attn_bias = torch.zeros(B, 1, Lq, L, device=dev)
+    for i, x in enumerate(Li):
+        attn_bias[i, 0, :, x:] = -torch.inf
+    q = torch.randn(B, Lq, H, cq, generator=rng, device=dev)
+    k = torch.randn(B, L, H, ckv, generator=rng, device=dev)
+    v = torch.randn(B, L, H, ckv, generator=rng, device=dev)
+    tq, tk, tv = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)    # BHLc
+    seqlen_k = torch.tensor(Li, dtype=torch.int32, device=dev)
+    cu_seqlens_k = F.pad(torch.cumsum(seqlen_k, dim=0, dtype=torch.torch.int32), (1, 0))
+    kv = torch.stack([k, v], dim=2)
+    kv_compact = torch.cat([kv[i, :Li[i]] for i in range(B)], dim=0)
+    ca = CrossAttention(for_attn_pool=False, embed_dim=Cq, kv_dim=Ckv, num_heads=H)
+    CrossAttention.forward
+    ca(q, (kv_compact, cu_seqlens_k, max(Li))).mean().backward()
+if __name__ == '__main__':
+    main()

models/bitwise_self_correction.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import os.path as osp
+import torch
+import torch.nn.functional as F
+import numpy as np
+def labels2image(all_indices, label_type='int_label', scale_schedule=None):
+    summed_codes, recons_imgs = self.vae.decode_from_indices(all_indices, scale_schedule, label_type)
+    recons_img = recons_imgs[0]
+    recons_img = (recons_img + 1) / 2
+    recons_img = recons_img.permute(1, 2, 0).mul_(255).cpu().numpy().astype(np.uint8)[:,:,::-1]
+    return recons_img
+def features2image(raw_features):
+    recons_imgs = self.vae.decode(raw_features.squeeze(-3))
+    recons_img = recons_imgs[0]
+    recons_img = (recons_img + 1) / 2
+    recons_img = recons_img.permute(1, 2, 0).mul_(255).cpu().numpy().astype(np.uint8)[:,:,::-1]
+    return recons_img
+class BitwiseSelfCorrection(object):
+    def __init__(self, vae, args):
+        self.noise_apply_layers = args.noise_apply_layers
+        self.noise_apply_requant = args.noise_apply_requant
+        self.noise_apply_strength = args.noise_apply_strength
+        self.apply_spatial_patchify = args.apply_spatial_patchify
+        self.vae = vae
+        self.debug_bsc = args.debug_bsc
+    def flip_requant(self, vae_scale_schedule, inp_B3HW, raw_features, device):
+        with torch.amp.autocast('cuda', enabled = False):
+            B = raw_features.shape[0]
+            if raw_features.dim() == 4:
+                codes_out = raw_features.unsqueeze(2)
+            else:
+                codes_out = raw_features
+            cum_var_input = 0
+            gt_all_bit_indices = []
+            pred_all_bit_indices = []
+            x_BLC_wo_prefix = []
+            for si, (pt, ph, pw) in enumerate(vae_scale_schedule):
+                residual = codes_out - cum_var_input
+                if si != len(vae_scale_schedule)-1:
+                    residual = F.interpolate(residual, size=vae_scale_schedule[si], mode=self.vae.quantizer.z_interplote_down).contiguous()
+                quantized, _, bit_indices, loss = self.vae.quantizer.lfq(residual) # quantized shape: [B, d_vae, 1, h, w], bit_indices shape: [B,1,h,w,d_vae]
+                gt_all_bit_indices.append(bit_indices)
+                if si < self.noise_apply_layers:
+                    noise_apply_strength = np.random.randint(0, 100 * self.noise_apply_strength+1) * 0.01
+                    mask = torch.rand(*bit_indices.shape).to(device) < noise_apply_strength
+                    pred_bit_indices = bit_indices.clone()
+                    pred_bit_indices[mask] = 1 - pred_bit_indices[mask]
+                    pred_all_bit_indices.append(pred_bit_indices)
+                    if self.noise_apply_requant:
+                        quantized = self.vae.quantizer.lfq.indices_to_codes(pred_bit_indices, label_type = 'bit_label')
+                else:
+                    pred_all_bit_indices.append(bit_indices)
+                cum_var_input = cum_var_input + F.interpolate(quantized, size=vae_scale_schedule[-1], mode=self.vae.quantizer.z_interplote_up).contiguous()
+                if si < len(vae_scale_schedule)-1:
+                    this_scale_input = F.interpolate(cum_var_input, size=vae_scale_schedule[si+1], mode=self.vae.quantizer.z_interplote_up).contiguous()
+                    if self.apply_spatial_patchify:
+                        # (B,d,1,H,W) -> (B,d,H,W) -> (B,4d,H/2,W/2)
+                        this_scale_input = torch.nn.functional.pixel_unshuffle(this_scale_input.squeeze(-3), 2)
+                    x_BLC_wo_prefix.append(this_scale_input.reshape(*this_scale_input.shape[:2], -1).permute(0,2,1)) # (B,H/2*W/2,4C) or (B,H*W,C)
+            if self.apply_spatial_patchify:
+                gt_ms_idx_Bl = []
+                for item in gt_all_bit_indices:
+                    # item shape: (B,1,H,W,d)
+                    item = item.squeeze(1).permute(0,3,1,2) # (B,d,H,W)
+                    # (B,d,H,W) -> (B,4d,H/2,W/2)
+                    item = torch.nn.functional.pixel_unshuffle(item, 2)
+                    # (B,4d,H/2,W/2) -> (B,H/2,W/2,4d) -> (B,H/2*w/2,4d)
+                    item = item.permute(0,2,3,1).reshape(B, -1, 4*self.vae.codebook_dim)
+                    gt_ms_idx_Bl.append(item)
+            else:
+                gt_ms_idx_Bl = [item.reshape(B, -1, self.vae.codebook_dim) for item in gt_all_bit_indices]
+            x_BLC_wo_prefix = torch.cat(x_BLC_wo_prefix, 1)
+            if self.debug_bsc:
+                self.visualize(vae_scale_schedule, inp_B3HW, gt_all_bit_indices, pred_all_bit_indices)
+        return x_BLC_wo_prefix, gt_ms_idx_Bl
+    def visualize(self, vae_scale_schedule, inp_B3HW, gt_all_bit_indices, pred_all_bit_indices):
+        gt_img = (inp_B3HW.squeeze(-3) + 1) / 2 * 255
+        gt_img = gt_img[0].permute(1,2,0).cpu().numpy().astype(np.uint8)[:,:,::-1]
+        recons_img_2 = labels2image(gt_all_bit_indices, label_type='bit_label', scale_schedule=vae_scale_schedule)
+        recons_img_3 = labels2image(pred_all_bit_indices, label_type='bit_label', scale_schedule=vae_scale_schedule)
+        cat_image = np.concatenate([gt_img, recons_img_2, recons_img_3], axis=1)
+        save_path = osp.abspath('non_teacher_force.jpg')
+        cv2.imwrite(save_path, cat_image)
+        print(f'Save to {save_path}')
+        import pdb; pdb.set_trace()
+        print(cat_image.shape)

models/bsq_vae/conv.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+import torch.nn as nn
+from einops import rearrange
+import torch.nn.functional as F
+class Conv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, cnn_type="2d", causal_offset=0, temporal_down=False):
+        super().__init__()
+        self.cnn_type = cnn_type
+        self.slice_seq_len = 17
+        if cnn_type == "2d":
+            self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
+        if cnn_type == "3d":
+            if temporal_down == False:
+                stride = (1, stride, stride)
+            else:
+                stride = (stride, stride, stride)
+            self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=0)
+            if isinstance(kernel_size, int):
+                kernel_size = (kernel_size, kernel_size, kernel_size)
+            self.padding = (
+                kernel_size[0] - 1 + causal_offset,  # Temporal causal padding
+                padding,  # Height padding
+                padding  # Width padding
+            )
+        self.causal_offset = causal_offset
+        self.stride = stride
+        self.kernel_size = kernel_size
+    def forward(self, x):
+        if self.cnn_type == "2d":
+            if x.ndim == 5:
+                B, C, T, H, W = x.shape
+                x = rearrange(x, "B C T H W -> (B T) C H W")
+                x = self.conv(x)
+                x = rearrange(x, "(B T) C H W -> B C T H W", T=T)
+                return x
+            else:
+                return self.conv(x)
+        if self.cnn_type == "3d":
+            assert self.stride[0] == 1 or self.stride[0] == 2, f"only temporal stride = 1 or 2 are supported"
+            xs = []
+            for i in range(0, x.shape[2], self.slice_seq_len+self.stride[0]-1):
+                st = i
+                en = min(i+self.slice_seq_len, x.shape[2])
+                _x = x[:,:,st:en,:,:]
+                if i == 0:
+                    _x = F.pad(_x, (self.padding[2], self.padding[2],  # Width
+                            self.padding[1], self.padding[1],   # Height
+                            self.padding[0], 0))                # Temporal
+                else:
+                    padding_0 = self.kernel_size[0] - 1
+                    _x = F.pad(_x, (self.padding[2], self.padding[2],  # Width
+                            self.padding[1], self.padding[1],   # Height
+                            padding_0, 0))                      # Temporal
+                    _x[:,:,:padding_0,
+                        self.padding[1]:_x.shape[-2]-self.padding[1],
+                        self.padding[2]:_x.shape[-1]-self.padding[2]] += x[:,:,i-padding_0:i,:,:]
+                _x = self.conv(_x)
+                xs.append(_x)
+            try:
+                x = torch.cat(xs, dim=2)
+            except:
+                device = x.device
+                del x
+                xs = [_x.cpu().pin_memory() for _x in xs]
+                torch.cuda.empty_cache()
+                x = torch.cat([_x.cpu() for _x in xs], dim=2).to(device=device)
+            return x

models/bsq_vae/dynamic_resolution.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import json
+import numpy as np
+import tqdm
+vae_stride = 16
+ratio2hws = {
+    1.000: [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16),(20,20),(24,24),(32,32),(40,40),(48,48),(64,64)],
+    1.250: [(1,1),(2,2),(3,3),(5,4),(10,8),(15,12),(20,16),(25,20),(30,24),(35,28),(45,36),(55,44),(70,56)],
+    1.333: [(1,1),(2,2),(4,3),(8,6),(12,9),(16,12),(20,15),(24,18),(28,21),(36,27),(48,36),(60,45),(72,54)],
+    1.500: [(1,1),(2,2),(3,2),(6,4),(9,6),(15,10),(21,14),(27,18),(33,22),(39,26),(48,32),(63,42),(78,52)],
+    1.750: [(1,1),(2,2),(3,3),(7,4),(11,6),(14,8),(21,12),(28,16),(35,20),(42,24),(56,32),(70,40),(84,48)],
+    2.000: [(1,1),(2,2),(4,2),(6,3),(10,5),(16,8),(22,11),(30,15),(38,19),(46,23),(60,30),(74,37),(90,45)],
+    2.500: [(1,1),(2,2),(5,2),(10,4),(15,6),(20,8),(25,10),(30,12),(40,16),(50,20),(65,26),(80,32),(100,40)],
+    3.000: [(1,1),(2,2),(6,2),(9,3),(15,5),(21,7),(27,9),(36,12),(45,15),(54,18),(72,24),(90,30),(111,37)],
+}
+full_ratio2hws = {}
+for ratio, hws in ratio2hws.items():
+    full_ratio2hws[ratio] = hws
+    full_ratio2hws[int(1/ratio*1000)/1000] = [(item[1], item[0]) for item in hws]
+dynamic_resolution_h_w = {}
+predefined_HW_Scales_dynamic = {}
+for ratio in full_ratio2hws:
+    dynamic_resolution_h_w[ratio] ={}
+    for ind, leng in enumerate([7, 10, 13]):
+        h, w = full_ratio2hws[ratio][leng-1][0], full_ratio2hws[ratio][leng-1][1] # feature map size
+        pixel = (h * vae_stride, w * vae_stride) # The original image (H, W)
+        dynamic_resolution_h_w[ratio][pixel[1]] = {
+            'pixel': pixel,
+            'scales': full_ratio2hws[ratio][:leng]
+        } # W as key
+        predefined_HW_Scales_dynamic[(h, w)] = full_ratio2hws[ratio][:leng]

models/bsq_vae/flux_vqgan.py ADDED Viewed

	@@ -0,0 +1,557 @@

+import argparse
+import os
+import imageio
+import torch
+import numpy as np
+from einops import rearrange
+from torch import Tensor, nn
+import torch.nn.functional as F
+import torchvision
+from torchvision import transforms
+from safetensors.torch import load_file
+import torch.utils.checkpoint as checkpoint
+from .conv import Conv
+from .multiscale_bsq import MultiScaleBSQ
+ptdtype = {None: torch.float32, 'fp32': torch.float32, 'bf16': torch.bfloat16}
+class Normalize(nn.Module):
+    def __init__(self, in_channels, norm_type, norm_axis="spatial"):
+        super().__init__()
+        self.norm_axis = norm_axis
+        assert norm_type in ['group', 'batch', "no"]
+        if norm_type == 'group':
+            if in_channels % 32 == 0:
+                self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+            elif in_channels % 24 == 0:
+                self.norm = nn.GroupNorm(num_groups=24, num_channels=in_channels, eps=1e-6, affine=True)
+            else:
+                raise NotImplementedError
+        elif norm_type == 'batch':
+            self.norm = nn.SyncBatchNorm(in_channels, track_running_stats=False) # Runtime Error: grad inplace if set track_running_stats to True
+        elif norm_type == 'no':
+            self.norm = nn.Identity()
+    def forward(self, x):
+        if self.norm_axis == "spatial":
+            if x.ndim == 4:
+                x = self.norm(x)
+            else:
+                B, C, T, H, W = x.shape
+                x = rearrange(x, "B C T H W -> (B T) C H W")
+                x = self.norm(x)
+                x = rearrange(x, "(B T) C H W -> B C T H W", T=T)
+        elif self.norm_axis == "spatial-temporal":
+            x = self.norm(x)
+        else:
+            raise NotImplementedError
+        return x
+def swish(x: Tensor) -> Tensor:
+    try:
+        return x * torch.sigmoid(x)
+    except:
+        device = x.device
+        x = x.cpu().pin_memory()
+        return (x*torch.sigmoid(x)).to(device=device)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, norm_type='group', cnn_param=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels, norm_type, norm_axis=cnn_param["cnn_norm_axis"])
+        self.q = Conv(in_channels, in_channels, kernel_size=1)
+        self.k = Conv(in_channels, in_channels, kernel_size=1)
+        self.v = Conv(in_channels, in_channels, kernel_size=1)
+        self.proj_out = Conv(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        B, _, T, _, _ = h_.shape
+        h_ = self.norm(h_)
+        h_ = rearrange(h_, "B C T H W -> (B T) C H W") # spatial attention only
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "(b t) 1 (h w) c -> b c t h w", h=h, w=w, c=c, b=B, t=T)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, norm_type='group', cnn_param=None):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = Normalize(in_channels, norm_type, norm_axis=cnn_param["cnn_norm_axis"])
+        if cnn_param["res_conv_2d"] in ["half", "full"]:
+            self.conv1 = Conv(in_channels, out_channels, kernel_size=3, stride=1, padding=1, cnn_type="2d")
+        else:
+            self.conv1 = Conv(in_channels, out_channels, kernel_size=3, stride=1, padding=1, cnn_type=cnn_param["cnn_type"])
+        self.norm2 = Normalize(out_channels, norm_type, norm_axis=cnn_param["cnn_norm_axis"])
+        if cnn_param["res_conv_2d"] in ["full"]:
+            self.conv2 = Conv(out_channels, out_channels, kernel_size=3, stride=1, padding=1, cnn_type="2d")
+        else:
+            self.conv2 = Conv(out_channels, out_channels, kernel_size=3, stride=1, padding=1, cnn_type=cnn_param["cnn_type"])
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = Conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels, cnn_type="2d", spatial_down=False, temporal_down=False):
+        super().__init__()
+        assert spatial_down == True
+        if cnn_type == "2d":
+            self.pad = (0,1,0,1)
+        if cnn_type == "3d":
+            self.pad = (0,1,0,1,0,0) # add padding to the right for h-axis and w-axis. No padding for t-axis
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = Conv(in_channels, in_channels, kernel_size=3, stride=2, padding=0, cnn_type=cnn_type, temporal_down=temporal_down)
+    def forward(self, x: Tensor):
+        x = nn.functional.pad(x, self.pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels, cnn_type="2d", spatial_up=False, temporal_up=False, use_pxsl=False):
+        super().__init__()
+        if cnn_type == "2d":
+            self.scale_factor = 2
+            self.causal_offset = 0
+        else:
+            assert spatial_up == True
+            if temporal_up:
+                self.scale_factor = (2,2,2)
+                self.causal_offset = -1
+            else:
+                self.scale_factor = (1,2,2)
+                self.causal_offset = 0
+        self.use_pxsl = use_pxsl
+        if self.use_pxsl:
+            self.conv = Conv(in_channels, in_channels*4, kernel_size=3, stride=1, padding=1, cnn_type=cnn_type, causal_offset=self.causal_offset)
+            self.pxsl = nn.PixelShuffle(2)
+        else:
+            self.conv = Conv(in_channels, in_channels, kernel_size=3, stride=1, padding=1, cnn_type=cnn_type, causal_offset=self.causal_offset)
+    def forward(self, x: Tensor):
+        if self.use_pxsl:
+            x = self.conv(x)
+            x = self.pxsl(x)
+        else:
+            try:
+                x = F.interpolate(x, scale_factor=self.scale_factor, mode="nearest")
+            except:
+                # shard across channel
+                _xs = []
+                for i in range(x.shape[1]):
+                    _x = F.interpolate(x[:,i:i+1,...], scale_factor=self.scale_factor, mode="nearest")
+                    _xs.append(_x)
+                x = torch.cat(_xs, dim=1)
+            x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+        in_channels = 3,
+        patch_size=8, temporal_patch_size=4,
+        norm_type='group', cnn_param=None,
+        use_checkpoint=False,
+        use_vae=True,
+    ):
+        super().__init__()
+        self.max_down = np.log2(patch_size)
+        self.temporal_max_down = np.log2(temporal_patch_size)
+        self.temporal_down_offset = self.max_down - self.temporal_max_down
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        self.cnn_param = cnn_param
+        self.use_checkpoint = use_checkpoint
+        # downsampling
+        # self.conv_in = Conv(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        # cnn_param["cnn_type"] = "2d" for images, cnn_param["cnn_type"] = "3d" for videos
+        if cnn_param["conv_in_out_2d"] == "yes": # "yes" for video
+            self.conv_in = Conv(in_channels, ch, kernel_size=3, stride=1, padding=1, cnn_type="2d")
+        else:
+            self.conv_in = Conv(in_channels, ch, kernel_size=3, stride=1, padding=1, cnn_type=cnn_param["cnn_type"])
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, norm_type=norm_type, cnn_param=cnn_param))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            # downsample, stride=1, stride=2, stride=2 for 4x8x8 Video VAE
+            spatial_down = True if i_level < self.max_down else False
+            temporal_down = True if i_level < self.max_down and i_level >= self.temporal_down_offset else False
+            if spatial_down or temporal_down:
+                down.downsample = Downsample(block_in, cnn_type=cnn_param["cnn_type"], spatial_down=spatial_down, temporal_down=temporal_down)
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, norm_type=norm_type, cnn_param=cnn_param)
+        if cnn_param["cnn_attention"] == "yes":
+            self.mid.attn_1 = AttnBlock(block_in, norm_type, cnn_param=cnn_param)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, norm_type=norm_type, cnn_param=cnn_param)
+        # end
+        self.norm_out = Normalize(block_in, norm_type, norm_axis=cnn_param["cnn_norm_axis"])
+        if cnn_param["conv_inner_2d"] == "yes":
+            self.conv_out = Conv(block_in, (int(use_vae) + 1) * z_channels, kernel_size=3, stride=1, padding=1, cnn_type="2d")
+        else:
+            self.conv_out = Conv(block_in, (int(use_vae) + 1) * z_channels, kernel_size=3, stride=1, padding=1, cnn_type=cnn_param["cnn_type"])
+    def forward(self, x, return_hidden=False):
+        if not self.use_checkpoint:
+            return self._forward(x, return_hidden=return_hidden)
+        else:
+            return checkpoint.checkpoint(self._forward, x, return_hidden, use_reentrant=False)
+    def _forward(self, x: Tensor, return_hidden=False) -> Tensor:
+        # downsampling
+        h0 = self.conv_in(x)
+        hs = [h0]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if hasattr(self.down[i_level], "downsample"):
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        hs_mid = [h]
+        h = self.mid.block_1(h)
+        if self.cnn_param["cnn_attention"] == "yes":
+            h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        hs_mid.append(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        if return_hidden:
+            return h, hs, hs_mid
+        else:
+            return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+        out_ch = 3,
+        patch_size=8, temporal_patch_size=4,
+        norm_type="group", cnn_param=None,
+        use_checkpoint=False,
+        use_freq_dec=False, # use frequency features for decoder
+        use_pxsf=False
+    ):
+        super().__init__()
+        self.max_up = np.log2(patch_size)
+        self.temporal_max_up = np.log2(temporal_patch_size)
+        self.temporal_up_offset = self.max_up - self.temporal_max_up
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        self.cnn_param = cnn_param
+        self.use_checkpoint = use_checkpoint
+        self.use_freq_dec = use_freq_dec
+        self.use_pxsf = use_pxsf
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        # z to block_in
+        if cnn_param["conv_inner_2d"] == "yes":
+            self.conv_in = Conv(z_channels, block_in, kernel_size=3, stride=1, padding=1, cnn_type="2d")
+        else:
+            self.conv_in = Conv(z_channels, block_in, kernel_size=3, stride=1, padding=1, cnn_type=cnn_param["cnn_type"])
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, norm_type=norm_type, cnn_param=cnn_param)
+        if cnn_param["cnn_attention"] == "yes":
+            self.mid.attn_1 = AttnBlock(block_in, norm_type=norm_type, cnn_param=cnn_param)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, norm_type=norm_type, cnn_param=cnn_param)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, norm_type=norm_type, cnn_param=cnn_param))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            # upsample, stride=1, stride=2, stride=2 for 4x8x8 Video VAE, offset 1 compared with encoder
+            # https://github.com/black-forest-labs/flux/blob/b4f689aaccd40de93429865793e84a734f4a6254/src/flux/modules/autoencoder.py#L228
+            spatial_up = True if 1 <= i_level <= self.max_up else False
+            temporal_up = True if 1 <= i_level <= self.max_up and i_level >= self.temporal_up_offset+1 else False
+            if spatial_up or temporal_up:
+                up.upsample = Upsample(block_in, cnn_type=cnn_param["cnn_type"], spatial_up=spatial_up, temporal_up=temporal_up, use_pxsl=self.use_pxsf)
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in, norm_type, norm_axis=cnn_param["cnn_norm_axis"])
+        if cnn_param["conv_in_out_2d"] == "yes":
+            self.conv_out = Conv(block_in, out_ch, kernel_size=3, stride=1, padding=1, cnn_type="2d")
+        else:
+            self.conv_out = Conv(block_in, out_ch, kernel_size=3, stride=1, padding=1, cnn_type=cnn_param["cnn_type"])
+    def forward(self, z):
+        if not self.use_checkpoint:
+            return self._forward(z)
+        else:
+            return checkpoint.checkpoint(self._forward, z, use_reentrant=False)
+    def _forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        if self.cnn_param["cnn_attention"] == "yes":
+            h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if hasattr(self.up[i_level], "upsample"):
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class AutoEncoder(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        cnn_param = dict(
+            cnn_type=args.cnn_type,
+            conv_in_out_2d=args.conv_in_out_2d,
+            res_conv_2d=args.res_conv_2d,
+            cnn_attention=args.cnn_attention,
+            cnn_norm_axis=args.cnn_norm_axis,
+            conv_inner_2d=args.conv_inner_2d,
+        )
+        self.encoder = Encoder(
+            ch=args.base_ch,
+            ch_mult=args.encoder_ch_mult,
+            num_res_blocks=args.num_res_blocks,
+            z_channels=args.codebook_dim,
+            patch_size=args.patch_size,
+            temporal_patch_size=args.temporal_patch_size,
+            cnn_param=cnn_param,
+            use_checkpoint=args.use_checkpoint,
+            use_vae=args.use_vae,
+        )
+        self.decoder = Decoder(
+            ch=args.base_ch,
+            ch_mult=args.decoder_ch_mult,
+            num_res_blocks=args.num_res_blocks,
+            z_channels=args.codebook_dim,
+            patch_size=args.patch_size,
+            temporal_patch_size=args.temporal_patch_size,
+            cnn_param=cnn_param,
+            use_checkpoint=args.use_checkpoint,
+            use_freq_dec=args.use_freq_dec,
+            use_pxsf=args.use_pxsf # pixelshuffle for upsampling
+        )
+        self.z_drop = nn.Dropout(args.z_drop)
+        self.scale_factor = 0.3611
+        self.shift_factor = 0.1159
+        self.codebook_dim = self.embed_dim = args.codebook_dim
+        self.gan_feat_weight = args.gan_feat_weight
+        self.video_perceptual_weight = args.video_perceptual_weight
+        self.recon_loss_type = args.recon_loss_type
+        self.l1_weight = args.l1_weight
+        self.use_vae = args.use_vae
+        self.kl_weight = args.kl_weight
+        self.lfq_weight = args.lfq_weight
+        self.image_gan_weight = args.image_gan_weight # image GAN loss weight
+        self.video_gan_weight = args.video_gan_weight # video GAN loss weight
+        self.perceptual_weight = args.perceptual_weight
+        self.flux_weight = args.flux_weight
+        self.cycle_weight = args.cycle_weight
+        self.cycle_feat_weight = args.cycle_feat_weight
+        self.cycle_gan_weight = args.cycle_gan_weight
+        self.flux_image_encoder = None
+        if not args.use_vae:
+            if args.quantizer_type == 'MultiScaleBSQ':
+                self.quantizer = MultiScaleBSQ(
+                    dim = args.codebook_dim,                        # this is the input feature dimension, defaults to log2(codebook_size) if not defined
+                    codebook_size = args.codebook_size,             # codebook size, must be a power of 2
+                    entropy_loss_weight = args.entropy_loss_weight, # how much weight to place on entropy loss
+                    diversity_gamma = args.diversity_gamma,         # within entropy loss, how much weight to give to diversity of codes, taken from https://arxiv.org/abs/1911.05894
+                    preserve_norm=args.preserve_norm,               # preserve norm of the input for BSQ
+                    ln_before_quant=args.ln_before_quant,           # use layer norm before quantization
+                    ln_init_by_sqrt=args.ln_init_by_sqrt,           # layer norm init value 1/sqrt(d)
+                    commitment_loss_weight=args.commitment_loss_weight, # loss weight of commitment loss
+                    new_quant=args.new_quant,
+                    use_decay_factor=args.use_decay_factor,
+                    mask_out=args.mask_out,
+                    use_stochastic_depth=args.use_stochastic_depth,
+                    drop_rate=args.drop_rate,
+                    schedule_mode=args.schedule_mode,
+                    keep_first_quant=args.keep_first_quant,
+                    keep_last_quant=args.keep_last_quant,
+                    remove_residual_detach=args.remove_residual_detach,
+                    use_out_phi=args.use_out_phi,
+                    use_out_phi_res=args.use_out_phi_res,
+                    random_flip = args.random_flip,
+                    flip_prob = args.flip_prob,
+                    flip_mode = args.flip_mode,
+                    max_flip_lvl = args.max_flip_lvl,
+                    random_flip_1lvl = args.random_flip_1lvl,
+                    flip_lvl_idx = args.flip_lvl_idx,
+                    drop_when_test = args.drop_when_test,
+                    drop_lvl_idx = args.drop_lvl_idx,
+                    drop_lvl_num = args.drop_lvl_num,
+                )
+                self.quantize = self.quantizer
+                self.vocab_size = args.codebook_size
+            else:
+                raise NotImplementedError(f"{args.quantizer_type} not supported")
+    def forward(self, x):
+        is_image = x.ndim == 4
+        if not is_image:
+            B, C, T, H, W = x.shape
+        else:
+            B, C, H, W = x.shape
+            T = 1
+        enc_dtype = ptdtype[self.args.encoder_dtype]
+        with torch.amp.autocast("cuda", dtype=enc_dtype):
+            h, hs, hs_mid = self.encoder(x, return_hidden=True) # B C H W or B C T H W
+        hs = [_h.detach() for _h in hs]
+        hs_mid = [_h.detach() for _h in hs_mid]
+        h = h.to(dtype=torch.float32)
+        # print(z.shape)
+        # Multiscale LFQ
+        z, all_indices, all_loss = self.quantizer(h)
+        x_recon = self.decoder(z)
+        vq_output = {
+            "commitment_loss": torch.mean(all_loss) * self.lfq_weight, # here commitment loss is sum of commitment loss and entropy penalty
+            "encodings": all_indices,
+        }
+        return x_recon, vq_output
+    def encode_for_raw_features(self, x, scale_schedule, return_residual_norm_per_scale=False):
+        is_image = x.ndim == 4
+        if not is_image:
+            B, C, T, H, W = x.shape
+        else:
+            B, C, H, W = x.shape
+            T = 1
+        enc_dtype = ptdtype[self.args.encoder_dtype]
+        with torch.amp.autocast("cuda", dtype=enc_dtype):
+            h, hs, hs_mid = self.encoder(x, return_hidden=True) # B C H W or B C T H W
+        hs = [_h.detach() for _h in hs]
+        hs_mid = [_h.detach() for _h in hs_mid]
+        h = h.to(dtype=torch.float32)
+        return h, hs, hs_mid
+    def encode(self, x, scale_schedule, return_residual_norm_per_scale=False):
+        h, hs, hs_mid = self.encode_for_raw_features(x, scale_schedule, return_residual_norm_per_scale)
+        # Multiscale LFQ
+        z, all_indices, all_bit_indices, residual_norm_per_scale, all_loss, var_input = self.quantizer(h, scale_schedule=scale_schedule, return_residual_norm_per_scale=return_residual_norm_per_scale)
+        return h, z, all_indices, all_bit_indices, residual_norm_per_scale, var_input
+    def decode(self, z):
+        x_recon = self.decoder(z)
+        x_recon = torch.clamp(x_recon, min=-1, max=1)
+        return x_recon
+    def decode_from_indices(self, all_indices, scale_schedule, label_type):
+        summed_codes = 0
+        for idx_Bl in all_indices:
+            codes = self.quantizer.lfq.indices_to_codes(idx_Bl, label_type)
+            summed_codes += F.interpolate(codes, size=scale_schedule[-1], mode=self.quantizer.z_interplote_up)
+        assert summed_codes.shape[-3] == 1
+        x_recon = self.decoder(summed_codes.squeeze(-3))
+        x_recon = torch.clamp(x_recon, min=-1, max=1)
+        return summed_codes, x_recon
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False)
+        parser.add_argument("--flux_weight", type=float, default=0)
+        parser.add_argument("--cycle_weight", type=float, default=0)
+        parser.add_argument("--cycle_feat_weight", type=float, default=0)
+        parser.add_argument("--cycle_gan_weight", type=float, default=0)
+        parser.add_argument("--cycle_loop", type=int, default=0)
+        parser.add_argument("--z_drop", type=float, default=0.)
+        return parser

models/bsq_vae/multiscale_bsq.py ADDED Viewed

	@@ -0,0 +1,718 @@

+"""
+Binary Spherical Quantization
+Proposed in https://arxiv.org/abs/2406.07548
+In the simplest setup, each dimension is quantized into {-1, 1}.
+An entropy penalty is used to encourage utilization.
+"""
+import random
+from math import log2, ceil
+from functools import partial, cache
+from collections import namedtuple
+from contextlib import nullcontext
+import torch.distributed as dist
+from torch.distributed import nn as dist_nn
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from torch.nn import Module
+from torch.amp import autocast
+import numpy as np
+from einops import rearrange, reduce, pack, unpack
+# from einx import get_at
+from .dynamic_resolution import predefined_HW_Scales_dynamic
+# constants
+Return = namedtuple('Return', ['quantized', 'indices', 'bit_indices', 'entropy_aux_loss'])
+LossBreakdown = namedtuple('LossBreakdown', ['per_sample_entropy', 'batch_entropy', 'commitment'])
+# distributed helpers
+@cache
+def is_distributed():
+    return dist.is_initialized() and dist.get_world_size() > 1
+def maybe_distributed_mean(t):
+    if not is_distributed():
+        return t
+    dist_nn.all_reduce(t)
+    t = t / dist.get_world_size()
+    return t
+# helper functions
+def exists(v):
+    return v is not None
+def identity(t):
+    return t
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg() if callable(arg) else arg
+    return None
+def round_up_multiple(num, mult):
+    return ceil(num / mult) * mult
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def l2norm(t):
+    return F.normalize(t, dim = -1)
+# entropy
+def log(t, eps = 1e-5):
+    return t.clamp(min = eps).log()
+def entropy(prob):
+    return (-prob * log(prob)).sum(dim=-1)
+# cosine sim linear
+class CosineSimLinear(Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        scale = 1.
+    ):
+        super().__init__()
+        self.scale = scale
+        self.weight = nn.Parameter(torch.randn(dim_in, dim_out))
+    def forward(self, x):
+        x = F.normalize(x, dim = -1)
+        w = F.normalize(self.weight, dim = 0)
+        return (x @ w) * self.scale
+def get_latent2scale_schedule(T: int, H: int, W: int, mode="original"):
+    assert mode in ["original", "dynamic", "dense", "same1", "same2", "same3"]
+    predefined_HW_Scales = {
+        # 256 * 256
+        (32, 32): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 6), (9, 9), (13, 13), (18, 18), (24, 24), (32, 32)],
+        (16, 16): [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (8, 8), (10, 10), (13, 13), (16, 16)],
+        # 1024x1024
+        (64, 64): [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (7, 7), (9, 9), (12, 12), (16, 16), (21, 21), (27, 27), (36, 36), (48, 48), (64, 64)],
+        (36, 64): [(1, 1), (2, 2), (3, 3), (4, 4), (6, 6), (9, 12), (13, 16), (18, 24), (24, 32), (32, 48), (36, 64)],
+    }
+    if mode == "dynamic":
+        predefined_HW_Scales.update(predefined_HW_Scales_dynamic)
+    elif mode == "dense":
+        predefined_HW_Scales[(16, 16)] = [(x, x) for x in range(1, 16+1)]
+        predefined_HW_Scales[(32, 32)] = predefined_HW_Scales[(16, 16)] + [(20, 20), (24, 24), (28, 28), (32, 32)]
+        predefined_HW_Scales[(64, 64)] = predefined_HW_Scales[(32, 32)] + [(40, 40), (48, 48), (56, 56), (64, 64)]
+    elif mode.startswith("same"):
+        num_quant = int(mode[len("same"):])
+        predefined_HW_Scales[(16, 16)] = [(16, 16) for _ in range(num_quant)]
+        predefined_HW_Scales[(32, 32)] = [(32, 32) for _ in range(num_quant)]
+        predefined_HW_Scales[(64, 64)] = [(64, 64) for _ in range(num_quant)]
+    predefined_T_Scales = [1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 15, 17, 17, 17, 17, 17]
+    patch_THW_shape_per_scale = predefined_HW_Scales[(H, W)]
+    if len(predefined_T_Scales) < len(patch_THW_shape_per_scale):
+        # print("warning: the length of predefined_T_Scales is less than the length of patch_THW_shape_per_scale!")
+        predefined_T_Scales += [predefined_T_Scales[-1]] * (len(patch_THW_shape_per_scale) - len(predefined_T_Scales))
+    patch_THW_shape_per_scale = [(min(T, t), h, w ) for (h, w), t in zip(patch_THW_shape_per_scale, predefined_T_Scales[:len(patch_THW_shape_per_scale)])]
+    return patch_THW_shape_per_scale
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    normalized_shape: int
+    """
+    def __init__(self, normalized_shape, norm_weight=False, eps=1e-6, data_format="channels_first"):
+        super().__init__()
+        if norm_weight:
+            self.weight = nn.Parameter(torch.ones(normalized_shape)/(normalized_shape**0.5))
+        else:
+            self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            if x.ndim == 4: # (b, c, h, w)
+                x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            elif x.ndim == 5: # (b, c, t, h, w)
+                x = self.weight[:, None, None, None] * x + self.bias[:, None, None, None]
+            else:
+                raise ValueError("the number of dimensions of the input should be 4 or 5")
+            return x
+class MultiScaleBSQ(Module):
+    """ Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf """
+    def __init__(
+        self,
+        *,
+        dim,
+        codebook_size,
+        soft_clamp_input_value = None,
+        aux_loss = False, # intermediate auxiliary loss
+        ln_before_quant=False, # add a LN before multi-scale RQ
+        ln_init_by_sqrt=False, # weight init by 1/sqrt(d)
+        use_decay_factor=False,
+        use_stochastic_depth=False,
+        drop_rate=0.,
+        schedule_mode="original", # ["original", "dynamic", "dense"]
+        keep_first_quant=False,
+        keep_last_quant=False,
+        remove_residual_detach=False,
+        random_flip = False,
+        flip_prob = 0.5,
+        flip_mode = "stochastic", # "stochastic", "deterministic"
+        max_flip_lvl = 1,
+        random_flip_1lvl = False, # random flip one level each time
+        flip_lvl_idx = None,
+        drop_when_test=False,
+        drop_lvl_idx=None,
+        drop_lvl_num=0,
+        **kwargs
+    ):
+        super().__init__()
+        codebook_dim = int(log2(codebook_size))
+        requires_projection = codebook_dim != dim
+        self.project_in = nn.Linear(dim, codebook_dim) if requires_projection else nn.Identity()
+        self.project_out = nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity()
+        self.has_projections = requires_projection
+        self.layernorm = LayerNorm(codebook_dim, norm_weight=ln_init_by_sqrt) if ln_before_quant else nn.Identity()
+        self.use_stochastic_depth = use_stochastic_depth
+        self.drop_rate = drop_rate
+        self.remove_residual_detach = remove_residual_detach
+        self.random_flip = random_flip
+        self.flip_prob = flip_prob
+        self.flip_mode = flip_mode
+        self.max_flip_lvl = max_flip_lvl
+        self.random_flip_1lvl = random_flip_1lvl
+        self.flip_lvl_idx = flip_lvl_idx
+        assert (random_flip and random_flip_1lvl) == False
+        self.drop_when_test = drop_when_test
+        self.drop_lvl_idx = drop_lvl_idx
+        self.drop_lvl_num = drop_lvl_num
+        if self.drop_when_test:
+            assert drop_lvl_idx is not None
+            assert drop_lvl_num > 0
+        self.lfq = BSQ(
+            dim = codebook_dim,
+            codebook_scale = 1/np.sqrt(codebook_dim),
+            soft_clamp_input_value = soft_clamp_input_value,
+            # experimental_softplus_entropy_loss=True,
+            # entropy_loss_offset=2,
+            **kwargs
+        )
+        self.z_interplote_up = 'trilinear'
+        self.z_interplote_down = 'area'
+        self.use_decay_factor = use_decay_factor
+        self.schedule_mode = schedule_mode
+        self.keep_first_quant = keep_first_quant
+        self.keep_last_quant = keep_last_quant
+        if self.use_stochastic_depth and self.drop_rate > 0:
+            assert self.keep_first_quant or self.keep_last_quant
+    @property
+    def codebooks(self):
+        return self.lfq.codebook
+    def get_codes_from_indices(self, indices_list):
+        all_codes = []
+        for indices in indices_list:
+            codes = self.lfq.indices_to_codes(indices)
+            all_codes.append(codes)
+        _, _, T, H, W = all_codes[-1].size()
+        summed_codes = 0
+        for code in all_codes:
+            summed_codes += F.interpolate(code, size=(T, H, W), mode=self.z_interplote_up)
+        return summed_codes
+    def get_output_from_indices(self, indices):
+        codes = self.get_codes_from_indices(indices)
+        codes_summed = reduce(codes, 'q ... -> ...', 'sum')
+        return self.project_out(codes_summed)
+    def flip_quant(self, x):
+        assert self.flip_mode == 'stochastic'
+        flip_mask = torch.rand_like(x) < self.flip_prob
+        x = x.clone()
+        x[flip_mask] = -x[flip_mask]
+        return x
+    def forward(
+        self,
+        x,
+        scale_schedule=None,
+        mask = None,
+        return_all_codes = False,
+        return_residual_norm_per_scale = False
+    ):
+        if x.ndim == 4:
+            x = x.unsqueeze(2)
+        B, C, T, H, W = x.size()
+        if scale_schedule is None:
+            if self.schedule_mode.startswith("same"):
+                scale_num = int(self.schedule_mode[len("same"):])
+                assert T == 1
+                scale_schedule = [(1, H, W)] * scale_num
+            else:
+                scale_schedule = get_latent2scale_schedule(T, H, W, mode=self.schedule_mode)
+                scale_num = len(scale_schedule)
+        # x = self.project_in(x)
+        x = x.permute(0, 2, 3, 4, 1).contiguous() # (b, c, t, h, w) => (b, t, h, w, c)
+        x = self.project_in(x)
+        x = x.permute(0, 4, 1, 2, 3).contiguous() # (b, t, h, w, c) => (b, c, t, h, w)
+        x = self.layernorm(x)
+        quantized_out = 0.
+        residual = x
+        all_losses = []
+        all_indices = []
+        all_bit_indices = []
+        var_inputs = []
+        residual_norm_per_scale = []
+        # go through the layers
+        out_fact = init_out_fact = 1.0
+        # residual_list = []
+        # interpolate_residual_list = []
+        # quantized_list = []
+        if self.drop_when_test:
+            drop_lvl_start = self.drop_lvl_idx
+            drop_lvl_end = self.drop_lvl_idx + self.drop_lvl_num
+        scale_num = len(scale_schedule)
+        with autocast('cuda', enabled = False):
+            for si, (pt, ph, pw) in enumerate(scale_schedule):
+                out_fact = max(0.1, out_fact) if self.use_decay_factor else init_out_fact
+                if (pt, ph, pw) != (T, H, W):
+                    interpolate_residual = F.interpolate(residual, size=(pt, ph, pw), mode=self.z_interplote_down)
+                else:
+                    interpolate_residual = residual
+                if return_residual_norm_per_scale:
+                    residual_norm_per_scale.append((torch.abs(interpolate_residual) < 0.05 * self.lfq.codebook_scale).sum() / interpolate_residual.numel())
+                # residual_list.append(torch.norm(residual.detach(), dim=1).mean())
+                # interpolate_residual_list.append(torch.norm(interpolate_residual.detach(), dim=1).mean())
+                if self.training and self.use_stochastic_depth and random.random() < self.drop_rate:
+                    if (si == 0 and self.keep_first_quant) or (si == scale_num - 1 and self.keep_last_quant):
+                        quantized, indices, _, loss = self.lfq(interpolate_residual)
+                        quantized = quantized * out_fact
+                        all_indices.append(indices)
+                        all_losses.append(loss)
+                    else:
+                        quantized = torch.zeros_like(interpolate_residual)
+                elif self.drop_when_test and drop_lvl_start <= si < drop_lvl_end:
+                    continue
+                else:
+                    # residual_norm = torch.norm(interpolate_residual.detach(), dim=1) # (b, t, h, w)
+                    # print(si, residual_norm.min(), residual_norm.max(), residual_norm.mean())
+                    quantized, indices, bit_indices, loss = self.lfq(interpolate_residual)
+                    if self.random_flip and si < self.max_flip_lvl:
+                        quantized = self.flip_quant(quantized)
+                    if self.random_flip_1lvl and si == self.flip_lvl_idx:
+                        quantized = self.flip_quant(quantized)
+                    quantized = quantized * out_fact
+                    all_indices.append(indices)
+                # quantized_list.append(torch.norm(quantized.detach(), dim=1).mean())
+                if (pt, ph, pw) != (T, H, W):
+                    quantized = F.interpolate(quantized, size=(T, H, W), mode=self.z_interplote_up).contiguous()
+                if self.remove_residual_detach:
+                    residual = residual - quantized
+                else:
+                    residual = residual - quantized.detach()
+                quantized_out = quantized_out + quantized
+                all_bit_indices.append(bit_indices)
+                all_losses.append(loss)
+                if si != scale_num - 1:
+                    var_inputs.append(F.interpolate(quantized_out, size=scale_schedule[si+1], mode=self.z_interplote_down).contiguous())
+                if self.use_decay_factor:
+                    out_fact -= 0.1
+        # print("residual_list:", residual_list)
+        # print("interpolate_residual_list:", interpolate_residual_list)
+        # print("quantized_list:", quantized_list)
+        # import ipdb; ipdb.set_trace()
+        # project out, if needed
+        quantized_out = quantized_out.permute(0, 2, 3, 4, 1).contiguous() # (b, c, t, h, w) => (b, t, h, w, c)
+        quantized_out = self.project_out(quantized_out)
+        quantized_out = quantized_out.permute(0, 4, 1, 2, 3).contiguous() # (b, t, h, w, c) => (b, c, t, h, w)
+        # image
+        if quantized_out.size(2) == 1:
+            quantized_out = quantized_out.squeeze(2)
+        # stack all losses and indices
+        all_losses = torch.stack(all_losses, dim = -1)
+        ret = (quantized_out, all_indices, all_bit_indices, residual_norm_per_scale, all_losses, var_inputs)
+        if not return_all_codes:
+            return ret
+        # whether to return all codes from all codebooks across layers
+        all_codes = self.get_codes_from_indices(all_indices)
+        # will return all codes in shape (quantizer, batch, sequence length, codebook dimension)
+        return (*ret, all_codes)
+class BSQ(Module):
+    def __init__(
+        self,
+        *,
+        dim = None,
+        codebook_size = None,
+        entropy_loss_weight = 0.1,
+        commitment_loss_weight = 0.25,
+        diversity_gamma = 1.,
+        straight_through_activation = nn.Identity(),
+        num_codebooks = 1,
+        keep_num_codebooks_dim = None,
+        codebook_scale = 1.,                        # for residual LFQ, codebook scaled down by 2x at each layer
+        frac_per_sample_entropy = 1.,               # make less than 1. to only use a random fraction of the probs for per sample entropy
+        has_projections = None,
+        projection_has_bias = True,
+        soft_clamp_input_value = None,
+        cosine_sim_project_in = False,
+        cosine_sim_project_in_scale = None,
+        channel_first = None,
+        experimental_softplus_entropy_loss = False,
+        entropy_loss_offset = 5.,                   # how much to shift the loss before softplus
+        spherical = True,                          # from https://arxiv.org/abs/2406.07548
+        force_quantization_f32 = True,               # will force the quantization step to be full precision
+        inv_temperature = 100.0,
+        gamma0=1.0, gamma=1.0, zeta=1.0,
+        preserve_norm = False, # whether to preserve the original norm info
+        new_quant = False, # new quant function，
+        mask_out = False, # mask the output as 0 in some conditions
+        use_out_phi = False, # use output phi network
+        use_out_phi_res = False, # residual out phi
+    ):
+        super().__init__()
+        # some assert validations
+        assert exists(dim) or exists(codebook_size), 'either dim or codebook_size must be specified for LFQ'
+        assert not exists(codebook_size) or log2(codebook_size).is_integer(), f'your codebook size must be a power of 2 for lookup free quantization (suggested {2 ** ceil(log2(codebook_size))})'
+        codebook_size = default(codebook_size, lambda: 2 ** dim)
+        self.codebook_size = codebook_size
+        codebook_dim = int(log2(codebook_size))
+        codebook_dims = codebook_dim * num_codebooks
+        dim = default(dim, codebook_dims)
+        self.codebook_dims = codebook_dims
+        has_projections = default(has_projections, dim != codebook_dims)
+        if cosine_sim_project_in:
+            cosine_sim_project_in = default(cosine_sim_project_in_scale, codebook_scale)
+            project_in_klass = partial(CosineSimLinear, scale = cosine_sim_project_in)
+        else:
+            project_in_klass = partial(nn.Linear, bias = projection_has_bias)
+        self.project_in = project_in_klass(dim, codebook_dims) if has_projections else nn.Identity() # nn.Identity()
+        self.project_out = nn.Linear(codebook_dims, dim, bias = projection_has_bias) if has_projections else nn.Identity() # nn.Identity()
+        self.has_projections = has_projections
+        self.out_phi = nn.Linear(codebook_dims, codebook_dims) if use_out_phi else nn.Identity()
+        self.use_out_phi_res = use_out_phi_res
+        if self.use_out_phi_res:
+            self.out_phi_scale = nn.Parameter(torch.zeros(codebook_dims), requires_grad=True) # init as zero
+        self.dim = dim
+        self.codebook_dim = codebook_dim
+        self.num_codebooks = num_codebooks
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        # channel first
+        self.channel_first = channel_first
+        # straight through activation
+        self.activation = straight_through_activation
+        # For BSQ (binary spherical quantization)
+        if not spherical:
+            raise ValueError("For BSQ, spherical must be True.")
+        self.persample_entropy_compute = 'analytical'
+        self.inv_temperature = inv_temperature
+        self.gamma0 = gamma0  # loss weight for entropy penalty
+        self.gamma = gamma  # loss weight for entropy penalty
+        self.zeta = zeta    # loss weight for entire entropy penalty
+        self.preserve_norm = preserve_norm
+        self.new_quant = new_quant
+        self.mask_out = mask_out
+        # entropy aux loss related weights
+        assert 0 < frac_per_sample_entropy <= 1.
+        self.frac_per_sample_entropy = frac_per_sample_entropy
+        self.diversity_gamma = diversity_gamma
+        self.entropy_loss_weight = entropy_loss_weight
+        # codebook scale
+        self.codebook_scale = codebook_scale
+        # commitment loss
+        self.commitment_loss_weight = commitment_loss_weight
+        # whether to soft clamp the input value from -value to value
+        self.soft_clamp_input_value = soft_clamp_input_value
+        assert not exists(soft_clamp_input_value) or soft_clamp_input_value >= codebook_scale
+        # whether to make the entropy loss positive through a softplus (experimental, please report if this worked or not in discussions)
+        self.entropy_loss_offset = entropy_loss_offset
+        self.experimental_softplus_entropy_loss = experimental_softplus_entropy_loss
+        # for no auxiliary loss, during inference
+        self.register_buffer('mask', 2 ** torch.arange(codebook_dim - 1, -1, -1))
+        self.register_buffer('zero', torch.tensor(0.), persistent = False)
+        # whether to force quantization step to be f32
+        self.force_quantization_f32 = force_quantization_f32
+        # codes
+        # all_codes = torch.arange(codebook_size)
+        # bits = ((all_codes[..., None].int() & self.mask) != 0).float()
+        # codebook = self.bits_to_codes(bits)
+        # self.register_buffer('codebook', codebook.float(), persistent = False)
+    def bits_to_codes(self, bits):
+        return bits * self.codebook_scale * 2 - self.codebook_scale
+    # @property
+    # def dtype(self):
+    #     return self.codebook.dtype
+    def indices_to_codes(
+        self,
+        indices,
+        label_type = 'int_label',
+        project_out = True
+    ):
+        assert label_type in ['int_label', 'bit_label']
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        should_transpose = default(self.channel_first, is_img_or_video)
+        if not self.keep_num_codebooks_dim:
+            if label_type == 'int_label':
+                indices = rearrange(indices, '... -> ... 1')
+            else:
+                indices = indices.unsqueeze(-2)
+        # indices to codes, which are bits of either -1 or 1
+        if label_type == 'int_label':
+            assert indices[..., None].int().min() > 0
+            bits = ((indices[..., None].int() & self.mask) != 0).float() # .to(self.dtype)
+        else:
+            bits = indices
+        codes = self.bits_to_codes(bits)
+        codes = l2norm(codes) # must normalize when using BSQ
+        codes = rearrange(codes, '... c d -> ... (c d)')
+        # whether to project codes out to original dimensions
+        # if the input feature dimensions were not log2(codebook size)
+        if project_out:
+            codes = self.project_out(codes)
+        # rearrange codes back to original shape
+        if should_transpose:
+            codes = rearrange(codes, 'b ... d -> b d ...')
+        return codes
+    def quantize(self, z):
+        assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}"
+        zhat = torch.where(z > 0,
+                           torch.tensor(1, dtype=z.dtype, device=z.device),
+                           torch.tensor(-1, dtype=z.dtype, device=z.device))
+        return z + (zhat - z).detach()
+    def quantize_new(self, z):
+        assert z.shape[-1] == self.codebook_dims, f"Expected {self.codebook_dims} dimensions, got {z.shape[-1]}"
+        zhat = torch.where(z > 0,
+                           torch.tensor(1, dtype=z.dtype, device=z.device),
+                           torch.tensor(-1, dtype=z.dtype, device=z.device))
+        q_scale = 1. / (self.codebook_dims ** 0.5)
+        zhat = q_scale * zhat # on unit sphere
+        return z + (zhat - z).detach()
+    def soft_entropy_loss(self, z):
+        if self.persample_entropy_compute == 'analytical':
+            # if self.l2_norm:
+            p = torch.sigmoid(-4 * z / (self.codebook_dims ** 0.5) * self.inv_temperature)
+            # else:
+            #     p = torch.sigmoid(-4 * z * self.inv_temperature)
+            prob = torch.stack([p, 1-p], dim=-1) # (b, h, w, 18, 2)
+            per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean() # (b,h,w,18)->(b,h,w)->scalar
+        else:
+            per_sample_entropy = self.get_entropy(prob, dim=-1, normalize=False).sum(dim=-1).mean()
+        # macro average of the probability of each subgroup
+        avg_prob = reduce(prob, '... g d ->g d', 'mean') # (18, 2)
+        codebook_entropy = self.get_entropy(avg_prob, dim=-1, normalize=False)
+        # the approximation of the entropy is the sum of the entropy of each subgroup
+        return per_sample_entropy, codebook_entropy.sum(), avg_prob
+    def get_entropy(self, count, dim=-1, eps=1e-4, normalize=True):
+        if normalize: # False
+            probs = (count + eps) / (count + eps).sum(dim=dim, keepdim =True)
+        else: # True
+            probs = count
+        H = -(probs * torch.log(probs + 1e-8)).sum(dim=dim)
+        return H
+    def forward(
+        self,
+        x,
+        return_loss_breakdown = False,
+        mask = None,
+        entropy_weight=0.1
+    ):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension, which is also log2(codebook size)
+        c - number of codebook dim
+        """
+        is_img_or_video = x.ndim >= 4
+        should_transpose = default(self.channel_first, is_img_or_video)
+        # standardize image or video into (batch, seq, dimension)
+        if should_transpose:
+            x = rearrange(x, 'b d ... -> b ... d')
+            x, ps = pack_one(x, 'b * d') # x.shape [b, hwt, c]
+        assert x.shape[-1] == self.dim, f'expected dimension of {self.dim} but received {x.shape[-1]}'
+        x = self.project_in(x)
+        # split out number of codebooks
+        x = rearrange(x, 'b n (c d) -> b n c d', c = self.num_codebooks)
+        x = l2norm(x)
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        quantization_context = partial(autocast, 'cuda', enabled = False) if force_f32 else nullcontext
+        indices = None
+        with quantization_context():
+            if force_f32:
+                orig_dtype = x.dtype
+                x = x.float()
+            # use straight-through gradients (optionally with custom activation fn) if training
+            if self.new_quant:
+                quantized = self.quantize_new(x)
+            # calculate indices
+            bit_indices = (quantized > 0).int()
+            entropy_penalty = persample_entropy = cb_entropy = self.zero
+            commit_loss = self.zero
+            # input back to original dtype if needed
+            if force_f32:
+                x = x.type(orig_dtype)
+        # merge back codebook dim
+        x = quantized # rename quantized to x for output
+        x = rearrange(x, 'b n c d -> b n (c d)')
+        # project out to feature dimension if needed
+        x = self.project_out(x)
+        # reconstitute image or video dimensions
+        if should_transpose:
+            x = unpack_one(x, ps, 'b * d')
+            x = rearrange(x, 'b ... d -> b d ...')
+            bit_indices = unpack_one(bit_indices, ps, 'b * c d')
+        # whether to remove single codebook dim
+        if not self.keep_num_codebooks_dim:
+            bit_indices = rearrange(bit_indices, '... 1 d -> ... d')
+        # complete aux loss
+        aux_loss = commit_loss * self.commitment_loss_weight + (self.zeta * entropy_penalty / self.inv_temperature)*entropy_weight
+        # returns
+        ret = Return(x, indices, bit_indices, aux_loss)
+        if not return_loss_breakdown:
+            return ret
+        return ret, LossBreakdown(persample_entropy, cb_entropy, commit_loss)

models/bsq_vae/vae.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import argparse
+import torch
+from infinity.models.bsq_vae.flux_vqgan import AutoEncoder
+def load_cnn(model, state_dict, prefix, expand=False, use_linear=False):
+    delete_keys = []
+    loaded_keys = []
+    for key in state_dict:
+        if key.startswith(prefix):
+            _key = key[len(prefix):]
+            if _key in model.state_dict():
+                # load nn.Conv2d or nn.Linear to nn.Linear
+                if use_linear and (".q.weight" in key or ".k.weight" in key or ".v.weight" in key or ".proj_out.weight" in key):
+                    load_weights = state_dict[key].squeeze()
+                elif _key.endswith(".conv.weight") and expand:
+                    if model.state_dict()[_key].shape == state_dict[key].shape:
+                        # 2D cnn to 2D cnn
+                        load_weights = state_dict[key]
+                    else:
+                        # 2D cnn to 3D cnn
+                        _expand_dim = model.state_dict()[_key].shape[2]
+                        load_weights = state_dict[key].unsqueeze(2).repeat(1, 1, _expand_dim, 1, 1)
+                else:
+                    load_weights = state_dict[key]
+                model.state_dict()[_key].copy_(load_weights)
+                delete_keys.append(key)
+                loaded_keys.append(prefix+_key)
+            # load nn.Conv2d to Conv class
+            conv_list = ["conv"] if use_linear else ["conv", ".q.", ".k.", ".v.", ".proj_out.", ".nin_shortcut."]
+            if any(k in _key for k in conv_list):
+                if _key.endswith(".weight"):
+                    conv_key = _key.replace(".weight", ".conv.weight")
+                    if conv_key and conv_key in model.state_dict():
+                        if model.state_dict()[conv_key].shape == state_dict[key].shape:
+                            # 2D cnn to 2D cnn
+                            load_weights = state_dict[key]
+                        else:
+                            # 2D cnn to 3D cnn
+                            _expand_dim = model.state_dict()[conv_key].shape[2]
+                            load_weights = state_dict[key].unsqueeze(2).repeat(1, 1, _expand_dim, 1, 1)
+                        model.state_dict()[conv_key].copy_(load_weights)
+                        delete_keys.append(key)
+                        loaded_keys.append(prefix+conv_key)
+                if _key.endswith(".bias"):
+                    conv_key = _key.replace(".bias", ".conv.bias")
+                    if conv_key and conv_key in model.state_dict():
+                        model.state_dict()[conv_key].copy_(state_dict[key])
+                        delete_keys.append(key)
+                        loaded_keys.append(prefix+conv_key)
+            # load nn.GroupNorm to Normalize class
+            if "norm" in _key:
+                if _key.endswith(".weight"):
+                    norm_key = _key.replace(".weight", ".norm.weight")
+                    if norm_key and norm_key in model.state_dict():
+                        model.state_dict()[norm_key].copy_(state_dict[key])
+                        delete_keys.append(key)
+                        loaded_keys.append(prefix+norm_key)
+                if _key.endswith(".bias"):
+                    norm_key = _key.replace(".bias", ".norm.bias")
+                    if norm_key and norm_key in model.state_dict():
+                        model.state_dict()[norm_key].copy_(state_dict[key])
+                        delete_keys.append(key)
+                        loaded_keys.append(prefix+norm_key)
+    for key in delete_keys:
+        del state_dict[key]
+    return model, state_dict, loaded_keys
+def vae_model(vqgan_ckpt, schedule_mode, codebook_dim, codebook_size, test_mode=True, patch_size=16, encoder_ch_mult=[1, 2, 4, 4, 4], decoder_ch_mult=[1, 2, 4, 4, 4],):
+    args=argparse.Namespace(
+        vqgan_ckpt=vqgan_ckpt,
+        sd_ckpt=None,
+        inference_type='image',
+        save='./imagenet_val_bsq',
+        save_prediction=True,
+        image_recon4video=False,
+        junke_old=False,
+        device='cuda',
+        max_steps=1000000.0,
+        log_every=1,
+        visu_every=1000,
+        ckpt_every=1000,
+        default_root_dir='',
+        compile='no',
+        ema='no',
+        lr=0.0001,
+        beta1=0.9,
+        beta2=0.95,
+        warmup_steps=0,
+        optim_type='Adam',
+        disc_optim_type=None,
+        lr_min=0.0,
+        warmup_lr_init=0.0,
+        max_grad_norm=1.0,
+        max_grad_norm_disc=1.0,
+        disable_sch=False,
+        patch_size=patch_size,
+        temporal_patch_size=4,
+        embedding_dim=256,
+        codebook_dim=codebook_dim,
+        num_quantizers=8,
+        quantizer_type='MultiScaleBSQ',
+        use_vae=False,
+        use_freq_enc=False,
+        use_freq_dec=False,
+        preserve_norm=False,
+        ln_before_quant=False,
+        ln_init_by_sqrt=False,
+        use_pxsf=False,
+        new_quant=True,
+        use_decay_factor=False,
+        mask_out=False,
+        use_stochastic_depth=False,
+        drop_rate=0.0,
+        schedule_mode=schedule_mode,
+        lr_drop=None,
+        lr_drop_rate=0.1,
+        keep_first_quant=False,
+        keep_last_quant=False,
+        remove_residual_detach=False,
+        use_out_phi=False,
+        use_out_phi_res=False,
+        use_lecam_reg=False,
+        lecam_weight=0.05,
+        perceptual_model='vgg16',
+        base_ch_disc=64,
+        random_flip=False,
+        flip_prob=0.5,
+        flip_mode='stochastic',
+        max_flip_lvl=1,
+        not_load_optimizer=False,
+        use_lecam_reg_zero=False,
+        freeze_encoder=False,
+        rm_downsample=False,
+        random_flip_1lvl=False,
+        flip_lvl_idx=0,
+        drop_when_test=False,
+        drop_lvl_idx=0,
+        drop_lvl_num=1,
+        disc_version='v1',
+        magvit_disc=False,
+        sigmoid_in_disc=False,
+        activation_in_disc='leaky_relu',
+        apply_blur=False,
+        apply_noise=False,
+        dis_warmup_steps=0,
+        dis_lr_multiplier=1.0,
+        dis_minlr_multiplier=False,
+        disc_channels=64,
+        disc_layers=3,
+        discriminator_iter_start=0,
+        disc_pretrain_iter=0,
+        disc_optim_steps=1,
+        disc_warmup=0,
+        disc_pool='no',
+        disc_pool_size=1000,
+        advanced_disc=False,
+        recon_loss_type='l1',
+        video_perceptual_weight=0.0,
+        image_gan_weight=1.0,
+        video_gan_weight=1.0,
+        image_disc_weight=0.0,
+        video_disc_weight=0.0,
+        l1_weight=4.0,
+        gan_feat_weight=0.0,
+        perceptual_weight=0.0,
+        kl_weight=0.0,
+        lfq_weight=0.0,
+        entropy_loss_weight=0.1,
+        commitment_loss_weight=0.25,
+        diversity_gamma=1,
+        norm_type='group',
+        disc_loss_type='hinge',
+        use_checkpoint=False,
+        precision='fp32',
+        encoder_dtype='fp32',
+        upcast_attention='',
+        upcast_tf32=False,
+        tokenizer='flux',
+        pretrained=None,
+        pretrained_mode='full',
+        inflation_pe=False,
+        init_vgen='no',
+        no_init_idis=False,
+        init_idis='keep',
+        init_vdis='no',
+        enable_nan_detector=False,
+        turn_on_profiler=False,
+        profiler_scheduler_wait_steps=10,
+        debug=True,
+        video_logger=False,
+        bytenas='',
+        username='',
+        seed=1234,
+        vq_to_vae=False,
+        load_not_strict=False,
+        zero=0,
+        bucket_cap_mb=40,
+        manual_gc_interval=1000,
+        data_path=[''],
+        data_type=[''],
+        dataset_list=['imagenet'],
+        fps=-1,
+        dataaug='resizecrop',
+        multi_resolution=False,
+        random_bucket_ratio=0.0,
+        sequence_length=16,
+        resolution=[256, 256],
+        batch_size=[1],
+        num_workers=0,
+        image_channels=3,
+        codebook_size=codebook_size,
+        codebook_l2_norm=True,
+        codebook_show_usage=True,
+        commit_loss_beta=0.25,
+        entropy_loss_ratio=0.0,
+        base_ch=128,
+        num_res_blocks=2,
+        encoder_ch_mult=encoder_ch_mult,
+        decoder_ch_mult=decoder_ch_mult,
+        dropout_p=0.0,
+        cnn_type='2d',
+        cnn_version='v1',
+        conv_in_out_2d='no',
+        conv_inner_2d='no',
+        res_conv_2d='no',
+        cnn_attention='no',
+        cnn_norm_axis='spatial',
+        flux_weight=0,
+        cycle_weight=0,
+        cycle_feat_weight=0,
+        cycle_gan_weight=0,
+        cycle_loop=0,
+        z_drop=0.0)
+    vae = AutoEncoder(args)
+    use_vae = vae.use_vae
+    if not use_vae:
+        num_codes = args.codebook_size
+    if isinstance(vqgan_ckpt, str):
+        state_dict = torch.load(args.vqgan_ckpt, map_location=torch.device("cpu"), weights_only=True)
+    else:
+        state_dict = args.vqgan_ckpt
+    if state_dict:
+        if args.ema == "yes":
+            vae, new_state_dict, loaded_keys = load_cnn(vae, state_dict["ema"], prefix="", expand=False)
+        else:
+            vae, new_state_dict, loaded_keys = load_cnn(vae, state_dict["vae"], prefix="", expand=False)
+    if test_mode:
+        vae.eval()
+        [p.requires_grad_(False) for p in vae.parameters()]
+    return vae

models/ema.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import copy
+import torch
+from collections import OrderedDict
+def get_ema_model(model):
+    ema_model = copy.deepcopy(model)
+    ema_model.eval()
+    for param in ema_model.parameters():
+        param.requires_grad = False
+    return ema_model
+@torch.no_grad()
+def update_ema(ema_model, model, decay=0.9999):
+    """
+    Step the EMA model towards the current model.
+    """
+    ema_params = OrderedDict(ema_model.named_parameters())
+    model_params = OrderedDict(model.named_parameters())
+    for name, param in model_params.items():
+        # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed
+        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)

models/flex_attn.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Wrap torch's flex attention and handle mess info or potentially refactor
+"""
+from functools import partial
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+    flex_attention_available = True
+except ImportError:
+    print(f"[Warning] flex attention need pytorch 2.5.0+ but your version is {torch.__version__}")
+    flex_attention_available = False
+def _causal_mask(b, h, q_idx, kv_idx):
+    return q_idx >= kv_idx
+def _length_to_offsets(lengths, device):
+    """Converts a list of lengths to a list of offsets.
+    Args:
+        lengths: A list of lengths.
+    """
+    offsets = [0]
+    offsets.extend(lengths)
+    offsets = torch.tensor(offsets, device=device, dtype=torch.int32)
+    offsets = torch.cumsum(offsets, dim=-1)
+    return offsets
+def _generate_var_mask_mod(offsets):
+    """Generates mask mods that apply to inputs to flex attention in the sequence stacked
+    format.
+    Args:
+        offsets: This tensor should be of shape(num_documents + 1)
+            this should contain the cumulative counts of document tokens.
+            e.g. if you have 3 documents of length 2, 4, 3 then
+            offsets = [0, 2, 6, 9]
+    Note:
+        What is the sequence stacked format? When assembling batches of inputs, we
+        take multiple sequences and stack them together to form 1 large sequence. We then
+        use masking to ensure that the attention scores are only applied to tokens within
+        the same document.
+    """
+    def _offsets_to_doc_ids_tensor(offsets):
+        device = offsets.device
+        counts = offsets[1:] - offsets[:-1]
+        return torch.repeat_interleave(
+            torch.arange(len(counts), device=device, dtype=torch.int32), counts
+        )
+    document_id = _offsets_to_doc_ids_tensor(offsets)
+    def var_mask_mod(b, h, q_idx, kv_idx):
+        same_doc = document_id[q_idx] == document_id[kv_idx]
+        causal_mask = _causal_mask(b, h, q_idx, kv_idx)
+        return same_doc | causal_mask
+    return var_mask_mod
+def _generate_var_infer_mask_with_kv_cache(lengths):
+    kv_len = sum(lengths)
+    def var_mask_mod(b, h, q_idx, kv_idx):
+        return kv_idx < kv_len
+    return var_mask_mod
+class FlexAttn(nn.Module):
+    def __init__(
+            self, block_scales:list, mask_type:str, B, H, L:int, auto_padding=False
+    ):
+        """
+        :param block_scales: accept VAR's block sizes like [(1,1), (2,2), (3,3)]
+        :param mask_type: var/causal
+        :param B: batch size
+        :param H: heads num
+        :param L: sequence length
+        """
+        super().__init__()
+        if not flex_attention_available:
+            raise NotImplementedError((f"[Error] flex attention need pytorch 2.5.0+ but your version is {torch.__version__}"))
+        self.support_mask_type = ["var", "causal", "var_infer_mask_with_kv_cache"]
+        self.auto_padding = auto_padding
+        self.flex_attention = torch.compile(flex_attention)
+        self.block_scales = block_scales
+        self.lengths = [ x * y * z for x,y,z in block_scales]
+        self.offsets = _length_to_offsets(self.lengths, device='cuda')
+        # if L paded to align 128, block need to cover padding area
+        if self.offsets[-1] < L:
+            self.offsets = torch.cat((self.offsets, torch.tensor([L], device='cuda')), dim=0)
+        if mask_type == "var":
+            self.mask_mod = _generate_var_mask_mod(self.offsets)
+            self.block_mask = create_block_mask(self.mask_mod, B = B, H = H, Q_LEN = L, KV_LEN = L, device = 'cuda', _compile = True)
+        elif mask_type == "causal":
+            self.mask_mod = _causal_mask
+            self.block_mask = create_block_mask(self.mask_mod, B = B, H = H, Q_LEN = L, KV_LEN = L, device = 'cuda', _compile = True)
+        elif mask_type == 'var_infer_mask_with_kv_cache':
+            self.mask_mod = _generate_var_infer_mask_with_kv_cache(self.lengths)
+            self.block_mask = create_block_mask(self.mask_mod, B = B, H = H, Q_LEN = L, KV_LEN = L, device = 'cuda', _compile = True)
+        else:
+            raise NotImplementedError(f"{mask_type} not supportted in FlexAttn, support type:{self.support_mask_type}")
+    def forward(self, q, k, v, scale = None):
+        if self.auto_padding:
+            q_pad_len = (128 - q.shape[-2] % 128) % 128
+            kv_pad_len = (128 - k.shape[-2] % 128) % 128
+            q_pad = F.pad(q, (0, 0, 0, q_pad_len))
+            k_pad = F.pad(k, (0, 0, 0, kv_pad_len))
+            v_pad = F.pad(v, (0, 0, 0, kv_pad_len))
+            oup = self.flex_attention(q_pad.to(v_pad.dtype), k_pad.to(v.dtype), v_pad, block_mask = self.block_mask, scale = scale)
+            if q_pad_len > 0:
+                oup = oup[:,:,:-q_pad_len]
+        else:
+            oup = self.flex_attention(q.to(v.dtype), k.to(v.dtype), v, block_mask = self.block_mask, scale = scale)
+        return oup
+    def extra_repr(self) -> str:
+        tail = ''
+        return f'block size:{self.block_scales} {tail}'

models/fused_op.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import gc
+from copy import deepcopy
+from typing import Union
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+@torch.compile(fullgraph=True)
+def fused_rms_norm(x: torch.Tensor, weight: nn.Parameter, eps: float):
+    x = x.float()
+    return (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True).add_(eps))) * weight
+@torch.compile(fullgraph=True)
+def fused_ada_layer_norm(C: int, eps: float, x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor):
+    x = x.float()
+    x = F.layer_norm(input=x, normalized_shape=(C,), weight=None, bias=None, eps=eps)
+    return x.mul(scale.add(1)).add_(shift)
+@torch.compile(fullgraph=True)
+def fused_ada_rms_norm(C: int, eps: float, x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor):
+    x = x.float()
+    x = (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True).add_(eps)))
+    return x.mul(scale.add(1)).add_(shift)

models/infinity.py ADDED Viewed

	@@ -0,0 +1,795 @@

+"""
+Definition of Infinity transformer model.
+"""
+import math
+import random
+import time
+from contextlib import nullcontext
+from functools import partial
+from typing import List, Optional, Tuple, Union, Dict, Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models import register_model
+from torch.utils.checkpoint import checkpoint
+from PIL import Image
+import numpy as np
+from torch.nn.attention.flex_attention import flex_attention
+import infinity.utils.dist as dist
+from infinity.utils.dist import for_visualize
+from infinity.models.basic import flash_attn_func, flash_fused_op_installed, AdaLNBeforeHead, CrossAttnBlock, SelfAttnBlock, CrossAttention, FastRMSNorm, precompute_rope2d_freqs_grid
+from infinity.utils import misc
+from infinity.models.flex_attn import FlexAttn
+from infinity.utils.dynamic_resolution import dynamic_resolution_h_w, h_div_w_templates
+try:
+    from infinity.models.fused_op import fused_ada_layer_norm, fused_ada_rms_norm
+except:
+    fused_ada_layer_norm, fused_ada_rms_norm = None, None
+class MultiInpIdentity(nn.Module):
+    def forward(self, x, *args, **kwargs):
+        return x
+class TextAttentivePool(nn.Module):
+    def __init__(self, Ct5: int, D: int):
+        super().__init__()
+        self.Ct5, self.D = Ct5, D
+        if D > 4096:
+            self.head_dim = 64
+        else:
+            self.head_dim = 128
+        self.num_heads = Ct5 // self.head_dim
+        self.ca = CrossAttention(for_attn_pool=True, embed_dim=self.D, kv_dim=Ct5, num_heads=self.num_heads)
+    def forward(self, ca_kv):
+        return self.ca(None, ca_kv).squeeze(1)
+class SharedAdaLin(nn.Linear):
+    def forward(self, cond_BD):
+        C = self.weight.shape[0] // 6
+        return super().forward(cond_BD).reshape(-1, 1, 6, C)   # B16C
+class MultipleLayers(nn.Module):
+    def __init__(self, ls, num_blocks_in_a_chunk, index):
+        super().__init__()
+        self.module = nn.ModuleList()
+        for i in range(index, index+num_blocks_in_a_chunk):
+            self.module.append(ls[i])
+    def forward(self, x, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn=None, scale_schedule=None, checkpointing_full_block=False, rope2d_freqs_grid=None):
+        h = x
+        for m in self.module:
+            if checkpointing_full_block:
+                h = torch.utils.checkpoint.checkpoint(m, h, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid, use_reentrant=False)
+            else:
+                h = m(h, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid)
+        return h
+class Infinity(nn.Module):
+    def __init__(
+        self, vae_local,
+        text_channels=0, text_maxlen=0,     # text-cond generation
+        selecting_idx=None,                 # class-cond generation
+        embed_dim=1024, depth=16, num_heads=16, mlp_ratio=4.,   # model's architecture
+        drop_rate=0., drop_path_rate=0.,    # drop out and drop path
+        norm_eps=1e-6, rms_norm=False,      # norm layer
+        shared_aln=False, head_aln=True,    # adaptive norm
+        cond_drop_rate=0.1,                 # for classifier-free guidance
+        rand_uncond=False,
+        cross_attn_layer_scale=-1., nm0=False, tau=1, cos_attn=True, swiglu=False,
+        raw_scale_schedule=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),
+        head_depth=1,
+        top_p=0.0, top_k=0.0,
+        customized_flash_attn=False, fused_mlp=False, fused_norm=False,
+        block_chunks=1,
+        checkpointing=None,
+        pad_to_multiplier=0,
+        use_flex_attn=False,
+        batch_size=2,
+        add_lvl_embeding_only_first_block=1,
+        use_bit_label=1,
+        rope2d_each_sa_layer=0,
+        rope2d_normalized_by_hw=0,
+        pn=None,
+        train_h_div_w_list=None,
+        video_frames=1,
+        always_training_scales=20,
+        apply_spatial_patchify = 0,
+        inference_mode=False,
+    ):
+        # set hyperparameters
+        self.C = embed_dim
+        self.inference_mode = inference_mode
+        self.apply_spatial_patchify = apply_spatial_patchify
+        if self.apply_spatial_patchify:
+            self.d_vae = vae_local.embed_dim * 4
+        else:
+            self.d_vae = vae_local.embed_dim
+        self.use_bit_label = use_bit_label
+        self.codebook_dim = self.d_vae
+        self.V = (self.codebook_dim * 2) if self.use_bit_label else vae_local.vocab_size
+        self.bit_mask = vae_local.quantizer.lfq.mask if self.use_bit_label else None
+        self.Ct5 = text_channels
+        self.depth = depth
+        self.num_heads = num_heads
+        self.batch_size = batch_size
+        self.mlp_ratio = mlp_ratio
+        self.cond_drop_rate = cond_drop_rate
+        self.norm_eps = norm_eps
+        self.prog_si = -1
+        self.pn = pn
+        self.train_h_div_w_list = train_h_div_w_list if train_h_div_w_list else h_div_w_templates
+        self.video_frames = video_frames
+        self.always_training_scales = always_training_scales
+        assert add_lvl_embeding_only_first_block in [0,1]
+        self.add_lvl_embeding_only_first_block = add_lvl_embeding_only_first_block
+        assert rope2d_each_sa_layer in [0,1]
+        self.rope2d_each_sa_layer = rope2d_each_sa_layer
+        self.rope2d_normalized_by_hw = rope2d_normalized_by_hw
+        print(f'self.codebook_dim: {self.codebook_dim}, self.add_lvl_embeding_only_first_block: {self.add_lvl_embeding_only_first_block}, \
+            self.use_bit_label: {self.use_bit_label}, self.rope2d_each_sa_layer: {rope2d_each_sa_layer}, self.rope2d_normalized_by_hw: {self.rope2d_normalized_by_hw}')
+        head_up_method = ''
+        word_patch_size = 1 if head_up_method in {'', 'no'} else 2
+        if word_patch_size > 1:
+            assert all(raw_pn % word_patch_size == 0 for raw_pn in raw_scale_schedule), f'raw_scale_schedule={raw_scale_schedule}, not compatible with word_patch_size={word_patch_size}'
+        self.checkpointing = checkpointing
+        self.pad_to_multiplier = max(1, pad_to_multiplier)
+        customized_kernel_installed = any('Infinity' in arg_name for arg_name in flash_attn_func.__code__.co_varnames)
+        self.customized_flash_attn = customized_flash_attn and customized_kernel_installed
+        if customized_flash_attn and not customized_kernel_installed:
+            import inspect, warnings
+            file_path = inspect.getsourcefile(flash_attn_func)
+            line_number = inspect.getsourcelines(flash_attn_func)[1]
+            info = (
+                f'>>>>>> Customized FlashAttention2 is not installed or compiled, but specified in args by --flash=1. Set customized_flash_attn = False. <<<<<<\n'
+                f'>>>>>> `flash_attn_func` is in [line {line_number}] [file {file_path}] <<<<<<\n'
+                f'>>>>>> {flash_attn_func.__code__.co_varnames=} <<<<<<\n'
+            )
+            warnings.warn(info, ImportWarning)
+            print(info, flush=True)
+        self.raw_scale_schedule = raw_scale_schedule    # 'raw' means before any patchifying
+        self.first_l = 1
+        # solve top-p top-k sampling hyperparameters
+        self.top_p, self.top_k = max(min(top_p, 1), 0), (round(top_k * self.V) if 0 < top_k < 1 else round(top_k))
+        if self.top_p < 1e-5: self.top_p = 0
+        if self.top_k >= self.V or self.top_k <= 0: self.top_k = 0
+        t = torch.zeros(dist.get_world_size(), device=dist.get_device())
+        t[dist.get_rank()] = float(flash_fused_op_installed)
+        dist.barrier()
+        dist.allreduce(t)
+        assert round(t.sum().item()) in {0, dist.get_world_size()}, f'flash_fused_op_installed: {t}'
+        super().__init__()
+        self.rng = torch.Generator(device=dist.get_device())
+        self.maybe_record_function = nullcontext
+        self.text_maxlen = text_maxlen
+        self.t2i = text_channels != 0
+        # [inp & position embedding]
+        init_std = math.sqrt(1 / self.C / 3)
+        self.norm0_cond = nn.Identity()
+        if self.t2i:
+            self.selecting_idx = None
+            self.num_classes = 0
+            self.D = self.C
+            cfg_uncond = torch.empty(self.text_maxlen, self.Ct5)
+            rng = torch.Generator(device='cpu')
+            rng.manual_seed(0)
+            torch.nn.init.trunc_normal_(cfg_uncond, std=1.2, generator=rng)
+            cfg_uncond /= self.Ct5 ** 0.5
+            if rand_uncond:
+                self.register_buffer('cfg_uncond', cfg_uncond)
+            else:
+                self.cfg_uncond = nn.Parameter(cfg_uncond)
+            self.text_norm = FastRMSNorm(self.Ct5, elementwise_affine=True, eps=norm_eps)
+            self.text_proj_for_sos = TextAttentivePool(self.Ct5, self.D)
+            self.text_proj_for_ca = nn.Sequential(
+                nn.Linear(self.Ct5, self.D),
+                nn.GELU(approximate='tanh'),
+                nn.Linear(self.D, self.D),
+            )
+        else:   # class-label cond
+            if selecting_idx is None:
+                num_classes = 1000
+                print(f'======= WARNING: selecting_idx not specified, set to 1/{num_classes} @ {dist.get_device()} =======')
+                selecting_idx = torch.full((1, num_classes), fill_value=1/num_classes, dtype=torch.float32, device=dist.get_device())
+            self.selecting_idx = selecting_idx
+            self.num_classes = selecting_idx.shape[-1]
+            self.D = self.C
+            self.class_emb = nn.Embedding(self.num_classes + 1, self.C)
+            nn.init.trunc_normal_(self.class_emb.weight.data, mean=0, std=init_std)
+        self.pos_start = nn.Parameter(torch.empty(1, self.first_l, self.C))
+        nn.init.trunc_normal_(self.pos_start.data, mean=0, std=init_std)
+        if self.rope2d_each_sa_layer:
+            rope2d_freqs_grid = precompute_rope2d_freqs_grid(dim=self.C//self.num_heads, dynamic_resolution_h_w=dynamic_resolution_h_w, pad_to_multiplier=self.pad_to_multiplier, rope2d_normalized_by_hw=self.rope2d_normalized_by_hw)
+            self.rope2d_freqs_grid = rope2d_freqs_grid
+        else:
+            raise ValueError(f'self.rope2d_each_sa_layer={self.rope2d_each_sa_layer} not implemented')
+        self.lvl_embed = nn.Embedding(15, self.C)
+        nn.init.trunc_normal_(self.lvl_embed.weight.data, mean=0, std=init_std)
+        # [input layers] input norm && input embedding
+        norm_layer = partial(FastRMSNorm if rms_norm else nn.LayerNorm, eps=norm_eps)
+        self.norm0_ve = norm_layer(self.d_vae) if nm0 else nn.Identity()
+        self.word_embed = nn.Linear(self.d_vae, self.C)
+        # [shared adaptive layernorm mapping network]
+        self.shared_ada_lin = nn.Sequential(nn.SiLU(inplace=False), SharedAdaLin(self.D, 6*self.C)) if shared_aln else nn.Identity()
+        # fused norm
+        if fused_norm:
+            fused_norm_func = fused_ada_rms_norm if rms_norm else fused_ada_layer_norm
+            if fused_norm_func is not None: # pre-compile
+                B = 2
+                x = torch.randn(B, 1, self.C).requires_grad_(True)
+                scale = torch.randn(B, 1, self.C).mul_(0.01).requires_grad_(True)
+                shift = torch.randn(B, 1, self.C).mul_(0.01).requires_grad_(True)
+                # fused_norm_func(C=self.C, eps=self.norm_eps, x=x, scale=scale, shift=shift).mean().backward()
+                del B, x, scale, shift
+        else:
+            fused_norm_func = None
+        # [backbone and head]
+        self.use_flex_attn = use_flex_attn
+        self.attn_fn_compile_dict = {}
+        self.batch_size = batch_size
+        if self.use_flex_attn:
+            self.attn_fn_compile_dict = self.compile_flex_attn()
+        self.drop_path_rate = drop_path_rate
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # dpr means drop path rate (linearly increasing)
+        self.unregistered_blocks = []
+        for block_idx in range(depth):
+            block = (CrossAttnBlock if self.t2i else SelfAttnBlock)(
+                embed_dim=self.C, kv_dim=self.D, cross_attn_layer_scale=cross_attn_layer_scale, cond_dim=self.D, act=True, shared_aln=shared_aln, norm_layer=norm_layer,
+                num_heads=num_heads, mlp_ratio=mlp_ratio, drop=drop_rate, drop_path=dpr[block_idx], tau=tau, cos_attn=cos_attn,
+                swiglu=swiglu, customized_flash_attn=self.customized_flash_attn, fused_mlp=fused_mlp, fused_norm_func=fused_norm_func,
+                checkpointing_sa_only=self.checkpointing == 'self-attn',
+                use_flex_attn=use_flex_attn, batch_size=batch_size, pad_to_multiplier=pad_to_multiplier, rope2d_normalized_by_hw=rope2d_normalized_by_hw,
+            )
+            self.unregistered_blocks.append(block)
+        # [head]
+        V = self.V
+        if head_aln:
+            self.head_nm = AdaLNBeforeHead(self.C, self.D, act=True, norm_layer=norm_layer, fused_norm_func=fused_norm_func)
+            self.head = nn.Linear(self.C, V) if head_depth == 1 else nn.Sequential(nn.Linear(self.C, self.C, bias=True), nn.GELU(approximate='tanh'), nn.Linear(self.C, V))
+        else:
+            self.head_nm = MultiInpIdentity()
+            self.head = nn.Sequential(norm_layer(self.C), nn.Linear(self.C, V)) if head_depth == 1 else nn.Sequential(norm_layer(self.C), nn.Linear(self.C, self.C, bias=True), nn.GELU(approximate='tanh'), nn.Linear(self.C, V))
+        self.num_block_chunks = block_chunks or 1
+        self.num_blocks_in_a_chunk = depth // block_chunks
+        print(f"{self.num_blocks_in_a_chunk=}, {depth=}, {block_chunks=}")
+        assert self.num_blocks_in_a_chunk * block_chunks == depth
+        if self.num_block_chunks == 1:
+            self.blocks = nn.ModuleList(self.unregistered_blocks)
+        else:
+            self.block_chunks = nn.ModuleList()
+            for i in range(self.num_block_chunks):
+                self.block_chunks.append(MultipleLayers(self.unregistered_blocks, self.num_blocks_in_a_chunk, i*self.num_blocks_in_a_chunk))
+        print(
+            f'\n[constructor]  ==== customized_flash_attn={self.customized_flash_attn} (using_flash={sum((b.sa.using_flash if self.t2i else b.attn.using_flash) for b in self.unregistered_blocks)}/{self.depth}), fused_mlp={fused_mlp} (fused_mlp={sum(b.ffn.fused_mlp_func is not None for b in self.unregistered_blocks)}/{self.depth}) ==== \n'
+            f'    [Infinity config ] embed_dim={embed_dim}, num_heads={num_heads}, depth={depth}, mlp_ratio={mlp_ratio}, swiglu={swiglu} num_blocks_in_a_chunk={self.num_blocks_in_a_chunk}\n'
+            f'    [drop ratios] drop_rate={drop_rate}, drop_path_rate={drop_path_rate:g} ({torch.linspace(0, drop_path_rate, depth)})',
+            end='\n\n', flush=True
+        )
+    def compile_flex_attn(self):
+        attn_fn_compile_dict = {}
+        for h_div_w in self.train_h_div_w_list:
+            h_div_w_template = h_div_w_templates[np.argmin(np.abs(float(h_div_w) - h_div_w_templates))]
+            full_scale_schedule = dynamic_resolution_h_w[h_div_w_template][self.pn]['scales']
+            if self.inference_mode:
+                apply_flex_attn_scales = list(range(1, 1+len(full_scale_schedule)))
+                mask_type = "infinity_infer_mask_with_kv_cache"
+                auto_padding = True
+            else:
+                mask_type = 'var'
+                auto_padding = False
+                apply_flex_attn_scales = [min(self.always_training_scales, len(full_scale_schedule))]
+            for scales_num in apply_flex_attn_scales:
+                print(f'====== apply flex attn hdivw: {h_div_w} scales: {scales_num} ======')
+                scale_schedule = full_scale_schedule[:scales_num]
+                scale_schedule = [ (min(t, self.video_frames//4+1), h, w) for (t,h, w) in scale_schedule]
+                patchs_nums_tuple = tuple(scale_schedule)
+                SEQ_L = sum( pt * ph * pw for pt, ph, pw in patchs_nums_tuple)
+                aligned_L = SEQ_L+ (self.pad_to_multiplier - SEQ_L % self.pad_to_multiplier) if SEQ_L % self.pad_to_multiplier != 0 else SEQ_L
+                attn_fn = FlexAttn(block_scales = patchs_nums_tuple,
+                                        mask_type = mask_type,
+                                        B = self.batch_size,
+                                        H = self.num_heads,
+                                        L = aligned_L,
+                                        auto_padding=auto_padding)
+                attn_fn_compile_dict[patchs_nums_tuple] = attn_fn
+            if self.video_frames > 1: # append image attn_fn when self.video_frames > 1 (namely videos)
+                scale_schedule = [ (1, h, w) for (t,h, w) in scale_schedule]
+                patchs_nums_tuple = tuple(scale_schedule)
+                SEQ_L = sum( pt * ph * pw for pt, ph, pw in patchs_nums_tuple)
+                aligned_L = SEQ_L+ (self.pad_to_multiplier - SEQ_L % self.pad_to_multiplier) if SEQ_L % self.pad_to_multiplier != 0 else SEQ_L
+                attn_fn = FlexAttn(block_scales = patchs_nums_tuple,
+                                        mask_type = mask_type,
+                                        B = self.batch_size,
+                                        H = self.num_heads,
+                                        L = aligned_L)
+                attn_fn_compile_dict[patchs_nums_tuple] = attn_fn
+        return attn_fn_compile_dict
+    def get_logits(self, h: torch.Tensor, cond_BD: Optional[torch.Tensor]):
+        """
+        :param h: hidden_state, shaped (B or batch_size, L or seq_len, C or hidden_dim)
+        :param cond_BD: shaped (B or batch_size, D or cond_dim)
+        :param tau: temperature
+        :return: logits, shaped (B or batch_size, V or vocabulary_size)
+        """
+        with torch.amp.autocast('cuda', enabled=False):
+            return self.head(self.head_nm(h.float(), cond_BD.float()))
+    def add_lvl_embeding(self, feature, scale_ind, scale_schedule, need_to_pad=0):
+        bs, seq_len, c = feature.shape
+        patch_t, patch_h, patch_w = scale_schedule[scale_ind]
+        t_mul_h_mul_w = patch_t * patch_h * patch_w
+        assert t_mul_h_mul_w + need_to_pad == seq_len
+        feature[:, :t_mul_h_mul_w] += self.lvl_embed(scale_ind*torch.ones((bs, t_mul_h_mul_w),dtype=torch.int).to(feature.device))
+        return feature
+    def add_lvl_embeding_for_x_BLC(self, x_BLC, scale_schedule, need_to_pad=0):
+        ptr = 0
+        x_BLC_list = []
+        for scale_ind, patch_t_h_w in enumerate(scale_schedule):
+            scale_seq_len = np.array(patch_t_h_w).prod()
+            x_BLC_this_scale = x_BLC[:,ptr:ptr+scale_seq_len] # shape: [bs, patch_h*patch_w, c]
+            ptr += scale_seq_len
+            x_BLC_this_scale = self.add_lvl_embeding(x_BLC_this_scale, scale_ind, scale_schedule)
+            x_BLC_list.append(x_BLC_this_scale)
+        assert x_BLC.shape[1] == (ptr + need_to_pad), f'{x_BLC.shape[1]} != {ptr} + {need_to_pad}'
+        x_BLC_list.append(x_BLC[:,ptr:])
+        x_BLC = torch.cat(x_BLC_list, dim=1)
+        return x_BLC
+    def forward(self, label_B_or_BLT: Union[torch.LongTensor, Tuple[torch.FloatTensor, torch.IntTensor, int]], x_BLC_wo_prefix: torch.Tensor, scale_schedule: List[Tuple[int]],
+        cfg_infer=False,
+        **kwargs,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:  # returns logits_BLV
+        """
+        label_B_or_BLT: label_B or (kv_compact, cu_seqlens_k, max_seqlen_k)
+        :return: logits BLV, V is vocab_size
+        """
+        if cfg_infer:
+            return self.autoregressive_infer_cfg(label_B_or_BLT=label_B_or_BLT, scale_schedule=scale_schedule, **kwargs)
+        x_BLC_wo_prefix = x_BLC_wo_prefix.float()       # input should be float32
+        B = x_BLC_wo_prefix.shape[0]
+        # [1. get input sequence x_BLC]
+        with torch.amp.autocast('cuda', enabled=False):
+            kv_compact, lens, cu_seqlens_k, max_seqlen_k = label_B_or_BLT
+            # drop cond
+            total = 0
+            for le in lens:
+                if random.random() < self.cond_drop_rate:
+                    kv_compact[total:total+le] = self.cfg_uncond[:le]
+                total += le
+            must_on_graph = self.cfg_uncond[0, 0] * 0
+            kv_compact = self.text_norm(kv_compact).contiguous()
+            sos = cond_BD = self.text_proj_for_sos((kv_compact, cu_seqlens_k, max_seqlen_k)).float().contiguous()    # cond_BD should be float32
+            kv_compact = self.text_proj_for_ca(kv_compact).contiguous()
+            kv_compact[0, 0] += must_on_graph
+            ca_kv = kv_compact, cu_seqlens_k, max_seqlen_k
+            cond_BD_or_gss = self.shared_ada_lin(cond_BD).contiguous()  # gss: gamma, scale, shift; cond_BD_or_gss should be float32
+            sos = sos.unsqueeze(1).expand(B, 1, -1) + self.pos_start.expand(B, 1, -1)
+            x_BLC = torch.cat((sos, self.word_embed(self.norm0_ve(x_BLC_wo_prefix))), dim=1)
+            # [1.1. pad the seqlen dim]
+            l_end = x_BLC.shape[1]
+            need_to_pad = (l_end + self.pad_to_multiplier - 1) // self.pad_to_multiplier * self.pad_to_multiplier - l_end # 0
+            if self.customized_flash_attn:
+                Infinity_visible_kvlen = self.Infinity_visible_kvlen[:l_end]
+                Infinity_invisible_qlen = self.Infinity_invisible_qlen[:l_end]
+                attn_bias_or_two_vector = (Infinity_visible_kvlen, Infinity_invisible_qlen)
+                # todo: solve need_to_pad here
+            elif self.use_flex_attn:
+                if need_to_pad:
+                    x_BLC = F.pad(x_BLC, (0, 0, 0, need_to_pad))
+                assert x_BLC.shape[-1] % 128 == 0, 'x_BLC.shape[-1] % 128 != 0'
+                attn_bias_or_two_vector = None
+            else:
+                d: torch.Tensor = torch.cat([torch.full((pn[0]*pn[1]*pn[2],), i) for i, pn in enumerate(scale_schedule)]).view(1, l_end, 1)
+                dT = d.transpose(1, 2)    # dT: 11L
+                attn_bias_for_masking = torch.where(d >= dT, 0., -torch.inf).reshape(1, 1, l_end, l_end)
+                attn_bias = attn_bias_for_masking[:, :, :l_end, :l_end].contiguous()   # attn_bias: 11LL
+                if need_to_pad:
+                    attn_bias = F.pad(attn_bias, (0, need_to_pad, 0, need_to_pad), value=-torch.inf)
+                    attn_bias[0, 0, l_end:, 0] = 0
+                    x_BLC = F.pad(x_BLC, (0, 0, 0, need_to_pad))
+                attn_bias_or_two_vector = attn_bias.type_as(x_BLC).to(x_BLC.device)
+        if self.use_flex_attn:
+            attn_fn = self.attn_fn_compile_dict[tuple(scale_schedule)]
+        else:
+            attn_fn = None
+        # [2. block loop]
+        SelfAttnBlock.forward, CrossAttnBlock.forward
+        checkpointing_full_block = self.checkpointing == 'full-block' and self.training
+        if self.num_block_chunks == 1:
+            for i, b in enumerate(self.blocks):
+                if self.add_lvl_embeding_only_first_block and i == 0:
+                    x_BLC = self.add_lvl_embeding_for_x_BLC(x_BLC, scale_schedule, need_to_pad)
+                if not self.add_lvl_embeding_only_first_block:
+                    x_BLC = self.add_lvl_embeding_for_x_BLC(x_BLC, scale_schedule, need_to_pad)
+                if checkpointing_full_block:
+                    x_BLC = torch.utils.checkpoint.checkpoint(b, x_BLC, cond_BD_or_gss, ca_kv, attn_bias_or_two_vector, attn_fn, scale_schedule, self.rope2d_freqs_grid, use_reentrant=False)
+                else:
+                    x_BLC = b(x=x_BLC, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_bias_or_two_vector, attn_fn=attn_fn, scale_schedule=scale_schedule, rope2d_freqs_grid=self.rope2d_freqs_grid)
+        else:
+            for i, chunk in enumerate(self.block_chunks): # this path
+                if self.add_lvl_embeding_only_first_block and i == 0:
+                    x_BLC = self.add_lvl_embeding_for_x_BLC(x_BLC, scale_schedule, need_to_pad)
+                if not self.add_lvl_embeding_only_first_block:
+                    x_BLC = self.add_lvl_embeding_for_x_BLC(x_BLC, scale_schedule, need_to_pad)
+                x_BLC = chunk(x=x_BLC, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_bias_or_two_vector, attn_fn=attn_fn, scale_schedule=scale_schedule, checkpointing_full_block=checkpointing_full_block, rope2d_freqs_grid=self.rope2d_freqs_grid)
+        # [3. unpad the seqlen dim, and then get logits]
+        return self.get_logits(x_BLC[:, :l_end], cond_BD)    # return logits BLV, V is vocab_size
+    @torch.no_grad()
+    def autoregressive_infer_cfg(
+        self,
+        vae=None,
+        scale_schedule=None,
+        label_B_or_BLT=None,
+        B=1, negative_label_B_or_BLT=None, force_gt_Bhw=None,
+        g_seed=None, cfg_list=[], tau_list=[], cfg_sc=3, top_k=0, top_p=0.0,
+        returns_vemb=0, ratio_Bl1=None, gumbel=0, norm_cfg=False,
+        cfg_exp_k: float=0.0, cfg_insertion_layer=[-5],
+        vae_type=0, softmax_merge_topk=-1, ret_img=False,
+        trunk_scale=1000,
+        gt_leak=0, gt_ls_Bl=None,
+        inference_mode=False,
+        save_img_path=None,
+        sampling_per_bits=1,
+    ):   # returns List[idx_Bl]
+        if g_seed is None: rng = None
+        else: self.rng.manual_seed(g_seed); rng = self.rng
+        assert len(cfg_list) >= len(scale_schedule)
+        assert len(tau_list) >= len(scale_schedule)
+        # scale_schedule is used by infinity, vae_scale_schedule is used by vae if there exists a spatial patchify,
+        # we need to convert scale_schedule to vae_scale_schedule by multiply 2 to h and w
+        if self.apply_spatial_patchify:
+            vae_scale_schedule = [(pt, 2*ph, 2*pw) for pt, ph, pw in scale_schedule]
+        else:
+            vae_scale_schedule = scale_schedule
+        kv_compact, lens, cu_seqlens_k, max_seqlen_k = label_B_or_BLT
+        if any(np.array(cfg_list) != 1):
+            bs = 2*B
+            if not negative_label_B_or_BLT:
+                kv_compact_un = kv_compact.clone()
+                total = 0
+                for le in lens:
+                    kv_compact_un[total:total+le] = (self.cfg_uncond)[:le]
+                    total += le
+                kv_compact = torch.cat((kv_compact, kv_compact_un), dim=0)
+                cu_seqlens_k = torch.cat((cu_seqlens_k, cu_seqlens_k[1:]+cu_seqlens_k[-1]), dim=0)
+            else:
+                kv_compact_un, lens_un, cu_seqlens_k_un, max_seqlen_k_un = negative_label_B_or_BLT
+                kv_compact = torch.cat((kv_compact, kv_compact_un), dim=0)
+                cu_seqlens_k = torch.cat((cu_seqlens_k, cu_seqlens_k_un[1:]+cu_seqlens_k[-1]), dim=0)
+                max_seqlen_k = max(max_seqlen_k, max_seqlen_k_un)
+        else:
+            bs = B
+        kv_compact = self.text_norm(kv_compact)
+        sos = cond_BD = self.text_proj_for_sos((kv_compact, cu_seqlens_k, max_seqlen_k)) # sos shape: [2, 4096]
+        kv_compact = self.text_proj_for_ca(kv_compact) # kv_compact shape: [304, 4096]
+        ca_kv = kv_compact, cu_seqlens_k, max_seqlen_k
+        last_stage = sos.unsqueeze(1).expand(bs, 1, -1) + self.pos_start.expand(bs, 1, -1)
+        with torch.amp.autocast('cuda', enabled=False):
+            cond_BD_or_gss = self.shared_ada_lin(cond_BD.float()).float().contiguous()
+        accu_BChw, cur_L, ret = None, 0, []  # current length, list of reconstructed images
+        idx_Bl_list, idx_Bld_list = [], []
+        if inference_mode:
+            for b in self.unregistered_blocks: (b.sa if isinstance(b, CrossAttnBlock) else b.attn).kv_caching(True)
+        else:
+            assert self.num_block_chunks > 1
+            for block_chunk_ in self.block_chunks:
+                for module in block_chunk_.module.module:
+                    (module.sa if isinstance(module, CrossAttnBlock) else module.attn).kv_caching(True)
+        abs_cfg_insertion_layers = []
+        add_cfg_on_logits, add_cfg_on_probs = False, False
+        leng = len(self.unregistered_blocks)
+        for item in cfg_insertion_layer:
+            if item == 0: # add cfg on logits
+                add_cfg_on_logits = True
+            elif item == 1: # add cfg on probs
+                add_cfg_on_probs = True # todo in the future, we may want to add cfg on logits and probs
+            elif item < 0: # determine to add cfg at item-th layer's output
+                assert leng+item > 0, f'cfg_insertion_layer: {item} is not valid since len(unregistered_blocks)={self.num_block_chunks}'
+                abs_cfg_insertion_layers.append(leng+item)
+            else:
+                raise ValueError(f'cfg_insertion_layer: {item} is not valid')
+        num_stages_minus_1 = len(scale_schedule)-1
+        summed_codes = 0
+        for si, pn in enumerate(scale_schedule):   # si: i-th segment
+            cfg = cfg_list[si]
+            if si >= trunk_scale:
+                break
+            cur_L += np.array(pn).prod()
+            need_to_pad = 0
+            attn_fn = None
+            if self.use_flex_attn:
+                # need_to_pad = (self.pad_to_multiplier - cur_L % self.pad_to_multiplier) % self.pad_to_multiplier
+                # if need_to_pad:
+                #     last_stage = F.pad(last_stage, (0, 0, 0, need_to_pad))
+                attn_fn = self.attn_fn_compile_dict.get(tuple(scale_schedule[:(si+1)]), None)
+            # assert self.attn_bias_for_masking[:, :, last_L:cur_L, :cur_L].sum() == 0, f'AR with {(self.attn_bias_for_masking[:, :, last_L:cur_L, :cur_L] != 0).sum()} / {self.attn_bias_for_masking[:, :, last_L:cur_L, :cur_L].numel()} mask item'
+            layer_idx = 0
+            for block_idx, b in enumerate(self.block_chunks):
+                # last_stage shape: [4, 1, 2048], cond_BD_or_gss.shape: [4, 1, 6, 2048], ca_kv[0].shape: [64, 2048], ca_kv[1].shape [5], ca_kv[2]: int
+                if self.add_lvl_embeding_only_first_block and block_idx == 0:
+                    last_stage = self.add_lvl_embeding(last_stage, si, scale_schedule, need_to_pad=need_to_pad)
+                if not self.add_lvl_embeding_only_first_block:
+                    last_stage = self.add_lvl_embeding(last_stage, si, scale_schedule, need_to_pad=need_to_pad)
+                for m in b.module:
+                    last_stage = m(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=None, attn_fn=attn_fn, scale_schedule=scale_schedule, rope2d_freqs_grid=self.rope2d_freqs_grid, scale_ind=si)
+                    if (cfg != 1) and (layer_idx in abs_cfg_insertion_layers):
+                        # print(f'add cfg={cfg} on {layer_idx}-th layer output')
+                        last_stage = cfg * last_stage[:B] + (1-cfg) * last_stage[B:]
+                        last_stage = torch.cat((last_stage, last_stage), 0)
+                    layer_idx += 1
+            if (cfg != 1) and add_cfg_on_logits:
+                # print(f'add cfg on add_cfg_on_logits')
+                logits_BlV = self.get_logits(last_stage, cond_BD).mul(1/tau_list[si])
+                logits_BlV = cfg * logits_BlV[:B] + (1-cfg) * logits_BlV[B:]
+            else:
+                logits_BlV = self.get_logits(last_stage[:B], cond_BD[:B]).mul(1/tau_list[si])
+            if self.use_bit_label:
+                tmp_bs, tmp_seq_len = logits_BlV.shape[:2]
+                logits_BlV = logits_BlV.reshape(tmp_bs, -1, 2)
+                idx_Bld = sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV, rng=rng, top_k=top_k or self.top_k, top_p=top_p or self.top_p, num_samples=1)[:, :, 0]
+                idx_Bld = idx_Bld.reshape(tmp_bs, tmp_seq_len, -1)
+            else:
+                idx_Bl = sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV, rng=rng, top_k=top_k or self.top_k, top_p=top_p or self.top_p, num_samples=1)[:, :, 0]
+            if vae_type != 0:
+                assert returns_vemb
+                if si < gt_leak:
+                    idx_Bld = gt_ls_Bl[si]
+                else:
+                    assert pn[0] == 1
+                    idx_Bld = idx_Bld.reshape(B, pn[1], pn[2], -1) # shape: [B, h, w, d] or [B, h, w, 4d]
+                    if self.apply_spatial_patchify: # unpatchify operation
+                        idx_Bld = idx_Bld.permute(0,3,1,2) # [B, 4d, h, w]
+                        idx_Bld = torch.nn.functional.pixel_shuffle(idx_Bld, 2) # [B, d, 2h, 2w]
+                        idx_Bld = idx_Bld.permute(0,2,3,1) # [B, 2h, 2w, d]
+                    idx_Bld = idx_Bld.unsqueeze(1) # [B, 1, h, w, d] or [B, 1, 2h, 2w, d]
+                idx_Bld_list.append(idx_Bld)
+                codes = vae.quantizer.lfq.indices_to_codes(idx_Bld, label_type='bit_label') # [B, d, 1, h, w] or [B, d, 1, 2h, 2w]
+                if si != num_stages_minus_1:
+                    summed_codes += F.interpolate(codes, size=vae_scale_schedule[-1], mode=vae.quantizer.z_interplote_up)
+                    last_stage = F.interpolate(summed_codes, size=vae_scale_schedule[si+1], mode=vae.quantizer.z_interplote_down) # [B, d, 1, h, w] or [B, d, 1, 2h, 2w]
+                    last_stage = last_stage.squeeze(-3) # [B, d, h, w] or [B, d, 2h, 2w]
+                    if self.apply_spatial_patchify: # patchify operation
+                        last_stage = torch.nn.functional.pixel_unshuffle(last_stage, 2) # [B, 4d, h, w]
+                    last_stage = last_stage.reshape(*last_stage.shape[:2], -1) # [B, d, h*w] or [B, 4d, h*w]
+                    last_stage = torch.permute(last_stage, [0,2,1]) # [B, h*w, d] or [B, h*w, 4d]
+                else:
+                    summed_codes += codes
+            else:
+                if si < gt_leak:
+                    idx_Bl = gt_ls_Bl[si]
+                h_BChw = self.quant_only_used_in_inference[0].embedding(idx_Bl).float()   # BlC
+                # h_BChw = h_BChw.float().transpose_(1, 2).reshape(B, self.d_vae, scale_schedule[si][0], scale_schedule[si][1])
+                h_BChw = h_BChw.transpose_(1, 2).reshape(B, self.d_vae, scale_schedule[si][0], scale_schedule[si][1], scale_schedule[si][2])
+                ret.append(h_BChw if returns_vemb != 0 else idx_Bl)
+                idx_Bl_list.append(idx_Bl)
+                if si != num_stages_minus_1:
+                    accu_BChw, last_stage = self.quant_only_used_in_inference[0].one_step_fuse(si, num_stages_minus_1+1, accu_BChw, h_BChw, scale_schedule)
+            if si != num_stages_minus_1:
+                last_stage = self.word_embed(self.norm0_ve(last_stage))
+                last_stage = last_stage.repeat(bs//B, 1, 1)
+        if inference_mode:
+            for b in self.unregistered_blocks: (b.sa if isinstance(b, CrossAttnBlock) else b.attn).kv_caching(False)
+        else:
+            assert self.num_block_chunks > 1
+            for block_chunk_ in self.block_chunks:
+                for module in block_chunk_.module.module:
+                    (module.sa if isinstance(module, CrossAttnBlock) else module.attn).kv_caching(False)
+        if not ret_img:
+            return ret, idx_Bl_list, []
+        if vae_type != 0:
+            img = vae.decode(summed_codes.squeeze(-3))
+        else:
+            img = vae.viz_from_ms_h_BChw(ret, scale_schedule=scale_schedule, same_shape=True, last_one=True)
+        img = (img + 1) / 2
+        img = img.permute(0, 2, 3, 1).mul_(255).to(torch.uint8).flip(dims=(3,))
+        return ret, idx_Bl_list, img
+    @for_visualize
+    def vis_key_params(self, ep):
+        return
+    def load_state_dict(self, state_dict: Dict[str, Any], strict=False, assign=False):
+        for k in state_dict:
+            if 'cfg_uncond' in k:
+                old, new = state_dict[k], self.cfg_uncond.data
+                min_tlen = min(old.shape[0], new.shape[0])
+                if min_tlen == old.shape[0]:
+                    state_dict[k] = torch.cat((old.to(device=new.device, dtype=new.dtype), new[min_tlen:]))
+                else:
+                    state_dict[k] = old[:min_tlen]
+        for buf_name in ('lvl_1L', 'attn_bias_for_masking', 'Infinity_visible_kvlen', 'Infinity_invisible_qlen'):
+            state_dict.pop(buf_name, None)
+            if hasattr(self, buf_name):
+                state_dict[buf_name] = getattr(self, buf_name)
+        return super().load_state_dict(state_dict=state_dict, strict=strict, assign=assign)
+    def special_init(
+        self,
+        aln_init: float,
+        aln_gamma_init: float,
+        scale_head: float,
+        scale_proj: int,
+    ):
+        # init head's norm
+        if isinstance(self.head_nm, AdaLNBeforeHead):
+            self.head_nm.ada_lin[-1].weight.data.mul_(aln_init)    # there's no gamma for head
+            if hasattr(self.head_nm.ada_lin[-1], 'bias') and self.head_nm.ada_lin[-1].bias is not None:
+                self.head_nm.ada_lin[-1].bias.data.zero_()
+        # init head's proj
+        if scale_head >= 0:
+            if isinstance(self.head, nn.Linear):
+                self.head.weight.data.mul_(scale_head)
+                self.head.bias.data.zero_()
+            elif isinstance(self.head, nn.Sequential):
+                self.head[-1].weight.data.mul_(scale_head)
+                self.head[-1].bias.data.zero_()
+        depth = len(self.unregistered_blocks)
+        for block_idx, sab in enumerate(self.unregistered_blocks):
+            sab: Union[SelfAttnBlock, CrossAttnBlock]
+            # init proj
+            scale = 1 / math.sqrt(2*depth if scale_proj == 1 else 2*(1 + block_idx))
+            if scale_proj == 1:
+                if self.t2i:
+                    sab.sa.proj.weight.data.mul_(scale)
+                    sab.ca.proj.weight.data.mul_(scale)
+                else:
+                    sab.attn.proj.weight.data.mul_(scale)
+                sab.ffn.fc2.weight.data.mul_(scale)
+            # if sab.using_swiglu:
+            #     nn.init.ones_(sab.ffn.fcg.bias)
+            #     nn.init.trunc_normal_(sab.ffn.fcg.weight, std=1e-5)
+            # init ada_lin
+            if hasattr(sab, 'ada_lin'):
+                lin = sab.ada_lin[-1]
+                lin.weight.data[:2*self.C].mul_(aln_gamma_init)     # init gamma
+                lin.weight.data[2*self.C:].mul_(aln_init)           # init scale and shift
+                if hasattr(lin, 'bias') and lin.bias is not None:
+                    lin.bias.data.zero_()
+            elif hasattr(sab, 'ada_gss'):
+                sab.ada_gss.data[:, :, :2, :].mul_(aln_gamma_init)  # init gamma
+                sab.ada_gss.data[:, :, 2:, :].mul_(aln_init)        # init scale and shift
+    def extra_repr(self):
+        return f'drop_path_rate={self.drop_path_rate}'
+    def get_layer_id_and_scale_exp(self, para_name: str):
+        raise NotImplementedError
+def sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV: torch.Tensor, top_k: int = 0, top_p: float = 0.0, rng=None, num_samples=1) -> torch.Tensor:  # return idx, shaped (B, l)
+    B, l, V = logits_BlV.shape
+    if top_k > 0:
+        top_k = min(top_k, V)
+        idx_to_remove = logits_BlV < logits_BlV.topk(top_k, largest=True, sorted=False, dim=-1)[0].amin(dim=-1, keepdim=True)
+        logits_BlV.masked_fill_(idx_to_remove, -torch.inf)
+    if top_p > 0:
+        sorted_logits, sorted_idx = logits_BlV.sort(dim=-1, descending=False)
+        sorted_idx_to_remove = sorted_logits.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p)
+        sorted_idx_to_remove[..., -1:] = False
+        logits_BlV.masked_fill_(sorted_idx_to_remove.scatter(sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove), -torch.inf)
+    # sample (have to squeeze cuz multinomial can only be used on 2D tensor)
+    replacement = num_samples >= 0
+    num_samples = abs(num_samples)
+    return torch.multinomial(logits_BlV.softmax(dim=-1).view(-1, V), num_samples=num_samples, replacement=replacement, generator=rng).view(B, l, num_samples)
+def sampling_with_top_k_top_p_also_inplace_modifying_probs_(probs_BlV: torch.Tensor, top_k: int = 0, top_p: float = 0.0, rng=None, num_samples=1) -> torch.Tensor:  # return idx, shaped (B, l)
+    B, l, V = probs_BlV.shape
+    if top_k > 0:
+        top_k = min(top_k, V)
+        idx_to_remove = probs_BlV < probs_BlV.topk(top_k, largest=True, sorted=False, dim=-1)[0].amin(dim=-1, keepdim=True)
+        probs_BlV.masked_fill_(idx_to_remove, 0)
+    if top_p > 0:
+        sorted_probs, sorted_idx = probs_BlV.sort(dim=-1, descending=False)
+        sorted_idx_to_remove = sorted_probs.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p)
+        sorted_idx_to_remove[..., -1:] = False
+        probs_BlV.masked_fill_(sorted_idx_to_remove.scatter(sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove), 0)
+    # sample (have to squeeze cuz multinomial can only be used on 2D tensor)
+    probs_BlV = probs_BlV / probs_BlV.sum(-1, keepdims=True)
+    replacement = num_samples >= 0
+    num_samples = abs(num_samples)
+    return torch.multinomial(probs_BlV.view(-1, V), num_samples=num_samples, replacement=replacement, generator=rng).view(B, l, num_samples)
+def get_params_num(d, w, mlp):
+    m = round(mlp * w / 256) * 256
+    s = d * (w**2 * 8 + w*m * 2)    # sa+ca, mlp
+    s += w**2 * 6       # saln
+    s += 4096 * w       # pred
+    s += 32 * w         # we
+    Ct5 = 4096
+    s += Ct5*w * 4      # T5 attn pool
+    s += Ct5*w + w*w    # T5 mlp
+    return f'{s/1e9:.2f}B'
+TIMM_KEYS = {'img_size', 'pretrained', 'pretrained_cfg', 'pretrained_cfg_overlay', 'global_pool'}
+@register_model
+def infinity_2b(depth=32, embed_dim=2048, num_heads=2048//128, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_20b(depth=58, embed_dim=4608, num_heads=4608//128, drop_path_rate=0.25, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+# model configuration for scaling Infinity transformer
+@register_model
+def infinity_layer12(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer16(depth=16, embed_dim=1152, num_heads=12, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer24(depth=24, embed_dim=1536, num_heads=16, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer32(depth=32, embed_dim=2080, num_heads=20, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer40(depth=40, embed_dim=2688, num_heads=24, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer48(depth=48, embed_dim=3360, num_heads=28, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})

models/init_param.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch.nn as nn
+def init_weights(model: nn.Module, conv_std_or_gain: float = 0.02, other_std: float = 0.02):
+    """
+    :param model: the model to be inited
+    :param conv_std_or_gain: how to init every conv layer `m`
+        > 0: nn.init.trunc_normal_(m.weight.data, std=conv_std_or_gain)
+        < 0: nn.init.xavier_normal_(m.weight.data, gain=-conv_std_or_gain)
+    :param other_std: how to init every linear layer or embedding layer
+        use nn.init.trunc_normal_(m.weight.data, std=other_std)
+    """
+    skip = abs(conv_std_or_gain) > 10
+    if skip: return
+    print(f'[init_weights] {type(model).__name__} with {"std" if conv_std_or_gain > 0 else "gain"}={abs(conv_std_or_gain):g}')
+    for m in model.modules():
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight.data, std=other_std)
+            if m.bias is not None:
+                nn.init.constant_(m.bias.data, 0.)
+        elif isinstance(m, nn.Embedding):
+            nn.init.trunc_normal_(m.weight.data, std=other_std)
+            if m.padding_idx is not None:
+                m.weight.data[m.padding_idx].zero_()
+        elif isinstance(m, (nn.Conv1d, nn.Conv2d, nn.ConvTranspose1d, nn.ConvTranspose2d)):
+            nn.init.trunc_normal_(m.weight.data, std=conv_std_or_gain) if conv_std_or_gain > 0 else nn.init.xavier_normal_(m.weight.data, gain=-conv_std_or_gain)   # todo: StyleSwin: (..., gain=.02)
+            if hasattr(m, 'bias') and m.bias is not None:
+                nn.init.constant_(m.bias.data, 0.)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm, nn.GroupNorm, nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d)):
+            if m.bias is not None:
+                nn.init.constant_(m.bias.data, 0.)
+            if m.weight is not None:
+                nn.init.constant_(m.weight.data, 1.)

models/t5.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import re
+import torch
+import os
+import traceback
+import numpy as np
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, T5EncoderModel
+import ftfy
+import html
+from bs4 import BeautifulSoup
+import urllib.parse as ul
+class T5Embedder:
+    available_models = ['t5-v1_1-xxl']
+    bad_punct_regex = re.compile(r'['+'#®•©™&@·º½¾¿¡§~'+'\)'+'\('+'\]'+'\['+'\}'+'\{'+'\|'+'\\'+'\/'+'\*' + r']{1,}')  # noqa
+    def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, local_cache=False, cache_dir=None, hf_token=None, use_text_preprocessing=True,
+                 t5_model_kwargs=None, torch_dtype=torch.bfloat16, use_offload_folder=None, model_max_length=512, padding="max_length", clean_caption_func_name="clean_caption"):
+        self.device = torch.device(device)
+        self.torch_dtype = torch_dtype
+        if t5_model_kwargs is None:
+            t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype}
+            if use_offload_folder is not None:
+                t5_model_kwargs['offload_folder'] = use_offload_folder
+                t5_model_kwargs['device_map'] = {
+                    'shared': self.device,
+                    'encoder.embed_tokens': self.device,
+                    'encoder.block.0': self.device,
+                    'encoder.block.1': self.device,
+                    'encoder.block.2': self.device,
+                    'encoder.block.3': self.device,
+                    'encoder.block.4': self.device,
+                    'encoder.block.5': self.device,
+                    'encoder.block.6': self.device,
+                    'encoder.block.7': self.device,
+                    'encoder.block.8': self.device,
+                    'encoder.block.9': self.device,
+                    'encoder.block.10': self.device,
+                    'encoder.block.11': self.device,
+                    'encoder.block.12': 'disk',
+                    'encoder.block.13': 'disk',
+                    'encoder.block.14': 'disk',
+                    'encoder.block.15': 'disk',
+                    'encoder.block.16': 'disk',
+                    'encoder.block.17': 'disk',
+                    'encoder.block.18': 'disk',
+                    'encoder.block.19': 'disk',
+                    'encoder.block.20': 'disk',
+                    'encoder.block.21': 'disk',
+                    'encoder.block.22': 'disk',
+                    'encoder.block.23': 'disk',
+                    'encoder.final_layer_norm': 'disk',
+                    'encoder.dropout': 'disk',
+                }
+            else:
+                t5_model_kwargs['device_map'] = {'shared': self.device, 'encoder': self.device}
+        self.use_text_preprocessing = use_text_preprocessing
+        self.hf_token = hf_token
+        self.cache_dir = cache_dir or os.path.expanduser('~/.cache/IF_')
+        self.dir_or_name = dir_or_name
+        tokenizer_path, path = dir_or_name, dir_or_name
+        if local_cache:
+            cache_dir = os.path.join(self.cache_dir, dir_or_name)
+            tokenizer_path, path = cache_dir, cache_dir
+        elif dir_or_name in self.available_models:
+            cache_dir = os.path.join(self.cache_dir, dir_or_name)
+            for filename in [
+                'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
+                'pytorch_model.bin.index.json', 'pytorch_model-00001-of-00002.bin', 'pytorch_model-00002-of-00002.bin'
+            ]:
+                hf_hub_download(repo_id=f'DeepFloyd/{dir_or_name}', filename=filename, cache_dir=cache_dir,
+                                force_filename=filename, token=self.hf_token)
+            tokenizer_path, path = cache_dir, cache_dir
+        else:
+            cache_dir = os.path.join(self.cache_dir, 't5-v1_1-xxl')
+            for filename in [
+                'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
+            ]:
+                hf_hub_download(repo_id='DeepFloyd/t5-v1_1-xxl', filename=filename, cache_dir=cache_dir,
+                                force_filename=filename, token=self.hf_token)
+            tokenizer_path = cache_dir
+        print(f"Loading T5 from {tokenizer_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval()
+        self.model_max_length = model_max_length
+        self.padding = padding
+        self.clean_caption_func = self.__getattribute__(clean_caption_func_name)
+    @torch.no_grad()
+    def get_text_embeddings(self, texts):
+        import time
+        start_time = time.time()
+        texts = [self.text_preprocessing(text) for text in texts]
+        # print("text_preprocessing: ", time.time() - start_time)
+        text_tokens_and_mask = self.tokenizer(
+            texts,
+            max_length=self.model_max_length,
+            padding=self.padding,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors='pt'
+        )
+        # print("tokenizer: ", time.time() - start_time)
+        text_tokens_and_mask['input_ids'] = text_tokens_and_mask['input_ids'].to(self.device)
+        text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask'].to(self.device)
+        with torch.no_grad():
+            text_encoder_embs = self.model(
+                input_ids=text_tokens_and_mask['input_ids'],
+                attention_mask=text_tokens_and_mask['attention_mask'],
+            )['last_hidden_state'].detach()
+        # print("model: ", time.time() - start_time)
+        return text_encoder_embs, text_tokens_and_mask['attention_mask'], text_tokens_and_mask['input_ids'], texts
+    def text_preprocessing(self, text):
+        if self.use_text_preprocessing:
+            try:
+                # The exact text cleaning as was in the training stage:
+                text = self.clean_caption_func(text)
+                text = self.clean_caption_func(text)
+                return text
+            except Exception as e:
+                print(f"Error in text preprocessing: {e} with text: {text}")
+                print(traceback.format_exc())
+                return text
+        else:
+            return text.lower().strip()
+    @staticmethod
+    def basic_clean(text):
+        text = ftfy.fix_text(text)
+        text = html.unescape(html.unescape(text))
+        return text.strip()
+    def clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub('<person>', 'person', caption)
+        # urls:
+        caption = re.sub(
+            r'\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',  # noqa
+            '', caption)  # regex for urls
+        caption = re.sub(
+            r'\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',  # noqa
+            '', caption)  # regex for urls
+        # html:
+        try:
+            caption = BeautifulSoup(caption, features='html.parser').text
+        except Exception as e:
+            print(f"Error parsing caption:{caption} with html.parser: {e}")
+        # @<nickname>
+        caption = re.sub(r'@[\w\d]+\b', '', caption)
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r'[\u31c0-\u31ef]+', '', caption)
+        caption = re.sub(r'[\u31f0-\u31ff]+', '', caption)
+        caption = re.sub(r'[\u3200-\u32ff]+', '', caption)
+        caption = re.sub(r'[\u3300-\u33ff]+', '', caption)
+        caption = re.sub(r'[\u3400-\u4dbf]+', '', caption)
+        caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption)
+        caption = re.sub(r'[\u4e00-\u9fff]+', '', caption)
+        #######################################################
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r'[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+',  # noqa
+            '-', caption)
+        # кавычки к одному стандарту
+        caption = re.sub(r'[`´«»“”¨]', '"', caption)
+        caption = re.sub(r'[‘’]', "'", caption)
+        # &quot;
+        caption = re.sub(r'&quot;?', '', caption)
+        # &amp
+        caption = re.sub(r'&amp', '', caption)
+        # ip adresses:
+        caption = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', caption)
+        # article ids:
+        caption = re.sub(r'\d:\d\d\s+$', '', caption)
+        # \n
+        caption = re.sub(r'\\n', ' ', caption)
+        # "#123"
+        caption = re.sub(r'#\d{1,3}\b', '', caption)
+        # "#12345.."
+        caption = re.sub(r'#\d{5,}\b', '', caption)
+        # "123456.."
+        caption = re.sub(r'\b\d{6,}\b', '', caption)
+        # filenames:
+        caption = re.sub(r'[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)', '', caption)
+        #
+        caption = re.sub(r'[\"\']{2,}', r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r'[\.]{2,}', r' ', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(self.bad_punct_regex, r' ', caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r'\s+\.\s+', r' ', caption)  # " . "
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r'(?:\-|\_)')
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, ' ', caption)
+        caption = self.basic_clean(caption)
+        caption = re.sub(r'\b[a-zA-Z]{1,3}\d{3,15}\b', '', caption)  # jc6640
+        caption = re.sub(r'\b[a-zA-Z]+\d+[a-zA-Z]+\b', '', caption)  # jc6640vc
+        caption = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', caption)  # 6640vc231
+        caption = re.sub(r'(worldwide\s+)?(free\s+)?shipping', '', caption)
+        caption = re.sub(r'(free\s)?download(\sfree)?', '', caption)
+        caption = re.sub(r'\bclick\b\s(?:for|on)\s\w+', '', caption)
+        caption = re.sub(r'\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?', '', caption)
+        caption = re.sub(r'\bpage\s+\d+\b', '', caption)
+        caption = re.sub(r'\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b', r' ', caption)  # j2d1a2a...
+        caption = re.sub(r'\b\d+\.?\d*[xх×]\d+\.?\d*\b', '', caption)
+        caption = re.sub(r'\b\s+\:\s+', r': ', caption)
+        caption = re.sub(r'(\D[,\./])\b', r'\1 ', caption)
+        caption = re.sub(r'\s+', ' ', caption)
+        caption.strip()
+        caption = re.sub(r'^[\"\']([\w\W]+)[\"\']$', r'\1', caption)
+        caption = re.sub(r'^[\'\_,\-\:;]', r'', caption)
+        caption = re.sub(r'[\'\_,\-\:\-\+]$', r'', caption)
+        caption = re.sub(r'^\.\S+$', '', caption)
+        return caption.strip()
+    def clean_caption_simplify(self, caption):
+        # 将 caption 转换为字符串
+        caption = str(caption)
+        # 解码 URL 编码的字符串
+        caption = ul.unquote_plus(caption)
+        # 去除首尾空格并转换为小写
+        caption = caption.strip().lower()
+        # 将 '<person>' 替换为 'person'
+        caption = re.sub('<person>', 'person', caption)
+        # 移除 URL
+        caption = re.sub(
+            r'\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',
+            '', caption)  # 匹配以 http:// 或 https:// 开头的 URL
+        caption = re.sub(
+            r'\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',
+            '', caption)  # 匹配以 www. 开头的 URL
+        # 解析 HTML 并删除 HTML 标签
+        caption = BeautifulSoup(caption, features='html.parser').text
+        # 移除 @nickname 标签
+        caption = re.sub(r'@[\w\d]+\b', '', caption)
+        # 移除特定 Unicode 范围的字符：CJK 相关字符
+        caption = re.sub(r'[\u31c0-\u31ef]+', '', caption)  # CJK 笔划
+        caption = re.sub(r'[\u31f0-\u31ff]+', '', caption)  # 片假名语音扩展
+        caption = re.sub(r'[\u3200-\u32ff]+', '', caption)  # 圆括号中的 CJK 字母和月份
+        caption = re.sub(r'[\u3300-\u33ff]+', '', caption)  # CJK 兼容性
+        caption = re.sub(r'[\u3400-\u4dbf]+', '', caption)  # CJK 统一表意符号扩展 A
+        caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption)  # 易经卦象符号
+        caption = re.sub(r'[\u4e00-\u9fff]+', '', caption)  # CJK 统一表意符号
+        # 所有类型的破折号替换为 "-"
+        caption = re.sub(
+            r'[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+',
+            '-', caption)  # 匹配各种 Unicode 破折号
+        # 统一不同类型的引号
+        caption = re.sub(r'[`´«»“”¨]', '"', caption)  # 将各种引号替换为标准引号
+        caption = re.sub(r'[‘’]', "'", caption)  # 将左单引号和右单引号替换为标准单引号
+        # 移除 &quot; 和 &amp
+        caption = re.sub(r'&quot;?', '', caption)  # 移除 HTML 实体 &quot;
+        caption = re.sub(r'&amp', '', caption)  # 移除 HTML 实体 &amp
+        # 移除 IP 地址
+        caption = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', caption)  # 匹配 IPv4 地址
+        # 移除文章 ID 格式
+        caption = re.sub(r'\d:\d\d\s+$', '', caption)  # 匹配类似 '1:23 ' 的格式
+        # 移除 \n 转义字符
+        caption = re.sub(r'\\n', ' ', caption)
+        # 移除特定格式的标签
+        # caption = re.sub(r'#\d{1,3}\b', '', caption)  # #123 移除 # 加 1 到 3 位数字的标签
+        # caption = re.sub(r'#\d{5,}\b', '', caption)  # #12345.. 移除 # 加 5 位或以上数字的标签
+        # caption = re.sub(r'\b\d{6,}\b', '', caption)  # 123456.. 移除 6 位或以上的纯数字
+        # 移除文件名
+        caption = re.sub(r'[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)', '', caption)  # 匹配图片和视频文件，匹配完整的文件名，包括文件名本身和扩展名。
+        # 简化多重引号和点
+        caption = re.sub(r'[\"\']{2,}', r'"', caption)  # 连续的双引号替换为一个双引号
+        caption = re.sub(r'[\.]{2,}', r' ', caption)  # 连续���点替换为空格
+        # 使用通用标点正则表达式清理无效标点
+        caption = re.sub(self.bad_punct_regex, r' ', caption)  # 自定义的无效标点正则表达式
+        caption = re.sub(r'\s+\.\s+', r' ', caption)  # 移除空格和点
+        # 过滤带有太多破折号或下划线的文本
+        regex2 = re.compile(r'(?:\-|\_)')
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, ' ', caption)
+        # 基本清理
+        caption = self.basic_clean(caption)
+        # 移除特定格式的短字符串
+        # caption = re.sub(r'\b[a-zA-Z]{1,3}\d{3,15}\b', '', caption)  # 匹配三个字母以下加三个数字以上的字符串
+        # caption = re.sub(r'\b[a-zA-Z]+\d+[a-zA-Z]+\b', '', caption)  # 匹配字母数字混合的字符串
+        # caption = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', caption)  # 匹配数字字母混合的字符串
+        # 移除特定的广告或指令性短语
+        # caption = re.sub(r'(worldwide\s+)?(free\s+)?shipping', '', caption)  # 匹配 'worldwide free shipping', 'free shipping'
+        # caption = re.sub(r'(free\s)?download(\sfree)?', '', caption)  # 匹配 'free download', 'download free'
+        # caption = re.sub(r'\bclick\b\s(?:for|on)\s\w+', '', caption)  # 匹配 'click for ...' 或 'click on ...'
+        # caption = re.sub(r'\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?', '', caption)  # 匹配文件扩展名，匹配独立的扩展名或扩展名后可能跟随的特定词汇的场景
+        # caption = re.sub(r'\bpage\s+\d+\b', '', caption)  # 匹配 'page 123'
+        # 移除复杂模式的字符串
+        # caption = re.sub(r'\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b', r' ', caption) # 123A456B789
+        # 移除特定的矩形标识符
+        caption = re.sub(r'\b\d+\.?\d*[xх×]\d+\.?\d*\b', '', caption)
+        # 修复多余的空白和标点
+        caption = re.sub(r'\b\s+\:\s+', r': ', caption)
+        caption = re.sub(r'(\D[,\./])\b', r'\1 ', caption)
+        caption = re.sub(r'\s+', ' ', caption)
+        # 去除首尾的多余字符
+        caption.strip()
+        caption = re.sub(r'^[\"\']([\w\W]+)[\"\']$', r'\1', caption)
+        caption = re.sub(r'^[\'\_,\-\:;]', r'', caption)
+        caption = re.sub(r'[\'\_,\-\:\-\+]$', r'', caption)
+        caption = re.sub(r'^\.\S+$', '', caption)
+        return caption.strip()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+random
+torch
+opencv-python
+numpy
+gradio
+huggingface-hub
+transformers
+argparse
+spaces

utils/amp_opt.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import math
+import os
+import signal
+import sys
+import time
+from typing import List, Optional, Tuple, Union
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+# from memory_profiler import profile
+import infinity.utils.dist as dist
+from infinity.utils import misc
+class NullCtx:
+    def __enter__(self):
+        pass
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+def handle_timeout(signum, frame):
+    raise TimeoutError('took too long')
+def per_param_clip_grad_norm_(parameters, thresh: float, stable=False, fp=None) -> (float, float):
+    skipped, max_grad = [], 0
+    for pi, p in enumerate(parameters):
+        if p.grad is not None:
+            g = p.grad.data.norm(2).item() + 1e-7
+            max_grad = max(max_grad, g)
+            clip_coef = thresh / g
+            if clip_coef < 1:
+                if stable and clip_coef < 0.2:
+                    skipped.append(clip_coef)
+                    p.grad.data.mul_(0)     # todo NOTE: inf.mul_(0)==nan will shrink the scale ratio, but inf.zero_()==0 won't
+                else:
+                    p.grad.data.mul_(clip_coef)
+    # if fp is not None: fp.write(f'[per_param_clip_grad_norm_:47] finished.\n'); fp.flush()
+    return 0 if len(skipped) == 0 else math.log10(max(min(skipped), 1e-7)), max_grad
+class AmpOptimizer:
+    def __init__(
+        self,
+        model_name_3letters: str, mixed_precision: int,
+        optimizer: torch.optim.Optimizer, model_maybe_fsdp: Union[torch.nn.Module, FSDP],
+        r_accu: float, grad_clip: float, zero: int,
+    ):
+        self.enable_amp = mixed_precision > 0
+        self.zero = zero
+        if self.enable_amp:
+            self.using_fp16_rather_bf16 = mixed_precision != 2
+            self.max_sc = float(mixed_precision if mixed_precision > 128 else 32768)
+            # todo: on both V100 and A100, torch.get_autocast_gpu_dtype() returns fp16, not bf16.
+            self.amp_ctx = torch.autocast('cuda', enabled=True, dtype=torch.float16 if self.using_fp16_rather_bf16 else torch.bfloat16, cache_enabled=self.zero == 0)    # todo: cache_enabled=False
+            if self.using_fp16_rather_bf16:
+                self.scaler = torch.cuda.amp.GradScaler(init_scale=2. ** 11, growth_interval=1000)
+            else:
+                self.scaler = None
+        else:
+            self.using_fp16_rather_bf16 = True
+            self.amp_ctx = NullCtx()
+            self.scaler = None
+        t = torch.zeros(dist.get_world_size())
+        t[dist.get_rank()] = float(self.enable_amp)
+        dist.allreduce(t)
+        assert round(t.sum().item()) in {0, dist.get_world_size()}, f'enable_amp: {t}'
+        t = torch.zeros(dist.get_world_size())
+        t[dist.get_rank()] = float(self.using_fp16_rather_bf16)
+        dist.allreduce(t)
+        assert round(t.sum().item()) in {0, dist.get_world_size()}, f'using_fp16_rather_bf16: {t}'
+        self.model_name_3letters = model_name_3letters
+        self.optimizer, self.model_maybe_fsdp = optimizer, model_maybe_fsdp
+        self.r_accu = r_accu
+        self.paras = self.names = ...    # todo: solve EMA-related codes
+        self.grad_clip, self.grad_clip_we = grad_clip, 0    # todo: disable wclip
+        if self.grad_clip > 100:
+            self.grad_clip %= 100
+            self.per_param = True
+        else:
+            self.per_param = False
+        self.per_param = False          # todo: disable wclip
+        self.early_clipping = grad_clip > 0 and not hasattr(optimizer, 'global_grad_norm')
+        self.late_clipping = grad_clip > 0 and hasattr(optimizer, 'global_grad_norm')   # deepspeed's optimizer
+        self.fp = None
+        self.last_orig_norm: torch.Tensor = torch.tensor(0.1)
+    @torch.no_grad()
+    def log_param(self, ep: int):
+        if self.zero == 0:
+            for name, values in get_param_for_log(self.model_name_3letters, self.model_maybe_fsdp.named_parameters()).items():
+                values: List[float]
+                if len(values) == 1:    # e.g., cls token will only have one value
+                    values.append(values[0])
+        else:
+            ...
+            # todo: log params
+    # @profile(precision=4, stream=open('amp_sc.log', 'w+'))
+    def backward_clip_step(
+        self, ep: int, it: int, g_it: int, stepping: bool, logging_params: bool, loss: torch.Tensor, clip_decay_ratio=1, stable=False,
+    ) -> Tuple[torch.Tensor, Optional[float]]:
+        # backward
+        loss = loss.mul(self.r_accu)   # r_accu == 1.0 / n_gradient_accumulation
+        orig_norm = scaler_sc = None
+        # if self.fp is not None:
+        #     if g_it % 20 == 0: self.fp.seek(0); self.fp.truncate(0)
+        if self.scaler is not None:
+            self.scaler.scale(loss).backward(retain_graph=False, create_graph=False)  # retain_graph=retain_graph, create_graph=create_graph
+        else:
+            loss.backward(retain_graph=False, create_graph=False)
+        # if self.fp is not None: self.fp.write(f'[backward_clip_step:131] [it{it}, g_it{g_it}] after backward\n'); self.fp.flush()
+        # clip gradients then step optimizer
+        if stepping:
+            if self.scaler is not None: self.scaler.unscale_(self.optimizer)    # now the gradient can be correctly got
+            # if self.fp is not None: self.fp.write(f'[backward_clip_step:137] [it{it}, g_it{g_it}] after scaler.unscale_\n'); self.fp.flush()
+            skipped, orig_norm = 0, self.last_orig_norm
+            # try:
+            if self.fp is not None:
+                if g_it % 10 == 0: self.fp.seek(0); self.fp.truncate(0)
+                self.fp.write(f'<ep{ep} it{it} {g_it}>\n'); self.fp.flush()
+            if self.early_clipping:
+                c = self.grad_clip * clip_decay_ratio
+                if self.zero:
+                    orig_norm: Optional[torch.Tensor] = self.model_maybe_fsdp.clip_grad_norm_(c)
+                else:
+                    orig_norm: Optional[torch.Tensor] = torch.nn.utils.clip_grad_norm_(self.model_maybe_fsdp.parameters(), c)
+            # if self.fp is not None: self.fp.write(f'[backward_clip_step:175] [it{it}, g_it{g_it}] before opt step\n'); self.fp.flush()
+            if self.scaler is not None:
+                self.scaler: torch.cuda.amp.GradScaler
+                if self.zero:
+                    # synchronize found_inf_per_device before calling step, so that even if only some ranks found inf on their sharded params, all other ranks will know
+                    # otherwise, when saving FSDP optimizer state, it will cause AssertionError saying "Different ranks have different values for step."
+                    for optimizer_state in self.scaler._per_optimizer_states.values():
+                        for t in optimizer_state['found_inf_per_device'].values():
+                            dist.allreduce(t)   # ideally, each rank only has one single t; so no need to use async allreduce
+                self.scaler.step(self.optimizer)
+                scaler_sc: Optional[float] = self.scaler.get_scale()
+                if scaler_sc > self.max_sc: # fp16 will overflow when >65536, so multiply 32768 could be dangerous
+                    # print(f'[fp16 scaling] too large loss scale {scaler_sc}! (clip to {self.max_sc:g})')
+                    self.scaler.update(new_scale=self.max_sc)
+                else:
+                    self.scaler.update()
+                try:
+                    scaler_sc = float(math.log2(scaler_sc))
+                except Exception as e:
+                    print(f'[scaler_sc = {scaler_sc}]\n' * 15, flush=True)
+                    time.sleep(1)
+                    print(f'[scaler_sc = {scaler_sc}]\n' * 15, flush=True)
+                    raise e
+            else:
+                self.optimizer.step()
+            if self.late_clipping:
+                orig_norm: Optional[torch.Tensor] = self.optimizer.global_grad_norm
+            self.last_orig_norm = orig_norm
+            # no zero_grad calling here, gonna log those gradients!
+        return orig_norm, scaler_sc
+    def state_dict(self):
+        return {
+            'optimizer': self.optimizer.state_dict()
+        } if self.scaler is None else {
+            'scaler': self.scaler.state_dict(),
+            'optimizer': self.optimizer.state_dict()
+        }
+    def load_state_dict(self, state, strict=True):
+        if self.scaler is not None:
+            try: self.scaler.load_state_dict(state['scaler'])
+            except Exception as e: print(f'[fp16 load_state_dict err] {e}')
+        self.optimizer.load_state_dict(state['optimizer'])

utils/arg_util.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import json
+import math
+import os
+import random
+import subprocess
+import sys
+import time
+from collections import OrderedDict, deque
+from typing import Optional, Union
+import numpy as np
+import torch
+from tap import Tap
+import infinity.utils.dist as dist
+class Args(Tap):
+    local_out_path: str = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'local_output')  # directory for save checkpoints
+    data_path: str = ''                 # dataset
+    bed: str = ''                       # bed directory for copy checkpoints apart from local_out_path
+    vae_ckpt: str = ''                  # VAE ckpt
+    exp_name: str = ''                  # experiment name
+    ds: str = 'oi'                      # only used in GPT training::load_viz_data & FID benchmark
+    model: str = ''                     # for VAE training, 'b' or any other for GPT training
+    short_cap_prob: float = 0.2         # prob for training with short captions
+    project_name: str = 'Infinity'      # name of wandb project
+    tf32: bool = True                   # whether to use TensorFloat32
+    auto_resume: bool = True            # whether to automatically resume from the last checkpoint found in args.bed
+    rush_resume: str = ''               # pretrained infinity checkpoint
+    nowd: int = 1                       # whether to disable weight decay on sparse params (like class token)
+    enable_hybrid_shard: bool = False   # whether to use hybrid FSDP
+    inner_shard_degree: int = 1         # inner degree for FSDP
+    zero: int = 0                       # ds zero
+    buck: str = 'chunk'                 # =0 for using module-wise
+    fsdp_orig: bool = True
+    enable_checkpointing: str = None    # checkpointing strategy: full-block, self-attn
+    pad_to_multiplier: int = 1          # >1 for padding the seq len to a multiplier of this
+    log_every_iter: bool = False
+    checkpoint_type: str = 'torch'      # checkpoint_type: torch, onmistore
+    seed: int = None                    # 3407
+    rand: bool = True                   # actual seed = seed + (dist.get_rank()*512 if rand else 0)
+    device: str = 'cpu'
+    task_id: str = '2493513'
+    trial_id: str = '7260554'
+    robust_run_id: str = '00'
+    ckpt_trials = []
+    real_trial_id: str = '7260552'
+    chunk_nodes: int = None
+    is_master_node: bool = None
+    # dir
+    log_txt_path: str = ''
+    t5_path: str = ''                   # if not specified: automatically find from all bytenas
+    online_t5: bool = True              # whether to use online t5 or load local features
+    # GPT
+    sdpa_mem: bool = True               # whether to use with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True)
+    tfast: int = 0                      # compile GPT
+    model_alias: str = 'b'              # [automatically set; don't specify this]
+    rms: bool = False
+    aln: float = 1e-3                   # multiplier of ada_lin.w's initialization
+    alng: float = -1                    # multiplier of ada_lin.w[gamma channels]'s initialization, -1: the same as aln
+    saln: bool = False                  # whether to use a shared adaln layer
+    haln: bool = True                   # whether to use a specific adaln layer in head layer
+    nm0: bool = False                   # norm before word proj linear
+    tau: float = 1                      # tau of self attention in GPT
+    cos: bool = True                    # cosine attn as in swin v2
+    swi: bool = False                   # whether to use FFNSwiGLU, instead of vanilla FFN
+    dp: float = -1
+    drop: float = 0.0                   # GPT's dropout (VAE's is --vd)
+    hd: int = 0
+    ca_gamma: float = -1                # >=0 for using layer-scale for cross attention
+    diva: int = 1                       # rescale_attn_fc_weights
+    hd0: float = 0.02                   # head.w *= hd0
+    dec: int = 1                        # dec depth
+    cum: int = 3                        # cumulating fea map as GPT TF input, 0: not cum; 1: cum @ next hw, 2: cum @ final hw
+    rwe: bool = False                   # random word emb
+    tp: float = 0.0                     # top-p
+    tk: float = 0.0                     # top-k
+    tini: float = 0.02                  # init parameters
+    cfg: float = 0.1                    # >0: classifier-free guidance, drop cond with prob cfg
+    rand_uncond = False                 # whether to use random, unlearnable uncond embeding
+    ema: float = 0.9999                 # VAE's ema ratio, not VAR's. 0.9977844 == 0.5 ** (32 / (10 * 1000)) from gans, 0.9999 from SD
+    tema: float = 0                     # 0.9999 in DiffiT, DiT
+    fp16: int = 0                       # 1: fp16, 2: bf16, >2: fp16's max scaling multiplier todo: 记得让quantize相关的feature都强制fp32！另外residueal最好也是fp32（根据flash-attention）nn.Conv2d有一个参数是use_float16？
+    fuse: bool = False                  # whether to use fused mlp
+    fused_norm: bool = False            # whether to use fused norm
+    flash: bool = False                 # whether to use customized flash-attn kernel
+    xen: bool = False                   # whether to use xentropy
+    use_flex_attn: bool = False         # whether to use flex_attn to speedup training
+    stable: bool = False
+    gblr: float = 1e-4
+    dblr: float = None                  # =gblr if is None
+    tblr: float = 6e-4
+    glr: float = None
+    dlr: float = None
+    tlr: float = None                   # vqgan: 4e-5
+    gwd: float = 0.005
+    dwd: float = 0.0005
+    twd: float = 0.005                  # vqgan: 0.01
+    gwde: float = 0
+    dwde: float = 0
+    twde: float = 0
+    ls: float = 0.0                     # label smooth
+    lz: float = 0.0                     # z loss from PaLM = 1e-4   todo
+    eq: int = 0                         # equalized loss
+    ep: int = 100
+    wp: float = 0
+    wp0: float = 0.005
+    wpe: float = 0.3                    # 0.001, final cosine lr = wpe * peak lr
+    sche: str = ''                      # cos, exp, lin
+    log_freq: int = 50                  # log frequency in the stdout
+    gclip: float = 6.                   # <=0 for not grad clip VAE
+    dclip: float = 6.                   # <=0 for not grad clip discriminator
+    tclip: float = 2.                   # <=0 for not grad clip GPT; >100 for per-param clip (%= 100 automatically)
+    cdec: bool = False                  # decay the grad clip thresholds of GPT and GPT's word embed
+    opt: str = 'adamw'                  # lion: https://cloud.tencent.com/developer/article/2336657?areaId=106001 lr=5e-5（比Adam学习率低四倍）和wd=0.8（比Adam高八倍）；比如在小的 batch_size 时，Lion 的表现不如 AdamW
+    ada: str = ''                       # adam's beta0 and beta1 for VAE or GPT, '0_0.99' from style-swin and magvit, '0.5_0.9' from VQGAN
+    dada: str = ''                      # adam's beta0 and beta1 for discriminator
+    oeps: float = 0                     # adam's eps, pixart uses 1e-10
+    afuse: bool = True                  # fused adam
+    # data
+    pn: str = ''                        # pixel nums, choose from 0.06M, 0.25M, 1M
+    scale_schedule: tuple = None        # [automatically set; don't specify this] = tuple(map(int, args.pn.replace('-', '_').split('_')))
+    patch_size: int = None              # [automatically set; don't specify this] = 2 ** (len(args.scale_schedule) - 1)
+    resos: tuple = None                 # [automatically set; don't specify this]
+    data_load_reso: int = None          # [automatically set; don't specify this]
+    workers: int = 0                    # num workers; 0: auto, -1: don't use multiprocessing in DataLoader
+    lbs: int = 0                        # local batch size; if lbs != 0, bs will be ignored, and will be reset as round(args.lbs / args.ac) * dist.get_world_size()
+    bs: int = 0                         # global batch size; if lbs != 0, bs will be ignored
+    batch_size: int = 0                 # [automatically set; don't specify this] batch size per GPU = round(args.bs / args.ac / dist.get_world_size())
+    glb_batch_size: int = 0             # [automatically set; don't specify this] global batch size = args.batch_size * dist.get_world_size()
+    ac: int = 1                         # gradient accumulation
+    r_accu: float = 1.0                 # [automatically set; don't specify this] = 1 / args.ac
+    norm_eps: float = 1e-6              # norm eps for infinity
+    tlen: int = 512                     # truncate text embedding to this length
+    Ct5: int = 2048                     # feature dimension of text encoder
+    use_bit_label: int = 1              # pred bitwise labels or index-wise labels
+    bitloss_type: str = 'mean'          # mean or sum
+    dynamic_resolution_across_gpus: int = 1 # allow dynamic resolution across gpus
+    enable_dynamic_length_prompt: int = 0 # enable dynamic length prompt during training
+    use_streaming_dataset: int = 0      # use streaming dataset
+    iterable_data_buffersize: int = 90000 # streaming dataset buffer size
+    save_model_iters_freq: int = 1000   # save model iter freq
+    noise_apply_layers: int = -1        # Bitwise Self-Correction: apply noise to layers, -1 means not apply noise
+    noise_apply_strength: float = -1    # Bitwise Self-Correction: apply noise strength, -1 means not apply noise
+    noise_apply_requant: int = 1        # Bitwise Self-Correction: requant after apply noise
+    rope2d_each_sa_layer: int = 0       # apply rope2d to each self-attention layer
+    rope2d_normalized_by_hw: int = 1    # apply normalized rope2d
+    use_fsdp_model_ema: int = 0         # use fsdp model ema
+    add_lvl_embeding_only_first_block: int = 1 # apply lvl pe embedding only first block or each block
+    reweight_loss_by_scale: int = 0     # reweight loss by scale
+    always_training_scales: int = 100   # trunc training scales
+    vae_type: int = 1                   # here 16/32/64 is bsq vae of different quant bits
+    fake_vae_input: bool = False        # fake vae input for debug
+    model_init_device: str = 'cuda'     # model_init_device
+    prefetch_factor: int = 2            # prefetch_factor for dataset
+    apply_spatial_patchify: int = 0     # apply apply_spatial_patchify or not
+    debug_bsc: int = 0                  # save figs and set breakpoint for debug bsc and check input
+    task_type: str = 't2i'              # take type to t2i or t2v
+    ############################  Attention! The following arguments and configurations are set automatically, you can skip reading the following part ###############################
+    ############################  Attention! The following arguments and configurations are set automatically, you can skip reading the following part ###############################
+    ############################  Attention! The following arguments and configurations are set automatically, you can skip reading the following part ###############################
+    # would be automatically set in runtime
+    branch: str = subprocess.check_output(f'git symbolic-ref --short HEAD 2>/dev/null || git rev-parse HEAD', shell=True).decode('utf-8').strip() or '[unknown]' # [automatically set; don't specify this]
+    commit_id: str = '' # subprocess.check_output(f'git rev-parse HEAD', shell=True).decode('utf-8').strip() or '[unknown]'  # [automatically set; don't specify this]
+    commit_msg: str = ''# (subprocess.check_output(f'git log -1', shell=True).decode('utf-8').strip().splitlines() or ['[unknown]'])[-1].strip()    # [automatically set; don't specify this]
+    cmd: str = ' '.join(a.replace('--exp_name=', '').replace('--exp_name ', '') for a in sys.argv[7:])  # [automatically set; don't specify this]
+    tag: str = 'UK'                     # [automatically set; don't specify this]
+    acc_all: float = None               # [automatically set; don't specify this]
+    acc_real: float = None              # [automatically set; don't specify this]
+    acc_fake: float = None              # [automatically set; don't specify this]
+    last_Lnll: float = None             # [automatically set; don't specify this]
+    last_L1: float = None               # [automatically set; don't specify this]
+    last_Ld: float = None               # [automatically set; don't specify this]
+    last_wei_g: float = None            # [automatically set; don't specify this]
+    grad_boom: str = None               # [automatically set; don't specify this]
+    diff: float = None                  # [automatically set; don't specify this]
+    diffs: str = ''                     # [automatically set; don't specify this]
+    diffs_ema: str = None               # [automatically set; don't specify this]
+    ca_performance: str = ''            # [automatically set; don't specify this]
+    cur_phase: str = ''                 # [automatically set; don't specify this]
+    cur_it: str = ''                    # [automatically set; don't specify this]
+    cur_ep: str = ''                    # [automatically set; don't specify this]
+    remain_time: str = ''               # [automatically set; don't specify this]
+    finish_time: str = ''               # [automatically set; don't specify this]
+    iter_speed: float = None            # [automatically set; don't specify this]
+    img_per_day: float = None           # [automatically set; don't specify this]
+    max_nvidia_smi: float = 0           # [automatically set; don't specify this]
+    max_memory_allocated: float = None  # [automatically set; don't specify this]
+    max_memory_reserved: float = None   # [automatically set; don't specify this]
+    num_alloc_retries: int = None       # [automatically set; don't specify this]
+    MFU: float = None                   # [automatically set; don't specify this]
+    HFU: float = None                   # [automatically set; don't specify this]
+    # ==================================================================================================================
+    # ======================== ignore these parts below since they are only for debug use ==============================
+    # ==================================================================================================================
+    dbg_modified: bool = False
+    dbg_ks: bool = False
+    dbg_ks_last = None
+    dbg_ks_fp = None
+    def dbg_ks_this_line(self, g_it: int):
+        if self.dbg_ks:
+            if self.dbg_ks_last is None:
+                self.dbg_ks_last = deque(maxlen=6)
+            from utils.misc import time_str
+            self.dbg_ks_fp.seek(0)
+            f_back = sys._getframe().f_back
+            file_desc = f'{f_back.f_code.co_filename:24s}'[-24:]
+            info = f'{time_str()} ({file_desc}, line{f_back.f_lineno:-4d})'
+            if g_it is not None:
+                info += f'  [g_it: {g_it}]'
+            self.dbg_ks_last.append(info)
+            self.dbg_ks_fp.write('\n'.join(self.dbg_ks_last) + '\n')
+            self.dbg_ks_fp.flush()
+    dbg: bool = 'KEVIN_LOCAL' in os.environ       # only used when debug about unused param in DDP
+    ks: bool = False
+    nodata: bool = False    # if True, will set nova=True as well
+    nodata_tlen: int = 320
+    nova: bool = False      # no val, no FID
+    prof: int = 0           # profile
+    prof_freq: int = 50     # profile
+    tos_profiler_file_prefix: str = 'vgpt_default/'
+    profall: int = 0
+    @property
+    def is_vae_visualization_only(self) -> bool:
+        return self.v_seed > 0
+    v_seed: int = 0     # v_seed != 0 means the visualization-only mode
+    @property
+    def is_gpt_visualization_only(self) -> bool:
+        return self.g_seed > 0
+    g_seed: int = 0     # g_seed != 0 means the visualization-only mode
+    # ==================================================================================================================
+    # ======================== ignore these parts above since they are only for debug use ==============================
+    # ==================================================================================================================
+    @property
+    def gpt_training(self):
+        return len(self.model) > 0
+    def set_initial_seed(self, benchmark: bool):
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = benchmark
+        if self.seed is None:
+            torch.backends.cudnn.deterministic = False
+        else:
+            seed = self.seed + (dist.get_rank()*512 if self.rand else 0)
+            torch.backends.cudnn.deterministic = True
+            os.environ['PYTHONHASHSEED'] = str(seed)
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(seed)
+                torch.cuda.manual_seed_all(seed)
+    def get_different_generator_for_each_rank(self) -> Optional[torch.Generator]:   # for random augmentation
+        if self.seed is None:
+            return None
+        g = torch.Generator()
+        g.manual_seed(self.seed + dist.get_rank()*512)
+        return g
+    def compile_model(self, m, fast):
+        if fast == 0:
+            return m
+        return torch.compile(m, mode={
+            1: 'reduce-overhead',
+            2: 'max-autotune',
+            3: 'default',
+        }[fast]) if hasattr(torch, 'compile') else m
+    def dump_log(self):
+        if not dist.is_local_master():
+            return
+        nd = {'is_master': dist.is_visualizer()}
+        r_trial, trial = str(self.real_trial_id), str(self.trial_id)
+        for k, v in {
+            'name': self.exp_name, 'tag': self.tag, 'cmd': self.cmd, 'commit': self.commit_id, 'branch': self.branch,
+            'Lnll': self.last_Lnll, 'L1': self.last_L1,
+            'Ld': self.last_Ld,
+            'acc': self.acc_all, 'acc_r': self.acc_real, 'acc_f': self.acc_fake,
+            'weiG': self.last_wei_g if (self.last_wei_g is None or math.isfinite(self.last_wei_g)) else -23333,
+            'grad': self.grad_boom,
+            'cur': self.cur_phase, 'cur_ep': self.cur_ep, 'cur_it': self.cur_it,
+            'rema': self.remain_time, 'fini': self.finish_time, 'last_upd': time.strftime("%Y-%m-%d %H:%M", time.localtime()),
+            'bsep': f'{self.glb_batch_size}/{self.ep}',
+            'G_lrwd': f'{self.glr:.1e}'.replace('.0', '').replace('-0', '-').replace('+0', '+') + f'/{self.gwd:g}',
+            'D_lrwd': f'{self.dlr:.1e}'.replace('.0', '').replace('-0', '-').replace('+0', '+') + f'/{self.dwd:g}',
+            'T_lrwd': f'{self.tlr:.1e}'.replace('.0', '').replace('-0', '-').replace('+0', '+') + f'/{self.twd:g}',
+            'diff': self.diff, 'diffs': self.diffs, 'diffs_ema': self.diffs_ema if self.diffs_ema else None,
+            'opt': self.opt,
+            'is_master_node': self.is_master_node,
+        }.items():
+            if hasattr(v, 'item'):v = v.item()
+            if v is None or (isinstance(v, str) and len(v) == 0): continue
+            nd[k] = v
+        if r_trial == trial:
+            nd.pop('trial', None)
+        with open(self.log_txt_path, 'w') as fp:
+            json.dump(nd, fp, indent=2)
+    def touch_log(self):    # listener will kill me if log_txt_path is not updated for 120s
+        os.utime(self.log_txt_path) # about 2e-6 sec
+    def state_dict(self, key_ordered=True) -> Union[OrderedDict, dict]:
+        d = (OrderedDict if key_ordered else dict)()
+        # self.as_dict() would contain methods, but we only need variables
+        for k in self.class_variables.keys():
+            if k not in {'device', 'dbg_ks_fp'}:     # these are not serializable
+                d[k] = getattr(self, k)
+        return d
+    def load_state_dict(self, d: Union[OrderedDict, dict, str]):
+        if isinstance(d, str):  # for compatibility with old version
+            d: dict = eval('\n'.join([l for l in d.splitlines() if '<bound' not in l and 'device(' not in l]))
+        for k in d.keys():
+            if k in {'is_large_model', 'gpt_training'}:
+                continue
+            try:
+                setattr(self, k, d[k])
+            except Exception as e:
+                print(f'k={k}, v={d[k]}')
+                raise e
+    @staticmethod
+    def set_tf32(tf32: bool):
+        if torch.cuda.is_available():
+            torch.backends.cudnn.allow_tf32 = bool(tf32)
+            torch.backends.cuda.matmul.allow_tf32 = bool(tf32)
+            if hasattr(torch, 'set_float32_matmul_precision'):
+                torch.set_float32_matmul_precision('high' if tf32 else 'highest')
+                print(f'[tf32] [precis] torch.get_float32_matmul_precision(): {torch.get_float32_matmul_precision()}')
+            print(f'[tf32] [ conv ] torch.backends.cudnn.allow_tf32: {torch.backends.cudnn.allow_tf32}')
+            print(f'[tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: {torch.backends.cuda.matmul.allow_tf32}')
+    def __str__(self):
+        s = []
+        for k in self.class_variables.keys():
+            if k not in {'device', 'dbg_ks_fp'}:     # these are not serializable
+                s.append(f'  {k:20s}: {getattr(self, k)}')
+        s = '\n'.join(s)
+        return f'{{\n{s}\n}}\n'
+def init_dist_and_get_args():
+    for i in range(len(sys.argv)):
+        if sys.argv[i].startswith('--local-rank=') or sys.argv[i].startswith('--local_rank='):
+            del sys.argv[i]
+            break
+    args = Args(explicit_bool=True).parse_args(known_only=True)
+    args.chunk_nodes = int(os.environ.get('CK', '') or '0')
+    if len(args.extra_args) > 0 and args.is_master_node == 0:
+        print(f'======================================================================================')
+        print(f'=========================== WARNING: UNEXPECTED EXTRA ARGS ===========================\n{args.extra_args}')
+        print(f'=========================== WARNING: UNEXPECTED EXTRA ARGS ===========================')
+        print(f'======================================================================================\n\n')
+    args.set_tf32(args.tf32)
+    if args.dbg:
+        torch.autograd.set_detect_anomaly(True)
+    try: os.makedirs(args.bed, exist_ok=True)
+    except: pass
+    try: os.makedirs(args.local_out_path, exist_ok=True)
+    except: pass
+    day3 = 60*24*3
+    dist.init_distributed_mode(local_out_path=args.local_out_path, fork=False, timeout_minutes=day3 if int(os.environ.get('LONG_DBG', '0') or '0') > 0 else 30)
+    args.tlen = max(args.tlen, args.nodata_tlen)
+    if args.zero and args.tema != 0:
+        args.tema = 0
+        print(f'======================================================================================')
+        print(f'======================== WARNING: args.tema:=0, due to zero={args.zero} ========================')
+        print(f'======================================================================================\n\n')
+    if args.nodata:
+        args.nova = True
+    if not args.tos_profiler_file_prefix.endswith('/'): args.tos_profiler_file_prefix += '/'
+    if args.alng < 0:
+        args.alng = args.aln
+    args.device = dist.get_device()
+    args.r_accu = 1 / args.ac   # gradient accumulation
+    args.data_load_reso = None
+    args.rand |= args.seed is None
+    args.sche = args.sche or ('lin0' if args.gpt_training else 'cos')
+    if args.wp == 0:
+        args.wp = args.ep * 1/100
+    di = {
+        'b': 'bilinear', 'c': 'bicubic', 'n': 'nearest', 'a': 'area', 'aa': 'area+area',
+        'at': 'auto', 'auto': 'auto',
+        'v': 'vae',
+        'x': 'pix', 'xg': 'pix_glu', 'gx': 'pix_glu', 'g': 'pix_glu'
+    }
+    args.ada = args.ada or ('0.9_0.96' if args.gpt_training else '0.5_0.9')
+    args.dada = args.dada or args.ada
+    args.opt = args.opt.lower().strip()
+    if args.lbs:
+        bs_per_gpu = args.lbs / args.ac
+    else:
+        bs_per_gpu = args.bs / args.ac / dist.get_world_size()
+    bs_per_gpu = round(bs_per_gpu)
+    args.batch_size = bs_per_gpu
+    args.bs = args.glb_batch_size = args.batch_size * dist.get_world_size()
+    args.workers = min(args.workers, bs_per_gpu)
+    args.dblr = args.dblr or args.gblr
+    args.glr = args.ac * args.gblr * args.glb_batch_size / 256
+    args.dlr = args.ac * args.dblr * args.glb_batch_size / 256
+    args.tlr = args.ac * args.tblr * args.glb_batch_size / 256
+    args.gwde = args.gwde or args.gwd
+    args.dwde = args.dwde or args.dwd
+    args.twde = args.twde or args.twd
+    if args.dbg_modified:
+        torch.autograd.set_detect_anomaly(True)
+    args.dbg_ks &= dist.is_local_master()
+    if args.dbg_ks:
+        args.dbg_ks_fp = open(os.path.join(args.local_out_path, 'dbg_ks.txt'), 'w')
+    # gpt args
+    if args.gpt_training:
+        assert args.vae_ckpt, 'VAE ckpt must be specified when training GPT'
+        from infinity.models import alias_dict, alias_dict_inv
+        if args.model in alias_dict:
+            args.model = alias_dict[args.model]
+            args.model_alias = alias_dict_inv[args.model]
+        else:
+            args.model_alias = args.model
+            args.model = f'infinity_{args.model}'
+    args.task_id = '123'
+    args.trial_id = '123'
+    args.robust_run_id = '0'
+    args.log_txt_path = os.path.join(args.local_out_path, 'log.txt')
+    ls = '[]'
+    if 'AUTO_RESUME' in os.environ:
+        ls.append(int(os.environ['AUTO_RESUME']))
+    ls = sorted(ls, reverse=True)
+    ls = [str(i) for i in ls]
+    args.ckpt_trials = ls
+    args.real_trial_id = args.trial_id if len(ls) == 0 else str(ls[-1])
+    args.enable_checkpointing = None if args.enable_checkpointing in [False, 0, "0"] else args.enable_checkpointing
+    args.enable_checkpointing = "full-block" if args.enable_checkpointing in [True, 1, "1"] else args.enable_checkpointing
+    assert args.enable_checkpointing in [None, "full-block", "full-attn", "self-attn"], \
+        f"only support no-checkpointing or full-block/full-attn checkpointing, but got {args.enable_checkpointing}."
+    if len(args.exp_name) == 0:
+        args.exp_name = os.path.basename(args.bed) or 'test_exp'
+    if '-' in args.exp_name:
+        args.tag, args.exp_name = args.exp_name.split('-', maxsplit=1)
+    else:
+        args.tag = 'UK'
+    if dist.is_master():
+        os.system(f'rm -rf {os.path.join(args.bed, "ready-node*")} {os.path.join(args.local_out_path, "ready-node*")}')
+    if args.sdpa_mem:
+        from torch.backends.cuda import enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp
+        enable_flash_sdp(True)
+        enable_mem_efficient_sdp(True)
+        enable_math_sdp(False)
+    return args

utils/csv_util.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+import os.path as osp
+import csv
+import numpy as np
+def write_dicts2csv_file(input_dict_list, csv_filename):
+    os.makedirs(osp.dirname(csv_filename), exist_ok=True)
+    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
+        fieldnames = input_dict_list[0].keys()
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(input_dict_list)
+    print(f'"{csv_filename}" has been written.')
+def load_csv_as_dicts(csv_filename):
+    with open(csv_filename, mode='r', newline='', encoding='utf-8') as csvfile:
+        reader = csv.DictReader(csvfile)
+        return list(reader)

utils/dist.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import datetime
+import functools
+import os
+import sys
+from typing import List
+from typing import Union
+import pytz
+import torch
+import torch.distributed as tdist
+import torch.multiprocessing as mp
+__rank, __local_rank, __world_size, __device = 0, 0, 1, 'cpu'
+__rank_str_zfill = '0'
+__initialized = False
+def initialized():
+    return __initialized
+def __initialize(fork=False, backend='nccl', gpu_id_if_not_distibuted=0, timeout_minutes=30):
+    global __device
+    if not torch.cuda.is_available():
+        print(f'[dist initialize] cuda is not available, use cpu instead', file=sys.stderr)
+        return
+    elif 'RANK' not in os.environ:
+        torch.cuda.set_device(gpu_id_if_not_distibuted)
+        __device = torch.empty(1).cuda().device
+        print(f'[dist initialize] env variable "RANK" is not set, use {__device} as the device', file=sys.stderr)
+        return
+    # then 'RANK' must exist
+    global_rank, num_gpus = int(os.environ['RANK']), torch.cuda.device_count()
+    local_rank = global_rank % num_gpus
+    torch.cuda.set_device(local_rank)
+    # ref: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py#L29
+    """
+    if mp.get_start_method(allow_none=True) is None:
+        method = 'fork' if fork else 'spawn'
+        print(f'[dist initialize] mp method={method}')
+        mp.set_start_method(method)
+    """
+    tdist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout_minutes * 60))
+    global __rank, __local_rank, __world_size, __initialized, __rank_str_zfill
+    __local_rank = local_rank
+    __rank, __world_size = tdist.get_rank(), tdist.get_world_size()
+    __rank_str_zfill = str(__rank).zfill(len(str(__world_size)))
+    __device = torch.device(local_rank)
+    __initialized = True
+    assert tdist.is_initialized(), 'torch.distributed is not initialized!'
+    print(f'[lrk={get_local_rank()}, rk={get_rank()}]')
+def get_rank():
+    return __rank
+def get_rank_given_group(group: tdist.ProcessGroup):
+    return tdist.get_rank(group=group)
+def get_rank_str_zfill():
+    return __rank_str_zfill
+def get_local_rank():
+    return __local_rank
+def get_world_size():
+    return __world_size
+def get_device():
+    return __device
+def set_gpu_id(gpu_id: int):
+    if gpu_id is None: return
+    global __device
+    if isinstance(gpu_id, (str, int)):
+        torch.cuda.set_device(int(gpu_id))
+        __device = torch.empty(1).cuda().device
+    else:
+        raise NotImplementedError
+def is_master():
+    return __rank == 0
+def is_local_master():
+    return __local_rank == 0
+def is_visualizer():
+    return __rank == 0
+    # return __rank == max(__world_size - 8, 0)
+def parallelize(net, syncbn=False):
+    if syncbn:
+        net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
+    net = net.cuda()
+    net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[get_local_rank()], find_unused_parameters=False, broadcast_buffers=False)
+    return net
+def new_group(ranks: List[int]):
+    if __initialized:
+        return tdist.new_group(ranks=ranks)
+    return None
+def new_local_machine_group():
+    if __initialized:
+        cur_subgroup, subgroups = tdist.new_subgroups()
+        return cur_subgroup
+    return None
+def barrier():
+    if __initialized:
+        tdist.barrier()
+def allreduce(t: torch.Tensor, async_op=False):
+    if __initialized:
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            ret = tdist.all_reduce(cu, async_op=async_op)
+            t.copy_(cu.cpu())
+        else:
+            ret = tdist.all_reduce(t, async_op=async_op)
+        return ret
+    return None
+def allgather(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
+    if __initialized:
+        if not t.is_cuda:
+            t = t.cuda()
+        ls = [torch.empty_like(t) for _ in range(__world_size)]
+        tdist.all_gather(ls, t)
+    else:
+        ls = [t]
+    if cat:
+        ls = torch.cat(ls, dim=0)
+    return ls
+def allgather_diff_shape(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
+    if __initialized:
+        if not t.is_cuda:
+            t = t.cuda()
+        t_size = torch.tensor(t.size(), device=t.device)
+        ls_size = [torch.empty_like(t_size) for _ in range(__world_size)]
+        tdist.all_gather(ls_size, t_size)
+        max_B = max(size[0].item() for size in ls_size)
+        pad = max_B - t_size[0].item()
+        if pad:
+            pad_size = (pad, *t.size()[1:])
+            t = torch.cat((t, t.new_empty(pad_size)), dim=0)
+        ls_padded = [torch.empty_like(t) for _ in range(__world_size)]
+        tdist.all_gather(ls_padded, t)
+        ls = []
+        for t, size in zip(ls_padded, ls_size):
+            ls.append(t[:size[0].item()])
+    else:
+        ls = [t]
+    if cat:
+        ls = torch.cat(ls, dim=0)
+    return ls
+def broadcast(t: torch.Tensor, src_rank) -> None:
+    if __initialized:
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            tdist.broadcast(cu, src=src_rank)
+            t.copy_(cu.cpu())
+        else:
+            tdist.broadcast(t, src=src_rank)
+def dist_fmt_vals(val: float, fmt: Union[str, None] = '%.2f') -> Union[torch.Tensor, List]:
+    if not initialized():
+        return torch.tensor([val]) if fmt is None else [fmt % val]
+    ts = torch.zeros(__world_size)
+    ts[__rank] = val
+    allreduce(ts)
+    if fmt is None:
+        return ts
+    return [fmt % v for v in ts.cpu().numpy().tolist()]
+def master_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if force or is_master():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        barrier()
+        return ret
+    return wrapper
+def local_master_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if force or is_local_master():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        barrier()
+        return ret
+    return wrapper
+def for_visualize(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_visualizer():
+            # with torch.no_grad():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        return ret
+    return wrapper
+def finalize():
+    if __initialized:
+        tdist.destroy_process_group()
+def init_distributed_mode(local_out_path, fork=False, only_sync_master=False, timeout_minutes=30):
+    try:
+        __initialize(fork=fork, timeout_minutes=timeout_minutes)
+        barrier()
+    except RuntimeError as e:
+        print(f'{"!"*80}   dist init error (NCCL Error?), stopping training!   {"!"*80}', flush=True)
+        raise e
+    if local_out_path is not None: os.makedirs(local_out_path, exist_ok=True)
+    _change_builtin_print(is_local_master())
+    if (is_master() if only_sync_master else is_local_master()) and local_out_path is not None and len(local_out_path):
+        sys.stdout, sys.stderr = BackupStreamToFile(local_out_path, for_stdout=True), BackupStreamToFile(local_out_path, for_stdout=False)
+def _change_builtin_print(is_master):
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    if type(builtin_print) != type(open):
+        return
+    def prt(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        clean = kwargs.pop('clean', False)
+        deeper = kwargs.pop('deeper', False)
+        if is_master or force:
+            if not clean:
+                f_back = sys._getframe().f_back
+                if deeper and f_back.f_back is not None:
+                    f_back = f_back.f_back
+                file_desc = f'{f_back.f_code.co_filename:24s}'[-24:]
+                time_str = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('[%m-%d %H:%M:%S]')
+                builtin_print(f'{time_str} ({file_desc}, line{f_back.f_lineno:-4d})=>', *args, **kwargs)
+            else:
+                builtin_print(*args, **kwargs)
+    __builtin__.print = prt
+class BackupStreamToFile(object):
+    def __init__(self, local_output_dir, for_stdout=True):
+        self.for_stdout = for_stdout
+        self.terminal_stream = sys.stdout if for_stdout else sys.stderr
+        fname = os.path.join(local_output_dir, 'b1_stdout.txt' if for_stdout else 'b2_stderr.txt')
+        existing = os.path.exists(fname)
+        self.file_stream = open(fname, 'a')
+        if existing:
+            time_str = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('[%m-%d %H:%M:%S]')
+            self.file_stream.write('\n'*7 + '='*55 + f'   RESTART {time_str}   ' + '='*55 + '\n')
+        self.file_stream.flush()
+        os.system(f'ln -s {fname} /opt/tiger/run_trial/ >/dev/null 2>&1')
+        self.enabled = True
+    def write(self, message):
+        self.terminal_stream.write(message)
+        self.file_stream.write(message)
+    def flush(self):
+        self.terminal_stream.flush()
+        self.file_stream.flush()
+    def isatty(self):
+        return True
+    def close(self):
+        if not self.enabled:
+            return
+        self.enabled = False
+        self.file_stream.flush()
+        self.file_stream.close()
+        if self.for_stdout:
+            sys.stdout = self.terminal_stream
+            sys.stdout.flush()
+        else:
+            sys.stderr = self.terminal_stream
+            sys.stderr.flush()
+    def __del__(self):
+        self.close()

utils/dynamic_resolution.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import json
+import numpy as np
+import tqdm
+vae_stride = 16
+ratio2hws = {
+    1.000: [(1,1),(2,2),(4,4),(6,6),(8,8),(12,12),(16,16),(20,20),(24,24),(32,32),(40,40),(48,48),(64,64)],
+    1.250: [(1,1),(2,2),(3,3),(5,4),(10,8),(15,12),(20,16),(25,20),(30,24),(35,28),(45,36),(55,44),(70,56)],
+    1.333: [(1,1),(2,2),(4,3),(8,6),(12,9),(16,12),(20,15),(24,18),(28,21),(36,27),(48,36),(60,45),(72,54)],
+    1.500: [(1,1),(2,2),(3,2),(6,4),(9,6),(15,10),(21,14),(27,18),(33,22),(39,26),(48,32),(63,42),(78,52)],
+    1.750: [(1,1),(2,2),(3,3),(7,4),(11,6),(14,8),(21,12),(28,16),(35,20),(42,24),(56,32),(70,40),(84,48)],
+    2.000: [(1,1),(2,2),(4,2),(6,3),(10,5),(16,8),(22,11),(30,15),(38,19),(46,23),(60,30),(74,37),(90,45)],
+    2.500: [(1,1),(2,2),(5,2),(10,4),(15,6),(20,8),(25,10),(30,12),(40,16),(50,20),(65,26),(80,32),(100,40)],
+    3.000: [(1,1),(2,2),(6,2),(9,3),(15,5),(21,7),(27,9),(36,12),(45,15),(54,18),(72,24),(90,30),(111,37)],
+}
+predefined_t = [1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 15, 17, 21]
+full_ratio2hws = {}
+for ratio, hws in ratio2hws.items():
+    full_ratio2hws[ratio] = hws
+    if ratio != 1.000:
+        full_ratio2hws[int(1/ratio*1000)/1000] = [(item[1], item[0]) for item in hws]
+dynamic_resolution_h_w = {}
+for ratio in full_ratio2hws:
+    dynamic_resolution_h_w[ratio] ={}
+    for ind, leng in enumerate([7, 10, 12, 13]):
+        h_div_w = full_ratio2hws[ratio][leng-1][0] / full_ratio2hws[ratio][leng-1][1]
+        assert np.abs(h_div_w-ratio) < 0.01, f'{full_ratio2hws[ratio][leng-1]}: {h_div_w} != {ratio}'
+        pixel = (full_ratio2hws[ratio][leng-1][0] * vae_stride, full_ratio2hws[ratio][leng-1][1] * vae_stride)
+        if ind == 0:
+            total_pixels = '0.06M'
+        elif ind == 1:
+            total_pixels = '0.25M'
+        elif ind == 2:
+            total_pixels = '0.60M'
+        else:
+            total_pixels = '1M'
+        scales = full_ratio2hws[ratio][:leng]
+        scales = [ (t, h, w) for t, (h, w) in zip(predefined_t, scales) ]
+        dynamic_resolution_h_w[ratio][total_pixels] = {
+            'pixel': pixel,
+            'scales': scales
+        }
+h_div_w_templates = []
+for h_div_w in dynamic_resolution_h_w.keys():
+    h_div_w_templates.append(h_div_w)
+h_div_w_templates = np.array(h_div_w_templates)
+def get_h_div_w_template2indices(h_div_w_list, h_div_w_templates):
+    indices = list(range(len(h_div_w_list)))
+    h_div_w_template2indices = {}
+    pbar = tqdm.tqdm(total=len(indices), desc='get_h_div_w_template2indices...')
+    for h_div_w, index in zip(h_div_w_list, indices):
+        pbar.update(1)
+        nearest_h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w-h_div_w_templates))]
+        if nearest_h_div_w_template_ not in h_div_w_template2indices:
+            h_div_w_template2indices[nearest_h_div_w_template_] = []
+        h_div_w_template2indices[nearest_h_div_w_template_].append(index)
+    for h_div_w_template_, sub_indices in h_div_w_template2indices.items():
+        h_div_w_template2indices[h_div_w_template_] = np.array(sub_indices)
+    return h_div_w_template2indices
+if __name__ == '__main__':
+    for h_div_w_template in dynamic_resolution_h_w:
+        for total_pixels in dynamic_resolution_h_w[h_div_w_template]:
+            scales = np.array(dynamic_resolution_h_w[h_div_w_template][total_pixels]['scales'])
+            seq_len = np.sum(scales[:,0]*scales[:,1])
+            if total_pixels == '1M':
+                string = f'{h_div_w_template}, {total_pixels}, {dynamic_resolution_h_w[h_div_w_template][total_pixels]}, seq_len: {seq_len}'.replace(', ', ',')
+                print(string)

utils/large_file_util.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+import os.path as osp
+import time
+import itertools
+import shutil
+import glob
+import argparse
+import tqdm
+import numpy as np
+import threading
+def save_lines(lines, filename):
+    os.makedirs(osp.dirname(filename), exist_ok=True)
+    with open(filename, 'w') as f:
+        f.writelines(lines)
+    del lines
+def get_part_jsonls(filepath, total_line_number, parts=512):
+    dirname, filename, ext = osp.dirname(filepath), osp.splitext(osp.basename(filepath))[0], osp.splitext(osp.basename(filepath))[1]
+    if parts == 1:
+        return False, {1: filepath}
+    save_dir = osp.join(dirname, f'{parts:04d}_parts')
+    chunk_id2save_files = {}
+    missing = False
+    chunk_size = int(total_line_number/parts)
+    for chunk_id in range(1, parts+1):
+        if chunk_id == parts:
+            num_of_lines = total_line_number - chunk_size * (parts-1)
+        else:
+            num_of_lines = chunk_size
+        chunk_id2save_files[chunk_id] = osp.join(save_dir, f'{filename}_{chunk_id:04d}_{parts:04d}_{num_of_lines:09d}{ext}')
+        if not osp.exists(chunk_id2save_files[chunk_id]):
+            missing = True
+    return missing, chunk_id2save_files
+def split_large_txt_files(filepath, chunk_id2save_files):
+    thread_list = []
+    chunk_id = 1
+    with open(filepath, 'r') as f:
+        chunk = []
+        pbar = tqdm.tqdm(total=len(chunk_id2save_files))
+        for line in f:
+            chunk.append(line)
+            cur_chunk_size = int(osp.splitext(osp.basename(chunk_id2save_files[chunk_id]))[0].split('_')[-1])
+            if len(chunk) >= cur_chunk_size:
+                pbar.update(1)
+                thread_list.append(threading.Thread(target=save_lines, args=(chunk, chunk_id2save_files[chunk_id])))
+                thread_list[-1].start()
+                chunk = []
+                chunk_id += 1
+        if len(chunk):
+            import ipdb; ipdb.set_trace()
+        assert not len(chunk)
+        for thread in thread_list:
+            thread.join()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--jsonl_folder', type=str, default='')
+    parser.add_argument('--parts', type=int, default=600)
+    args = parser.parse_args()
+    for jsonl_filepath in sorted(glob.glob(osp.join(args.jsonl_folder, '*.jsonl'))):
+        print(jsonl_filepath)
+        t1 = time.time()
+        line_num = int(jsonl_filepath.split('_')[-1].split('.')[0])
+        missing, chunk_id2save_files = get_part_jsonls(jsonl_filepath, line_num, parts=args.parts)
+        split_large_txt_files(jsonl_filepath, chunk_id2save_files)
+        t2 = time.time()
+        print(f'split takes {t2-t1}s')

utils/load.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#!/usr/bin/python3
+import gc
+import os
+import os.path as osp
+import random
+import sys
+from copy import deepcopy
+from typing import Tuple, Union
+import colorama
+import torch
+import yaml
+import infinity.utils.dist as dist
+from infinity.models import Infinity
+from infinity.models.ema import get_ema_model
+from infinity.utils import arg_util, misc
+from infinity.utils.misc import os_system
+def build_vae_gpt(args: arg_util.Args, vae_st: dict, skip_gpt: bool, force_flash=False, device='cuda'):
+    if args.vae_type in [8,16,18,20,24,32,64,128]:
+        from infinity.models.bsq_vae.vae import vae_model
+        schedule_mode = "dynamic"
+        codebook_dim = args.vae_type # 18
+        codebook_size = 2**codebook_dim
+        if args.apply_spatial_patchify:
+            patch_size = 8
+            encoder_ch_mult=[1, 2, 4, 4]
+            decoder_ch_mult=[1, 2, 4, 4]
+        else:
+            patch_size = 16
+            encoder_ch_mult=[1, 2, 4, 4, 4]
+            decoder_ch_mult=[1, 2, 4, 4, 4]
+        vae_local = vae_model(vae_st, schedule_mode, codebook_dim, codebook_size, patch_size=patch_size,
+                              encoder_ch_mult=encoder_ch_mult, decoder_ch_mult=decoder_ch_mult, test_mode=True).to(args.device)
+        if args.fake_vae_input:
+            vae_local.encoder = None
+            vae_local.decoder = None
+            torch.cuda.empty_cache()
+    else:
+        raise ValueError(f"vae_type {args.vae_type} not supported")
+    if force_flash: args.flash = True
+    gpt_kw = dict(
+        pretrained=False, global_pool='',
+        text_channels=args.Ct5, text_maxlen=args.tlen,
+        norm_eps=args.norm_eps, rms_norm=args.rms,
+        shared_aln=args.saln, head_aln=args.haln,
+        cond_drop_rate=args.cfg, rand_uncond=args.rand_uncond, drop_rate=args.drop,
+        cross_attn_layer_scale=args.ca_gamma, nm0=args.nm0, tau=args.tau, cos_attn=args.cos, swiglu=args.swi,
+        raw_scale_schedule=args.scale_schedule,
+        head_depth=args.dec,
+        top_p=args.tp, top_k=args.tk,
+        customized_flash_attn=args.flash, fused_mlp=args.fuse, fused_norm=args.fused_norm,
+        checkpointing=args.enable_checkpointing,
+        pad_to_multiplier=args.pad_to_multiplier,
+        use_flex_attn=args.use_flex_attn,
+        batch_size=args.batch_size,
+        add_lvl_embeding_only_first_block=args.add_lvl_embeding_only_first_block,
+        use_bit_label=args.use_bit_label,
+        rope2d_each_sa_layer=args.rope2d_each_sa_layer,
+        rope2d_normalized_by_hw=args.rope2d_normalized_by_hw,
+        pn=args.pn,
+        train_h_div_w_list=args.train_h_div_w_list,
+        always_training_scales=args.always_training_scales,
+        apply_spatial_patchify=args.apply_spatial_patchify,
+    )
+    if args.dp >= 0: gpt_kw['drop_path_rate'] = args.dp
+    if args.hd > 0: gpt_kw['num_heads'] = args.hd
+    print(f'[create gpt_wo_ddp] constructor kw={gpt_kw}\n')
+    gpt_kw['vae_local'] = vae_local
+    model_str = args.model.replace('vgpt', 'infinity')   # legacy
+    print(f"{model_str=}")
+    if model_str.rsplit('c', maxsplit=1)[-1].isdecimal():
+        model_str, block_chunks = model_str.rsplit('c', maxsplit=1)
+        block_chunks = int(block_chunks)
+    else:
+        block_chunks = 1
+    gpt_kw['block_chunks'] = block_chunks
+    from infinity.models import Infinity
+    from timm.models import create_model
+    gpt_wo_ddp: Infinity = create_model(model_str, **gpt_kw)
+    if args.use_fsdp_model_ema:
+        gpt_wo_ddp_ema = get_ema_model(gpt_wo_ddp)
+    else:
+        gpt_wo_ddp_ema = None
+    gpt_wo_ddp = gpt_wo_ddp.to(device)
+    assert all(not p.requires_grad for p in vae_local.parameters())
+    assert all(p.requires_grad for n, p in gpt_wo_ddp.named_parameters())
+    return vae_local, gpt_wo_ddp, gpt_wo_ddp_ema
+if __name__ == '__main__':
+    ld(sys.argv[1])

utils/lr_control.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import math
+from pprint import pformat
+from typing import Tuple, List, Dict, Union
+import torch.nn
+import infinity.utils.dist as dist
+def lr_wd_annealing(sche_type: str, optimizer, peak_lr, wd, wd_end, cur_it, wp_it, max_it, wp0=0.005, wpe=0.001):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    wp_it = round(wp_it)
+    if cur_it < wp_it:
+        cur_lr = wp0 + (1-wp0) * cur_it / wp_it
+    else:
+        pasd = (cur_it - wp_it) / (max_it-1 - wp_it)   # [0, 1]
+        rest = 1 - pasd     # [1, 0]
+        if sche_type == 'cos':
+            cur_lr = wpe + (1-wpe) * (0.5 + 0.5 * math.cos(math.pi * pasd))
+        elif sche_type == 'lin':
+            T = 0.15; max_rest = 1-T
+            if pasd < T: cur_lr = 1
+            else: cur_lr = wpe + (1-wpe) * rest / max_rest  # 1 to wpe
+        elif sche_type == 'lin0':
+            T = 0.05; max_rest = 1-T
+            if pasd < T: cur_lr = 1
+            else: cur_lr = wpe + (1-wpe) * rest / max_rest
+        elif sche_type == 'lin00':
+            cur_lr = wpe + (1-wpe) * rest
+        elif sche_type.startswith('lin'):
+            T = float(sche_type[3:]); max_rest = 1-T
+            wpe_mid = wpe + (1-wpe) * max_rest
+            wpe_mid = (1 + wpe_mid) / 2
+            if pasd < T: cur_lr = 1 + (wpe_mid-1) * pasd / T
+            else: cur_lr = wpe + (wpe_mid-wpe) * rest / max_rest
+        elif sche_type == 'exp':
+            T = 0.15; max_rest = 1-T
+            if pasd < T: cur_lr = 1
+            else:
+                expo = (pasd-T) / max_rest * math.log(wpe)
+                cur_lr = math.exp(expo)
+        else:
+            raise NotImplementedError(f'unknown sche_type {sche_type}')
+    cur_lr *= peak_lr
+    pasd = cur_it / (max_it-1)
+    cur_wd = wd_end + (wd - wd_end) * (0.5 + 0.5 * math.cos(math.pi * pasd))
+    inf = 1e6
+    min_lr, max_lr = inf, -1
+    min_wd, max_wd = inf, -1
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = cur_lr * param_group.get('lr_sc', 1)    # 'lr_sc' could be assigned
+        max_lr = max(max_lr, param_group['lr'])
+        min_lr = min(min_lr, param_group['lr'])
+        param_group['weight_decay'] = cur_wd * param_group.get('wd_sc', 1)
+        max_wd = max(max_wd, param_group['weight_decay'])
+        if param_group['weight_decay'] > 0:
+            min_wd = min(min_wd, param_group['weight_decay'])
+    if min_lr == inf: min_lr = -1
+    if min_wd == inf: min_wd = -1
+    return min_lr, max_lr, min_wd, max_wd
+def filter_params(model, ndim_dict, nowd_keys=(), lr_scale=0.0) -> Tuple[
+    List[str], List[torch.nn.Parameter], List[Dict[str, Union[torch.nn.Parameter, float]]]
+]:
+    with_lr_scale = hasattr(model, 'get_layer_id_and_scale_exp') and 0 < lr_scale <= 1
+    print(f'[get_param_groups][lr decay] with_lr_scale={with_lr_scale}, lr_scale={lr_scale}')
+    para_groups, para_groups_dbg = {}, {}
+    names, paras = [], []
+    names_no_grad = []
+    count, numel = 0, 0
+    for name, para in model.named_parameters():
+        name = name.replace('_fsdp_wrapped_module.', '')
+        if not para.requires_grad:
+            names_no_grad.append(name)
+            continue  # frozen weights
+        count += 1
+        numel += para.numel()
+        names.append(name)
+        paras.append(para)
+        if ndim_dict.get(name, 2) == 1 or name.endswith('bias') or any(k in name for k in nowd_keys):
+            cur_wd_sc, group_name = 0., 'ND'
+        # elif any(k in name for k in small_wd_keys):
+        #     cur_wd_sc, group_name = small_wd, 'small_decay'
+        else:
+            cur_wd_sc, group_name = 1., 'D'
+        if with_lr_scale:
+            layer_id, scale_exp = model.get_layer_id_and_scale_exp(name)
+            group_name = f'layer{layer_id}_' + group_name
+            cur_lr_sc = lr_scale ** scale_exp
+            dbg = f'[layer {layer_id}][sc = {lr_scale} ** {scale_exp}]'
+        else:
+            cur_lr_sc = 1.
+            dbg = f'[no scale]'
+        if group_name not in para_groups:
+            para_groups[group_name] = {'params': [], 'wd_sc': cur_wd_sc, 'lr_sc': cur_lr_sc}
+            para_groups_dbg[group_name] = {'params': [], 'wd_sc': cur_wd_sc, 'lr_sc': dbg}
+        para_groups[group_name]['params'].append(para)
+        para_groups_dbg[group_name]['params'].append(name)
+    for g in para_groups_dbg.values():
+        g['params'] = pformat(', '.join(g['params']), width=200)
+    print(f'[get_param_groups] param_groups = \n{pformat(para_groups_dbg, indent=2, width=240)}\n')
+    for rk in range(dist.get_world_size()):
+        dist.barrier()
+        if dist.get_rank() == rk:
+            print(f'[get_param_groups][rank{dist.get_rank()}] {type(model).__name__=} {count=}, {numel=}', flush=True, force=True)
+    print('')
+    assert len(names_no_grad) == 0, f'[get_param_groups] names_no_grad = \n{pformat(names_no_grad, indent=2, width=240)}\n'
+    del ndim_dict
+    return names, paras, list(para_groups.values())
+def plot():
+    import matplotlib.pyplot as plt
+    import torch.nn as nn
+    from torch.optim import SGD
+    # for sche in ('lin', 'lin0', 'lin00', 'lin0.5', 'lin0.75'):
+    for sche in ('lin0', ):
+        op = SGD(nn.Linear(3, 4).parameters(), lr=1e-3)
+        it, lr = [], []
+        iters = 500
+        wp_it, max_it = 1 * iters, 10 * iters
+        for cur_it in range(max_it):
+            it.append(cur_it)
+            lr.append(lr_wd_annealing(sche, op, 0.1, 1e-5, 1e-5, cur_it, wp_it, max_it, wpe=0.3)[0])
+        plt.figure()
+        plt.title(sche)
+        plt.plot(it, lr, 'b', label=sche)
+        plt.xlabel('it'), plt.ylabel('lr')
+        plt.legend()
+    plt.savefig('lr.jpg')
+if __name__ == '__main__':
+    plot()

utils/misc.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import datetime
+import functools
+import math
+import os
+import random
+import subprocess
+import sys
+import threading
+import time
+from collections import defaultdict, deque
+from typing import Iterator, List, Tuple
+import numpy as np
+import pytz
+import torch
+import torch.distributed as tdist
+import torch.nn.functional as F
+import infinity.utils.dist as dist
+os_system = functools.partial(subprocess.call, shell=True)
+def echo(info):
+    os_system(f'echo "[$(date "+%m-%d-%H:%M:%S")] ({os.path.basename(sys._getframe().f_back.f_code.co_filename)}, line{sys._getframe().f_back.f_lineno})=> {info}"')
+def os_system_get_stdout(cmd):
+    return subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout.decode('utf-8')
+def os_system_get_stdout_stderr(cmd):
+    cnt = 0
+    while True:
+        try:
+            sp = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=30)
+        except subprocess.TimeoutExpired:
+            cnt += 1
+            print(f'[fetch free_port file] timeout cnt={cnt}')
+        else:
+            return sp.stdout.decode('utf-8'), sp.stderr.decode('utf-8')
+def is_pow2n(x):
+    return x > 0 and (x & (x - 1) == 0)
+def time_str(fmt='[%m-%d %H:%M:%S]'):
+    return datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime(fmt)
+class DistLogger(object):
+    def __init__(self, lg):
+        self._lg = lg
+    @staticmethod
+    def do_nothing(*args, **kwargs):
+        pass
+    def __getattr__(self, attr: str):
+        return getattr(self._lg, attr) if self._lg is not None else DistLogger.do_nothing
+class TensorboardLogger(object):
+    def __init__(self, log_dir, filename_suffix):
+        try: import tensorflow_io as tfio
+        except: pass
+        from torch.utils.tensorboard import SummaryWriter
+        self.writer = SummaryWriter(log_dir=log_dir, filename_suffix=filename_suffix)
+        self.step = 0
+    def set_step(self, step=None):
+        if step is not None:
+            self.step = step
+        else:
+            self.step += 1
+    def loggable(self):
+        return self.step == 0 or (self.step + 1) % 500 == 0
+    def update(self, head='scalar', step=None, **kwargs):
+        if step is None:
+            step = self.step
+            if not self.loggable(): return
+        for k, v in kwargs.items():
+            if v is None: continue
+            if hasattr(v, 'item'): v = v.item()
+            self.writer.add_scalar(f'{head}/{k}', v, step)
+    def log_tensor_as_distri(self, tag, tensor1d, step=None):
+        if step is None:
+            step = self.step
+            if not self.loggable(): return
+        try:
+            self.writer.add_histogram(tag=tag, values=tensor1d, global_step=step)
+        except Exception as e:
+            print(f'[log_tensor_as_distri writer.add_histogram failed]: {e}')
+    def log_image(self, tag, img_chw, step=None):
+        if step is None:
+            step = self.step
+            if not self.loggable(): return
+        self.writer.add_image(tag, img_chw, step, dataformats='CHW')
+    def flush(self):
+        self.writer.flush()
+    def close(self):
+        self.writer.close()
+class Low_GPU_usage(object):
+    def __init__(self, files, sleep_secs, verbose):
+        pass
+    def early_stop(self):
+        pass
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+class TouchingDaemonDontForgetToStartMe(threading.Thread):
+    def __init__(self, files: List[str], sleep_secs: int, verbose=False):
+        super().__init__(daemon=True)
+        self.files = tuple(files)
+        self.sleep_secs = sleep_secs
+        self.is_finished = False
+        self.verbose = verbose
+        f_back = sys._getframe().f_back
+        file_desc = f'{f_back.f_code.co_filename:24s}'[-24:]
+        self.print_prefix = f' ({file_desc}, line{f_back.f_lineno:-4d}) @daemon@ '
+    def finishing(self):
+        self.is_finished = True
+    def run(self) -> None:
+        kw = {}
+        if tdist.is_initialized(): kw['clean'] = True
+        stt = time.time()
+        if self.verbose: print(f'{time_str()}{self.print_prefix}[TouchingDaemon tid={threading.get_native_id()}] start touching {self.files} per {self.sleep_secs}s ...', **kw)
+        while not self.is_finished:
+            for f in self.files:
+                if os.path.exists(f):
+                    try:
+                        os.utime(f)
+                        fp = open(f, 'a')
+                        fp.close()
+                    except: pass
+            time.sleep(self.sleep_secs)
+        if self.verbose: print(f'{time_str()}{self.print_prefix}[TouchingDaemon tid={threading.get_native_id()}] finish touching after {time.time()-stt:.1f} secs {self.files} per {self.sleep_secs}s. ', **kw)
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=30, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        tdist.barrier()
+        tdist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        return np.median(self.deque) if len(self.deque) else 0
+    @property
+    def avg(self):
+        return sum(self.deque) / (len(self.deque) or 1)
+    @property
+    def global_avg(self):
+        return self.total / (self.count or 1)
+    @property
+    def max(self):
+        return max(self.deque) if len(self.deque) else 0
+    @property
+    def value(self):
+        return self.deque[-1] if len(self.deque) else 0
+    def time_preds(self, counts) -> Tuple[float, str, str]:
+        remain_secs = counts * self.median
+        return remain_secs, str(datetime.timedelta(seconds=round(remain_secs))), time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time() + remain_secs))
+    def __str__(self):
+        return self.fmt.format(median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value)
+class MetricLogger(object):
+    def __init__(self):
+        self.meters = defaultdict(SmoothedValue)
+        self.iter_end_t = time.time()
+        self.log_iters = set()
+        self.log_every_iter = False
+    def update(self, **kwargs):
+        # if it != 0 and it not in self.log_iters: return
+        for k, v in kwargs.items():
+            if v is None: continue
+            if hasattr(v, 'item'): v = v.item()
+            # assert isinstance(v, (float, int)), type(v)
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            if len(meter.deque):
+                loss_str.append(
+                    "{}: {}".format(name, str(meter))
+                )
+        return '  '.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, start_it, max_iters, itrt, log_freq, log_every_iter=False, header=''):    # also solve logging & skipping iterations before start_it
+        start_it = start_it % max_iters
+        self.log_iters = set(range(start_it, max_iters, log_freq))
+        self.log_iters.add(start_it)
+        self.log_iters.add(max_iters-1)
+        self.log_iters.add(max_iters)
+        self.log_every_iter = log_every_iter
+        self.iter_end_t = time.time()
+        self.iter_time = SmoothedValue(fmt='{value:.4f}')
+        self.data_time = SmoothedValue(fmt='{value:.3f}')
+        header_fmt = header + ':  [{0:' + str(len(str(max_iters))) + 'd}/{1}]'
+        start_time = time.time()
+        if isinstance(itrt, Iterator) and not hasattr(itrt, 'preload') and not hasattr(itrt, 'set_epoch'):
+            for it in range(start_it, max_iters):
+                obj = next(itrt)
+                if it < start_it: continue
+                self.data_time.update(time.time() - self.iter_end_t)
+                yield it, obj
+                self.iter_time.update(time.time() - self.iter_end_t)
+                if self.log_every_iter or it in self.log_iters:
+                    eta_seconds = self.iter_time.avg * (max_iters - it)
+                    print(f'{header_fmt.format(it, max_iters)}  eta: {str(datetime.timedelta(seconds=int(eta_seconds)))}  {str(self)}  T: {self.iter_time.value:.3f}s  dataT: {self.data_time.value*1e3:.1f}ms', flush=True)
+                self.iter_end_t = time.time()
+        else:
+            if isinstance(itrt, int): itrt = range(itrt)
+            for it, obj in enumerate(itrt):
+                if it < start_it:
+                    self.iter_end_t = time.time()
+                    continue
+                self.data_time.update(time.time() - self.iter_end_t)
+                yield it, obj
+                self.iter_time.update(time.time() - self.iter_end_t)
+                if self.log_every_iter or it in self.log_iters:
+                    eta_seconds = self.iter_time.avg * (max_iters - it)
+                    print(f'{header_fmt.format(it, max_iters)}  eta: {str(datetime.timedelta(seconds=int(eta_seconds)))}  {str(self)}  T: {self.iter_time.value:.3f}s  dataT: {self.data_time.value*1e3:.1f}ms', flush=True)
+                self.iter_end_t = time.time()
+        cost = time.time() - start_time
+        cost_str = str(datetime.timedelta(seconds=int(cost)))
+        print(f'{header}   Cost of this ep:      {cost_str}   ({cost / (max_iters-start_it):.3f} s / it)', flush=True)
+class NullDDP(torch.nn.Module):
+    def __init__(self, module, *args, **kwargs):
+        super(NullDDP, self).__init__()
+        self.module = module
+        self.require_backward_grad_sync = False
+    def forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
+def build_2d_sincos_position_embedding(h, w, embed_dim, temperature=10000., sc=0, verbose=True):    # (1, hw**2, embed_dim)
+    # DiT: sc=0
+    # DETR: sc=2?
+    grid_w = torch.arange(w, dtype=torch.float32)
+    grid_h = torch.arange(h, dtype=torch.float32)
+    grid_w, grid_h = torch.meshgrid([grid_w, grid_h], indexing='ij')
+    if sc == 0:
+        scale = 1
+    elif sc == 1:
+        scale = math.pi * 2 / w
+    else:
+        scale = 1 / w
+    grid_w = scale * grid_w.reshape(h*w, 1) # scale * [0, 0, 0, 1, 1, 1, 2, 2, 2]
+    grid_h = scale * grid_h.reshape(h*w, 1) # scale * [0, 1, 2, 0, 1, 2, 0, 1, 2]
+    assert embed_dim % 4 == 0, f'Embed dimension ({embed_dim}) must be divisible by 4 for 2D sin-cos position embedding!'
+    pos_dim = embed_dim // 4
+    omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+    omega = (-math.log(temperature) * omega).exp()
+    # omega == (1/T) ** (arange(pos_dim) / pos_dim), a vector only dependent on C
+    out_w = grid_w * omega.view(1, pos_dim) # out_w: scale * [0*ome, 0*ome, 0*ome, 1*ome, 1*ome, 1*ome, 2*ome, 2*ome, 2*ome]
+    out_h = grid_h * omega.view(1, pos_dim) # out_h: scale * [0*ome, 1*ome, 2*ome, 0*ome, 1*ome, 2*ome, 0*ome, 1*ome, 2*ome]
+    pos_emb = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1)[None, :, :]
+    if verbose: print(f'[build_2d_sincos_position_embedding @ {hw} x {hw}] scale_type={sc}, temperature={temperature:g}, shape={pos_emb.shape}')
+    return pos_emb  # (1, hw**2, embed_dim)
+if __name__ == '__main__':
+    import seaborn as sns
+    import matplotlib.pyplot as plt
+    cmap_div = sns.color_palette('icefire', as_cmap=True)
+    scs = [0, 1, 2]
+    temps = [20, 50, 100, 1000]
+    reso = 3.0
+    RR, CC = len(scs), len(temps)
+    plt.figure(figsize=(CC * reso, RR * reso))  # figsize=(16, 16)
+    for row, sc in enumerate(scs):
+        for col, temp in enumerate(temps):
+            name = f'sc={sc}, T={temp}'
+            hw, C = 16, 512
+            N = hw*hw
+            pe = build_2d_sincos_position_embedding(hw, C, temperature=temp, sc=sc, verbose=False)[0] # N, C = 64, 16
+            hw2 = 16
+            N2 = hw2*hw2
+            pe2 = build_2d_sincos_position_embedding(hw2, C, temperature=temp, sc=sc, verbose=False)[0] # N, C = 64, 16
+            # pe2 = pe2.flip(dims=(0,))
+            bchw, bchw2 = F.normalize(pe.view(hw, hw, C).permute(2, 0, 1).unsqueeze(0), dim=1), F.normalize(pe2.view(hw2, hw2, C).permute(2, 0, 1).unsqueeze(0), dim=1)
+            dis = [
+                f'{F.mse_loss(bchw, F.interpolate(bchw2, size=bchw.shape[-2], mode=inter)).item():.3f}'
+                for inter in ('bilinear', 'bicubic', 'nearest')
+            ]
+            dis += [
+                f'{F.mse_loss(F.interpolate(bchw, size=bchw2.shape[-2], mode=inter), bchw2).item():.3f}'
+                for inter in ('area', 'nearest')
+            ]
+            print(f'[{name:^20s}] dis: {dis}')
+            """
+            [     sc=0, T=20     ] dis: ['0.010', '0.011', '0.011', '0.009', '0.010']
+            [    sc=0, T=100     ] dis: ['0.007', '0.007', '0.007', '0.006', '0.007']
+            [    sc=0, T=1000    ] dis: ['0.005', '0.005', '0.005', '0.004', '0.005']
+            [   sc=0, T=10000    ] dis: ['0.004', '0.004', '0.004', '0.003', '0.004']
+            [     sc=1, T=20     ] dis: ['0.007', '0.008', '0.008', '0.007', '0.008']
+            [    sc=1, T=100     ] dis: ['0.005', '0.005', '0.005', '0.005', '0.005']
+            [    sc=1, T=1000    ] dis: ['0.003', '0.003', '0.003', '0.003', '0.003']
+            [   sc=1, T=10000    ] dis: ['0.003', '0.003', '0.003', '0.003', '0.003']
+            [     sc=2, T=20     ] dis: ['0.000', '0.000', '0.000', '0.000', '0.000']
+            [    sc=2, T=100     ] dis: ['0.000', '0.000', '0.000', '0.000', '0.000']
+            [    sc=2, T=1000    ] dis: ['0.000', '0.000', '0.000', '0.000', '0.000']
+            [   sc=2, T=10000    ] dis: ['0.000', '0.000', '0.000', '0.000', '0.000']
+            Process finished with exit code 0
+            """
+            pe = torch.from_numpy(cmap_div(pe.T.numpy())[:, :, :3])      # C, N, 3
+            tar_h, tar_w = 1024, 1024
+            pe = pe.repeat_interleave(tar_w//pe.shape[0], dim=0).repeat_interleave(tar_h//pe.shape[1], dim=1)
+            plt.subplot(RR, CC, 1+row*CC+col)
+            plt.title(name)
+            plt.xlabel('hxw'), plt.ylabel('C')
+            plt.xticks([]), plt.yticks([])
+            plt.imshow(pe.mul(255).round().clamp(0, 255).byte().numpy())
+    plt.tight_layout(h_pad=0.02)
+    plt.show()
+def check_randomness(args):
+    U = 16384
+    t = torch.zeros(dist.get_world_size(), 4, dtype=torch.float32, device=args.device)
+    t0 = torch.zeros(1, dtype=torch.float32, device=args.device).random_(U)
+    t[dist.get_rank(), 0] = float(random.randrange(U))
+    t[dist.get_rank(), 1] = float(np.random.randint(U))
+    t[dist.get_rank(), 2] = float(torch.randint(0, U, (1,))[0])
+    t[dist.get_rank(), 3] = float(t0[0])
+    dist.allreduce(t)
+    for rk in range(1, dist.get_world_size()):
+        assert torch.allclose(t[rk - 1], t[rk]), f't={t}'
+    del t0, t, U

utils/save_and_load.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import gc
+import os
+import subprocess
+import time
+import re
+from typing import List, Optional, Tuple
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+import glob
+import shutil
+from infinity.utils import arg_util
+import infinity.utils.dist as dist
+def glob_with_epoch_iter(pattern, recursive=False):
+    def extract_ep_iter(filename):
+        match = re.search(r'ep(\d+)-iter(\d+)', filename)
+        if match:
+            ep = int(match.group(1))
+            iter_idx = int(match.group(2))
+            return ep, iter_idx
+        return 0, 0
+    return sorted(glob.glob(pattern, recursive=recursive), key=lambda x: extract_ep_iter(os.path.basename(x)), reverse=True)
+def glob_with_global_step(pattern, recursive=False):
+    def extract_ep_iter(filename):
+        match = re.search(r'global_step_(\d+)', filename)
+        if match:
+            iter_idx = int(match.group(1))
+            return iter_idx
+        return 0
+    return sorted(glob.glob(pattern, recursive=recursive), key=lambda x: extract_ep_iter(os.path.basename(x)), reverse=True)
+class CKPTSaver(object):
+    def __init__(self, is_master: bool, eval_milestone: List[Tuple[float, float]]):
+        self.is_master = is_master
+        self.time_stamp = torch.tensor([time.time() - 1e5, time.time()], device=dist.get_device())
+        self.sp_also: subprocess.Popen = None
+        self.sp_best: subprocess.Popen = None
+        self.sp_backup: subprocess.Popen = None
+        self.acc_str, self.eval_milestone = '[no acc str]', eval_milestone
+    def sav(
+        self, args: arg_util.Args, g_it: int, next_ep: int, next_it: int, trainer,
+        acc_str: Optional[str] = None, eval_milestone: Optional[List[Tuple[float, float]]] = None,
+        also_save_to: str = None, best_save_to: str = None,
+    ):
+        self.time_stamp[1] = time.time()
+        dist.broadcast(self.time_stamp, src_rank=0)
+        last_save_time, cur_time = self.time_stamp.cpu().tolist()
+        auto_save = cur_time - last_save_time > 20 * 60
+        need_save = also_save_to is not None or best_save_to is not None or next_ep == args.ep or auto_save
+        if not need_save:
+            return
+        if acc_str is not None: self.acc_str = acc_str
+        if eval_milestone is not None: self.eval_milestone = eval_milestone
+        fname = f'ar-ckpt-giter{g_it//1000:03d}K-ep{next_ep}-iter{next_it}-last.pth' if args.gpt_training else f'ckpt-last.pth'
+        local_out_ckpt = os.path.join(args.local_out_path, fname)
+        # NOTE: all rank should call this state_dict(), not master only!
+        trainer_state = trainer.state_dict()
+        if self.is_master:
+            stt = time.time()
+            torch.save({
+                'args':         args.state_dict(),
+                'gpt_training': args.gpt_training,
+                'arch':         args.model if args.gpt_training else args.vv,
+                'epoch':        next_ep,
+                'iter':         next_it,
+                'trainer':      trainer_state,
+                'acc_str':      self.acc_str,
+                'milestones':   self.eval_milestone,
+            }, local_out_ckpt)
+            print(f'[CKPTSaver][rank00] start: {also_save_to=} {best_save_to=} {(next_ep == args.ep)=} {auto_save=}  |  see {local_out_ckpt}', flush=True)
+            print(f'[CKPTSaver][rank00] dbg: {args.bed=}', flush=True)
+            if auto_save:
+                if self.sp_backup is not None:
+                    self.sp_backup.wait(timeout=300); self.sp_backup.kill(); self.sp_backup.communicate()
+                self.time_stamp[0] = time.time()
+                def auto_sync(source_filename, target_filename):
+                    cmd = f'cp -r {source_filename} {target_filename}'
+                    self.sp_backup = subprocess.Popen(cmd, shell=True, bufsize=-1)
+                    print(f'[CKPTSaver] auto_save cmd: {cmd}', flush=True)
+                local_files = glob.glob(f"{args.local_out_path}/*")
+                for filename in local_files:
+                    basename = os.path.basename(filename)
+                    target_filename = f'{args.bed}/{basename}'
+                    if basename.endswith('.pth'):
+                        if not os.path.isfile(target_filename):
+                            auto_sync(filename, target_filename)
+                    else:
+                        auto_sync(filename, target_filename)
+            cost = time.time() - stt
+            print(f'[CKPTSaver][rank00] cost: {cost:.2f}s', flush=True)
+        del trainer_state
+        time.sleep(3), gc.collect(), torch.cuda.empty_cache(), time.sleep(3)
+        dist.barrier()
+def auto_resume(args: arg_util.Args, pattern='ckpt*.pth') -> Tuple[List[str], int, int, str, List[Tuple[float, float]], dict, dict]:
+    info = []
+    resume = ''
+    if args.auto_resume:
+        for dd in (args.local_out_path, args.bed):
+            all_ckpt = glob_with_epoch_iter(os.path.join(dd, pattern))
+            if len(all_ckpt): break
+        if len(all_ckpt) == 0:
+            info.append(f'[auto_resume] no ckpt found @ {pattern}')
+            info.append(f'[auto_resume quit]')
+        else:
+            resume = all_ckpt[0]
+            info.append(f'[auto_resume] auto load from @ {resume} ...')
+    else:
+        info.append(f'[auto_resume] disabled')
+        info.append(f'[auto_resume quit]')
+    if len(resume) == 0:
+        return info, 0, 0, '[no acc str]', [], {}, {}
+    print(f'auto resume from {resume}')
+    try:
+        ckpt = torch.load(resume, map_location='cpu')
+    except Exception as e:
+        info.append(f'[auto_resume] failed, {e} @ {resume}')
+        if len(all_ckpt) < 2:
+            return info, 0, 0, '[no acc str]', [], {}, {}
+        try: # another chance to load from bytenas
+            ckpt = torch.load(all_ckpt[1], map_location='cpu')
+        except Exception as e:
+            info.append(f'[auto_resume] failed, {e} @ {all_ckpt[1]}')
+            return info, 0, 0, '[no acc str]', [], {}, {}
+    dist.barrier()
+    ep, it = ckpt['epoch'], ckpt['iter']
+    eval_milestone = ckpt.get('milestones', [])
+    info.append(f'[auto_resume success] resume from ep{ep}, it{it},    eval_milestone: {eval_milestone}')
+    return info, ep, it, ckpt.get('acc_str', '[no acc str]'), eval_milestone, ckpt['trainer'], ckpt['args']

utils/wandb_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import wandb
+import torch
+from torchvision.utils import make_grid
+import torch.distributed as dist
+from PIL import Image
+import os
+import argparse
+import hashlib
+import math
+def is_main_process():
+    return dist.get_rank() == 0
+def namespace_to_dict(namespace):
+    return {
+        k: namespace_to_dict(v) if isinstance(v, argparse.Namespace) else v
+        for k, v in vars(namespace).items()
+    }
+def generate_run_id(exp_name):
+    # https://stackoverflow.com/questions/16008670/how-to-hash-a-string-into-8-digits
+    return str(int(hashlib.sha256(exp_name.encode('utf-8')).hexdigest(), 16) % 10 ** 8)
+def initialize(args, entity, exp_name, project_name):
+    config_dict = namespace_to_dict(args)
+    wandb.login(key=os.environ["WANDB_KEY"])
+    wandb.init(
+        entity=entity,
+        project=project_name,
+        name=exp_name,
+        config=config_dict,
+        id=generate_run_id(exp_name),
+        resume="allow",
+    )
+def log(stats, step=None):
+    if is_main_process():
+        wandb.log({k: v for k, v in stats.items()}, step=step)
+def log_image(name, sample, step=None):
+    if is_main_process():
+        sample = array2grid(sample)
+        wandb.log({f"{name}": wandb.Image(sample), "train_step": step})
+def array2grid(x):
+    nrow = round(math.sqrt(x.size(0)))
+    x = make_grid(x, nrow=nrow, normalize=True, value_range=(-1,1))
+    x = x.mul(255).add_(0.5).clamp_(0,255).permute(1,2,0).to('cpu', torch.uint8).numpy()
+    return x