diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..da22ac41ca7b479d9b6947186152a7ae64de8d7c --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +/.idea +/.vscode +**/checkpoint*/ +**/__pycache__/ +**/generated*/ +**/wandb/ +**/full_model.pth +/rubbish +**/*cache* +/workspace/classinput/Qwen25llm/ diff --git a/README.md b/README.md index ab1026189d4daa3c13229a3ffce8d7b35a755eb3..1c58d1e2760d01dd4ea0b74b5817e1c3c995390e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,79 @@ ---- -license: unknown ---- +# Recurrent Parameter Generation +The official repository of paper [Recurrent Diffusion for Large-Scale Parameter Generation](). + + +## Introduction +Parameter generation has long struggled to scale, significantly limiting its applications. +In this study, we introduce Recurrent diffusion for large-scale Parameter Generation, or RPG, +which models large-scale parameter generation through a recurrent diffusion process. +We divide the trained parameters into non-overlapping parts and propose a recurrent model to learn their relationships. +The outputs of this recurrent model, serving as conditions, are then input into a diffusion model to generate neural network parameters. +Utilizing only a single GPU, our method can generate parameters for popular vision and language models, such as ConvNeXt-L and LoRA parameters for LLaMA-7B. +Across various architectures and tasks, the generated parameters consistently achieve comparable performance to those of trained networks. +Additionally, our approach demonstrates potential in generating models capable of handling unseen tasks, +indicating that recurrent diffusion greatly enhances the practicality of parameter generation. + + + + + + + + +## Environment +Before you get started, you need to set up a conda environment first. +1. Create your conda environment. +```shell +conda create -n rpg python=3.11 +conda activate rpg +conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=12.1 -c pytorch -c nvidia +``` +2. Install mamba-ssm. (You may run into compilation issues, refer to the [official mamba-ssm repository](https://github.com/state-spaces/mamba) for details.) +```shell +pip install mamba-ssm[causal-conv1d] +pip install causal-conv1d +``` +3. Install other dependencies for this repository. +```shell +git lfs install +git clone https://huggingface.co./MTDoven/Recurrent-Parameter-Generation +cd Recurrent-Parameter-Generation +pip install -r requirements.txt +``` + + + + +## Quick Start +1. Modify your config file. +```shell +# Set up your configs interactively. +python ./workspace/set_configs.py +``` + +2. Download checkpoint datasets. +```shell + +# Download the ViTTiny1022 dataset to /path/to/your/download/ViTTiny1022 +mv /path/to/your/download/ViTTiny1022/* ./dataset/condition_classinput_vittiny/ +``` + +3. Try to generate with RPG model. +```shell +cd ./workspace +CUDA_VISIBLE_DEVICES=0 python ./classinput/launch.py +# CUDA_VISIBLE_DEVICES= python ./classinput/launch.py +``` + +You can get more information from [Github](https://github.com/NUS-HPC-AI-Lab/Recurrent-Parameter-Generation). + + + + +## Acknowledgment +coming soon... + + +## Citation +coming soon... + diff --git a/checkpoint/generalization.pth b/checkpoint/generalization.pth new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dataset/__init__.py b/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c393c3937c55da45a48e1e80ff79cc4b205719cf --- /dev/null +++ b/dataset/__init__.py @@ -0,0 +1 @@ +from .register import * \ No newline at end of file diff --git a/dataset/cifar100_resnet18bn/model.py b/dataset/cifar100_resnet18bn/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9396a42825b91e47d606462da56582937b434f35 --- /dev/null +++ b/dataset/cifar100_resnet18bn/model.py @@ -0,0 +1,27 @@ +import torch.nn as nn +import torch +import timm +import os + + +def Model(): + model = timm.create_model("resnet18", pretrained=True) + model.fc = nn.Linear(512, 100) + if os.path.exists(os.path.join(os.path.dirname(__file__), "full_model.pth")): + model.load_state_dict(torch.load(os.path.join(os.path.dirname(__file__), "full_model.pth"), map_location="cpu")) + for k, v in model.named_parameters(): + if k in ["layer4.1.bn1.weight", "layer4.1.bn1.bias", "layer4.1.bn2.weight", "layer4.1.bn2.bias"]: + v.requires_grad = True + else: # requires_grad = False + v.requires_grad = False + return model, model.fc + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for k, v in model.named_parameters(): + num_param += v.numel() + print(k) + print("num_param:", num_param) \ No newline at end of file diff --git a/dataset/cifar100_resnet18bn/prepare.py b/dataset/cifar100_resnet18bn/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..9bbd5c6f03c850d4594ae5d1a524338f925091ae --- /dev/null +++ b/dataset/cifar100_resnet18bn/prepare.py @@ -0,0 +1,192 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR100 as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 200, + "num_workers": 32, + "learning_rate": 0.0005, + "weight_decay": 0.000005, + "epochs": 200, + "save_learning_rate": 0.0, + "total_save_number": 1, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + download=True, + train=True, + transform=transforms.Compose([ + transforms.Resize(80), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + download=True, + train=False, + transform=transforms.Compose([ + transforms.Resize(80), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +pre_optimizer = optim.AdamW( + head.parameters(), + lr=0.001, + weight_decay=config["weight_decay"], +) +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"full_model.pth") + print("save:", f"full_model.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + train(model=model, optimizer=pre_optimizer, scheduler=scheduler) + train(model=model, optimizer=pre_optimizer, scheduler=scheduler) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/cifar100_resnet18bn/test.py b/dataset/cifar100_resnet18bn/test.py new file mode 100644 index 0000000000000000000000000000000000000000..2abb8013e55dbb69016bcc94eeee1d8d4c9814db --- /dev/null +++ b/dataset/cifar100_resnet18bn/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}, strict=False) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/cifar100_resnet18bn/train.py b/dataset/cifar100_resnet18bn/train.py new file mode 100644 index 0000000000000000000000000000000000000000..15fbe91378f9fe5885fece83141d7d61c41c8607 --- /dev/null +++ b/dataset/cifar100_resnet18bn/train.py @@ -0,0 +1,195 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR100 as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 100 if __name__ == "__main__" else 200, + "num_workers": 4, + "learning_rate": 0.01, + "weight_decay": 5e-6, + "epochs": 1, + "save_learning_rate": 0.01, + "total_save_number": 200, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + download=True, + train=True, + transform=transforms.Compose([ + transforms.Resize(80), + transforms.RandomHorizontalFlip(), + transforms.RandAugment(), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=False, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + download=True, + train=False, + transform=transforms.Compose([ + transforms.Resize(80), + transforms.CenterCrop(80), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=False, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + saved_number = 0 + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items() \ + if key in ["layer4.1.bn1.weight", "layer4.1.bn1.bias", "layer4.1.bn2.weight", "layer4.1.bn2.bias"]} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + saved_number += 1 + if saved_number >= config["total_save_number"]: + break + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/cifar10_cnnmedium/model.py b/dataset/cifar10_cnnmedium/model.py new file mode 100644 index 0000000000000000000000000000000000000000..83b27f81382b6536a6e0cf67e459f19f3c14ec93 --- /dev/null +++ b/dataset/cifar10_cnnmedium/model.py @@ -0,0 +1,48 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +import timm + + +class CNNMedium(nn.Module): + def __init__(self): + super().__init__() + self.module = nn.Sequential( + nn.Conv2d(3, 16, 3), + nn.MaxPool2d(2, 2), + nn.LeakyReLU(), + nn.Conv2d(16, 32, 3), + nn.MaxPool2d(2, 2), + nn.LeakyReLU(), + nn.Conv2d(32, 15, 3), + nn.MaxPool2d(2, 2), + nn.LeakyReLU(), + nn.Flatten(start_dim=1), + ) + self.head = nn.Sequential( + nn.Linear(60, 20), + nn.LeakyReLU(), + nn.Linear(20, 10), + ) + + def forward(self, x): + x = self.module(x) + x = self.head(x) + return x + + +def Model(): + model = CNNMedium() + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + x = torch.ones([4, 3, 32, 32]) + y = model(x) + print(y.shape) + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/cifar10_cnnmedium/test.py b/dataset/cifar10_cnnmedium/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/cifar10_cnnmedium/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/cifar10_cnnmedium/train.py b/dataset/cifar10_cnnmedium/train.py new file mode 100644 index 0000000000000000000000000000000000000000..c9079f81e001b82c5604e1deeb78bb446170a05a --- /dev/null +++ b/dataset/cifar10_cnnmedium/train.py @@ -0,0 +1,192 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 200, + "num_workers": 32, + "learning_rate": 1e-2, + "weight_decay": 0.00666, + "epochs": 50, + "save_learning_rate": 1e-5, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + download=True, + train=True, + transform=transforms.Compose([ + transforms.Resize(32), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + download=True, + train=False, + transform=transforms.Compose([ + transforms.Resize(32), + transforms.CenterCrop(32), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.SGD( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], + momentum=0.9, +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/cifar10_cnnsmall/model.py b/dataset/cifar10_cnnsmall/model.py new file mode 100644 index 0000000000000000000000000000000000000000..0be423a3052e3616ba2a6427255c4467169aa8fc --- /dev/null +++ b/dataset/cifar10_cnnsmall/model.py @@ -0,0 +1,48 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +import timm + + +class CNNSmall(nn.Module): + def __init__(self): + super().__init__() + self.module = nn.Sequential( + nn.Conv2d(3, 8, 5), + nn.MaxPool2d(2, 2), + nn.LeakyReLU(), + nn.Conv2d(8, 6, 5), + nn.MaxPool2d(2, 2), + nn.LeakyReLU(), + nn.Conv2d(6, 4, 2), + nn.LeakyReLU(), + nn.Flatten(start_dim=1), + ) + self.head = nn.Sequential( + nn.Linear(36, 20), + nn.LeakyReLU(), + nn.Linear(20, 10), + ) + + def forward(self, x): + x = F.interpolate(x, (28, 28), mode='bilinear') + x = self.module(x) + x = self.head(x) + return x + + +def Model(): + model = CNNSmall() + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + x = torch.ones([4, 3, 28, 28]) + y = model(x) + print(y.shape) + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/cifar10_cnnsmall/test.py b/dataset/cifar10_cnnsmall/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/cifar10_cnnsmall/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/cifar10_cnnsmall/train.py b/dataset/cifar10_cnnsmall/train.py new file mode 100644 index 0000000000000000000000000000000000000000..4a9274e1f884cdbc9775ce1385efa2dbfd26d8eb --- /dev/null +++ b/dataset/cifar10_cnnsmall/train.py @@ -0,0 +1,192 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 200, + "num_workers": 32, + "learning_rate": 1e-2, + "weight_decay": 0.001, + "epochs": 50, + "save_learning_rate": 1e-5, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + download=True, + train=True, + transform=transforms.Compose([ + transforms.Resize(32), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + download=True, + train=False, + transform=transforms.Compose([ + transforms.Resize(32), + transforms.CenterCrop(32), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.SGD( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], + momentum=0.9, +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/cifar10_mobilenetv3/model.py b/dataset/cifar10_mobilenetv3/model.py new file mode 100644 index 0000000000000000000000000000000000000000..246e1adf12820eac6958aeae1e867c2ff36376b8 --- /dev/null +++ b/dataset/cifar10_mobilenetv3/model.py @@ -0,0 +1,21 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("mobilenetv3_large_100", pretrained=True) + model.classifier = nn.Linear(1280, 10) + for name, param in model.named_parameters(): + if "bn" in name: + # print(f"freeze {name}") + param.requires_grad = False + return model, model.classifier + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/cifar10_mobilenetv3/test.py b/dataset/cifar10_mobilenetv3/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/cifar10_mobilenetv3/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/cifar10_mobilenetv3/train.py b/dataset/cifar10_mobilenetv3/train.py new file mode 100644 index 0000000000000000000000000000000000000000..8eea65f54eeeb9d3d834e7748e0355fc6a96c1bf --- /dev/null +++ b/dataset/cifar10_mobilenetv3/train.py @@ -0,0 +1,199 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 200, + "num_workers": 4, + "learning_rate": 3e-3, + "weight_decay": 0.1, + "epochs": 5, + "save_learning_rate": 1e-6, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + download=True, + train=True, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + download=True, + train=False, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +pre_optimizer = optim.AdamW( + head.parameters(), + lr=0.05, + weight_decay=0.01, +) +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for _ in range(1): + train(model=model, optimizer=pre_optimizer) + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/cifar10_resnet18/model.py b/dataset/cifar10_resnet18/model.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8a86289e1efb4444d80e6bce990e977d8676ad --- /dev/null +++ b/dataset/cifar10_resnet18/model.py @@ -0,0 +1,17 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("resnet18", pretrained=True) + model.fc = nn.Linear(512, 10) + return model, model.fc + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/cifar10_resnet18/test.py b/dataset/cifar10_resnet18/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/cifar10_resnet18/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/cifar10_resnet18/train.py b/dataset/cifar10_resnet18/train.py new file mode 100644 index 0000000000000000000000000000000000000000..cbcfeb1684bfe28cb7eb942f45657dccdf30e45f --- /dev/null +++ b/dataset/cifar10_resnet18/train.py @@ -0,0 +1,191 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 200, + "num_workers": 32, + "learning_rate": 3e-3, + "weight_decay": 0.1, + "epochs": 50, + "save_learning_rate": 1e-5, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + download=True, + train=True, + transform=transforms.Compose([ + transforms.Resize(64), + transforms.RandomCrop(64), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + download=True, + train=False, + transform=transforms.Compose([ + transforms.Resize(64), + transforms.CenterCrop(64), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/cifar10_vitbase/model.py b/dataset/cifar10_vitbase/model.py new file mode 100644 index 0000000000000000000000000000000000000000..0a6250ca9de58f26c2e7c73e070c39dd80d23555 --- /dev/null +++ b/dataset/cifar10_vitbase/model.py @@ -0,0 +1,17 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("vit_base_patch16_224", pretrained=True) + model.head = nn.Linear(768, 10) + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/cifar10_vitbase/test.py b/dataset/cifar10_vitbase/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/cifar10_vitbase/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/cifar10_vitbase/train.py b/dataset/cifar10_vitbase/train.py new file mode 100644 index 0000000000000000000000000000000000000000..5ebebdbad7361a7beaa6bee4d98c4a944d51da98 --- /dev/null +++ b/dataset/cifar10_vitbase/train.py @@ -0,0 +1,199 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 200, + "num_workers": 32, + "learning_rate": 3e-5, + "weight_decay": 0.1, + "epochs": 7, + "save_learning_rate": 1e-5, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + download=True, + train=True, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + download=True, + train=False, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +pre_optimizer = optim.AdamW( + head.parameters(), + lr=0.05, + weight_decay=0.01, +) +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for _ in range(3): + train(model=model, optimizer=pre_optimizer) + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/condition_classinput_inference/dataset.py b/dataset/condition_classinput_inference/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b74fb89eb76c655cb9293addc4f68802a3f76b74 --- /dev/null +++ b/dataset/condition_classinput_inference/dataset.py @@ -0,0 +1,41 @@ +import re +import sys +from torch.utils.data import Dataset +from torchvision.datasets import CIFAR10 +import torchvision.transforms as transforms + + +class BinaryClassifierDataset(Dataset): + def __init__(self, root, train, optimize_class: list): + self.optimize_class = optimize_class + self.dataset = CIFAR10( + root=root, + train=train, + download=True, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) + ) + + def __getitem__(self, index): + img, origin_target = self.dataset[index] + target = 1 if origin_target in self.optimize_class else 0 + return img, target + + def __len__(self): + return self.dataset.__len__() + + +def get_optimize_class(): + try: # get string + string = sys.argv[1] + except IndexError: + RuntimeError("sys.argv[1] not found") + class_int_string = str(re.search(r'class(\d+)', string).group(1)).zfill(4) + one_hot_string = bin(int(class_int_string))[2:].zfill(10) + optimize_class = [index for index, i in enumerate(one_hot_string) if i == "1"] + return list(optimize_class), class_int_string \ No newline at end of file diff --git a/dataset/condition_classinput_inference/model.py b/dataset/condition_classinput_inference/model.py new file mode 100644 index 0000000000000000000000000000000000000000..5b239d01334940fc96c3c75cc01ef4c523035d15 --- /dev/null +++ b/dataset/condition_classinput_inference/model.py @@ -0,0 +1,25 @@ +import torch +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("vit_tiny_patch16_224", pretrained=True) + model.head = nn.Sequential( + nn.Linear(192, 192, bias=True), + nn.SiLU(), + nn.Linear(192, 2, bias=False), + ) + for param in model.head.parameters(): + param = nn.Parameter(torch.ones_like(param) / 192) + param.requires_grad = True + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/condition_classinput_inference/test.py b/dataset/condition_classinput_inference/test.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae4dde68516651f46782854c5aaaf8e0856500c --- /dev/null +++ b/dataset/condition_classinput_inference/test.py @@ -0,0 +1,30 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint_test" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/condition_classinput_inference/train.py b/dataset/condition_classinput_inference/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ff7ed3ec9036df72eb26e060138dd822683a02c4 --- /dev/null +++ b/dataset/condition_classinput_inference/train.py @@ -0,0 +1,209 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model + from dataset import BinaryClassifierDataset as Dataset + from dataset import get_optimize_class +except ImportError: + from .model import Model + from .dataset import BinaryClassifierDataset as Dataset + from .dataset import get_optimize_class + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +from torch.nn import functional as F +import os +import sys +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 50, + "num_workers": 16, + "pre_learning_rate": 0.01, + "learning_rate": 1e-4, + "pre_epochs": 2, + "epochs": 13, + "weight_decay": 0.1, + "save_learning_rate": 2e-5, + "total_save_number": 5, + "tag": os.path.basename(os.path.dirname(__file__)), + "optimize_class": get_optimize_class()[0], + "optimize_class_int": get_optimize_class()[1], +} +config.update(additional_config) +print("Training/Testing:", config["optimize_class"]) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + train=True, + optimize_class=config["optimize_class"], +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + train=False, + optimize_class=config["optimize_class"], + ), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, +) + +# Model +model, head = Model() +model = model.to(device) +class FocalLoss(nn.Module): + def __init__(self, weight=None, gamma=2): + super(FocalLoss, self).__init__() + self.weight = weight + self.gamma = gamma + def forward(self, input, target): + ce_loss = F.cross_entropy(input, target, reduction='none', weight=self.weight) + pt = torch.exp(-ce_loss) + focal_loss = (1 - pt) ** self.gamma * ce_loss + return focal_loss.mean() +criterion = FocalLoss() + +# Optimizer +head_optimizer = optim.AdamW( + head.parameters(), + lr=config["pre_learning_rate"], + weight_decay=config["weight_decay"], +) +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in enumerate(test_loader): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + data_loader = DataLoader( + dataset=dataset, + batch_size=min(len(dataset) // config["total_save_number"], config["batch_size"]), + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + ) + model.train() + for batch_idx, (inputs, targets) in enumerate(data_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{config['optimize_class_int']}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{config['optimize_class_int']}_{config['tag']}.pth") + # exit loop + if batch_idx+1 == config["total_save_number"]: + break + + + + +# main +if __name__ == '__main__': + for epoch in range(config["pre_epochs"]): + train(model=model, optimizer=head_optimizer, scheduler=None) + # test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + # test(model=model) + save_train(model=model, optimizer=optimizer) diff --git a/dataset/condition_classinput_vittiny/dataset.py b/dataset/condition_classinput_vittiny/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b74fb89eb76c655cb9293addc4f68802a3f76b74 --- /dev/null +++ b/dataset/condition_classinput_vittiny/dataset.py @@ -0,0 +1,41 @@ +import re +import sys +from torch.utils.data import Dataset +from torchvision.datasets import CIFAR10 +import torchvision.transforms as transforms + + +class BinaryClassifierDataset(Dataset): + def __init__(self, root, train, optimize_class: list): + self.optimize_class = optimize_class + self.dataset = CIFAR10( + root=root, + train=train, + download=True, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) + ) + + def __getitem__(self, index): + img, origin_target = self.dataset[index] + target = 1 if origin_target in self.optimize_class else 0 + return img, target + + def __len__(self): + return self.dataset.__len__() + + +def get_optimize_class(): + try: # get string + string = sys.argv[1] + except IndexError: + RuntimeError("sys.argv[1] not found") + class_int_string = str(re.search(r'class(\d+)', string).group(1)).zfill(4) + one_hot_string = bin(int(class_int_string))[2:].zfill(10) + optimize_class = [index for index, i in enumerate(one_hot_string) if i == "1"] + return list(optimize_class), class_int_string \ No newline at end of file diff --git a/dataset/condition_classinput_vittiny/detail.py b/dataset/condition_classinput_vittiny/detail.py new file mode 100644 index 0000000000000000000000000000000000000000..aa5a84de541e435ea83b7ce1ad7ceafd371d40fa --- /dev/null +++ b/dataset/condition_classinput_vittiny/detail.py @@ -0,0 +1,58 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * +from torchvision.datasets import CIFAR10 +from torchvision import transforms + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./generated" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + + + +original_dataset = CIFAR10( + root=config["dataset_root"], + train=False, + download=True, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) +) +original_targets = [original_dataset[i][1] for i in range(len(original_dataset))] +original_targets = torch.tensor(original_targets, dtype=torch.long) + + + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) + all_targets, all_predicts = torch.tensor(all_targets), torch.tensor(all_predicts) + + for class_idx in range(10): + class_mask = torch.where(original_targets == class_idx, 1, 0) + total_number = torch.sum(class_mask) + correct = torch.where(all_targets == all_predicts, 1, 0) + class_correct = class_mask * correct + correct_number = torch.sum(class_correct) + class_acc = correct_number.item() / total_number.item() + print(f"class{class_idx}:", class_acc) \ No newline at end of file diff --git a/dataset/condition_classinput_vittiny/finetune.py b/dataset/condition_classinput_vittiny/finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..639dfaa1c114cdcb43560f25c784305f9f913d07 --- /dev/null +++ b/dataset/condition_classinput_vittiny/finetune.py @@ -0,0 +1,215 @@ +# set global seed +import time +print("time stamp:", time.time()) +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model + from dataset import BinaryClassifierDataset as Dataset + from dataset import get_optimize_class +except ImportError: + from .model import Model + from .dataset import BinaryClassifierDataset as Dataset + from .dataset import get_optimize_class + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +from torch.nn import functional as F +import os +import sys +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 50, + "num_workers": 16, + "pre_learning_rate": 0.01, + "learning_rate": 2e-5, + "pre_epochs": 0, + "epochs": 50, + "weight_decay": 0.1, + "save_learning_rate": 1e-6, + "total_save_number": 5, + "tag": os.path.basename(os.path.dirname(__file__)), + "optimize_class": get_optimize_class()[0], + "optimize_class_int": get_optimize_class()[1], +} +config.update(additional_config) +print("Training:", config["optimize_class"]) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + train=True, + optimize_class=config["optimize_class"], +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + train=False, + optimize_class=config["optimize_class"], + ), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, +) + +# Model +model, head = Model() +model.load_state_dict(torch.load(sys.argv[1], map_location="cpu", weights_only=True)) +model = model.to(device) +class FocalLoss(nn.Module): + def __init__(self, weight=None, gamma=2): + super(FocalLoss, self).__init__() + self.weight = weight + self.gamma = gamma + def forward(self, input, target): + ce_loss = F.cross_entropy(input, target, reduction='none', weight=self.weight) + pt = torch.exp(-ce_loss) + focal_loss = (1 - pt) ** self.gamma * ce_loss + return focal_loss.mean() +criterion = FocalLoss() + +# Optimizer +head_optimizer = optim.AdamW( + head.parameters(), + lr=config["pre_learning_rate"], + weight_decay=config["weight_decay"], +) +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in enumerate(test_loader): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + data_loader = DataLoader( + dataset=dataset, + batch_size=min(len(dataset) // config["total_save_number"], config["batch_size"]), + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + ) + model.train() + for batch_idx, (inputs, targets) in enumerate(data_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + # _, acc, _, _ = test(model=model) + acc = 1.0 + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{config['optimize_class_int']}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{config['optimize_class_int']}_{config['tag']}.pth") + # exit loop + if batch_idx+1 == config["total_save_number"]: + break + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["pre_epochs"]): + train(model=model, optimizer=head_optimizer, scheduler=None) + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + # save_train(model=model, optimizer=optimizer) +print("time stamp:", time.time()) diff --git a/dataset/condition_classinput_vittiny/model.py b/dataset/condition_classinput_vittiny/model.py new file mode 100644 index 0000000000000000000000000000000000000000..5b239d01334940fc96c3c75cc01ef4c523035d15 --- /dev/null +++ b/dataset/condition_classinput_vittiny/model.py @@ -0,0 +1,25 @@ +import torch +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("vit_tiny_patch16_224", pretrained=True) + model.head = nn.Sequential( + nn.Linear(192, 192, bias=True), + nn.SiLU(), + nn.Linear(192, 2, bias=False), + ) + for param in model.head.parameters(): + param = nn.Parameter(torch.ones_like(param) / 192) + param.requires_grad = True + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/condition_classinput_vittiny/split.sh b/dataset/condition_classinput_vittiny/split.sh new file mode 100644 index 0000000000000000000000000000000000000000..b85c7cfbfe73b0d8f1a496da3566d113ff8c1305 --- /dev/null +++ b/dataset/condition_classinput_vittiny/split.sh @@ -0,0 +1,28 @@ +mkdir checkpoint_test +mkdir checkpoint_train +mkdir generated + +mv ./checkpoint/*class0314* ./checkpoint_test +mv ./checkpoint/*class0482* ./checkpoint_test +mv ./checkpoint/*class0589* ./checkpoint_test +mv ./checkpoint/*class0197* ./checkpoint_test +mv ./checkpoint/*class0462* ./checkpoint_test +mv ./checkpoint/*class0111* ./checkpoint_test +mv ./checkpoint/*class0101* ./checkpoint_test +mv ./checkpoint/*class0278* ./checkpoint_test +mv ./checkpoint/*class0793* ./checkpoint_test +mv ./checkpoint/*class0279* ./checkpoint_test +mv ./checkpoint/*class0653* ./checkpoint_test +mv ./checkpoint/*class0238* ./checkpoint_test +mv ./checkpoint/*class1001* ./checkpoint_test +mv ./checkpoint/*class0141* ./checkpoint_test +mv ./checkpoint/*class0884* ./checkpoint_test +mv ./checkpoint/*class0592* ./checkpoint_test +mv ./checkpoint/*class0502* ./checkpoint_test +mv ./checkpoint/*class0643* ./checkpoint_test +mv ./checkpoint/*class0383* ./checkpoint_test +mv ./checkpoint/*class0128* ./checkpoint_test + +mv ./checkpoint/* ./checkpoint_train + +rm checkpoint -r \ No newline at end of file diff --git a/dataset/condition_classinput_vittiny/test.py b/dataset/condition_classinput_vittiny/test.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae4dde68516651f46782854c5aaaf8e0856500c --- /dev/null +++ b/dataset/condition_classinput_vittiny/test.py @@ -0,0 +1,30 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint_test" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/condition_classinput_vittiny/train.py b/dataset/condition_classinput_vittiny/train.py new file mode 100644 index 0000000000000000000000000000000000000000..4995025d1487b45675f1481743fbfae45883e06e --- /dev/null +++ b/dataset/condition_classinput_vittiny/train.py @@ -0,0 +1,212 @@ +# set global seed +import time +print("time stamp:", time.time()) +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model + from dataset import BinaryClassifierDataset as Dataset + from dataset import get_optimize_class +except ImportError: + from .model import Model + from .dataset import BinaryClassifierDataset as Dataset + from .dataset import get_optimize_class + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +from torch.nn import functional as F +import os +import sys +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 50, + "num_workers": 16, + "pre_learning_rate": 0.01, + "learning_rate": 1e-4, + "pre_epochs": 2, + "epochs": 13, + "weight_decay": 0.1, + "save_learning_rate": 2e-5, + "total_save_number": 5, + "tag": os.path.basename(os.path.dirname(__file__)), + "optimize_class": get_optimize_class()[0], + "optimize_class_int": get_optimize_class()[1], +} +config.update(additional_config) +print("Training:", config["optimize_class"]) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + train=True, + optimize_class=config["optimize_class"], +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + train=False, + optimize_class=config["optimize_class"], + ), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, +) + +# Model +model, head = Model() +model = model.to(device) +class FocalLoss(nn.Module): + def __init__(self, weight=None, gamma=2): + super(FocalLoss, self).__init__() + self.weight = weight + self.gamma = gamma + def forward(self, input, target): + ce_loss = F.cross_entropy(input, target, reduction='none', weight=self.weight) + pt = torch.exp(-ce_loss) + focal_loss = (1 - pt) ** self.gamma * ce_loss + return focal_loss.mean() +criterion = FocalLoss() + +# Optimizer +head_optimizer = optim.AdamW( + head.parameters(), + lr=config["pre_learning_rate"], + weight_decay=config["weight_decay"], +) +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in enumerate(test_loader): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + data_loader = DataLoader( + dataset=dataset, + batch_size=min(len(dataset) // config["total_save_number"], config["batch_size"]), + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + ) + model.train() + for batch_idx, (inputs, targets) in enumerate(data_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{config['optimize_class_int']}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{config['optimize_class_int']}_{config['tag']}.pth") + # exit loop + if batch_idx+1 == config["total_save_number"]: + break + + + + +# main +if __name__ == '__main__': + for epoch in range(config["pre_epochs"]): + train(model=model, optimizer=head_optimizer, scheduler=None) + # test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + # test(model=model) + save_train(model=model, optimizer=optimizer) +print("time stamp:", time.time()) diff --git a/dataset/condition_classinput_vittiny/train.sh b/dataset/condition_classinput_vittiny/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..4428136cca889370d4b5356ad5e13a9a8644640c --- /dev/null +++ b/dataset/condition_classinput_vittiny/train.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +start=1 +end=1022 + +for i in $(seq $start $end) +do + python train.py class$i + sleep 1 +done \ No newline at end of file diff --git a/dataset/condition_imageinput_vittiny/README.md b/dataset/condition_imageinput_vittiny/README.md new file mode 100644 index 0000000000000000000000000000000000000000..64ca56505e78e674465a7a6c2af2417405690fc7 --- /dev/null +++ b/dataset/condition_imageinput_vittiny/README.md @@ -0,0 +1 @@ +Code for condition_imageinput_vittiny is coming... \ No newline at end of file diff --git a/dataset/condition_imageinput_vittiny/dataset.py b/dataset/condition_imageinput_vittiny/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..bf24596b8b43fc3348b7373929a1c75b6fa800aa --- /dev/null +++ b/dataset/condition_imageinput_vittiny/dataset.py @@ -0,0 +1,46 @@ +import re +import sys +from torch.utils.data import Dataset +from torchvision.datasets import CIFAR10 +import torchvision.transforms as transforms + + + + +class BinaryClassifierDataset(Dataset): + def __init__(self, root, train, optimize_class): + optimize_class = [optimize_class,] if isinstance(optimize_class, int) else optimize_class + self.optimize_class = optimize_class + self.dataset = CIFAR10( + root=root, + train=train, + download=True, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) + ) + + def __getitem__(self, index): + img, origin_target = self.dataset[index] + target = 1 if origin_target in self.optimize_class else 0 + return img, target + + def __len__(self): + return self.dataset.__len__() + + + + +def get_optimize_class(): + try: # get string + string = sys.argv[1] + except IndexError: + RuntimeError("sys.argv[1] not found") + class_int_string = str(re.search(r'class(\d+)', string).group(1)).zfill(4) + one_hot_string = bin(int(class_int_string))[2:].zfill(10) + optimize_class = [index for index, i in enumerate(one_hot_string) if i == "1"] + return list(optimize_class), class_int_string \ No newline at end of file diff --git a/dataset/condition_imageinput_vittiny/model.py b/dataset/condition_imageinput_vittiny/model.py new file mode 100644 index 0000000000000000000000000000000000000000..30e2b86a6e1854c38a70b4c0e1eb39e17fda83be --- /dev/null +++ b/dataset/condition_imageinput_vittiny/model.py @@ -0,0 +1,18 @@ +import torch +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("vit_tiny_patch16_224", pretrained=True) + model.head = nn.Linear(192, 2) + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) \ No newline at end of file diff --git a/dataset/condition_imageinput_vittiny/test.py b/dataset/condition_imageinput_vittiny/test.py new file mode 100644 index 0000000000000000000000000000000000000000..042ef89eb0ba4d428b89a088a6121a8f59771d79 --- /dev/null +++ b/dataset/condition_imageinput_vittiny/test.py @@ -0,0 +1,30 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/condition_imageinput_vittiny/train.py b/dataset/condition_imageinput_vittiny/train.py new file mode 100644 index 0000000000000000000000000000000000000000..11526d904f1d0ab67b2a70ea766a1f059e5a45d3 --- /dev/null +++ b/dataset/condition_imageinput_vittiny/train.py @@ -0,0 +1,208 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +try: # relative import + from model import Model + from dataset import BinaryClassifierDataset as Dataset + from dataset import get_optimize_class +except ImportError: + from .model import Model + from .dataset import BinaryClassifierDataset as Dataset + from .dataset import get_optimize_class + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +from torch.nn import functional as F +import os +import sys +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 250 if __name__ == "__main__" else 50, + "num_workers": 20, + "pre_learning_rate": 0.01, + "learning_rate": 3e-5, + "pre_epochs": 2, + "epochs": 13, + "weight_decay": 0.1, + "save_learning_rate": 1e-5, + "total_save_number": 10, + "tag": os.path.basename(os.path.dirname(__file__)), + "optimize_class": get_optimize_class()[0], + "optimize_class_int": get_optimize_class()[1], +} +config.update(additional_config) +print("Training:", config["optimize_class"]) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + train=True, + optimize_class=config["optimize_class"], +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + train=False, + optimize_class=config["optimize_class"], + ), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, +) + +# Model +model, head = Model() +model = model.to(device) +class FocalLoss(nn.Module): + def __init__(self, weight=None, gamma=2): + super(FocalLoss, self).__init__() + self.weight = weight + self.gamma = gamma + def forward(self, input, target): + ce_loss = F.cross_entropy(input, target, reduction='none', weight=self.weight) + pt = torch.exp(-ce_loss) + focal_loss = (1 - pt) ** self.gamma * ce_loss + return focal_loss.mean() +criterion = FocalLoss() + +# Optimizer +head_optimizer = optim.AdamW( + head.parameters(), + lr=config["pre_learning_rate"], + weight_decay=config["weight_decay"], +) +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in enumerate(test_loader): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + data_loader = DataLoader( + dataset=dataset, + batch_size=min(len(dataset) // config["total_save_number"], config["batch_size"]), + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + ) + model.train() + for batch_idx, (inputs, targets) in enumerate(data_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{config['optimize_class_int']}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{config['optimize_class_int']}_{config['tag']}.pth") + # exit loop + if batch_idx+1 == config["total_save_number"]: + break + + + + +# main +if __name__ == '__main__': + for epoch in range(config["pre_epochs"]): + train(model=model, optimizer=head_optimizer, scheduler=None) + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/condition_imageinput_vittiny/train.sh b/dataset/condition_imageinput_vittiny/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..bcbf50d673c88b84fc04550e974ea7e8a3d0503a --- /dev/null +++ b/dataset/condition_imageinput_vittiny/train.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +start=0 +end=9 + +for i in $(seq $start $end) +do + power=$((2**i)) + CUDA_VISIBLE_DEVICES=5 python train.py class$power + sleep 1 +done \ No newline at end of file diff --git a/dataset/condition_permutation_vittiny/model.py b/dataset/condition_permutation_vittiny/model.py new file mode 100644 index 0000000000000000000000000000000000000000..4152b3bdf144dc7502209025a0d541d1e83702ee --- /dev/null +++ b/dataset/condition_permutation_vittiny/model.py @@ -0,0 +1,18 @@ +import torch +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("vit_tiny_patch16_224", pretrained=False) + model.head = nn.Linear(192, 10) + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) \ No newline at end of file diff --git a/dataset/condition_permutation_vittiny/test.py b/dataset/condition_permutation_vittiny/test.py new file mode 100644 index 0000000000000000000000000000000000000000..51d908747843f1b87c396f2ed7e20998f63b4c01 --- /dev/null +++ b/dataset/condition_permutation_vittiny/test.py @@ -0,0 +1,31 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + + + +for item in test_items: + print(f"testing: {item}") + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/condition_permutation_vittiny/train.py b/dataset/condition_permutation_vittiny/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f0d4a6e7daf264d865a4c3a526e24214b1368583 --- /dev/null +++ b/dataset/condition_permutation_vittiny/train.py @@ -0,0 +1,210 @@ +# set global seed +import time +print("time stamp:", time.time()) +import random +import numpy as np +import torch +import re +import sys +if __name__ == "__main__": + def get_permutation_state(): + try: # get string + string = sys.argv[1] + except IndexError: + RuntimeError("sys.argv[1] not found") + class_int_string = str(re.search(r'class(\d+)', string).group(1)).zfill(4) + return int(class_int_string) + seed = SEED = get_permutation_state() +else: # when testing + seed = SEED = 0 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) +print("Seed:", SEED) + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +from torchvision.datasets import CIFAR10 as Dataset +from torchvision import transforms +from torch.nn import functional as F +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import os +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 250 if __name__ == "__main__" else 50, + "num_workers": 16, + "learning_rate": 5e-3, + "epochs": 200, + "weight_decay": 0.1, + "save_learning_rate": 2e-5, + "total_save_number": 5, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["dataset_root"], + train=True, + download=True, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224, padding=32), + transforms.RandomHorizontalFlip(), + transforms.AutoAugment(policy=transforms.AutoAugmentPolicy("cifar10")), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2471, 0.2435, 0.2616)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["dataset_root"], + train=False, + download=True, + transform=transforms.Compose([ + transforms.Resize(224), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2471, 0.2435, 0.2616)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() + +# Optimizer +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in enumerate(test_loader): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + data_loader = DataLoader( + dataset=dataset, + batch_size=min(len(dataset) // config["total_save_number"], config["batch_size"]), + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + ) + model.train() + for batch_idx, (inputs, targets) in enumerate(data_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{SEED:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_class{SEED:04d}_{config['tag']}.pth") + # exit loop + if batch_idx+1 == config["total_save_number"]: + break + + + + +# main +if __name__ == '__main__': + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) +print("time stamp:", time.time()) \ No newline at end of file diff --git a/dataset/condition_permutation_vittiny/train.sh b/dataset/condition_permutation_vittiny/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..a4d2ec3d9e31fa426efcb7f3e2ddbb1dc93d7b9d --- /dev/null +++ b/dataset/condition_permutation_vittiny/train.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +start=0 +end=19 + +for i in $(seq $start $end) +do + python train.py class$i + sleep 1 +done \ No newline at end of file diff --git a/dataset/config.json b/dataset/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c2d5b532fbdea1f156db78b7a932cb33425535c2 --- /dev/null +++ b/dataset/config.json @@ -0,0 +1 @@ +{"dataset_root": "path_to_your_dataset", "imagenet_root": {"train": null, "test": null}, "dora_root": "/home/wangkai/arpgen/DoRA/commonsense_reasoning", "dora_env_name": "dora_llama"} \ No newline at end of file diff --git a/dataset/dataset.py b/dataset/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..08cb4d884d6364659c0593f2e20540eee0be032e --- /dev/null +++ b/dataset/dataset.py @@ -0,0 +1,327 @@ +import torch +import einops +from torch.utils.data import Dataset +from torchvision.datasets import CIFAR10 +from torchvision import transforms +import os +import math +import random +import json +from abc import ABC +import pickle + + + + +def pad_to_length(x, common_factor, **config): + if x.numel() % common_factor == 0: + return x.flatten() + # print(f"padding {x.shape} according to {common_factor}") + full_length = (x.numel() // common_factor + 1) * common_factor + padding_length = full_length - len(x.flatten()) + padding = torch.full([padding_length, ], dtype=x.dtype, device=x.device, fill_value=config["fill_value"]) + x = torch.cat((x.flatten(), padding), dim=0) + return x + +def layer_to_token(x, common_factor, **config): + if config["granularity"] == 2: # split by output + if x.numel() <= common_factor: + return pad_to_length(x.flatten(), common_factor, **config)[None] + dim2 = x[0].numel() + dim1 = x.shape[0] + if dim2 <= common_factor: + i = int(dim1 / (common_factor / dim2)) + while True: + if dim1 % i == 0 and dim2 * (dim1 // i) <= common_factor: + output = x.view(-1, dim2 * (dim1 // i)) + output = [pad_to_length(item, common_factor, **config) for item in output] + return torch.stack(output, dim=0) + i += 1 + else: # dim2 > common_factor + output = [layer_to_token(item, common_factor, **config) for item in x] + return torch.cat(output, dim=0) + elif config["granularity"] == 1: # split by layer + return pad_to_length(x.flatten(), common_factor, **config).view(-1, common_factor) + elif config["granularity"] == 0: # flatten directly + return x.flatten() + else: # NotImplementedError + raise NotImplementedError("granularity: 0: flatten directly, 1: split by layer, 2: split by output dim") + + +def token_to_layer(tokens, shape, **config): + common_factor = tokens.shape[-1] + if config["granularity"] == 2: # split by output + num_element = math.prod(shape) + if num_element <= common_factor: + param = tokens[0][:num_element].view(shape) + tokens = tokens[1:] + return param, tokens + dim2 = num_element // shape[0] + dim1 = shape[0] + if dim2 <= common_factor: + i = int(dim1 / (common_factor / dim2)) + while True: + if dim1 % i == 0 and dim2 * (dim1 // i) <= common_factor: + item_per_token = dim2 * (dim1 // i) + length = num_element // item_per_token + output = [item[:item_per_token] for item in tokens[:length]] + param = torch.cat(output, dim=0).view(shape) + tokens = tokens[length:] + return param, tokens + i += 1 + else: # dim2 > common_factor + output = [] + for i in range(shape[0]): + param, tokens = token_to_layer(tokens, shape[1:], **config) + output.append(param.flatten()) + param = torch.cat(output, dim=0).view(shape) + return param, tokens + elif config["granularity"] == 1: # split by layer + num_element = math.prod(shape) + token_num = num_element // common_factor if num_element % common_factor == 0 \ + else num_element // common_factor + 1 + param = tokens.flatten()[:num_element].view(shape) + tokens = tokens[token_num:] + return param, tokens + elif config["granularity"] == 0: # flatten directly + num_element = math.prod(shape) + param = tokens.flatten()[:num_element].view(shape) + tokens = pad_to_length(tokens.flatten()[num_element:], + common_factor, fill_value=torch.nan).view(-1, common_factor) + return param, tokens + else: # NotImplementedError + raise NotImplementedError("granularity: 0: flatten directly, 1: split by layer, 2: split by output dim") + + +def positional_embedding_2d(dim1, dim2, d_model): + assert d_model % 4 == 0, f"Cannot use sin/cos positional encoding with odd dimension {d_model}" + pe = torch.zeros(d_model, dim1, dim2) + d_model = int(d_model / 2) # Each dimension use half of d_model + div_term = torch.exp(torch.arange(0., d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / d_model)) + pos_w = torch.arange(0., dim2).unsqueeze(1) + pos_h = torch.arange(0., dim1).unsqueeze(1) + pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, dim1, 1) + pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, dim1, 1) + pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, dim2) + pe[d_model+1::2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, dim2) + return pe.permute(1, 2, 0) + + +def positional_embedding_1d(dim1, d_model): + pe = torch.zeros(dim1, d_model) + position = torch.arange(0, dim1, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + return pe + + + + +class BaseDataset(Dataset, ABC): + data_path = None + generated_path = None + test_command = None + config = { + "fill_value": torch.nan, + "granularity": 1, # 0: flatten directly, 1: split by layer, 2: split by output + "pe_granularity": 2, # 0: no embedding, 1: 1d embedding, 2: 2d embedding + } + + def __init__(self, checkpoint_path=None, dim_per_token=8192, **kwargs): + if not os.path.exists(self.data_path): + os.makedirs(self.data_path, exist_ok=False) + if self.generated_path is not None and not os.path.exists(os.path.dirname(self.generated_path)): + os.makedirs(os.path.dirname(self.generated_path)) + self.config.update(kwargs) + checkpoint_path = self.data_path if checkpoint_path is None else checkpoint_path + assert os.path.exists(checkpoint_path) + self.dim_per_token = dim_per_token + self.structure = None # set in get_structure() + self.sequence_length = None # set in get_structure() + # load checkpoint_list + checkpoint_list = os.listdir(checkpoint_path) + self.checkpoint_list = list([os.path.join(checkpoint_path, item) for item in checkpoint_list]) + self.length = self.real_length = len(self.checkpoint_list) + self.set_infinite_dataset() + # get structure + structure_cache_file = os.path.join(os.path.dirname(self.data_path), "structure.cache") + try: # try to load cache file + assert os.path.exists(structure_cache_file) + with open(structure_cache_file, "rb") as f: + print(f"Loading cache from {structure_cache_file}") + cache_file = pickle.load(f) + if len(self.checkpoint_list) != 0: + assert set(cache_file["checkpoint_list"]) == set(self.checkpoint_list) + self.structure = cache_file["structure"] + else: # empty checkpoint_list, only generate + print("Cannot find any trained checkpoint, loading cache file for generating!") + self.structure = cache_file["structure"] + fake_diction = {key: torch.zeros(item[0]) for key, item in self.structure.items()} + torch.save(fake_diction, os.path.join(checkpoint_path, "fake_checkpoint.pth")) + self.checkpoint_list.append(os.path.join(checkpoint_path, "fake_checkpoint.pth")) + self.length = self.real_length = len(self.checkpoint_list) + self.set_infinite_dataset() + os.system(f"rm {os.path.join(checkpoint_path, 'fake_checkpoint.pth')}") + except AssertionError: # recompute cache file + print("==> Organizing structure..") + self.structure = self.get_structure() + with open(structure_cache_file, "wb") as f: + pickle.dump({"structure": self.structure, "checkpoint_list": self.checkpoint_list}, f) + # get sequence_length + self.sequence_length = self.get_sequence_length() + + def get_sequence_length(self): + fake_diction = {key: torch.zeros(item[0]) for key, item in self.structure.items()} + # get sequence_length + param = self.preprocess(fake_diction) + self.sequence_length = param.size(0) + return self.sequence_length + + def get_structure(self): + # get structure + checkpoint_list = self.checkpoint_list + structures = [{} for _ in range(len(checkpoint_list))] + for i, checkpoint in enumerate(checkpoint_list): + diction = torch.load(checkpoint, map_location="cpu") + for key, value in diction.items(): + if ("num_batches_tracked" in key) or (value.numel() == 1) or not torch.is_floating_point(value): + structures[i][key] = (value.shape, value, None) + elif "running_var" in key: + pre_mean = value.mean() * 0.95 + value = torch.log(value / pre_mean + 0.05) + structures[i][key] = (value.shape, pre_mean, value.mean(), value.std()) + else: # conv & linear + structures[i][key] = (value.shape, value.mean(), value.std()) + final_structure = {} + structure_diction = torch.load(checkpoint_list[0], map_location="cpu") + for key, param in structure_diction.items(): + if ("num_batches_tracked" in key) or (param.numel() == 1) or not torch.is_floating_point(param): + final_structure[key] = (param.shape, param, None) + elif "running_var" in key: + value = [param.shape, 0., 0., 0.] + for structure in structures: + for i in [1, 2, 3]: + value[i] += structure[key][i] + for i in [1, 2, 3]: + value[i] /= len(structures) + final_structure[key] = tuple(value) + else: # conv & linear + value = [param.shape, 0., 0.] + for structure in structures: + for i in [1, 2]: + value[i] += structure[key][i] + for i in [1, 2]: + value[i] /= len(structures) + final_structure[key] = tuple(value) + self.structure = final_structure + return self.structure + + def set_infinite_dataset(self, max_num=None): + if max_num is None: + max_num = self.length * 1000000 + self.length = max_num + return self + + @property + def max_permutation_state(self): + return self.real_length + + def get_position_embedding(self, positional_embedding_dim=None): + if positional_embedding_dim is None: + positional_embedding_dim = self.dim_per_token // 2 + assert self.structure is not None, "run get_structure before get_position_embedding" + if self.config["pe_granularity"] == 2: + print("Use 2d positional embedding") + positional_embedding_index = [] + for key, item in self.structure.items(): + if ("num_batches_tracked" in key) or (item[-1] is None): + continue + else: # conv & linear + shape, *_ = item + fake_param = torch.ones(size=shape) + fake_param = layer_to_token(fake_param, self.dim_per_token, **self.config) + positional_embedding_index.append(list(range(fake_param.size(0)))) + dim1 = len(positional_embedding_index) + dim2 = max([len(token_per_layer) for token_per_layer in positional_embedding_index]) + full_pe = positional_embedding_2d(dim1, dim2, positional_embedding_dim) + positional_embedding = [] + for layer_index, token_indexes in enumerate(positional_embedding_index): + for token_index in token_indexes: + this_pe = full_pe[layer_index, token_index] + positional_embedding.append(this_pe) + positional_embedding = torch.stack(positional_embedding) + return positional_embedding + elif self.config["pe_granularity"] == 1: + print("Use 1d positional embedding") + return positional_embedding_1d(self.sequence_length, positional_embedding_dim) + elif self.config["pe_granularity"] == 0: + print("Not use positional embedding") + return torch.zeros_like(self.__getitem__(0)) + else: # NotImplementedError + raise NotImplementedError("pe_granularity: 0: no embedding, 1: 1d embedding, 2: 2d embedding") + + def __len__(self): + return self.length + + def __getitem__(self, index): + index = index % self.real_length + diction = torch.load(self.checkpoint_list[index], map_location="cpu") + param = self.preprocess(diction) + return param, index + + def save_params(self, params, save_path): + diction = self.postprocess(params.cpu().to(torch.float32)) + torch.save(diction, save_path) + + def preprocess(self, diction: dict, **kwargs) -> torch.Tensor: + param_list = [] + for key, value in diction.items(): + if ("num_batches_tracked" in key) or (value.numel() == 1) or not torch.is_floating_point(value): + continue + elif "running_var" in key: + shape, pre_mean, mean, std = self.structure[key] + value = torch.log(value / pre_mean + 0.05) + else: # normal + shape, mean, std = self.structure[key] + value = (value - mean) / std + value = layer_to_token(value, self.dim_per_token, **self.config) + param_list.append(value) + param = torch.cat(param_list, dim=0) + if self.config["granularity"] == 0: # padding directly process tail + param = pad_to_length(param, self.dim_per_token, **self.config).view(-1, self.dim_per_token) + # print("Sequence length:", param.size(0)) + return param.to(torch.float32) + + def postprocess(self, params: torch.Tensor, **kwargs) -> dict: + diction = {} + params = params if len(params.shape) == 2 else params.squeeze(0) + for key, item in self.structure.items(): + if ("num_batches_tracked" in key) or (item[-1] is None): + shape, mean, std = item + diction[key] = mean + continue + elif "running_var" in key: + shape, pre_mean, mean, std = item + else: # conv & linear + shape, mean, std = item + this_param, params = token_to_layer(params, shape, **self.config) + this_param = this_param * std + mean + if "running_var" in key: + this_param = torch.clip(torch.exp(this_param) - 0.05, min=0.001) * pre_mean + diction[key] = this_param + return diction + + +class ConditionalDataset(BaseDataset, ABC): + def _extract_condition(self, index: int): + name = self.checkpoint_list[index] + condition_list = os.path.basename(name).split("_") + return condition_list + + def __getitem__(self, index): + index = index % self.real_length + diction = torch.load(self.checkpoint_list[index], map_location="cpu") + condition = self._extract_condition(index) + param = self.preprocess(diction) + return param, condition \ No newline at end of file diff --git a/dataset/downtask_detection/README.md b/dataset/downtask_detection/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a5157b97a6b139556c41dd0d06e2b5bf46bac5a5 --- /dev/null +++ b/dataset/downtask_detection/README.md @@ -0,0 +1 @@ +Code for segmentation is coming... \ No newline at end of file diff --git a/dataset/downtask_detection/test.sh b/dataset/downtask_detection/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..7a5baf76ce9ffb59abca2ec111d5c16f8d090d6a --- /dev/null +++ b/dataset/downtask_detection/test.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +source /path/to/miniconda3/bin/activate /path/to/miniconda3/envs/environment + +CLUSTER=True \ +DETECTRON2_DATASETS="/path/to/" \ +PYTHONPATH="$(dirname $0)/Detection":$PYTHONPATH \ +python $(dirname $0)/Detection/tools/lazyconfig_train_net.py --config-file $(dirname $0)/Detection/projects/ViTDet/configs/COCO/our_vit_b_100ep.py --finetune "VIT_BASE_IN21K" \ +--num-gpus 1 \ +--fulltune \ +--eval-only "train.init_checkpoint='$1'" diff --git a/dataset/downtask_dora_r16/adapter_config.json b/dataset/downtask_dora_r16/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65200892e36afccefad839aedfd2d75566eda515 --- /dev/null +++ b/dataset/downtask_dora_r16/adapter_config.json @@ -0,0 +1,23 @@ +{ + "Wdecompose_target_modules": null, + "base_model_name_or_path": "yahma/llama-7b-hf", + "bias": "none", + "dora_simple": true, + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 32, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "DORA", + "r": 16, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/dataset/downtask_dora_r16/test.py b/dataset/downtask_dora_r16/test.py new file mode 100644 index 0000000000000000000000000000000000000000..eb077a99c501d286ab8db2fd09e1c17cf313eefd --- /dev/null +++ b/dataset/downtask_dora_r16/test.py @@ -0,0 +1,92 @@ +import os +import re +import sys +import time +import shutil +from _thread import start_new_thread +cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] +assert len(cuda_visible_devices) == 1, "Only support test on one GPU." +RANK = 16 + + + + +checkpoint_path = sys.argv[1] +assert os.path.exists(checkpoint_path) and os.path.isfile(checkpoint_path) +checkpoint_path = os.path.abspath(checkpoint_path) +adapter_config_path = os.path.join(os.path.dirname(__file__), "adapter_config.json") +assert os.path.exists(adapter_config_path) +evaluate_path = f"./finetuned_result/dora_r{RANK}/evaluating" + + +# change root +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) +root = additional_config["dora_root"] +sys.path.append(root) +os.chdir(root) +print(f"\033[91mWe are working under: {root}\033[0m") + + +# copy file +os.makedirs(evaluate_path, exist_ok=True) +if len(os.listdir(evaluate_path)) != 0: + os.system(f"rm {evaluate_path}/* -rf") +shutil.copy(adapter_config_path, os.path.join(evaluate_path, "adapter_config.json")) +shutil.copy(checkpoint_path, os.path.join(evaluate_path, "adapter_model.bin")) +evaluate_path = os.path.abspath(evaluate_path) + + + + +def find_last_accuracy_value_in_result_txt(file_pointer): + original_position = file_pointer.tell() + file_pointer.seek(0, 2) + end_position = file_pointer.tell() + buffer_size = 1024 + while end_position > 0: + start_position = max(0, end_position - buffer_size) + file_pointer.seek(start_position) + chunk = file_pointer.read(end_position - start_position) + lines = chunk.splitlines(True) # True keeps the newline character with the line + if start_position > 0: + first_line_in_chunk = lines[0] + lines[0] = file_pointer.readline() + first_line_in_chunk + for line in reversed(lines): + match = re.search(r"accuracy\s\d+\s+(\d+\.\d+)", line) + if match: + accuracy_value = float(match.group(1)) + file_pointer.seek(original_position) + return accuracy_value + end_position = start_position + file_pointer.seek(original_position) + return None + + +def conclude(path=evaluate_path): + print("\n\n\n\n\n==================== CONCLUDE ======================\n") + files = os.listdir(path) + files.sort() + for file in files: + file = os.path.join(path, file) + if ".txt" in file: + name = os.path.basename(file).split(".")[0] + with open(file, "r") as f: + value = find_last_accuracy_value_in_result_txt(f) + value *= 100. + print(f"{name}: {value:.3f}") + + + + +# start testing +activate_path = shutil.which('conda')[:-5] + "activate" +env_path = shutil.which('conda')[:-9] + f"envs/{additional_config['dora_env_name']}" +os.system( + f"bash -c \"source {activate_path} {env_path} && " + + f"sh llama_7B_Dora_eval.sh {evaluate_path} {cuda_visible_devices}\"" +) +conclude(evaluate_path) +print() diff --git a/dataset/downtask_dora_r16/train.py b/dataset/downtask_dora_r16/train.py new file mode 100644 index 0000000000000000000000000000000000000000..a2cf3b7b643db857f485d8be4116f9e1b6344ed7 --- /dev/null +++ b/dataset/downtask_dora_r16/train.py @@ -0,0 +1,109 @@ +import os +import re +import sys +import time +import torch +import shutil +from _thread import start_new_thread +cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] +assert len(cuda_visible_devices) == 1, "Only support train on one GPU." +RANK = 16 + + + + +checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoint") +if not os.path.exists(checkpoint_path): + os.makedirs(checkpoint_path, exist_ok=False) +generated_path = os.path.join(os.path.dirname(__file__), "generated") +if not os.path.exists(generated_path): + os.makedirs(generated_path, exist_ok=False) +adapter_config_path = os.path.join(os.path.dirname(__file__), "adapter_config.json") + + +# change root +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) +root = additional_config["dora_root"] +sys.path.append(root) +os.chdir(root) +print(f"\033[91mWe are working under: {root}\033[0m") +if os.path.exists(f"./finetuned_result/dora_r{RANK}"): + print(f"\033[91mWARNING: ./finetuned_result/dora_r{RANK} existed!\033[0m") + input("\033[91mPress ENTER to clear this dir...\033[0m") + os.system(f"rm ./finetuned_result/dora_r{RANK}/* -rf") + + + + +exit_flag = False + +def move_to_checkpoint(): + global exit_flag + index = 1 + finished_list = [] + while exit_flag is False: + father = f"./finetuned_result/dora_r{RANK}" + if not os.path.exists(father): + time.sleep(1) + continue + item_list = os.listdir(father) + for item in item_list: + src = os.path.join(father, item) + if not os.path.isdir(src): + continue # is file saved in the end + if item[:4] == "tmp-": + continue # is a tmp file + if src in finished_list: + continue # have been processed + finished_list.append(src) + try: # deleted before loaded + shutil.copy(os.path.join(src, "adapter_config.json"), adapter_config_path) + src = os.path.join(src, "adapter_model.bin") + diction = torch.load(src, map_location="cpu", weights_only=False) + dst = os.path.join(checkpoint_path, f"{str(index).zfill(7)}.pth") + torch.save(diction, dst) + except Exception as e: + print(f"\033[91mWARNING: encountered {e} and ignored.\033[0m") + continue + print(f"Moved {src} to {dst}.") + index += 1 + time.sleep(1) +start_new_thread(move_to_checkpoint, ()) + + +def remove_early_checkpoint(): + global exit_flag + while exit_flag is False: + item_list = [item for item in os.listdir(checkpoint_path) if item.endswith('.pth')] + if len(item_list) <= 50: + time.sleep(10) + continue + def extract_number(filename): + match = re.search(r'(\d+).pth', filename) + return int(match.group(1)) if match else -1 + sorted_items = sorted(item_list, key=extract_number) + num_to_remove = len(sorted_items) - 50 + for i in range(num_to_remove): + file_to_remove = os.path.join(checkpoint_path, sorted_items[i]) + os.remove(file_to_remove) + print(f"\033[91mRemoved: {file_to_remove}\033[0m") + time.sleep(10) +start_new_thread(remove_early_checkpoint, ()) + + + + +# start training +activate_path = shutil.which('conda')[:-5] + "activate" +env_path = shutil.which('conda')[:-9] + f"envs/{additional_config['dora_env_name']}" +os.system( + f"bash -c \"source {activate_path} {env_path} && " + + f"sh llama_7B_Dora.sh {RANK} {RANK*2} ./finetuned_result/dora_r{RANK} {cuda_visible_devices}\"" +) +# noinspection PyRedeclaration +time.sleep(5) +exit_flag = True +time.sleep(20) \ No newline at end of file diff --git a/dataset/downtask_dora_r4/adapter_config.json b/dataset/downtask_dora_r4/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3b438b504b34d6be5c633ccf3c2919dde9fc910e --- /dev/null +++ b/dataset/downtask_dora_r4/adapter_config.json @@ -0,0 +1,23 @@ +{ + "Wdecompose_target_modules": null, + "base_model_name_or_path": "yahma/llama-7b-hf", + "bias": "none", + "dora_simple": true, + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 8, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "DORA", + "r": 4, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/dataset/downtask_dora_r4/test.py b/dataset/downtask_dora_r4/test.py new file mode 100644 index 0000000000000000000000000000000000000000..19fe92d404519e61d438b2992cae79c36510e935 --- /dev/null +++ b/dataset/downtask_dora_r4/test.py @@ -0,0 +1,92 @@ +import os +import re +import sys +import time +import shutil +from _thread import start_new_thread +cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] +assert len(cuda_visible_devices) == 1, "Only support test on one GPU." +RANK = 4 + + + + +checkpoint_path = sys.argv[1] +assert os.path.exists(checkpoint_path) and os.path.isfile(checkpoint_path) +checkpoint_path = os.path.abspath(checkpoint_path) +adapter_config_path = os.path.join(os.path.dirname(__file__), "adapter_config.json") +assert os.path.exists(adapter_config_path) +evaluate_path = f"./finetuned_result/dora_r{RANK}/evaluating" + + +# change root +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) +root = additional_config["dora_root"] +sys.path.append(root) +os.chdir(root) +print(f"\033[91mWe are working under: {root}\033[0m") + + +# copy file +os.makedirs(evaluate_path, exist_ok=True) +if len(os.listdir(evaluate_path)) != 0: + os.system(f"rm {evaluate_path}/* -rf") +shutil.copy(adapter_config_path, os.path.join(evaluate_path, "adapter_config.json")) +shutil.copy(checkpoint_path, os.path.join(evaluate_path, "adapter_model.bin")) +evaluate_path = os.path.abspath(evaluate_path) + + + + +def find_last_accuracy_value_in_result_txt(file_pointer): + original_position = file_pointer.tell() + file_pointer.seek(0, 2) + end_position = file_pointer.tell() + buffer_size = 1024 + while end_position > 0: + start_position = max(0, end_position - buffer_size) + file_pointer.seek(start_position) + chunk = file_pointer.read(end_position - start_position) + lines = chunk.splitlines(True) # True keeps the newline character with the line + if start_position > 0: + first_line_in_chunk = lines[0] + lines[0] = file_pointer.readline() + first_line_in_chunk + for line in reversed(lines): + match = re.search(r"accuracy\s\d+\s+(\d+\.\d+)", line) + if match: + accuracy_value = float(match.group(1)) + file_pointer.seek(original_position) + return accuracy_value + end_position = start_position + file_pointer.seek(original_position) + return None + + +def conclude(path=evaluate_path): + print("\n\n\n\n\n==================== CONCLUDE ======================\n") + files = os.listdir(path) + files.sort() + for file in files: + file = os.path.join(path, file) + if ".txt" in file: + name = os.path.basename(file).split(".")[0] + with open(file, "r") as f: + value = find_last_accuracy_value_in_result_txt(f) + value *= 100. + print(f"{name}: {value:.3f}") + + + + +# start testing +activate_path = shutil.which('conda')[:-5] + "activate" +env_path = shutil.which('conda')[:-9] + f"envs/{additional_config['dora_env_name']}" +os.system( + f"bash -c \"source {activate_path} {env_path} && " + + f"sh llama_7B_Dora_eval.sh {evaluate_path} {cuda_visible_devices}\"" +) +conclude(evaluate_path) +print() diff --git a/dataset/downtask_dora_r4/train.py b/dataset/downtask_dora_r4/train.py new file mode 100644 index 0000000000000000000000000000000000000000..c07fea4ee86180e7b9e4a312d9d6509be84f82b5 --- /dev/null +++ b/dataset/downtask_dora_r4/train.py @@ -0,0 +1,109 @@ +import os +import re +import sys +import time +import torch +import shutil +from _thread import start_new_thread +cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] +assert len(cuda_visible_devices) == 1, "Only support train on one GPU." +RANK = 4 + + + + +checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoint") +if not os.path.exists(checkpoint_path): + os.makedirs(checkpoint_path, exist_ok=False) +generated_path = os.path.join(os.path.dirname(__file__), "generated") +if not os.path.exists(generated_path): + os.makedirs(generated_path, exist_ok=False) +adapter_config_path = os.path.join(os.path.dirname(__file__), "adapter_config.json") + + +# change root +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) +root = additional_config["dora_root"] +sys.path.append(root) +os.chdir(root) +print(f"\033[91mWe are working under: {root}\033[0m") +if os.path.exists(f"./finetuned_result/dora_r{RANK}"): + print(f"\033[91mWARNING: ./finetuned_result/dora_r{RANK} existed!\033[0m") + input("\033[91mPress ENTER to clear this dir...\033[0m") + os.system(f"rm ./finetuned_result/dora_r{RANK}/* -rf") + + + + +exit_flag = False + +def move_to_checkpoint(): + global exit_flag + index = 1 + finished_list = [] + while exit_flag is False: + father = f"./finetuned_result/dora_r{RANK}" + if not os.path.exists(father): + time.sleep(1) + continue + item_list = os.listdir(father) + for item in item_list: + src = os.path.join(father, item) + if not os.path.isdir(src): + continue # is file saved in the end + if item[:4] == "tmp-": + continue # is a tmp file + if src in finished_list: + continue # have been processed + finished_list.append(src) + try: # deleted before loaded + shutil.copy(os.path.join(src, "adapter_config.json"), adapter_config_path) + src = os.path.join(src, "adapter_model.bin") + diction = torch.load(src, map_location="cpu", weights_only=False) + dst = os.path.join(checkpoint_path, f"{str(index).zfill(7)}.pth") + torch.save(diction, dst) + except Exception as e: + print(f"\033[91mWARNING: encountered {e} and ignored.\033[0m") + continue + print(f"Moved {src} to {dst}.") + index += 1 + time.sleep(1) +start_new_thread(move_to_checkpoint, ()) + + +def remove_early_checkpoint(): + global exit_flag + while exit_flag is False: + item_list = [item for item in os.listdir(checkpoint_path) if item.endswith('.pth')] + if len(item_list) <= 50: + time.sleep(10) + continue + def extract_number(filename): + match = re.search(r'(\d+).pth', filename) + return int(match.group(1)) if match else -1 + sorted_items = sorted(item_list, key=extract_number) + num_to_remove = len(sorted_items) - 50 + for i in range(num_to_remove): + file_to_remove = os.path.join(checkpoint_path, sorted_items[i]) + os.remove(file_to_remove) + print(f"\033[91mRemoved: {file_to_remove}\033[0m") + time.sleep(10) +start_new_thread(remove_early_checkpoint, ()) + + + + +# start training +activate_path = shutil.which('conda')[:-5] + "activate" +env_path = shutil.which('conda')[:-9] + f"envs/{additional_config['dora_env_name']}" +os.system( + f"bash -c \"source {activate_path} {env_path} && " + + f"sh llama_7B_Dora.sh {RANK} {RANK*2} ./finetuned_result/dora_r{RANK} {cuda_visible_devices}\"" +) +# noinspection PyRedeclaration +time.sleep(5) +exit_flag = True +time.sleep(20) \ No newline at end of file diff --git a/dataset/downtask_dora_r64/adapter_config.json b/dataset/downtask_dora_r64/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..74252d5d0fc55b3a5c8661205e10cfdd4de972fe --- /dev/null +++ b/dataset/downtask_dora_r64/adapter_config.json @@ -0,0 +1,23 @@ +{ + "Wdecompose_target_modules": null, + "base_model_name_or_path": "yahma/llama-7b-hf", + "bias": "none", + "dora_simple": true, + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 128, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "DORA", + "r": 64, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/dataset/downtask_dora_r64/test.py b/dataset/downtask_dora_r64/test.py new file mode 100644 index 0000000000000000000000000000000000000000..ef5649d5ddf96d840922b4e2f0dc5860ffb2b99e --- /dev/null +++ b/dataset/downtask_dora_r64/test.py @@ -0,0 +1,92 @@ +import os +import re +import sys +import time +import shutil +from _thread import start_new_thread +cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] +assert len(cuda_visible_devices) == 1, "Only support test on one GPU." +RANK = 64 + + + + +checkpoint_path = sys.argv[1] +assert os.path.exists(checkpoint_path) and os.path.isfile(checkpoint_path) +checkpoint_path = os.path.abspath(checkpoint_path) +adapter_config_path = os.path.join(os.path.dirname(__file__), "adapter_config.json") +assert os.path.exists(adapter_config_path) +evaluate_path = f"./finetuned_result/dora_r{RANK}/evaluating" + + +# change root +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) +root = additional_config["dora_root"] +sys.path.append(root) +os.chdir(root) +print(f"\033[91mWe are working under: {root}\033[0m") + + +# copy file +os.makedirs(evaluate_path, exist_ok=True) +if len(os.listdir(evaluate_path)) != 0: + os.system(f"rm {evaluate_path}/* -rf") +shutil.copy(adapter_config_path, os.path.join(evaluate_path, "adapter_config.json")) +shutil.copy(checkpoint_path, os.path.join(evaluate_path, "adapter_model.bin")) +evaluate_path = os.path.abspath(evaluate_path) + + + + +def find_last_accuracy_value_in_result_txt(file_pointer): + original_position = file_pointer.tell() + file_pointer.seek(0, 2) + end_position = file_pointer.tell() + buffer_size = 1024 + while end_position > 0: + start_position = max(0, end_position - buffer_size) + file_pointer.seek(start_position) + chunk = file_pointer.read(end_position - start_position) + lines = chunk.splitlines(True) # True keeps the newline character with the line + if start_position > 0: + first_line_in_chunk = lines[0] + lines[0] = file_pointer.readline() + first_line_in_chunk + for line in reversed(lines): + match = re.search(r"accuracy\s\d+\s+(\d+\.\d+)", line) + if match: + accuracy_value = float(match.group(1)) + file_pointer.seek(original_position) + return accuracy_value + end_position = start_position + file_pointer.seek(original_position) + return None + + +def conclude(path=evaluate_path): + print("\n\n\n\n\n==================== CONCLUDE ======================\n") + files = os.listdir(path) + files.sort() + for file in files: + file = os.path.join(path, file) + if ".txt" in file: + name = os.path.basename(file).split(".")[0] + with open(file, "r") as f: + value = find_last_accuracy_value_in_result_txt(f) + value *= 100. + print(f"{name}: {value:.3f}") + + + + +# start testing +activate_path = shutil.which('conda')[:-5] + "activate" +env_path = shutil.which('conda')[:-9] + f"envs/{additional_config['dora_env_name']}" +os.system( + f"bash -c \"source {activate_path} {env_path} && " + + f"sh llama_7B_Dora_eval.sh {evaluate_path} {cuda_visible_devices}\"" +) +conclude(evaluate_path) +print() diff --git a/dataset/downtask_dora_r64/train.py b/dataset/downtask_dora_r64/train.py new file mode 100644 index 0000000000000000000000000000000000000000..83115fd3d4fda38935ff8b934e25bcb8faf13a9a --- /dev/null +++ b/dataset/downtask_dora_r64/train.py @@ -0,0 +1,109 @@ +import os +import re +import sys +import time +import torch +import shutil +from _thread import start_new_thread +cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] +assert len(cuda_visible_devices) == 1, "Only support train on one GPU." +RANK = 64 + + + + +checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoint") +if not os.path.exists(checkpoint_path): + os.makedirs(checkpoint_path, exist_ok=False) +generated_path = os.path.join(os.path.dirname(__file__), "generated") +if not os.path.exists(generated_path): + os.makedirs(generated_path, exist_ok=False) +adapter_config_path = os.path.join(os.path.dirname(__file__), "adapter_config.json") + + +# change root +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) +root = additional_config["dora_root"] +sys.path.append(root) +os.chdir(root) +print(f"\033[91mWe are working under: {root}\033[0m") +if os.path.exists(f"./finetuned_result/dora_r{RANK}"): + print(f"\033[91mWARNING: ./finetuned_result/dora_r{RANK} existed!\033[0m") + input("\033[91mPress ENTER to clear this dir...\033[0m") + os.system(f"rm ./finetuned_result/dora_r{RANK}/* -rf") + + + + +exit_flag = False + +def move_to_checkpoint(): + global exit_flag + index = 1 + finished_list = [] + while exit_flag is False: + father = f"./finetuned_result/dora_r{RANK}" + if not os.path.exists(father): + time.sleep(1) + continue + item_list = os.listdir(father) + for item in item_list: + src = os.path.join(father, item) + if not os.path.isdir(src): + continue # is file saved in the end + if item[:4] == "tmp-": + continue # is a tmp file + if src in finished_list: + continue # have been processed + finished_list.append(src) + try: # deleted before loaded + shutil.copy(os.path.join(src, "adapter_config.json"), adapter_config_path) + src = os.path.join(src, "adapter_model.bin") + diction = torch.load(src, map_location="cpu", weights_only=False) + dst = os.path.join(checkpoint_path, f"{str(index).zfill(7)}.pth") + torch.save(diction, dst) + except Exception as e: + print(f"\033[91mWARNING: encountered {e} and ignored.\033[0m") + continue + print(f"Moved {src} to {dst}.") + index += 1 + time.sleep(1) +start_new_thread(move_to_checkpoint, ()) + + +def remove_early_checkpoint(): + global exit_flag + while exit_flag is False: + item_list = [item for item in os.listdir(checkpoint_path) if item.endswith('.pth')] + if len(item_list) <= 50: + time.sleep(10) + continue + def extract_number(filename): + match = re.search(r'(\d+).pth', filename) + return int(match.group(1)) if match else -1 + sorted_items = sorted(item_list, key=extract_number) + num_to_remove = len(sorted_items) - 50 + for i in range(num_to_remove): + file_to_remove = os.path.join(checkpoint_path, sorted_items[i]) + os.remove(file_to_remove) + print(f"\033[91mRemoved: {file_to_remove}\033[0m") + time.sleep(10) +start_new_thread(remove_early_checkpoint, ()) + + + + +# start training +activate_path = shutil.which('conda')[:-5] + "activate" +env_path = shutil.which('conda')[:-9] + f"envs/{additional_config['dora_env_name']}" +os.system( + f"bash -c \"source {activate_path} {env_path} && " + + f"sh llama_7B_Dora.sh {RANK} {RANK*2} ./finetuned_result/dora_r{RANK} {cuda_visible_devices}\"" +) +# noinspection PyRedeclaration +time.sleep(5) +exit_flag = True +time.sleep(20) \ No newline at end of file diff --git a/dataset/downtask_segmentation/README.md b/dataset/downtask_segmentation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a5157b97a6b139556c41dd0d06e2b5bf46bac5a5 --- /dev/null +++ b/dataset/downtask_segmentation/README.md @@ -0,0 +1 @@ +Code for segmentation is coming... \ No newline at end of file diff --git a/dataset/downtask_segmentation/convert.py b/dataset/downtask_segmentation/convert.py new file mode 100644 index 0000000000000000000000000000000000000000..b1997ae0bc8cd37a4c7bc50a42c57b2cd0c57482 --- /dev/null +++ b/dataset/downtask_segmentation/convert.py @@ -0,0 +1,16 @@ +import numpy as np +import torch +import sys +import os + +file = sys.argv[1] +model = torch.load(file, map_location="cpu") +if 'meta' in model.keys(): + print("this file need not to convert.") + exit(0) +else: # this is a raw checkpoint + meta_file = os.path.join(os.path.dirname(__file__), "Segmentation/example.pth") + meta_data = torch.load(meta_file, map_location="cpu")['meta'] + model = {'meta': meta_data, "state_dict": model} + torch.save(model, file) + print("converted to test-able file.") diff --git a/dataset/downtask_segmentation/reverse.py b/dataset/downtask_segmentation/reverse.py new file mode 100644 index 0000000000000000000000000000000000000000..3cdcadd3cd92466887d3eac7c38e225129470145 --- /dev/null +++ b/dataset/downtask_segmentation/reverse.py @@ -0,0 +1,12 @@ +import numpy as np +import torch +import sys + +file = sys.argv[1] + +try: + model = torch.load(file, map_location='cpu')['state_dict'] + torch.save(model, file) + print('this file has been reversed.') +except KeyError: + print('this file need not to reverse.') diff --git a/dataset/downtask_segmentation/test.sh b/dataset/downtask_segmentation/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..6fc4812ef9122ef09034e4627bbe1483baa0fc65 --- /dev/null +++ b/dataset/downtask_segmentation/test.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +source /path/to/miniconda3/bin/activate /path/to/miniconda3/envs/environment + +python ./convert.py "$1" + +PYTHONPATH=/path/to/Segmentation:$PYTHONPATH \ + python /path/to/Segmentation/tools/test.py \ + /path/to/Segmentation/configs/beit/upernet/our_vit.py \ + "$1" \ + --launcher none \ + --eval "mIoU" + +python ./reverse.py "$1" + diff --git a/dataset/imagenet_convnextatto/model.py b/dataset/imagenet_convnextatto/model.py new file mode 100644 index 0000000000000000000000000000000000000000..650d4a115b5b87c10f1019144acbd3cfa362d68c --- /dev/null +++ b/dataset/imagenet_convnextatto/model.py @@ -0,0 +1,16 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("convnext_atto", pretrained=True) + return model, model.head.fc + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/imagenet_convnextatto/test.py b/dataset/imagenet_convnextatto/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/imagenet_convnextatto/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/imagenet_convnextatto/train.py b/dataset/imagenet_convnextatto/train.py new file mode 100644 index 0000000000000000000000000000000000000000..87aa842f6f0742f0bcb1c4ca1059c34869cec357 --- /dev/null +++ b/dataset/imagenet_convnextatto/train.py @@ -0,0 +1,187 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import ImageFolder as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 256 if __name__ == "__main__" else 200, + "num_workers": 16, + "learning_rate": 1e-6, + "weight_decay": 0.1, + "epochs": 0, + "save_learning_rate": 1e-6, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["imagenet_root"]["train"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.RandAugment(), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["imagenet_root"]["test"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/imagenet_convnextlarge/model.py b/dataset/imagenet_convnextlarge/model.py new file mode 100644 index 0000000000000000000000000000000000000000..5e7800fce1e3c9faae811eef7f673837fc4d64b0 --- /dev/null +++ b/dataset/imagenet_convnextlarge/model.py @@ -0,0 +1,16 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("convnext_large", pretrained=True) + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/imagenet_convnextlarge/test.py b/dataset/imagenet_convnextlarge/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/imagenet_convnextlarge/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/imagenet_convnextlarge/train.py b/dataset/imagenet_convnextlarge/train.py new file mode 100644 index 0000000000000000000000000000000000000000..db34ecd50bbaac86cb98e2cbe24d3592189f587f --- /dev/null +++ b/dataset/imagenet_convnextlarge/train.py @@ -0,0 +1,187 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import ImageFolder as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 256 if __name__ == "__main__" else 100, + "num_workers": 16, + "learning_rate": 3e-5, + "weight_decay": 0.1, + "epochs": 1, + "save_learning_rate": 3e-5, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["imagenet_root"]["train"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.RandAugment(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["imagenet_root"]["test"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/imagenet_resnet18/model.py b/dataset/imagenet_resnet18/model.py new file mode 100644 index 0000000000000000000000000000000000000000..a7c19ade0485fb11998f364765805f7baa9b4adb --- /dev/null +++ b/dataset/imagenet_resnet18/model.py @@ -0,0 +1,16 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("resnet18", pretrained=True) + return model, model.fc + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/imagenet_resnet18/test.py b/dataset/imagenet_resnet18/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/imagenet_resnet18/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/imagenet_resnet18/train.py b/dataset/imagenet_resnet18/train.py new file mode 100644 index 0000000000000000000000000000000000000000..40055ba58746c8c7a9e07c2c742fb28791750be2 --- /dev/null +++ b/dataset/imagenet_resnet18/train.py @@ -0,0 +1,187 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import ImageFolder as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 500 if __name__ == "__main__" else 200, + "num_workers": 16, + "learning_rate": 1e-5, + "weight_decay": 0.1, + "epochs": 0, + "save_learning_rate": 1e-5, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["imagenet_root"]["train"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.RandAugment(), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["imagenet_root"]["test"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/imagenet_resnet50/model.py b/dataset/imagenet_resnet50/model.py new file mode 100644 index 0000000000000000000000000000000000000000..06e5ab1d3f0b9ac59d881926979760769d33f80b --- /dev/null +++ b/dataset/imagenet_resnet50/model.py @@ -0,0 +1,16 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("resnet50_gn", pretrained=True) + return model, model.fc + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/imagenet_resnet50/test.py b/dataset/imagenet_resnet50/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/imagenet_resnet50/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/imagenet_resnet50/train.py b/dataset/imagenet_resnet50/train.py new file mode 100644 index 0000000000000000000000000000000000000000..a8a6e6e283adc05dd9876d82b1e40fa4d9ff9690 --- /dev/null +++ b/dataset/imagenet_resnet50/train.py @@ -0,0 +1,187 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 21 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import ImageFolder as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 200 if __name__ == "__main__" else 200, + "num_workers": 4, + "learning_rate": 1e-6, + "weight_decay": 0.1, + "epochs": 0, + "save_learning_rate": 1e-6, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["imagenet_root"]["train"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.RandAugment(), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["imagenet_root"]["test"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/imagenet_vitbase/model.py b/dataset/imagenet_vitbase/model.py new file mode 100644 index 0000000000000000000000000000000000000000..ebdc36fd2cb90b352fdf61a828e718872bc27d77 --- /dev/null +++ b/dataset/imagenet_vitbase/model.py @@ -0,0 +1,16 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("vit_base_patch16_224", pretrained=True) + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/imagenet_vitbase/test.py b/dataset/imagenet_vitbase/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/imagenet_vitbase/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/imagenet_vitbase/train.py b/dataset/imagenet_vitbase/train.py new file mode 100644 index 0000000000000000000000000000000000000000..59c6cc6b09b67e86dc75362c4fb0c802e81d60e8 --- /dev/null +++ b/dataset/imagenet_vitbase/train.py @@ -0,0 +1,189 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import ImageFolder as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 100 if __name__ == "__main__" else 200, + "num_workers": 8, + "learning_rate": 3e-6, + "weight_decay": 0.1, + "epochs": 0, + "save_learning_rate": 3e-6, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["imagenet_root"]["train"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.RandAugment(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["imagenet_root"]["test"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + # if batch_idx >= 50: + # break + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/imagenet_vitsmall/model.py b/dataset/imagenet_vitsmall/model.py new file mode 100644 index 0000000000000000000000000000000000000000..41d14b86494dd93af8b368dfb9f6e0d91de1d226 --- /dev/null +++ b/dataset/imagenet_vitsmall/model.py @@ -0,0 +1,16 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("vit_small_patch16_224", pretrained=True) + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/imagenet_vitsmall/test.py b/dataset/imagenet_vitsmall/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/imagenet_vitsmall/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/imagenet_vitsmall/train.py b/dataset/imagenet_vitsmall/train.py new file mode 100644 index 0000000000000000000000000000000000000000..022395e8149c845e75914b8f5a1fd21f34823fb5 --- /dev/null +++ b/dataset/imagenet_vitsmall/train.py @@ -0,0 +1,187 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import ImageFolder as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 200 if __name__ == "__main__" else 200, + "num_workers": 16, + "learning_rate": 1e-6, + "weight_decay": 0.1, + "epochs": 0, + "save_learning_rate": 1e-6, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["imagenet_root"]["train"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.RandAugment(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["imagenet_root"]["test"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/imagenet_vittiny/model.py b/dataset/imagenet_vittiny/model.py new file mode 100644 index 0000000000000000000000000000000000000000..953285b6a81ac2443acf08199bf85dfa8c332a17 --- /dev/null +++ b/dataset/imagenet_vittiny/model.py @@ -0,0 +1,16 @@ +import torch.nn as nn +import timm + + +def Model(): + model = timm.create_model("vit_tiny_patch16_224", pretrained=True) + return model, model.head + + +if __name__ == "__main__": + model, _ = Model() + print(model) + num_param = 0 + for v in model.parameters(): + num_param += v.numel() + print("num_param:", num_param) diff --git a/dataset/imagenet_vittiny/test.py b/dataset/imagenet_vittiny/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd304532ca5ea17bf916c2a3a7ec15a03051a82 --- /dev/null +++ b/dataset/imagenet_vittiny/test.py @@ -0,0 +1,28 @@ +import os +import sys +if __name__ == "__main__": + from train import * +else: # relative import + from .train import * + + + + +try: + test_item = sys.argv[1] +except IndexError: + assert __name__ == "__main__" + test_item = "./checkpoint" +test_items = [] +if os.path.isdir(test_item): + for item in os.listdir(test_item): + item = os.path.join(test_item, item) + test_items.append(item) +elif os.path.isfile(test_item): + test_items.append(test_item) + + +for item in test_items: + state = torch.load(item, map_location="cpu") + model.load_state_dict({key: value.to(torch.float32).to(device) for key, value in state.items()}) + loss, acc, all_targets, all_predicts = test(model=model) \ No newline at end of file diff --git a/dataset/imagenet_vittiny/train.py b/dataset/imagenet_vittiny/train.py new file mode 100644 index 0000000000000000000000000000000000000000..5243c6bcee2c585d4de83d5cbd8621c4d379bba7 --- /dev/null +++ b/dataset/imagenet_vittiny/train.py @@ -0,0 +1,187 @@ +# set global seed +import random +import numpy as np +import torch +seed = SEED = 20 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + + +try: # relative import + from model import Model +except ImportError: + from .model import Model + +# import +import torch.nn as nn +from torch import optim +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +import torchvision.transforms as transforms +from torchvision.datasets import ImageFolder as Dataset +from tqdm.auto import tqdm +import os +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +# load additional config +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") +with open(config_file, "r") as f: + additional_config = json.load(f) + + + + +# config +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +config = { + "dataset_root": "from_additional_config", + "batch_size": 50 if __name__ == "__main__" else 200, + "num_workers": 16, + "learning_rate": 1e-5, + "weight_decay": 0.1, + "epochs": 0, + "save_learning_rate": 1e-5, + "total_save_number": 50, + "tag": os.path.basename(os.path.dirname(__file__)), +} +config.update(additional_config) + + + + +# Data +dataset = Dataset( + root=config["imagenet_root"]["train"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.RandomCrop(224), + transforms.RandomHorizontalFlip(), + transforms.RandAugment(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) +) +train_loader = DataLoader( + dataset=dataset, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=True, + drop_last=True, + pin_memory=True, + persistent_workers=True, +) +test_loader = DataLoader( + dataset=Dataset( + root=config["imagenet_root"]["test"], + transform=transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ])), + batch_size=config["batch_size"], + num_workers=config["num_workers"], + shuffle=False, + pin_memory=True, + persistent_workers=True, + pin_memory_device="cuda", +) + +# Model +model, head = Model() +model = model.to(device) +criterion = nn.CrossEntropyLoss() +optimizer = optim.AdamW( + model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=config["epochs"], + eta_min=config["save_learning_rate"], +) + + + + +# Training +def train(model=model, optimizer=optimizer, scheduler=scheduler): + model.train() + for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader), + total=len(dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + if scheduler is not None: + scheduler.step() + +# test +@torch.no_grad() +def test(model=model): + model.eval() + all_targets = [] + all_predicts = [] + test_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in tqdm(enumerate(test_loader), + total=len(test_loader.dataset) // config["batch_size"]): + inputs, targets = inputs.to(device), targets.to(device) + with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + # to logging losses + all_targets.extend(targets.flatten().tolist()) + test_loss += loss.item() + _, predicts = outputs.max(1) + all_predicts.extend(predicts.flatten().tolist()) + total += targets.size(0) + correct += predicts.eq(targets).sum().item() + loss = test_loss / (batch_idx + 1) + acc = correct / total + print(f"Loss: {loss:.4f} | Acc: {acc:.4f}\n") + model.train() + return loss, acc, all_targets, all_predicts + +# save train +def save_train(model=model, optimizer=optimizer): + model.train() + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + # Save checkpoint + if batch_idx % (len(dataset) // train_loader.batch_size // config["total_save_number"]) == 0: + _, acc, _, _ = test(model=model) + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + save_state = {key: value.cpu().to(torch.float32) for key, value in model.state_dict().items()} + torch.save(save_state, f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + print("save:", f"checkpoint/{str(batch_idx).zfill(4)}_acc{acc:.4f}_seed{seed:04d}_{config['tag']}.pth") + + + + +# main +if __name__ == '__main__': + test(model=model) + for epoch in range(config["epochs"]): + train(model=model, optimizer=optimizer, scheduler=scheduler) + test(model=model) + save_train(model=model, optimizer=optimizer) \ No newline at end of file diff --git a/dataset/register.py b/dataset/register.py new file mode 100644 index 0000000000000000000000000000000000000000..138371989cc39caa8653a14447ee9e83e1a68345 --- /dev/null +++ b/dataset/register.py @@ -0,0 +1,173 @@ +import os +import torch +from .dataset import BaseDataset, ConditionalDataset +import json +config_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), "workspace/config.json") +with open(config_file, "r") as f: + running_config = json.load(f) +test_gpu_ids = running_config["test_gpu_ids"] + + + + +class ImageNet_ResNet18(BaseDataset): + data_path = "./dataset/imagenet_resnet18/checkpoint" + generated_path = "./dataset/imagenet_resnet18/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/imagenet_resnet18/test.py " + \ + "./dataset/imagenet_resnet18/generated/generated_model.pth" + +class ImageNet_ResNet50(BaseDataset): + data_path = "./dataset/imagenet_resnet50/checkpoint" + generated_path = "./dataset/imagenet_resnet50/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/imagenet_resnet50/test.py " + \ + "./dataset/imagenet_resnet50/generated/generated_model.pth" + +class ImageNet_ViTTiny(BaseDataset): + data_path = "./dataset/imagenet_vittiny/checkpoint" + generated_path = "./dataset/imagenet_vittiny/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/imagenet_vittiny/test.py " + \ + "./dataset/imagenet_vittiny/generated/generated_model.pth" + +class ImageNet_ViTSmall(BaseDataset): + data_path = "./dataset/imagenet_vitsmall/checkpoint" + generated_path = "./dataset/imagenet_vitsmall/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/imagenet_vitsmall/test.py " + \ + "./dataset/imagenet_vitsmall/generated/generated_model.pth" + +class ImageNet_ViTBase(BaseDataset): + data_path = "./dataset/imagenet_vitbase/checkpoint" + generated_path = "./dataset/imagenet_vitbase/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/imagenet_vitbase/test.py " + \ + "./dataset/imagenet_vitbase/generated/generated_model.pth" + +class ImageNet_ConvNextAtto(BaseDataset): + data_path = "./dataset/imagenet_convnextatto/checkpoint" + generated_path = "./dataset/imagenet_convnextatto/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/imagenet_convnextatto/test.py " + \ + "./dataset/imagenet_convnextatto/generated/generated_model.pth" + +class ImageNet_ConvNextLarge(BaseDataset): + data_path = "./dataset/imagenet_convnextlarge/checkpoint" + generated_path = "./dataset/imagenet_convnextlarge/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/imagenet_convnextlarge/test.py " + \ + "./dataset/imagenet_convnextlarge/generated/generated_model.pth" + +class CocoDetection(BaseDataset): + data_path = "./dataset/downtask_detection/checkpoint" + generated_path = "./dataset/downtask_detection/generated/generated_model.pth" + test_command = "echo \"Code for testing is coming soon!\n\"" + # test_command = "bash ./dataset/downtask_detection/test.sh " + \ + # "./dataset/downtask_detection/generated/generated_model.pth" + +class ADE20KSegmentation(BaseDataset): + data_path = "./dataset/downtask_segmentation/checkpoint" + generated_path = "./dataset/downtask_segmentation/generated/generated_model.pth" + test_command = "echo \"Code for testing is coming soon!\n\"" + # test_command = "bash ./dataset/downtask_segmentation/test.sh " + \ + # "./dataset/downtask_segmentation/generated/generated_model.pth" + +class DoRACommonSenseReasoningR4(BaseDataset): + data_path = "./dataset/downtask_dora_r4/checkpoint" + generated_path = "./dataset/downtask_dora_r4/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/downtask_dora_r4/test.py " + \ + "./dataset/downtask_dora_r4/generated/generated_model.pth" + +class DoRACommonSenseReasoningR16(BaseDataset): + data_path = "./dataset/downtask_dora_r16/checkpoint" + generated_path = "./dataset/downtask_dora_r16/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/downtask_dora_r16/test.py " + \ + "./dataset/downtask_dora_r16/generated/generated_model.pth" + +class DoRACommonSenseReasoningR64(BaseDataset): + data_path = "./dataset/downtask_dora_r64/checkpoint" + generated_path = "./dataset/downtask_dora_r64/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/downtask_dora_r64/test.py " + \ + "./dataset/downtask_dora_r64/generated/generated_model.pth" + +class Cifar10_ResNet18(BaseDataset): + data_path = "./dataset/cifar10_resnet18/checkpoint" + generated_path = "./dataset/cifar10_resnet18/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/cifar10_resnet18/test.py " + \ + "./dataset/cifar10_resnet18/generated/generated_model.pth" + +class Cifar10_MobileNetv3(BaseDataset): + data_path = "./dataset/cifar10_mobilenetv3/checkpoint" + generated_path = "./dataset/cifar10_mobilenetv3/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/cifar10_mobilenetv3/test.py " + \ + "./dataset/cifar10_mobilenetv3/generated/generated_model.pth" + +class Cifar10_ViTBase(BaseDataset): + data_path = "./dataset/cifar10_vitbase/checkpoint" + generated_path = "./dataset/cifar10_vitbase/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/cifar10_vitbase/test.py " + \ + "./dataset/cifar10_vitbase/generated/generated_model.pth" + +class Cifar10_CNNSmall(BaseDataset): + data_path = "./dataset/cifar10_cnnsmall/checkpoint" + generated_path = "./dataset/cifar10_cnnsmall/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/cifar10_cnnsmall/test.py " + \ + "./dataset/cifar10_cnnsmall/generated/generated_model.pth" + +class Cifar10_CNNMedium(BaseDataset): + data_path = "./dataset/cifar10_cnnmedium/checkpoint" + generated_path = "./dataset/cifar10_cnnmedium/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/cifar10_cnnmedium/test.py " + \ + "./dataset/cifar10_cnnmedium/generated/generated_model.pth" + +class Cifar100_ResNet18BN(BaseDataset): + data_path = "./dataset/cifar100_resnet18bn/checkpoint" + generated_path = "./dataset/cifar100_resnet18bn/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/cifar100_resnet18bn/test.py " + \ + "./dataset/cifar100_resnet18bn/generated/generated_model.pth" + + + + +class Permutation_ViTTiny(ConditionalDataset): + data_path = "./dataset/condition_permutation_vittiny/checkpoint" + generated_path = "./dataset/condition_permutation_vittiny/generated/generated_model.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/condition_permutation_vittiny/test.py " + \ + "./dataset/condition_permutation_vittiny/generated/generated_model.pth" + + def _extract_condition(self, index: int): + condition = super()._extract_condition(index)[2][5:] + return int(condition) + + + + +class ClassInput_ViTTiny(ConditionalDataset): + def _extract_condition(self, index: int): + condition = super()._extract_condition(index)[2][5:] + one_hot_string = bin(int(condition))[2:].zfill(10) + optimize_class = [index for index, i in enumerate(one_hot_string) if i == "1"] + indicator_tensor = torch.zeros(size=(10,)) + for i in optimize_class: + indicator_tensor[i] = 1.0 + return indicator_tensor + +class ClassInput_ViTTiny_Train(ClassInput_ViTTiny): + data_path = "./dataset/condition_classinput_vittiny/checkpoint_train" + generated_path = None + test_command = None + +class ClassInput_ViTTiny_Test(ClassInput_ViTTiny): + data_path = "./dataset/condition_classinput_vittiny/checkpoint_test" + generated_path = "./dataset/condition_classinput_vittiny/generated/generated_model_class{}.pth" + test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/condition_classinput_vittiny/test.py " + \ + "./dataset/condition_classinput_vittiny/generated/generated_model_class{}.pth" + + + + + + +# #################################### user-defined dataset classes here #################################### +# +# class YourDatasetName(BaseDataset): +# data_path = "./dataset/your_dataset_name/checkpoint" +# generated_path = "./dataset/your_dataset_name/generated/generated_model.pth" +# test_command = f"CUDA_VISIBLE_DEVICES={test_gpu_ids} python ./dataset/your_dataset_name/test.py " + \ +# "./dataset/your_dataset_name/generated/generated_model.pth" +# +# #################################### user-defined dataset classes here #################################### \ No newline at end of file diff --git a/model/__init__.py b/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52f659d25a8e52579e390a690d4e29ebd9b692de --- /dev/null +++ b/model/__init__.py @@ -0,0 +1,228 @@ +import torch +from abc import ABC +from torch import nn +from torch.nn import functional as F +from .diffusion import DiffusionLoss, DDIMSampler, DDPMSampler +from .transformer import TransformerModel +from .mamba import MambaModel +from .lstm import LstmModel +from .gatemlp import GMLPModel + + + + +class ModelDiffusion(nn.Module, ABC): + config = {} + + def __init__(self, sequence_length): + super().__init__() + DiffusionLoss.config = self.config + self.criteria = DiffusionLoss() + if self.config.get("post_d_model") is None: + assert self.config["d_model"] == self.config["condition_dim"] + self.sequence_length = sequence_length + # to define model after this function + self.to_condition = nn.Linear(self.config["d_condition"], self.config["d_model"]) + self.to_permutation_state = nn.Embedding(self.config["num_permutation"], self.config["d_model"]) + self.to_permutation_state.weight = \ + nn.Parameter(torch.ones_like(self.to_permutation_state.weight) / self.config["d_model"]) + + def forward(self, output_shape=None, x_0=None, condition=None, permutation_state=None, **kwargs): + # condition + if condition is not None: + assert len(condition.shape) == 2 + assert condition.shape[-1] == self.config["d_condition"] + condition = self.to_condition(condition.to(self.device)[:, None, :]) + else: # not use condition + condition = self.to_condition(torch.zeros(size=(1, 1, 1), device=self.device)) + # process + if kwargs.get("sample"): + if permutation_state is not False: + permutation_state = torch.randint(0, self.to_permutation_state.num_embeddings, (1,), device=self.device) + permutation_state = self.to_permutation_state(permutation_state)[:, None, :] + else: # permutation state == False + permutation_state = 0. + return self.sample(x=None, condition=condition+permutation_state) + else: # train + if permutation_state is not None: + permutation_state = self.to_permutation_state(permutation_state)[:, None, :] + else: # not use permutation state + permutation_state = 0. + # Given condition c and ground truth token x, compute loss + c = self.model(output_shape, condition+permutation_state) + loss = self.criteria(x=x_0, c=c, **kwargs) + return loss + + @torch.no_grad() + def sample(self, x=None, condition=None): + z = self.model([1, self.sequence_length, self.config["d_model"]], condition) + if x is None: + x = torch.randn((1, self.sequence_length, self.config["model_dim"]), device=z.device) + x = self.criteria.sample(x, z) + return x + + @property + def device(self): + return next(self.parameters()).device + + +class ModelMSELoss(nn.Module, ABC): + config = {} + + def __init__(self, sequence_length): + super().__init__() + if self.config.get("post_d_model") is None: + assert self.config["d_model"] == self.config["condition_dim"] + self.sequence_length = sequence_length + # to define model after this function + self.to_condition = nn.Linear(self.config["d_condition"], self.config["d_model"]) + self.to_permutation_state = nn.Embedding(self.config["num_permutation"], self.config["d_model"]) + self.to_permutation_state.weight = \ + nn.Parameter(torch.ones_like(self.to_permutation_state.weight) / self.config["d_model"]) + + def forward(self, output_shape=None, x_0=None, condition=None, permutation_state=None, **kwargs): + # condition + if condition is not None: + assert len(condition.shape) == 2 + assert condition.shape[-1] == self.config["d_condition"] + condition = self.to_condition(condition.to(self.device)[:, None, :]) + else: # not use condition + condition = self.to_condition(torch.zeros(size=(1, 1, 1), device=self.device)) + # process + if kwargs.get("sample"): + if permutation_state is not False: + permutation_state = torch.randint(0, self.to_permutation_state.num_embeddings, (1,), device=self.device) + permutation_state = self.to_permutation_state(permutation_state)[:, None, :] + else: # permutation state == False + permutation_state = 0. + return self.sample(x=None, condition=condition+permutation_state) + else: # train + if permutation_state is not None: + permutation_state = self.to_permutation_state(permutation_state)[:, None, :] + else: # not use permutation state + permutation_state = 0. + # Given condition c and ground truth token x, compute loss + c = self.model(output_shape, condition+permutation_state) + assert c.shape[-1] == x_0.shape[-1], "d_model should be equal to dim_per_token" + # preprocess nan to zero + mask = torch.isnan(x_0) + x_0 = torch.nan_to_num(x_0, 0.) + # get the gradient + loss = F.mse_loss(c, x_0, reduction="none") + loss[mask] = torch.nan + return loss.nanmean() + + @torch.no_grad() + def sample(self, x=None, condition=None): + z = self.model([1, self.sequence_length, self.config["d_model"]], condition) + return z + + @property + def device(self): + return next(self.parameters()).device + + + + +class MambaDiffusion(ModelDiffusion): + def __init__(self, sequence_length, positional_embedding): + super().__init__(sequence_length=sequence_length) + MambaModel.config = self.config + self.model = MambaModel(positional_embedding=positional_embedding) + + +class TransformerDiffusion(ModelDiffusion): + def __init__(self, sequence_length, positional_embedding): + super().__init__(sequence_length=sequence_length) + TransformerModel.config = self.config + self.model = TransformerModel(positional_embedding=positional_embedding) + + +class LstmDiffusion(ModelDiffusion): + def __init__(self, sequence_length, positional_embedding): + super().__init__(sequence_length=sequence_length) + LstmModel.config = self.config + self.model = LstmModel(positional_embedding=positional_embedding) + + +class GMLPDiffusion(ModelDiffusion): + def __init__(self, sequence_length, positional_embedding): + super().__init__(sequence_length=sequence_length) + GMLPModel.config = self.config + self.model = GMLPModel(positional_embedding=positional_embedding) + + + + +class MambaMSELoss(ModelMSELoss): + def __init__(self, sequence_length, positional_embedding): + super().__init__(sequence_length=sequence_length) + MambaModel.config = self.config + self.model = MambaModel(positional_embedding=positional_embedding) + + + + +class ClassConditionMambaDiffusion(MambaDiffusion): + def __init__(self, sequence_length, positional_embedding, input_class=10): + super().__init__(sequence_length, positional_embedding) + self.get_condition = nn.Sequential( + nn.Linear(input_class, self.config["d_condition"]), + nn.SiLU(), + ) # to condition + self.to_permutation_state = nn.Embedding(self.config["num_permutation"], self.config["d_model"]) + # condition module + self.to_condition_linear = nn.Linear(self.config["d_condition"], self.config["d_model"]) + to_condition_gate = torch.zeros(size=(1, sequence_length, 1)) + to_condition_gate[:, -8:, :] = 1. + self.register_buffer("to_condition_gate", to_condition_gate) + # reset to_condition + del self.to_condition + self.to_condition = self._to_condition + + def forward(self, output_shape=None, x_0=None, condition=None, **kwargs): + condition = self.get_condition(condition.to(self.device)) + return super().forward(output_shape=output_shape, x_0=x_0, condition=condition, **kwargs) + + def _to_condition(self, x): + assert len(x.shape) == 3 + x = self.to_condition_linear(x) + x = x * self.to_condition_gate + return x + + +class ClassConditionMambaDiffusionFull(MambaDiffusion): + def __init__(self, sequence_length, positional_embedding, input_class=10, init_noise_intensity=1e-4): + super().__init__(sequence_length, positional_embedding) + self.get_condition = nn.Sequential( + nn.Linear(input_class, self.config["d_condition"]), + nn.LayerNorm(self.config["d_condition"]), + ) # to condition + self.to_permutation_state = nn.Embedding(self.config["num_permutation"], self.config["d_model"]) + # condition module + self.to_condition_linear = nn.Linear(self.config["d_condition"], self.config["d_model"]) + self.to_condition_conv = nn.Sequential( + nn.Conv1d(1, sequence_length, 9, 1, 4), + nn.GroupNorm(num_groups=1, num_channels=sequence_length), + nn.Conv1d(sequence_length, sequence_length, 9, 1, 4), + ) # [batch_size, sequence_length, d_model] + # reset to_condition + del self.to_condition + + def forward(self, output_shape=None, x_0=None, condition=None, **kwargs): + if kwargs.get("pre_training"): + self.to_condition = self._zero_condition + condition = None + else: # train with condition + self.to_condition = self._to_condition + condition = self.get_condition(condition.to(self.device)) + return super().forward(output_shape=output_shape, x_0=x_0, condition=condition, **kwargs) + + def _to_condition(self, x): + assert len(x.shape) == 3 + x = self.to_condition_linear(x) # [batch_size, 1, d_model] + x = self.to_condition_conv(x) # [batch_size, sequence_length, d_model] + return x + + def _zero_condition(self, x): + return torch.zeros(size=(x.shape[0], self.sequence_length, self.config["d_model"]), device=x.device) \ No newline at end of file diff --git a/model/denoiser.py b/model/denoiser.py new file mode 100644 index 0000000000000000000000000000000000000000..e8cd66bb612f49fb420e67195ae39c79de7f590d --- /dev/null +++ b/model/denoiser.py @@ -0,0 +1,96 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +import math + + +class TimestepEmbedder(nn.Module): + def __init__(self, hidden_dim, frequency_embedding_size=256, max_period=10000): + super().__init__() + assert frequency_embedding_size % 2 == 0 + self.frequency_embedding_size = frequency_embedding_size + self.mlp = nn.Sequential( + nn.Linear(frequency_embedding_size, hidden_dim, bias=True), + nn.SiLU(), + nn.Linear(hidden_dim, hidden_dim, bias=True) + ) # FIXME: this is too big! Why this is such necessary? + half = frequency_embedding_size // 2 + freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half) / half) + self.register_buffer("freqs", freqs) + + def forward(self, t): + args = t[:, None].float() * self.freqs + t_freq = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + t_emb = self.mlp(t_freq) + return t_emb + + +class ConditionalUNet(nn.Module): + def __init__(self, layer_channels: list, model_dim: int, condition_dim: int, kernel_size: int): + super().__init__() + self.time_embedder = TimestepEmbedder(hidden_dim=model_dim) + self.condi_embedder = nn.Linear(condition_dim, model_dim) + # FIXME: condi_embedder is calculated for 1000 times as same, but it does not work in recurrent module, why? + self.encoder_list = nn.ModuleList([]) + for i in range(len(layer_channels) // 2 + 1): + self.encoder_list.append(nn.ModuleList([ + nn.Conv1d(layer_channels[i], layer_channels[i+1], kernel_size, 1, kernel_size // 2), + nn.Sequential(nn.BatchNorm1d(layer_channels[i+1]), nn.ELU()) + ])) + self.decoder_list = nn.ModuleList([]) + for i in range(len(layer_channels) // 2 + 1, len(layer_channels) - 1): + self.decoder_list.append(nn.ModuleList([ + nn.Conv1d(layer_channels[i], layer_channels[i+1], kernel_size, 1, kernel_size // 2), + nn.Sequential(nn.BatchNorm1d(layer_channels[i+1]), nn.ELU()) + if layer_channels[i+1] != 1 else nn.Identity(), + ])) + + def forward(self, x, t, c): + x = x[:, None, :] + t = self.time_embedder(t)[:, None, :] + c = self.condi_embedder(c)[:, None, :] + x_list = [] + for i, (module, activation) in enumerate(self.encoder_list): + x = module((x + c) * t) + x = activation(x) + if i < len(self.encoder_list) - 2: + x_list.append(x) + for i, (module, activation) in enumerate(self.decoder_list): + x = x + x_list[-i-1] + x = module((x + c) * t) + x = activation(x) + return x[:, 0, :] + + +class OneDimCNN(nn.Module): + def __init__(self, layer_channels: list, model_dim: int, kernel_size: int): + super().__init__() + self.time_embedder = TimestepEmbedder(hidden_dim=model_dim) + self.encoder_list = nn.ModuleList([]) + for i in range(len(layer_channels) // 2 + 1): + self.encoder_list.append(nn.ModuleList([ + nn.Conv1d(layer_channels[i], layer_channels[i+1], kernel_size, 1, kernel_size // 2), + nn.Sequential(nn.BatchNorm1d(layer_channels[i+1]), nn.ELU()) + ])) + self.decoder_list = nn.ModuleList([]) + for i in range(len(layer_channels) // 2 + 1, len(layer_channels) - 1): + self.decoder_list.append(nn.ModuleList([ + nn.Conv1d(layer_channels[i], layer_channels[i+1], kernel_size, 1, kernel_size // 2), + nn.Sequential(nn.BatchNorm1d(layer_channels[i+1]), nn.ELU()) + if layer_channels[i+1] != 1 else nn.Identity(), + ])) + + def forward(self, x, t, c=0.): + x = x[:, None, :] + t = self.time_embedder(t)[:, None, :] + x_list = [] + for i, (module, activation) in enumerate(self.encoder_list): + x = module(x + t) + x = activation(x) + if i < len(self.encoder_list) - 2: + x_list.append(x) + for i, (module, activation) in enumerate(self.decoder_list): + x = x + x_list[-i-1] + x = module(x + t) + x = activation(x) + return x[:, 0, :] \ No newline at end of file diff --git a/model/diffusion.py b/model/diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..e607c190dac7f281edda7c6ba1b5dfc2cd713180 --- /dev/null +++ b/model/diffusion.py @@ -0,0 +1,219 @@ +import torch +from torch import nn +import torch.nn.functional as F +from .denoiser import ConditionalUNet +import numpy as np + + +def extract(v, i, shape): + out = torch.gather(v, index=i, dim=0) + out = out.to(device=i.device, dtype=torch.float32) + # reshape to (batch_size, 1, 1, 1, 1, ...) for broadcasting purposes. + out = out.view([i.shape[0]] + [1] * (len(shape) - 1)) + return out + + +class GaussianDiffusionTrainer(nn.Module): + def __init__(self, model: nn.Module, beta: tuple[int, int], T: int): + super().__init__() + self.model = model + self.T = T + # generate T steps of beta + self.register_buffer("beta_t", torch.linspace(*beta, T, dtype=torch.float32)) + # calculate the cumulative product of $\alpha$ , named $\bar{\alpha_t}$ in paper + alpha_t = 1.0 - self.beta_t + alpha_t_bar = torch.cumprod(alpha_t, dim=0) + # calculate and store two coefficient of $q(x_t | x_0)$ + self.register_buffer("signal_rate", torch.sqrt(alpha_t_bar)) + self.register_buffer("noise_rate", torch.sqrt(1.0 - alpha_t_bar)) + + def forward(self, x_0, z, **kwargs): + # preprocess nan to zero + mask = torch.isnan(x_0) + x_0 = torch.nan_to_num(x_0, 0.) + # get a random training step $t \sim Uniform({1, ..., T})$ + t = torch.randint(self.T, size=(x_0.shape[0],), device=x_0.device) + # generate $\epsilon \sim N(0, 1)$ + epsilon = torch.randn_like(x_0) + # predict the noise added from $x_{t-1}$ to $x_t$ + x_t = (extract(self.signal_rate, t, x_0.shape) * x_0 + + extract(self.noise_rate, t, x_0.shape) * epsilon) + epsilon_theta = self.model(x_t, t, z) + # get the gradient + loss = F.mse_loss(epsilon_theta, epsilon, reduction="none") + loss[mask] = torch.nan + return loss.nanmean() + + +class DDPMSampler(nn.Module): + def __init__(self, model: nn.Module, beta: tuple[int, int], T: int): + super().__init__() + self.model = model + self.T = T + # generate T steps of beta + self.register_buffer("beta_t", torch.linspace(*beta, T, dtype=torch.float32)) + # calculate the cumulative product of $\alpha$ , named $\bar{\alpha_t}$ in paper + alpha_t = 1.0 - self.beta_t + alpha_t_bar = torch.cumprod(alpha_t, dim=0) + alpha_t_bar_prev = F.pad(alpha_t_bar[:-1], (1, 0), value=1.0) + self.register_buffer("coeff_1", torch.sqrt(1.0 / alpha_t)) + self.register_buffer("coeff_2", self.coeff_1 * (1.0 - alpha_t) / torch.sqrt(1.0 - alpha_t_bar)) + self.register_buffer("posterior_variance", self.beta_t * (1.0 - alpha_t_bar_prev) / (1.0 - alpha_t_bar)) + + @torch.no_grad() + def cal_mean_variance(self, x_t, t, c): + # """ Calculate the mean and variance for $q(x_{t-1} | x_t, x_0)$ """ + epsilon_theta = self.model(x_t, t, c) + mean = extract(self.coeff_1, t, x_t.shape) * x_t - extract(self.coeff_2, t, x_t.shape) * epsilon_theta + # var is a constant + var = extract(self.posterior_variance, t, x_t.shape) + return mean, var + + @torch.no_grad() + def sample_one_step(self, x_t, time_step, c): + # """ Calculate $x_{t-1}$ according to $x_t$ """ + t = torch.full((x_t.shape[0],), time_step, device=x_t.device, dtype=torch.long) + mean, var = self.cal_mean_variance(x_t, t, c) + z = torch.randn_like(x_t) if time_step > 0 else 0 + x_t_minus_one = mean + torch.sqrt(var) * z + if torch.isnan(x_t_minus_one).int().sum() != 0: + raise ValueError("nan in tensor!") + return x_t_minus_one + + @torch.no_grad() + def forward(self, x_t, c, only_return_x_0=True, interval=1, **kwargs): + x = [x_t] + for time_step in reversed(range(self.T)): + x_t = self.sample_one_step(x_t, time_step, c) + if not only_return_x_0 and ((self.T - time_step) % interval == 0 or time_step == 0): + x.append(x_t) + if only_return_x_0: + return x_t # [batch_size, channels, height, width] + return torch.stack(x, dim=1) # [batch_size, sample, channels, height, width] + + +class DDIMSampler(nn.Module): + def __init__(self, model: nn.Module, beta: tuple[int, int], T: int): + super().__init__() + self.model = model + self.T = T + # generate T steps of beta + beta_t = torch.linspace(*beta, T, dtype=torch.float32) + # calculate the cumulative product of $\alpha$ , named $\bar{\alpha_t}$ in paper + alpha_t = 1.0 - beta_t + self.register_buffer("alpha_t_bar", torch.cumprod(alpha_t, dim=0)) + + @torch.no_grad() + def sample_one_step(self, x_t, time_step, c, prev_time_step, eta): + t = torch.full((x_t.shape[0],), time_step, device=x_t.device, dtype=torch.long) + prev_t = torch.full((x_t.shape[0],), prev_time_step, device=x_t.device, dtype=torch.long) + # get current and previous alpha_cumprod + alpha_t = extract(self.alpha_t_bar, t, x_t.shape) + alpha_t_prev = extract(self.alpha_t_bar, prev_t, x_t.shape) + # predict noise using model + epsilon_theta_t = self.model(x_t, t, c) + # calculate x_{t-1} + sigma_t = eta * torch.sqrt((1 - alpha_t_prev) / (1 - alpha_t) * (1 - alpha_t / alpha_t_prev)) + epsilon_t = torch.randn_like(x_t) + x_t_minus_one = (torch.sqrt(alpha_t_prev / alpha_t) * x_t + + (torch.sqrt(1 - alpha_t_prev - sigma_t ** 2) - torch.sqrt( + (alpha_t_prev * (1 - alpha_t)) / alpha_t)) * epsilon_theta_t + + sigma_t * epsilon_t) + return x_t_minus_one + + @torch.no_grad() + def forward(self, x_t, c, steps=60, method="linear", eta=0.05, only_return_x_0=True, interval=1, **kwargs): + if steps == 0: + return c + if method == "linear": + a = self.T // steps + time_steps = np.asarray(list(range(0, self.T, a))) + elif method == "quadratic": + time_steps = (np.linspace(0, np.sqrt(self.T * 0.8), steps) ** 2).astype(np.int) + else: # NotImplementedError + raise NotImplementedError(f"sampling method {method} is not implemented!") + # add one to get the final alpha values right (the ones from first scale to data during sampling) + time_steps = time_steps + 1 + # previous sequence + time_steps_prev = np.concatenate([[0], time_steps[:-1]]) + x = [x_t] + for i in reversed(range(0, steps)): + x_t = self.sample_one_step(x_t, time_steps[i], c, time_steps_prev[i], eta) + if not only_return_x_0 and ((steps - i) % interval == 0 or i == 0): + x.append(x_t) + if only_return_x_0: + return x_t # [batch_size x channels, dim] + return torch.stack(x, dim=1) # [batch_size x channels, sample, dim] + + + + +class DiffusionLoss(nn.Module): + config = {} + + def __init__(self): + super().__init__() + self.net = ConditionalUNet( + layer_channels=self.config["layer_channels"], + model_dim=self.config["model_dim"], + condition_dim=self.config["condition_dim"], + kernel_size=self.config["kernel_size"], + ) + self.diffusion_trainer = GaussianDiffusionTrainer( + model=self.net, + beta=self.config["beta"], + T=self.config["T"] + ) + self.diffusion_sampler = self.config["sample_mode"]( + model=self.net, + beta=self.config["beta"], + T=self.config["T"] + ) + + def forward(self, x, c, **kwargs): + if kwargs.get("parameter_weight_decay"): + x = x * (1.0 - kwargs["parameter_weight_decay"]) + # Given condition z and ground truth token x, compute loss + x = x.view(-1, x.size(-1)) + c = c.view(-1, c.size(-1)) + real_batch = x.size(0) + batch = self.config.get("diffusion_batch") + if self.config.get("forward_once"): + random_indices = torch.randperm(x.size(0))[:batch] + x, c = x[random_indices], c[random_indices] + real_batch = x.size(0) + if batch is not None and real_batch > batch: + loss = 0. + num_loops = x.size(0) // batch if x.size(0) % batch != 0 else x.size(0) // batch - 1 + for _ in range(num_loops): + loss += self.diffusion_trainer(x[:batch], c[:batch], **kwargs) * batch + x, c = x[batch:], c[batch:] + loss += self.diffusion_trainer(x, c, **kwargs) * x.size(0) + loss = loss / real_batch + else: # all as a batch + loss = self.diffusion_trainer(x, c, **kwargs) + return loss + + @torch.no_grad() + def sample(self, x, c, **kwargs): + # Given condition and noise, sample x using reverse diffusion process + # Given condition z and ground truth token x, compute loss + batch = self.config.get("diffusion_batch") + # if batch is not None: + # batch = max(batch, 256) + x_shape = x.shape + x = x.view(-1, x.size(-1)) + c = c.view(-1, c.size(-1)) + if kwargs.get("only_return_x_0") is False: + diffusion_steps = self.diffusion_sampler(x, c, **kwargs) + return torch.permute(diffusion_steps, (1, 0, 2)) # [sample, 1 x channels, dim] + if batch is not None and x.size(0) > batch: + result = [] + num_loops = x.size(0) // batch if x.size(0) % batch != 0 else x.size(0) // batch - 1 + for _ in range(num_loops): + result.append(self.diffusion_sampler(x[:batch], c[:batch], **kwargs)) + x, c = x[batch:], c[batch:] + result.append(self.diffusion_sampler(x, c, **kwargs)) + return torch.cat(result, dim=0).view(x_shape) + else: # all as a batch + return self.diffusion_sampler(x, c, **kwargs).view(x_shape) \ No newline at end of file diff --git a/model/gatemlp.py b/model/gatemlp.py new file mode 100644 index 0000000000000000000000000000000000000000..417ed41a061de533fe692933c4a37ffeafab2735 --- /dev/null +++ b/model/gatemlp.py @@ -0,0 +1,42 @@ +import torch +from torch import nn + + + + +class GateMLP(nn.Module): + def __init__(self, d_model, expand): + super().__init__() + self.proj_1 = nn.Linear(d_model, d_model * expand, bias=False) + self.proj_2 = nn.Linear(d_model, d_model * expand, bias=False) + self.proj_3 = nn.Linear(d_model * expand, d_model, bias=True) + self.layer_norm = nn.LayerNorm(d_model) + + def forward(self, x): + x, x1 = self.proj_1(x), self.proj_2(x) + x = x * torch.sigmoid(x1) + x = self.proj_3(x) + x = self.layer_norm(x) + return x + + +class GMLPModel(nn.Module): + config = {} + + def __init__(self, positional_embedding): + super().__init__() + gmlp_config = { + "d_model": self.config["d_model"], + "expand": self.config["expand"], + } + self.gmlp_forward = nn.Sequential(*[GateMLP(**gmlp_config) for _ in range(self.config["num_layers"])]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + + def forward(self, output_shape, condition=None): + assert len(condition.shape) == 3 + x = self.gmlp_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x diff --git a/model/lstm.py b/model/lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..29264a7a0cc7e55f094f44aa3d00742f159c469a --- /dev/null +++ b/model/lstm.py @@ -0,0 +1,27 @@ +import torch +from torch import nn +import math + + +class LstmModel(nn.Module): + config = {} + + def __init__(self, positional_embedding): + super().__init__() + self.lstm_forward = nn.LSTM( + input_size=self.config["d_model"], + hidden_size=self.config["d_model"], + num_layers=self.config["num_layers"], + dropout=self.config["dropout"], + bias=True, + batch_first=True,) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + + def forward(self, output_shape, condition=None): + assert len(condition.shape) == 3 + x, _ = self.lstm_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x.contiguous() diff --git a/model/mamba.py b/model/mamba.py new file mode 100644 index 0000000000000000000000000000000000000000..a8808588e1d10edcde521879f1c1671724538270 --- /dev/null +++ b/model/mamba.py @@ -0,0 +1,28 @@ +import torch +from torch import nn +from mamba_ssm import Mamba2 as Mamba +import math + + +class MambaModel(nn.Module): + config = {} + + def __init__(self, positional_embedding): + super().__init__() + mamba_config = { + "d_model": self.config["d_model"], + "d_state": self.config["d_state"], + "d_conv": self.config["d_conv"], + "expand": self.config["expand"], + } + self.mamba_forward = nn.Sequential(*[Mamba(**mamba_config) for _ in range(self.config["num_layers"])]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + + def forward(self, output_shape, condition=None): + assert len(condition.shape) == 3 + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x diff --git a/model/pdiff.py b/model/pdiff.py new file mode 100644 index 0000000000000000000000000000000000000000..164c218f3fa0dad9281ce21b1cefaea0253b7791 --- /dev/null +++ b/model/pdiff.py @@ -0,0 +1,146 @@ +from .diffusion import DDIMSampler, DDPMSampler, GaussianDiffusionTrainer +from .denoiser import OneDimCNN +from torch.nn import functional as F +from abc import abstractmethod +from torch import nn +import torch + + + + +class PDiff(nn.Module): + config = {} + + def __init__(self, sequence_length): + super().__init__() + self.sequence_length = sequence_length + self.net = OneDimCNN( + layer_channels=self.config["layer_channels"], + model_dim=self.config["model_dim"], + kernel_size=self.config["kernel_size"], + ) + self.diffusion_trainer = GaussianDiffusionTrainer( + model=self.net, + beta=self.config["beta"], + T=self.config["T"] + ) + self.diffusion_sampler = self.config["sample_mode"]( + model=self.net, + beta=self.config["beta"], + T=self.config["T"] + ) + + def forward(self, x=None, c=0., **kwargs): + if kwargs.get("sample"): + del kwargs["sample"] + return self.sample(x, c, **kwargs) + x = x.view(-1, x.size(-1)) + loss = self.diffusion_trainer(x, c, **kwargs) + return loss + + @torch.no_grad() + def sample(self, x=None, c=0., **kwargs): + if x is None: + x = torch.randn((1, self.config["model_dim"]), device=self.device) + x_shape = x.shape + x = x.view(-1, x.size(-1)) + return self.diffusion_sampler(x, c, **kwargs).view(x_shape) + + @property + def device(self): + return next(self.parameters()).device + + + + +class OneDimVAE(nn.Module): + def __init__(self, d_model, d_latent, sequence_length, kernel_size=7, divide_slice_length=64): + super(OneDimVAE, self).__init__() + self.d_model = d_model.copy() + self.d_latent = d_latent + # confirm self.last_length + sequence_length = (sequence_length // divide_slice_length + 1) * divide_slice_length \ + if sequence_length % divide_slice_length != 0 else sequence_length + assert sequence_length % int(2 ** len(d_model)) == 0, \ + f"Please set divide_slice_length to {int(2 ** len(d_model))}." + self.last_length = sequence_length // int(2 ** len(d_model)) + + # Build Encoder + modules = [] + in_dim = 1 + for h_dim in d_model: + modules.append(nn.Sequential( + nn.Conv1d(in_dim, h_dim, kernel_size, 2, kernel_size//2), + nn.BatchNorm1d(h_dim), + nn.LeakyReLU() + )) + in_dim = h_dim + self.encoder = nn.Sequential(*modules) + self.to_latent = nn.Linear(self.last_length * d_model[-1], d_latent) + self.fc_mu = nn.Linear(d_latent, d_latent) + self.fc_var = nn.Linear(d_latent, d_latent) + + # Build Decoder + modules = [] + self.to_decode = nn.Linear(d_latent, self.last_length * d_model[-1]) + d_model.reverse() + for i in range(len(d_model) - 1): + modules.append(nn.Sequential( + nn.ConvTranspose1d(d_model[i], d_model[i+1], kernel_size, 2, kernel_size//2, output_padding=1), + nn.BatchNorm1d(d_model[i + 1]), + nn.ELU(), + )) + self.decoder = nn.Sequential(*modules) + self.final_layer = nn.Sequential( + nn.ConvTranspose1d(d_model[-1], d_model[-1], kernel_size, 2, kernel_size//2, output_padding=1), + nn.BatchNorm1d(d_model[-1]), + nn.ELU(), + nn.Conv1d(d_model[-1], 1, kernel_size, 1, kernel_size//2), + ) + + def encode(self, input, **kwargs): + # print(input.shape) + # assert input.shape == [batch_size, num_parameters] + input = input[:, None, :] + result = self.encoder(input) + # print(result.shape) + result = torch.flatten(result, start_dim=1) + result = self.to_latent(result) + mu = self.fc_mu(result) + log_var = self.fc_var(result) + return mu, log_var + + def decode(self, z, **kwargs): + # z.shape == [batch_size, d_latent] + result = self.to_decode(z) + result = result.view(-1, self.d_model[-1], self.last_length) + result = self.decoder(result) + result = self.final_layer(result) + assert result.shape[1] == 1, f"{result.shape}" + return result[:, 0, :] + + def reparameterize(self, mu, log_var, **kwargs): + if kwargs.get("use_var"): + std = torch.exp(0.5 * log_var) + eps = torch.randn_like(std) + if kwargs.get("manual_std") is not None: + std = kwargs.get("manual_std") + return eps * std + mu + else: # not use var + return mu + + def encode_decode(self, input, **kwargs): + mu, log_var = self.encode(input) + z = self.reparameterize(mu, log_var, **kwargs) + recons = self.decode(z) + return recons, input, mu, log_var + + def forward(self, x, **kwargs): + recons, input, mu, log_var = self.encode_decode(input=x, **kwargs) + recons_loss = F.mse_loss(recons, input) + if kwargs.get("use_var"): + kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim=1), dim=0) + loss = recons_loss + kwargs['kld_weight'] * kld_loss + else: # not use var + loss = recons_loss + return loss diff --git a/model/transformer.py b/model/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdae9607907ada91ae1b7cdee056ab7e101ad58 --- /dev/null +++ b/model/transformer.py @@ -0,0 +1,93 @@ +import torch +from torch import nn +from einops import rearrange + + + + +class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.net = nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, hidden_dim), + nn.GELU(), + nn.Linear(hidden_dim, dim), + ) + + def forward(self, x): + x = self.net(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, heads=8, dim_head=64, mask=None): + super().__init__() + inner_dim = dim_head * heads + self.heads = heads + self.scale = dim_head ** -0.5 + self.norm = nn.LayerNorm(dim) + self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) + self.to_out = nn.Linear(inner_dim, dim, bias=False) if inner_dim != dim else nn.Identity() + self.softmax = nn.Softmax(dim=-1) + if mask is not None: + assert len(mask.shape) == 2 + mask = mask[None, None, :, :] + self.register_buffer("mask", mask) + self.use_mask = True + else: # not use mask + self.use_mask = False + + def forward(self, x): + x = self.norm(x) + qkv = self.to_qkv(x).chunk(3, dim=-1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=self.heads), qkv) + attn = q @ k.transpose(-1, -2) + if self.use_mask: + attn = torch.where(self.mask, attn, 1e-8) + out = self.softmax(attn) @ v + out = rearrange(out, 'b h n d -> b n (h d)') + out = self.to_out(out) + return out + + +class Transformer(nn.Module): + def __init__(self, d_model, nhead, dim_feedforward, dim_head, num_layers, mask=None): + super().__init__() + self.layers = nn.ModuleList([]) + for _ in range(num_layers): + self.layers.append(nn.ModuleList([ + Attention(d_model, heads=nhead, dim_head=dim_head, mask=mask), + FeedForward(d_model, dim_feedforward) + ])) + + def forward(self, x): + for attn, ff in self.layers: + x = attn(x) + x + x = ff(x) + x + return x + + +class TransformerModel(nn.Module): + config = {} + + def __init__(self, positional_embedding): + super().__init__() + self.transformer_forward = Transformer( + d_model=self.config["d_model"], + nhead=self.config["nhead"], + dim_feedforward=self.config["dim_feedforward"], + dim_head=self.config["dim_head"], + num_layers=self.config["num_layers"], + mask=self.config.get("mask"), + ) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + + def forward(self, output_shape, condition=None): + assert len(condition.shape) == 3 + x = self.transformer_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..361350367e5ee26f848dd0e8b0b683783d6076b2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +timm +wandb +einops +seaborn +openpyxl +bitsandbytes \ No newline at end of file diff --git a/workspace/ablation/no_diffusion/no_diffusion.py b/workspace/ablation/no_diffusion/no_diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..5882c951b81c566ec6828752fd89b4ee2f7ec8be --- /dev/null +++ b/workspace/ablation/no_diffusion/no_diffusion.py @@ -0,0 +1,209 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaMSELoss as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 16, + "num_workers": 16, + "total_steps": 120000, + "learning_rate": 0.0001, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # output config + "condition_dim": "auto", + }, + "tag": "ablation_no_diffusion", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/no_diffusion/no_diffusion.sh b/workspace/ablation/no_diffusion/no_diffusion.sh new file mode 100644 index 0000000000000000000000000000000000000000..4b4104a460f2edc55114a8385c43617abbe2bd44 --- /dev/null +++ b/workspace/ablation/no_diffusion/no_diffusion.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + no_diffusion.py \ No newline at end of file diff --git a/workspace/ablation/no_relation/no_relation.py b/workspace/ablation/no_relation/no_relation.py new file mode 100644 index 0000000000000000000000000000000000000000..f5afbc175a4deab29a7283605fa9e16484b1c7a2 --- /dev/null +++ b/workspace/ablation/no_relation/no_relation.py @@ -0,0 +1,217 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import GMLPDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 16, + "num_workers": 16, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # transformer config + "d_condition": 1, + "d_model": 8192, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_no_relation", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/no_relation/no_relation.sh b/workspace/ablation/no_relation/no_relation.sh new file mode 100644 index 0000000000000000000000000000000000000000..c006478407be1926dd40c44534de4a624ce93b3c --- /dev/null +++ b/workspace/ablation/no_relation/no_relation.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + no_relation.py \ No newline at end of file diff --git a/workspace/ablation/rm_structure/lstm.py b/workspace/ablation/rm_structure/lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..168b0f0bf06a3f14375fd1fdee97eff043361226 --- /dev/null +++ b/workspace/ablation/rm_structure/lstm.py @@ -0,0 +1,217 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import LstmDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "num_layers": 2, + "dropout": 0.0, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_structure_lstm", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/rm_structure/lstm.sh b/workspace/ablation/rm_structure/lstm.sh new file mode 100644 index 0000000000000000000000000000000000000000..c5bf353a7abd81553259ec373ba1e7cc0ae424d0 --- /dev/null +++ b/workspace/ablation/rm_structure/lstm.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + lstm.py \ No newline at end of file diff --git a/workspace/ablation/rm_structure/transformer.py b/workspace/ablation/rm_structure/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f566c2868948a1c8fb289b1fdfdf4c70c0cd028f --- /dev/null +++ b/workspace/ablation/rm_structure/transformer.py @@ -0,0 +1,224 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import TransformerDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # transformer config + "d_condition": 1, + "d_model": 8192, + "nhead": 8, + "dim_feedforward": 16384, + "dim_head": 1024, + "num_layers": 2, + "mask": "set in future", + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_structure_transformer", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +##### set transformer casual mask ##### +causal_mask = torch.triu(torch.ones(config["sequence_length"], config["sequence_length"], dtype=torch.long)).T.bool() +config["model_config"]["mask"] = causal_mask +##### set transformer casual mask ##### +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/rm_structure/transformer.sh b/workspace/ablation/rm_structure/transformer.sh new file mode 100644 index 0000000000000000000000000000000000000000..58e14bfb7bfdfd96bdfd3280151f2bb79187585c --- /dev/null +++ b/workspace/ablation/rm_structure/transformer.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + transformer.py \ No newline at end of file diff --git a/workspace/ablation/slice_channel/channel.py b/workspace/ablation/slice_channel/channel.py new file mode 100644 index 0000000000000000000000000000000000000000..33ecd159b9e93faa706f7c5570fc8c6327923767 --- /dev/null +++ b/workspace/ablation/slice_channel/channel.py @@ -0,0 +1,223 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 768, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1536, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_slice_channel", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"], + granularity=2) # 2: split by output +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +nan_mask = torch.logical_not(torch.isnan(train_set[0][0])).float() +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = torch.nanmean(prediction.abs().cpu() * nan_mask) + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) diff --git a/workspace/ablation/slice_channel/channel.sh b/workspace/ablation/slice_channel/channel.sh new file mode 100644 index 0000000000000000000000000000000000000000..a53176d1a12a2141de173c58fa577a7bc303c454 --- /dev/null +++ b/workspace/ablation/slice_channel/channel.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + channel.py \ No newline at end of file diff --git a/workspace/ablation/slice_channel/index.py b/workspace/ablation/slice_channel/index.py new file mode 100644 index 0000000000000000000000000000000000000000..3873797471dbea2f504f6187deca4f22563cff4d --- /dev/null +++ b/workspace/ablation/slice_channel/index.py @@ -0,0 +1,226 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 4096, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 4096, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_slice_index", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"]( + dim_per_token=config["dim_per_token"], + granularity=0, # 0: flatten directly + pe_granularity=1, # 1: 1d embedding +) # old method +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +nan_mask = torch.logical_not(torch.isnan(train_set[0][0])).float() +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = torch.nanmean(prediction.abs().cpu() * nan_mask) + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) diff --git a/workspace/ablation/slice_channel/index.sh b/workspace/ablation/slice_channel/index.sh new file mode 100644 index 0000000000000000000000000000000000000000..31b76e8e74f558d3e2bf3159bafd6f4955505f8a --- /dev/null +++ b/workspace/ablation/slice_channel/index.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + index.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/dim_1024.py b/workspace/ablation/token_dim/dim_1024.py new file mode 100644 index 0000000000000000000000000000000000000000..d224309090ea7744b403b3c535d8ea47548a863c --- /dev/null +++ b/workspace/ablation/token_dim/dim_1024.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 1024, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 1024, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 4096, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_token_dim_1024", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/dim_1024.sh b/workspace/ablation/token_dim/dim_1024.sh new file mode 100644 index 0000000000000000000000000000000000000000..121cf7cd0b4543a77310b6f5f09d1ff7b5a29651 --- /dev/null +++ b/workspace/ablation/token_dim/dim_1024.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + dim_1024.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/dim_16384.py b/workspace/ablation/token_dim/dim_16384.py new file mode 100644 index 0000000000000000000000000000000000000000..fd4fd452cadb5f12d827ed5b37e54fc7d5775fc8 --- /dev/null +++ b/workspace/ablation/token_dim/dim_16384.py @@ -0,0 +1,254 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from mamba_ssm import Mamba2 as Mamba +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 16384, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 12288, + "post_d_model": 16384, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 384, + "layer_channels": [1, 64, 96, 64, 1], + "model_dim": 16384, + "condition_dim": 16384, + "kernel_size": 7, + "sample_mode": DDIMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_token_dim_16384", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model +class VaryMambaModel(nn.Module): + config = {} + def __init__(self, positional_embedding): + super().__init__() + mamba1 = Mamba(d_model=config["model_config"]["d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2 = Mamba(d_model=config["model_config"]["post_d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2.in_proj = nn.Linear(mamba1.out_proj.out_features, mamba2.in_proj.out_features, bias=False) + self.mamba_forward = nn.Sequential(*[mamba1, mamba2]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + def forward(self, output_shape, condition=None): + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x +VaryMambaModel.config = config["model_config"] +model.model = VaryMambaModel( + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # update mamba model +torch.cuda.empty_cache() + + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/dim_16384.sh b/workspace/ablation/token_dim/dim_16384.sh new file mode 100644 index 0000000000000000000000000000000000000000..d58de620103297955563108ef293249d0dfa569a --- /dev/null +++ b/workspace/ablation/token_dim/dim_16384.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + dim_16384.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/dim_2048.py b/workspace/ablation/token_dim/dim_2048.py new file mode 100644 index 0000000000000000000000000000000000000000..4b79e0f8322c8543b3777a5e54d0235a1d8e61b5 --- /dev/null +++ b/workspace/ablation/token_dim/dim_2048.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 2048, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 2048, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 2048, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_token_dim_2048", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/dim_2048.sh b/workspace/ablation/token_dim/dim_2048.sh new file mode 100644 index 0000000000000000000000000000000000000000..785b4610f4c85c1eb6f45e6ea684595c656eba5c --- /dev/null +++ b/workspace/ablation/token_dim/dim_2048.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + dim_2048.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/dim_4096.py b/workspace/ablation/token_dim/dim_4096.py new file mode 100644 index 0000000000000000000000000000000000000000..a957632d2c3bd29c635ff5dbb3a0127ac8e3bc23 --- /dev/null +++ b/workspace/ablation/token_dim/dim_4096.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 998 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 4096, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 4096, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 768, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_token_dim_4096", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/dim_4096.sh b/workspace/ablation/token_dim/dim_4096.sh new file mode 100644 index 0000000000000000000000000000000000000000..056dc3e06fa13c189c2acec53fa2e0344b3257ad --- /dev/null +++ b/workspace/ablation/token_dim/dim_4096.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + dim_4096.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitbase_2048.py b/workspace/ablation/token_dim/vitbase_2048.py new file mode 100644 index 0000000000000000000000000000000000000000..2f82a1d4dddd7ca236c57fe35318f60a64d82587 --- /dev/null +++ b/workspace/ablation/token_dim/vitbase_2048.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 997 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTBase as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 2048, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 2048, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_vitbase_dim_2048", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitbase_2048.sh b/workspace/ablation/token_dim/vitbase_2048.sh new file mode 100644 index 0000000000000000000000000000000000000000..2556428988c372b33e39c01035b77b77636d81f0 --- /dev/null +++ b/workspace/ablation/token_dim/vitbase_2048.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vitbase_2048.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitbase_4096.py b/workspace/ablation/token_dim/vitbase_4096.py new file mode 100644 index 0000000000000000000000000000000000000000..39b4bf3b5ce131e3e82d9a87ac99057c2c44fee3 --- /dev/null +++ b/workspace/ablation/token_dim/vitbase_4096.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 997 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTBase as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 4096, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 4, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 4096, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 768, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_vitbase_dim_4096", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitbase_4096.sh b/workspace/ablation/token_dim/vitbase_4096.sh new file mode 100644 index 0000000000000000000000000000000000000000..c446cb92f77205a7aec93bfea8e5563c8a87cd75 --- /dev/null +++ b/workspace/ablation/token_dim/vitbase_4096.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vitbase_4096.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitbase_8192.py b/workspace/ablation/token_dim/vitbase_8192.py new file mode 100644 index 0000000000000000000000000000000000000000..ea63cf1e476447eddf8e8a69f13158f81c490428 --- /dev/null +++ b/workspace/ablation/token_dim/vitbase_8192.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 997 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTBase as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_vitbase_dim_8192", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitbase_8192.sh b/workspace/ablation/token_dim/vitbase_8192.sh new file mode 100644 index 0000000000000000000000000000000000000000..a9a222e99508bb5d2a1c67f414582902591c9354 --- /dev/null +++ b/workspace/ablation/token_dim/vitbase_8192.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vitbase_8192.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitsmall_16384.py b/workspace/ablation/token_dim/vitsmall_16384.py new file mode 100644 index 0000000000000000000000000000000000000000..4b57cffe6d752fa626b9acd39fbd7570bd842cfb --- /dev/null +++ b/workspace/ablation/token_dim/vitsmall_16384.py @@ -0,0 +1,254 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 1003 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from mamba_ssm import Mamba2 as Mamba +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTSmall as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 16384, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 12288, + "post_d_model": 16384, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 384, + "layer_channels": [1, 64, 96, 64, 1], + "model_dim": 16384, + "condition_dim": 16384, + "kernel_size": 7, + "sample_mode": DDIMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_vitsmall_dim_16384", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model +class VaryMambaModel(nn.Module): + config = {} + def __init__(self, positional_embedding): + super().__init__() + mamba1 = Mamba(d_model=config["model_config"]["d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2 = Mamba(d_model=config["model_config"]["post_d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2.in_proj = nn.Linear(mamba1.out_proj.out_features, mamba2.in_proj.out_features, bias=False) + self.mamba_forward = nn.Sequential(*[mamba1, mamba2]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + def forward(self, output_shape, condition=None): + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x +VaryMambaModel.config = config["model_config"] +model.model = VaryMambaModel( + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # update mamba model +torch.cuda.empty_cache() + + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitsmall_16384.sh b/workspace/ablation/token_dim/vitsmall_16384.sh new file mode 100644 index 0000000000000000000000000000000000000000..7c08846bedb2f23e32e3ec25cdd7630b531beb41 --- /dev/null +++ b/workspace/ablation/token_dim/vitsmall_16384.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vitsmall_16384.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitsmall_2048.py b/workspace/ablation/token_dim/vitsmall_2048.py new file mode 100644 index 0000000000000000000000000000000000000000..63648fab2b2f49be2428b72ee93bab02bb7132cf --- /dev/null +++ b/workspace/ablation/token_dim/vitsmall_2048.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 997 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTSmall as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 2048, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 2048, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 2048, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_vitsmall_dim_2048", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitsmall_2048.sh b/workspace/ablation/token_dim/vitsmall_2048.sh new file mode 100644 index 0000000000000000000000000000000000000000..e07f210d92005b7bcc5904b3b1ebd1e994bea9bd --- /dev/null +++ b/workspace/ablation/token_dim/vitsmall_2048.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vitsmall_2048.py \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitsmall_4096.py b/workspace/ablation/token_dim/vitsmall_4096.py new file mode 100644 index 0000000000000000000000000000000000000000..f00b7afc5275fadbe69d44547975d2073e1f8eb8 --- /dev/null +++ b/workspace/ablation/token_dim/vitsmall_4096.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 997 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTSmall as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 4096, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 4096, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_vitsmall_dim_4096", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/token_dim/vitsmall_4096.sh b/workspace/ablation/token_dim/vitsmall_4096.sh new file mode 100644 index 0000000000000000000000000000000000000000..56f280a61b9b932fb6a46072be4f626354763a4d --- /dev/null +++ b/workspace/ablation/token_dim/vitsmall_4096.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vitsmall_4096.py \ No newline at end of file diff --git a/workspace/ablation/trainable_pe/onedim.py b/workspace/ablation/trainable_pe/onedim.py new file mode 100644 index 0000000000000000000000000000000000000000..dfeb0292af615ba2303a6030e7bd7e91f9e8632c --- /dev/null +++ b/workspace/ablation/trainable_pe/onedim.py @@ -0,0 +1,222 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_onedim_pe", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"], + pe_granularity=1) # 1: 1d embedding +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/trainable_pe/onedim.sh b/workspace/ablation/trainable_pe/onedim.sh new file mode 100644 index 0000000000000000000000000000000000000000..7fe10365d8f09cca8f0ad2bec8a93e2d32dcdbd1 --- /dev/null +++ b/workspace/ablation/trainable_pe/onedim.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + onedim.py \ No newline at end of file diff --git a/workspace/ablation/trainable_pe/trainable.py b/workspace/ablation/trainable_pe/trainable.py new file mode 100644 index 0000000000000000000000000000000000000000..874685e0d606b9890b57a38dc797cc88fa411311 --- /dev/null +++ b/workspace/ablation/trainable_pe/trainable.py @@ -0,0 +1,222 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "trainable_pe": True, + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "ablation_trainable_pe", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/ablation/trainable_pe/trainable.sh b/workspace/ablation/trainable_pe/trainable.sh new file mode 100644 index 0000000000000000000000000000000000000000..f7a3ac211bc242501752dd00c88b1e1dd2d2e7ea --- /dev/null +++ b/workspace/ablation/trainable_pe/trainable.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + trainable.py \ No newline at end of file diff --git a/workspace/classinput/generalization.py b/workspace/classinput/generalization.py new file mode 100644 index 0000000000000000000000000000000000000000..90f5a396f9295fc28a668f9ce35adec41538c971 --- /dev/null +++ b/workspace/classinput/generalization.py @@ -0,0 +1,224 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from bitsandbytes import optim +from model import ClassConditionMambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ClassInput_ViTTiny +from torch.utils.data import DataLoader + + +class ClassInput_ViTTiny_Dataset(ClassInput_ViTTiny): + data_path = "./dataset/condition_classinput_inference/checkpoint_test" + generated_path = "./workspace/classinput/generated.pth" + test_command = f"python ./dataset/condition_classinput_inference/test.py " + + + + +config = { + # dataset setting + "dataset": None, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 16, + "num_workers": 16, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//50, + "print_every": 50, + "autocast": lambda i: 5000 < i < 90000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": ClassInput_ViTTiny_Dataset.generated_path, + "test_command": ClassInput_ViTTiny_Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1024, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "generalization", +} + + + + +# Data +print('==> Preparing data..') +train_set = ClassInput_ViTTiny_Dataset(dim_per_token=config["dim_per_token"]) +test_set = ClassInput_ViTTiny_Dataset(dim_per_token=config["dim_per_token"]) +# sample = train_set[0][0] +print("checkpoint number:", train_set.real_length) +# print("input shape:", sample.shape) +# print("useful ratio:", torch.where(torch.isnan(sample), 0., 1.).mean()) +# mask = torch.where(torch.isnan(sample), torch.nan, 1.) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +# train_loader = DataLoader( +# dataset=train_set, +# batch_size=config["batch_size"], +# num_workers=config["num_workers"], +# persistent_workers=True, +# drop_last=True, +# shuffle=True, +# ) +# +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"], + ), # positional_embedding +) # model setting is in model +# +# # Optimizer +# print('==> Building optimizer..') +# optimizer = optim.AdamW8bit( +# params=model.parameters(), +# lr=config["learning_rate"], +# weight_decay=config["weight_decay"], +# ) # optimizer +# scheduler = CosineAnnealingLR( +# optimizer=optimizer, +# T_max=config["total_steps"], +# ) # scheduler +# +# # accelerator +# if __name__ == "__main__": +# kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) +# accelerator = Accelerator(kwargs_handlers=[kwargs,]) +# model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) +# +# +# # wandb +# if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: +# wandb.login(key=additional_config["wandb_api_key"]) +# wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +# print('==> Defining training..') +# def train(): +# if not USE_WANDB: +# train_loss = 0 +# this_steps = 0 +# print("==> Start training..") +# model.train() +# for batch_idx, (param, condition) in enumerate(train_loader): +# optimizer.zero_grad() +# # train +# # noinspection PyArgumentList +# with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): +# loss = model( +# output_shape=param.shape, +# x_0=param, +# condition=condition, +# permutation_state=None, +# ) +# accelerator.backward(loss) +# optimizer.step() +# if accelerator.is_main_process: +# scheduler.step() +# # to logging losses and print and save +# if USE_WANDB and accelerator.is_main_process: +# wandb.log({"train_loss": loss.item()}) +# elif USE_WANDB: +# pass # don't print +# else: # not use wandb +# train_loss += loss.item() +# this_steps += 1 +# if this_steps % config["print_every"] == 0: +# print('Loss: %.6f' % (train_loss/this_steps)) +# this_steps = 0 +# train_loss = 0 +# if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: +# os.makedirs(config["checkpoint_save_path"], exist_ok=True) +# state = accelerator.unwrap_model(model).state_dict() +# torch.save(state, os.path.join(config["checkpoint_save_path"], +# f"{__file__.split('/')[-1].split('.')[0]}.pth")) +# generate(save_path=config["generated_path"], need_test=True) +# if batch_idx >= config["total_steps"]: +# break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + _, condition = test_set[random.randint(0, len(test_set)-1)] + class_index = str(int("".join([str(int(i)) for i in condition]), 2)).zfill(4) + with torch.no_grad(): + prediction = model(sample=True, condition=condition[None], permutation_state=False) + generated_norm = torch.nanmean((prediction.cpu() * mask).abs()) + print("Generated_norm:", generated_norm.item()) + if USE_WANDB and accelerator.is_main_process: + wandb.log({"generated_norm": generated_norm.item()}) + if accelerator.is_main_process: + train_set.save_params(prediction, save_path=save_path.format(class_index)) + if need_test: + start_new_thread(os.system, (config["test_command"].format(class_index),)) + model.train() + return prediction + + + + +# if __name__ == '__main__': +# train() +# del train_loader # deal problems by dataloader +# print("Finished Training!") +# exit(0) diff --git a/workspace/classinput/generate.py b/workspace/classinput/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..fe6a39eda41c0d860613d9a866fc3818d5e5d3fd --- /dev/null +++ b/workspace/classinput/generate.py @@ -0,0 +1,63 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +# torch +import torch +import random +from torch import nn +# father +from workspace.classinput import generalization as item +train_set = item.train_set +test_set = item.test_set +train_set.set_infinite_dataset(max_num=train_set.real_length) +print("num_generated:", test_set.real_length) +config = item.config +model = item.model +assert config.get("tag") is not None, "Remember to set a tag." + + +# Model +print('==> Building model..') +diction = torch.load("./checkpoint/generalization.pth") +permutation_shape = diction["to_permutation_state.weight"].shape +model.to_permutation_state = nn.Embedding(*permutation_shape) +model.load_state_dict(diction) +model = model.cuda() + + +# generate +print('==> Defining generate..') +def generate(save_path, embedding, real_embedding, need_test=True): + class_index = str(int("".join([str(int(i)) for i in real_embedding]), 2)).zfill(4) + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True, condition=embedding[None], permutation_state=False) + generated_norm = torch.nanmean((prediction.cpu()).abs()) + print("Generated_norm:", generated_norm.item()) + train_set.save_params(prediction, save_path=save_path.format(class_index)) + print("Saved to:", save_path.format(class_index)) + if need_test: + test_command = os.path.join(test_set.test_command + save_path.format(class_index)) + os.system(test_command) + model.train() + return prediction + + + + +# if __name__ == "__main__": +# for i in range(config["num_generated"]): +# if config["specific_item"] is not None: +# assert isinstance(config["specific_item"], int) +# i = config["specific_item"] +# print(f"generate index {i}\n") +# print("Save to", config["generated_path"].format(config["tag"], "class####")) +# generate( +# save_path=config["generated_path"], +# test_command=config["test_command"], +# need_test=config["need_test"], +# index=random.randint(0, len(train_set)-1) if config["specific_item"] is None else i, +# ) diff --git a/workspace/classinput/launch.py b/workspace/classinput/launch.py new file mode 100644 index 0000000000000000000000000000000000000000..874718e6b6bd8fad5dfef0ad2dd932f90c797985 --- /dev/null +++ b/workspace/classinput/launch.py @@ -0,0 +1,28 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + + +from workspace.classinput.generate import generate +from workspace.classinput.qwen25llm import get_embedding +import torch +import time + + + + +while True: + time.sleep(0.5) + save_name = "./workspace/classinput/generated_class{}.pth" + print("\n\n\n==================================================================================") + print('class includes: ("airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")') + text_emb = input("Input your description: ") or "Give me a model to select all living things." + real_emb = input("Input your expected class (only for evaluating): ") or "[0,0,1,1,1,1,1,1,0,0]" + # text_emb = "Give me a model to select all living things." + # real_emb = "[0,0,1,1,1,1,1,1,0,0]" + + emb = get_embedding(prompt=text_emb) + emb = torch.tensor(emb, dtype=torch.float) + real_emb = torch.tensor(eval(real_emb), dtype=torch.float) + params = generate(save_path=save_name, embedding=emb, real_embedding=real_emb, need_test=True) diff --git a/workspace/classinput/qwen25llm.py b/workspace/classinput/qwen25llm.py new file mode 100644 index 0000000000000000000000000000000000000000..678cc298b069ae9a5fc243c75352a9903cc19aa6 --- /dev/null +++ b/workspace/classinput/qwen25llm.py @@ -0,0 +1,59 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer + + +model_name = "/home/wangkai/Recurrent-Parameter-Generation/workspace/classinput/Qwen25llm" +model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype="auto", + device_map="auto" +) # load model +tokenizer = AutoTokenizer.from_pretrained(model_name) + + +def describe(prompt, system_prompt): + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt} + ] # construct msgs + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) # get text + model_inputs = tokenizer([text], return_tensors="pt").to(model.device) + generated_ids = model.generate( + **model_inputs, + max_new_tokens=512 + ) # generate + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] # generate + response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + return response + + +def discriminate(class_name, prompt): + system_prompt = "You are an accurate discriminator. " \ + "You need to determines if the class name matches the description. " \ + "Answer with YES or NO." + keywords = [word for word in prompt.split(" ") + if "select" in word or "classif" in word or "find" in word or "all" in word] + if len(keywords) == 0: + description = prompt + else: # # len(keywords > 0) + description = prompt.rsplit(keywords[-1], 1)[-1] + prompt = f"Does the {class_name} belong to \"{description}\"? \n\nAnswer me with YES or NO." + result = describe(prompt, system_prompt) + if "NO" in result or "no" in result or "No" in result: + return False + else: # assert YES in result + return True + + +def get_embedding(prompt): + class_names = ("airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck") + results = [] + for class_name in class_names: + result = discriminate(class_name, prompt) + results.append(result) + return results diff --git a/workspace/compare/generate_only_vae.py b/workspace/compare/generate_only_vae.py new file mode 100644 index 0000000000000000000000000000000000000000..f2e8d8277b01c2c0e6dcc8267139ae297b9a4ff0 --- /dev/null +++ b/workspace/compare/generate_only_vae.py @@ -0,0 +1,70 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +# torch +import torch +# father +import importlib +item = importlib.import_module(f"{sys.argv[1]}") +Dataset = item.Dataset +train_set = item.train_set +config = item.config +model = item.model +vae = item.vae +assert config.get("tag") is not None, "Remember to set a tag." + + + + +generate_config = { + "device": "cuda", + "num_generated": 200, + "checkpoint": f"./checkpoint/{config['tag']}.pth", + "generated_path": os.path.join(Dataset.generated_path.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "test_command": os.path.join(Dataset.test_command.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "need_test": True, +} +config.update(generate_config) + + + + +# Model +print('==> Building model..') +diction = torch.load(config["checkpoint"], map_location="cpu") +vae.load_state_dict(diction["vae"]) +model.load_state_dict(diction["diffusion"]) +model = model.to(config["device"]) +vae = vae.to(config["device"]) + + +# generate +print('==> Defining generate..') +def generate(save_path=config["generated_path"], test_command=config["test_command"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.cuda.amp.autocast(True, torch.bfloat16): + with torch.no_grad(): + mu = model(sample=True) + mu = torch.randn_like(mu) + prediction = vae.decode(mu) + generated_norm = torch.nanmean(prediction.abs()) + print("Generated_norm:", generated_norm.item()) + train_set.save_params(prediction, save_path=save_path) + if need_test: + os.system(test_command) + + + + +if __name__ == "__main__": + for i in range(config["num_generated"]): + index = str(i+1).zfill(3) + print("Save to", config["generated_path"].format(config["tag"], index)) + generate( + save_path=config["generated_path"].format(config["tag"], index), + test_command=config["test_command"].format(config["tag"], index), + need_test=config["need_test"], + ) \ No newline at end of file diff --git a/workspace/compare/generate_with_vae.py b/workspace/compare/generate_with_vae.py new file mode 100644 index 0000000000000000000000000000000000000000..b6eddffbd41a668561f872d1d8eeb2178c626ba3 --- /dev/null +++ b/workspace/compare/generate_with_vae.py @@ -0,0 +1,69 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +# torch +import torch +# father +import importlib +item = importlib.import_module(f"{sys.argv[1]}") +Dataset = item.Dataset +train_set = item.train_set +config = item.config +model = item.model +vae = item.vae +assert config.get("tag") is not None, "Remember to set a tag." + + + + +generate_config = { + "device": "cuda", + "num_generated": 200, + "checkpoint": f"./checkpoint/{config['tag']}.pth", + "generated_path": os.path.join(Dataset.generated_path.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "test_command": os.path.join(Dataset.test_command.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "need_test": True, +} +config.update(generate_config) + + + + +# Model +print('==> Building model..') +diction = torch.load(config["checkpoint"], map_location="cpu") +vae.load_state_dict(diction["vae"]) +model.load_state_dict(diction["diffusion"]) +model = model.to(config["device"]) +vae = vae.to(config["device"]) + + +# generate +print('==> Defining generate..') +def generate(save_path=config["generated_path"], test_command=config["test_command"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.cuda.amp.autocast(True, torch.bfloat16): + with torch.no_grad(): + mu = model(sample=True) + prediction = vae.decode(mu) + generated_norm = torch.nanmean(prediction.abs()) + print("Generated_norm:", generated_norm.item()) + train_set.save_params(prediction, save_path=save_path) + if need_test: + os.system(test_command) + + + + +if __name__ == "__main__": + for i in range(config["num_generated"]): + index = str(i+1).zfill(3) + print("Save to", config["generated_path"].format(config["tag"], index)) + generate( + save_path=config["generated_path"].format(config["tag"], index), + test_command=config["test_command"].format(config["tag"], index), + need_test=config["need_test"], + ) \ No newline at end of file diff --git a/workspace/compare/ours_cnnmedium.py b/workspace/compare/ours_cnnmedium.py new file mode 100644 index 0000000000000000000000000000000000000000..801e6142ba0bcec51018bae69e0b800b098029f5 --- /dev/null +++ b/workspace/compare/ours_cnnmedium.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar10_CNNMedium as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 256, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 16, + "total_steps": 30000, + "learning_rate": 0.0001, + "weight_decay": 0.0, + "save_every": 30000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 256, + "d_state": 32, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": None, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "compare_ours_cnnmedium", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/compare/ours_cnnmedium.sh b/workspace/compare/ours_cnnmedium.sh new file mode 100644 index 0000000000000000000000000000000000000000..980e98a70e097ab7ace722cc888392ca3e3a3d69 --- /dev/null +++ b/workspace/compare/ours_cnnmedium.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + ours_cnnmedium.py \ No newline at end of file diff --git a/workspace/compare/ours_cnnsmall.py b/workspace/compare/ours_cnnsmall.py new file mode 100644 index 0000000000000000000000000000000000000000..2e6ce1a2da3a4d7f2ebb4c551f3ce2915467bfc6 --- /dev/null +++ b/workspace/compare/ours_cnnsmall.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar10_CNNSmall as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 256, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 16, + "total_steps": 30000, + "learning_rate": 0.0001, + "weight_decay": 0.0, + "save_every": 30000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 256, + "d_state": 32, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": None, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "compare_ours_cnnsmall", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/compare/ours_cnnsmall.sh b/workspace/compare/ours_cnnsmall.sh new file mode 100644 index 0000000000000000000000000000000000000000..1716075eb8b544fa7490e31fa45527a89181ed1e --- /dev/null +++ b/workspace/compare/ours_cnnsmall.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + ours_cnnsmall.py \ No newline at end of file diff --git a/workspace/compare/ours_mobilenetv3.py b/workspace/compare/ours_mobilenetv3.py new file mode 100644 index 0000000000000000000000000000000000000000..10295d3a5114d43fa8051054d7113573b5cf5e27 --- /dev/null +++ b/workspace/compare/ours_mobilenetv3.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 995 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar10_MobileNetv3 as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 4096, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 100000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 100000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 4096, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "compare_ours_mobilenetv3", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: For the strong sensitivity of MobileNetv3, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/compare/ours_mobilenetv3.sh b/workspace/compare/ours_mobilenetv3.sh new file mode 100644 index 0000000000000000000000000000000000000000..e75f1ed96f4f73abe84c16b5538b8a95680ca17f --- /dev/null +++ b/workspace/compare/ours_mobilenetv3.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + ours_mobilenetv3.py \ No newline at end of file diff --git a/workspace/compare/ours_resnet18.py b/workspace/compare/ours_resnet18.py new file mode 100644 index 0000000000000000000000000000000000000000..ec0b7100305b19850c128b64c9f9474e01e73bb2 --- /dev/null +++ b/workspace/compare/ours_resnet18.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar10_ResNet18 as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 16, + "total_steps": 80000, + "learning_rate": 0.00004, + "weight_decay": 0.0, + "save_every": 80000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "compare_ours_resnet18", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/compare/ours_resnet18.sh b/workspace/compare/ours_resnet18.sh new file mode 100644 index 0000000000000000000000000000000000000000..416c4959c5eb421769745bee8d413f9038c0452f --- /dev/null +++ b/workspace/compare/ours_resnet18.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + ours_resnet18.py \ No newline at end of file diff --git a/workspace/compare/ours_vitbase.py b/workspace/compare/ours_vitbase.py new file mode 100644 index 0000000000000000000000000000000000000000..f6194e93ec7d5a55d5d5345b7f83d7a4f2a8220b --- /dev/null +++ b/workspace/compare/ours_vitbase.py @@ -0,0 +1,270 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 995 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from mamba_ssm import Mamba2 as Mamba +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar10_ViTBase as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "resume": False, + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 16384, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 100000, + "learning_rate": 0.00001, + "weight_decay": 0.0, + "save_every": 100000//50, + "print_every": 50, + "autocast": lambda i: 5000 < i < 100000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 12288, + "post_d_model": 16384, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 64, 96, 64, 1], + "model_dim": 16384, + "condition_dim": 16384, + "kernel_size": 7, + "sample_mode": DDIMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "compare_ours_vitbase", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model +class VaryMambaModel(nn.Module): + config = {} + def __init__(self, positional_embedding): + super().__init__() + mamba1 = Mamba(d_model=config["model_config"]["d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2 = Mamba(d_model=config["model_config"]["post_d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2.in_proj = nn.Linear(mamba1.out_proj.out_features, mamba2.in_proj.out_features, bias=False) + self.mamba_forward = nn.Sequential(*[mamba1, mamba2]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + def forward(self, output_shape, condition=None): + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x +VaryMambaModel.config = config["model_config"] +model.model = VaryMambaModel( + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # update mamba model +torch.cuda.empty_cache() + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# load checkpoint +if config["resume"] and os.path.exists(f"./cache_{config['tag']}.pt"): + diction = torch.load(f"./cache_{config['tag']}.pt", map_location="cpu") + model.load_state_dict(diction["model"]) + optimizer.load_state_dict(diction["optimizer"]) + scheduler.load_state_dict(diction["scheduler"]) + start_batch_idx = diction["step"] + 1 +else: # not resume + start_batch_idx = 0 + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + batch_idx += start_batch_idx + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + torch.save({ + "model": accelerator.unwrap_model(model).state_dict(), + "optimizer": accelerator.unwrap_model(optimizer).state_dict(), + "scheduler": scheduler.state_dict(), + "step": batch_idx + }, f"./cache_{config['tag']}.pt") + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/compare/ours_vitbase.sh b/workspace/compare/ours_vitbase.sh new file mode 100644 index 0000000000000000000000000000000000000000..204ff5c63d279897f7c7d27e22e9767c3b2137e4 --- /dev/null +++ b/workspace/compare/ours_vitbase.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + ours_vitbase.py \ No newline at end of file diff --git a/workspace/compare/pdiff_cnnmedium_vae.py b/workspace/compare/pdiff_cnnmedium_vae.py new file mode 100644 index 0000000000000000000000000000000000000000..eaeeed31dbeb42ced373c4ac9b8e5611fb2362d4 --- /dev/null +++ b/workspace/compare/pdiff_cnnmedium_vae.py @@ -0,0 +1,256 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model.pdiff import PDiff as Model +from model.pdiff import OneDimVAE as VAE +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar10_CNNMedium as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "sequence_length": 'auto', + # train setting + "batch_size": 16, + "num_workers": 16, + "total_steps": 40000, + "vae_steps": 10000, + "learning_rate": 0.0001, + "vae_learning_rate": 0.00001, + "weight_decay": 0.0, + "save_every": 40000//5, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + # diffusion config + "layer_channels": [1, 64, 128, 256, 512, 256, 128, 64, 1], + "model_dim": 1024, + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + # vae config + "channels": [64, 256, 384, 512, 64], + }, + "tag": "compare_pdiff_cnnsmedium_vae", +} + + + + +# Data +divide_slice_length = int(2 ** len(config["model_config"]["channels"])) +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=divide_slice_length, + granularity=0, + pe_granularity=0, + fill_value=0.) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].flatten().shape) +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length * divide_slice_length + print(f"sequence length: {config['sequence_length']}") +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model(sequence_length=config["sequence_length"]) # model setting is in model +vae = VAE(d_model=config["model_config"]["channels"], + d_latent=config["model_config"]["model_dim"], + sequence_length=config["sequence_length"], + kernel_size=config["model_config"]["kernel_size"]) + +# Optimizer +print('==> Building optimizer..') +vae_optimizer = optim.AdamW( + params=vae.parameters(), + lr=config["vae_learning_rate"], + weight_decay=config["weight_decay"], +) +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +vae_scheduler = CosineAnnealingLR( + optimizer=vae_optimizer, + T_max=config["vae_steps"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + vae, model, vae_optimizer, optimizer, train_loader = \ + accelerator.prepare(vae, model, vae_optimizer, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') + +def train_vae(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training vae..") + vae.train() + for batch_idx, (param, _) in enumerate(train_loader): + vae_optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + param = param.flatten(start_dim=1) + loss = vae(x=param) + accelerator.backward(loss) + vae_optimizer.step() + if accelerator.is_main_process: + vae_scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"vae_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx >= config["vae_steps"]: + break + +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> start training..") + model.train() + for batch_idx, (param, _) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + param = param.flatten(start_dim=1) + with torch.no_grad(): + mu, _ = vae.encode(param) + loss = model(x=mu) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = {"diffusion": accelerator.unwrap_model(model).state_dict(), "vae": vae.state_dict()} + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + mu = model(sample=True) + prediction = vae.decode(mu) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + prediction = prediction.view(-1, divide_slice_length) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train_vae() + vae = accelerator.unwrap_model(vae) + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/compare/pdiff_cnnmedium_vae.sh b/workspace/compare/pdiff_cnnmedium_vae.sh new file mode 100644 index 0000000000000000000000000000000000000000..e22117dbe9142dd99f5bb2888aece0f672c95982 --- /dev/null +++ b/workspace/compare/pdiff_cnnmedium_vae.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + pdiff_cnnmedium_vae.py \ No newline at end of file diff --git a/workspace/compare/pdiff_cnnsmall.py b/workspace/compare/pdiff_cnnsmall.py new file mode 100644 index 0000000000000000000000000000000000000000..1f74fdf53b25c657f7da8331ad43ce478d98c616 --- /dev/null +++ b/workspace/compare/pdiff_cnnsmall.py @@ -0,0 +1,201 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model.pdiff import PDiff as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar10_CNNSmall as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "sequence_length": 'auto', + # train setting + "batch_size": 16, + "num_workers": 16, + "total_steps": 50000, + "learning_rate": 0.0002, + "weight_decay": 0.0, + "save_every": 50000//2, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + # diffusion config + "layer_channels": [1, 64, 128, 256, 512, 256, 128, 64, 1], + "model_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + }, + "tag": "compare_pdiff_cnnsmall", +} + + + + +# Data +divide_slice_length = 64 +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=divide_slice_length, + granularity=0, + pe_granularity=0) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].flatten().shape) +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length * divide_slice_length + print(f"sequence length: {config['sequence_length']}") +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["sequence_length"] +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model(sequence_length=config["sequence_length"]) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, _) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + param = param.flatten(start_dim=1) + loss = model(x=param) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + prediction = prediction.view(-1, divide_slice_length) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/compare/pdiff_cnnsmall.sh b/workspace/compare/pdiff_cnnsmall.sh new file mode 100644 index 0000000000000000000000000000000000000000..1f67a511cc4197d229d4181063dd1e9fa829dc95 --- /dev/null +++ b/workspace/compare/pdiff_cnnsmall.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + pdiff_cnnsmall.py \ No newline at end of file diff --git a/workspace/compare/pdiff_resnet18bn_vae.py b/workspace/compare/pdiff_resnet18bn_vae.py new file mode 100644 index 0000000000000000000000000000000000000000..fedab77029e63749fd8daa9ada20fa7f4ce1754c --- /dev/null +++ b/workspace/compare/pdiff_resnet18bn_vae.py @@ -0,0 +1,258 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model.pdiff import PDiff as Model +from model.pdiff import OneDimVAE as VAE +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar100_ResNet18BN as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "sequence_length": 'auto', + # train setting + "batch_size": 50, + "num_workers": 25, + "total_steps": 10000, + "vae_steps": 1000, + "learning_rate": 0.0001, + "vae_learning_rate": 0.00002, + "weight_decay": 0.01, + "save_every": 10000//1, + "print_every": 50, + "autocast": lambda i: True, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + # diffusion config + "layer_channels": [1, 64, 128, 256, 512, 256, 128, 64, 1], + "model_dim": 128, + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + # vae config + "channels": [64, 256, 384, 512, 64], + }, + "tag": "compare_pdiff_resnet18bn_vae", +} + + + + +# Data +divide_slice_length = int(2 ** len(config["model_config"]["channels"])) +print('==> Preparing data..') +train_set = config["dataset"]( + dim_per_token=divide_slice_length, + granularity=0, + pe_granularity=0, + fill_value=0. +) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].flatten().shape) +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length * divide_slice_length + print(f"sequence length: {config['sequence_length']}") +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model(sequence_length=config["sequence_length"]) # model setting is in model +vae = VAE(d_model=config["model_config"]["channels"], + d_latent=config["model_config"]["model_dim"], + sequence_length=config["sequence_length"], + kernel_size=config["model_config"]["kernel_size"]) + +# Optimizer +print('==> Building optimizer..') +vae_optimizer = optim.AdamW( + params=vae.parameters(), + lr=config["vae_learning_rate"], + weight_decay=config["weight_decay"], +) +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +vae_scheduler = CosineAnnealingLR( + optimizer=vae_optimizer, + T_max=config["vae_steps"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + vae, model, vae_optimizer, optimizer, train_loader = \ + accelerator.prepare(vae, model, vae_optimizer, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') + +def train_vae(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training vae..") + vae.train() + for batch_idx, (param, _) in enumerate(train_loader): + vae_optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + param = param.flatten(start_dim=1) + loss = vae(x=param, use_var=False, manual_std=0.01, kld_weight=0.01) + accelerator.backward(loss) + vae_optimizer.step() + if accelerator.is_main_process: + vae_scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"vae_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx >= config["vae_steps"]: + break + +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> start training..") + model.train() + for batch_idx, (param, _) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + param = param.flatten(start_dim=1) + with torch.no_grad(): + mu, _ = vae.encode(param) + loss = model(x=mu) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = {"diffusion": accelerator.unwrap_model(model).state_dict(), "vae": vae.state_dict()} + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + mu = model(sample=True) + prediction = vae.decode(mu) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + prediction = prediction.view(-1, divide_slice_length) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train_vae() + vae = accelerator.unwrap_model(vae) + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/compare/pdiff_resnet18bn_vae.sh b/workspace/compare/pdiff_resnet18bn_vae.sh new file mode 100644 index 0000000000000000000000000000000000000000..b21df9bb1d8d3c1134f065b90fea03980e0b13ab --- /dev/null +++ b/workspace/compare/pdiff_resnet18bn_vae.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + pdiff_resnet18bn_vae.py \ No newline at end of file diff --git a/workspace/condition/generalization.py b/workspace/condition/generalization.py new file mode 100644 index 0000000000000000000000000000000000000000..f654afd9ddb7864622c0e568cdea467e84a4da3a --- /dev/null +++ b/workspace/condition/generalization.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from bitsandbytes import optim +from model import ClassConditionMambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ClassInput_ViTTiny_Train +from dataset import ClassInput_ViTTiny_Test +from torch.utils.data import DataLoader + + + + +config = { + # dataset setting + "dataset": None, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 16, + "num_workers": 16, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//50, + "print_every": 50, + "autocast": lambda i: 5000 < i < 90000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": ClassInput_ViTTiny_Test.generated_path, + "test_command": ClassInput_ViTTiny_Test.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1024, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "generalization", +} + + + + +# Data +print('==> Preparing data..') +train_set = ClassInput_ViTTiny_Train(dim_per_token=config["dim_per_token"]) +test_set = ClassInput_ViTTiny_Test(dim_per_token=config["dim_per_token"]) +sample = train_set[0][0] +print("checkpoint number:", train_set.real_length) +print("input shape:", sample.shape) +print("useful ratio:", torch.where(torch.isnan(sample), 0., 1.).mean()) +mask = torch.where(torch.isnan(sample), torch.nan, 1.) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"], + ), # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) # optimizer +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) # scheduler + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, condition) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model( + output_shape=param.shape, + x_0=param, + condition=condition, + permutation_state=None, + ) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], + f"{__file__.split('/')[-1].split('.')[0]}.pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + _, condition = test_set[random.randint(0, len(test_set)-1)] + class_index = str(int("".join([str(int(i)) for i in condition]), 2)).zfill(4) + with torch.no_grad(): + prediction = model(sample=True, condition=condition[None], permutation_state=False) + generated_norm = torch.nanmean((prediction.cpu() * mask).abs()) + print("Generated_norm:", generated_norm.item()) + if USE_WANDB and accelerator.is_main_process: + wandb.log({"generated_norm": generated_norm.item()}) + if accelerator.is_main_process: + train_set.save_params(prediction, save_path=save_path.format(class_index)) + if need_test: + start_new_thread(os.system, (config["test_command"].format(class_index),)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/generalization.sh b/workspace/condition/generalization.sh new file mode 100644 index 0000000000000000000000000000000000000000..1425d48d6f8389baea3e329f2d45b2c9940864cb --- /dev/null +++ b/workspace/condition/generalization.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + generalization.py \ No newline at end of file diff --git a/workspace/condition/generalization_full.py b/workspace/condition/generalization_full.py new file mode 100644 index 0000000000000000000000000000000000000000..5e9bcd2c53a820561ef2b9f94a8b7e80052056db --- /dev/null +++ b/workspace/condition/generalization_full.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from bitsandbytes import optim +from model import ClassConditionMambaDiffusionFull as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ClassInput_ViTTiny_Train +from dataset import ClassInput_ViTTiny_Test +from torch.utils.data import DataLoader + + + + +config = { + # dataset setting + "dataset": None, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 16, + "num_workers": 16, + "pre_steps": 0, + "total_steps": 150000, + "learning_rate": 0.00005, + "weight_decay": 5e-6, + "save_every": 150000//50, + "print_every": 50, + "autocast": lambda i: 5000 < i < 190000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": ClassInput_ViTTiny_Test.generated_path, + "test_command": ClassInput_ViTTiny_Test.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1024, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "generalization_full", +} + + + + +# Data +print('==> Preparing data..') +train_set = ClassInput_ViTTiny_Train(dim_per_token=config["dim_per_token"]) +test_set = ClassInput_ViTTiny_Test(dim_per_token=config["dim_per_token"]) +sample = train_set[0][0] +print("checkpoint number:", train_set.real_length) +print("input shape:", sample.shape) +print("useful ratio:", torch.where(torch.isnan(sample), 0., 1.).mean()) +mask = torch.where(torch.isnan(sample), torch.nan, 1.) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"], + ), # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) # optimizer +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) # scheduler + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, condition) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model( + output_shape=param.shape, + x_0=param, + condition=condition, + permutation_state=None, + pre_training=batch_idx < config["pre_steps"] + ) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], + f"{__file__.split('/')[-1].split('.')[0]}.pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + _, condition = test_set[random.randint(0, len(test_set)-1)] + class_index = str(int("".join([str(int(i)) for i in condition]), 2)).zfill(4) + with torch.no_grad(): + prediction = model(sample=True, condition=condition[None], permutation_state=False) + generated_norm = torch.nanmean((prediction.cpu() * mask).abs()) + print("Generated_norm:", generated_norm.item()) + if USE_WANDB and accelerator.is_main_process: + wandb.log({"generated_norm": generated_norm.item()}) + if accelerator.is_main_process: + train_set.save_params(prediction, save_path=save_path.format(class_index)) + if need_test: + start_new_thread(os.system, (config["test_command"].format(class_index),)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/generalization_full.sh b/workspace/condition/generalization_full.sh new file mode 100644 index 0000000000000000000000000000000000000000..2ab8a080f96d1e625b35680ce32c81984bc832f6 --- /dev/null +++ b/workspace/condition/generalization_full.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + generalization_full.py \ No newline at end of file diff --git a/workspace/condition/generate_full_seen.log b/workspace/condition/generate_full_seen.log new file mode 100644 index 0000000000000000000000000000000000000000..3156adf06d07a2e79c8dc67ee3a57eb1f300c864 --- /dev/null +++ b/workspace/condition/generate_full_seen.log @@ -0,0 +1,80 @@ +Training: [2, 4, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0497 | Acc: 0.9185 +Training: [2, 4, 5, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0690 | Acc: 0.9005 +Training: [1, 6, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0396 | Acc: 0.9548 +Training: [0, 2, 3, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0528 | Acc: 0.9155 +Training: [1, 2, 5, 6, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0952 | Acc: 0.8464 +Training: [3, 5, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0639 | Acc: 0.9197 +Training: [1, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1675 | Acc: 0.7348 +Training: [0, 2, 4, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0317 | Acc: 0.9588 +Training: [0, 1, 2, 4, 5, 6, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0414 | Acc: 0.9304 +Training: [0, 1, 2, 3, 4, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0624 | Acc: 0.8829 +Training: [0, 1, 2, 3, 4, 5, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0382 | Acc: 0.9483 +Training: [0, 2, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0781 | Acc: 0.8109 +Training: [1, 4, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1056 | Acc: 0.8360 +Training: [0, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1083 | Acc: 0.8042 +Training: [0, 1, 3, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0526 | Acc: 0.9176 +Training: [1, 3, 5, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1428 | Acc: 0.7666 +Training: [5, 6] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1449 | Acc: 0.8036 +Training: [0, 2, 3, 6, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0719 | Acc: 0.9021 +Training: [1, 2, 3, 5, 6] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0448 | Acc: 0.9332 +Training: [1, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1049 | Acc: 0.8114 \ No newline at end of file diff --git a/workspace/condition/generate_full_unseen.log b/workspace/condition/generate_full_unseen.log new file mode 100644 index 0000000000000000000000000000000000000000..e7887d85c2f28558f397230dcd859e41bc84ec91 --- /dev/null +++ b/workspace/condition/generate_full_unseen.log @@ -0,0 +1,240 @@ +Training: [0, 1, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0455 | Acc: 0.9343 +Training: [0, 3, 5] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0650 | Acc: 0.9175 +Training: [1, 2, 3, 4, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0602 | Acc: 0.9205 +Training: [0, 1, 2, 3, 4, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0279 | Acc: 0.9613 +Training: [1, 2, 3, 4, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0628 | Acc: 0.9193 +Training: [0, 2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0814 | Acc: 0.8425 +Training: [3, 4, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0721 | Acc: 0.9110 +Training: [1, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0635 | Acc: 0.8664 +Training: [0, 1, 3, 4, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0662 | Acc: 0.9008 +Training: [1, 5, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0673 | Acc: 0.9026 +Training: [1, 3, 4, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0360 | Acc: 0.9599 +Training: [1, 5, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0459 | Acc: 0.9290 +Training: [0, 1, 2, 3, 4, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0537 | Acc: 0.9135 +Training: [2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1370 | Acc: 0.6953 +Training: [1, 2, 3, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0800 | Acc: 0.8591 +Training: [1, 2, 3, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0739 | Acc: 0.8611 +Training: [3, 4, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1695 | Acc: 0.7421 +Training: [0, 1, 3, 4, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0651 | Acc: 0.8926 +Training: [2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1447 | Acc: 0.7203 +Training: [2] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.8010 | Acc: 0.5268 +Training: [1, 3, 4, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0583 | Acc: 0.8792 +Training: [1, 2, 3, 4, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0608 | Acc: 0.9206 +Training: [2, 3, 4, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0502 | Acc: 0.9364 +Training: [0, 1, 3, 4, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0650 | Acc: 0.9072 +Training: [1, 3, 4, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0358 | Acc: 0.9458 +Training: [0, 3, 5] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0989 | Acc: 0.8393 +Training: [0, 3, 5] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0520 | Acc: 0.9465 +Training: [3, 4, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1682 | Acc: 0.6933 +Training: [3, 4, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1311 | Acc: 0.7710 +Training: [2, 3, 4, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0753 | Acc: 0.8546 +Training: [0, 2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0614 | Acc: 0.9093 +Training: [0, 1, 2, 3, 4, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0617 | Acc: 0.8993 +Training: [0, 2, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0357 | Acc: 0.9622 +Training: [0, 1, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0746 | Acc: 0.8837 +Training: [0, 3, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1019 | Acc: 0.8275 +Training: [1, 2, 3, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0803 | Acc: 0.8574 +Training: [0, 3, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1001 | Acc: 0.8491 +Training: [1, 2, 3, 4, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1222 | Acc: 0.7925 +Training: [1, 4, 5, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0651 | Acc: 0.8949 +Training: [0, 2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0715 | Acc: 0.8774 +Training: [0, 2, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1068 | Acc: 0.8456 +Training: [2] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.8825 | Acc: 0.5152 +Training: [0, 2, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0335 | Acc: 0.9561 +Training: [1, 4, 5, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0700 | Acc: 0.8773 +Training: [2] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.2259 | Acc: 0.6951 +Training: [1, 2, 3, 4, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0549 | Acc: 0.9178 +Training: [1, 5, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0574 | Acc: 0.9072 +Training: [0, 3, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0884 | Acc: 0.8698 +Training: [1, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0557 | Acc: 0.9184 +Training: [2, 3, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1302 | Acc: 0.7495 +Training: [3, 4, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0954 | Acc: 0.8493 +Training: [1, 4, 5, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1447 | Acc: 0.8060 +Training: [1, 2, 3, 4, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0700 | Acc: 0.8489 +Training: [3, 4, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1067 | Acc: 0.8084 +Training: [2, 3, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1274 | Acc: 0.7569 +Training: [2, 3, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1094 | Acc: 0.7823 +Training: [1, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0708 | Acc: 0.8649 +Training: [2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1238 | Acc: 0.7485 +Training: [2, 3, 4, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0796 | Acc: 0.8649 +Training: [0, 1, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0592 | Acc: 0.8846 \ No newline at end of file diff --git a/workspace/condition/generate_seen.log b/workspace/condition/generate_seen.log new file mode 100644 index 0000000000000000000000000000000000000000..e12961b76a539e36d601940ccd062ea9083aca6f --- /dev/null +++ b/workspace/condition/generate_seen.log @@ -0,0 +1,240 @@ +Training: [1, 2, 4, 5, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0520 | Acc: 0.9395 +Training: [3, 4, 6, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0653 | Acc: 0.9089 +Training: [0, 1, 2, 4, 5, 6, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0476 | Acc: 0.9129 +Training: [1, 2, 3, 4, 5, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0325 | Acc: 0.9509 +Training: [2, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0456 | Acc: 0.9326 +Training: [0, 1, 2, 3, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0801 | Acc: 0.8355 +Training: [1, 3, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0241 | Acc: 0.9701 +Training: [1, 3, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0519 | Acc: 0.9288 +Training: [1, 2, 4, 5, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0640 | Acc: 0.8967 +Training: [2, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0625 | Acc: 0.9145 +Training: [3, 4, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0273 | Acc: 0.9617 +Training: [0, 1, 2, 4] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0508 | Acc: 0.9313 +Training: [1, 5, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0586 | Acc: 0.9221 +Training: [0, 1, 3, 4, 6, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0394 | Acc: 0.9365 +Training: [2, 4, 5, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0918 | Acc: 0.8429 +Training: [0, 1, 2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0509 | Acc: 0.9292 +Training: [0, 3, 4, 6, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0604 | Acc: 0.8981 +Training: [1, 2, 3, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0773 | Acc: 0.8876 +Training: [0, 1, 2, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0763 | Acc: 0.8777 +Training: [2, 3, 4, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0679 | Acc: 0.8801 +Training: [0, 1, 3, 5, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0404 | Acc: 0.9400 +Training: [0, 2, 3, 5, 6, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0408 | Acc: 0.9176 +Training: [1, 2, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0395 | Acc: 0.9499 +Training: [2, 3, 4, 5, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0811 | Acc: 0.8445 +Training: [0, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0622 | Acc: 0.9209 +Training: [1, 3, 4, 5, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0257 | Acc: 0.9607 +Training: [2, 3, 4] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0449 | Acc: 0.9333 +Training: [0, 2, 3, 5] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0769 | Acc: 0.8801 +Training: [1, 2, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0383 | Acc: 0.9545 +Training: [0, 2, 3, 6, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0437 | Acc: 0.9418 +Training: [0, 1, 3, 4, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0515 | Acc: 0.9201 +Training: [3, 5, 6, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0500 | Acc: 0.9491 +Training: [0, 1, 2, 3, 4, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0344 | Acc: 0.9549 +Training: [3, 4, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0731 | Acc: 0.9239 +Training: [2, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0601 | Acc: 0.9376 +Training: [0, 5, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0552 | Acc: 0.9292 +Training: [0, 3, 5, 6] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0421 | Acc: 0.9502 +Training: [0, 1, 3, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0680 | Acc: 0.8910 +Training: [3, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0548 | Acc: 0.9216 +Training: [1, 3, 5, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0548 | Acc: 0.9056 +Training: [0, 2] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0389 | Acc: 0.9474 +Training: [0, 2, 3, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1188 | Acc: 0.8079 +Training: [1, 2, 5, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0810 | Acc: 0.8518 +Training: [0, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1952 | Acc: 0.6692 +Training: [1, 2, 5, 6] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0723 | Acc: 0.9018 +Training: [0, 1, 4, 5, 6, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0387 | Acc: 0.9399 +Training: [0, 1, 3, 5, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0530 | Acc: 0.9045 +Training: [0, 1, 3, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0543 | Acc: 0.9143 +Training: [0, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0420 | Acc: 0.9571 +Training: [1, 2, 3, 4, 5, 6, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0227 | Acc: 0.9725 +Training: [1, 3, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0554 | Acc: 0.9244 +Training: [1, 2, 4, 5, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0732 | Acc: 0.8878 +==> Generating..Training: [0, 2, 3, 5, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0555 | Acc: 0.9061 +Training: [0, 2, 4, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0298 | Acc: 0.9611 +Training: [1, 2, 3, 5, 6, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0555 | Acc: 0.8859 +Training: [0, 1, 2, 3, 5, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0238 | Acc: 0.9637 +Training: [3, 5, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0940 | Acc: 0.8259 +Training: [0, 1, 5, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0814 | Acc: 0.8491 +Training: [2, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0874 | Acc: 0.8427 +Training: [3, 4, 5, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0627 | Acc: 0.9156 \ No newline at end of file diff --git a/workspace/condition/generate_seen.py b/workspace/condition/generate_seen.py new file mode 100644 index 0000000000000000000000000000000000000000..35f02096359409c671745f305baa0128096c4d52 --- /dev/null +++ b/workspace/condition/generate_seen.py @@ -0,0 +1,78 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +# torch +import torch +import random +from torch import nn +# father +from workspace.condition import generalization as item +train_set = item.train_set +test_set = item.test_set +train_set.set_infinite_dataset(max_num=train_set.real_length) +print("num_generated:", test_set.real_length) +config = item.config +model = item.model +assert config.get("tag") is not None, "Remember to set a tag." + + + + +generate_config = { + "device": "cuda", + "num_generated": 20, + "checkpoint": f"./checkpoint/{config['tag']}.pth", + "generated_path": os.path.join(test_set.generated_path.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "test_command": os.path.join(test_set.test_command.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "need_test": True, + "specific_item": None, +} +config.update(generate_config) + + + + +# Model +print('==> Building model..') +diction = torch.load(config["checkpoint"]) +permutation_shape = diction["to_permutation_state.weight"].shape +model.to_permutation_state = nn.Embedding(*permutation_shape) +model.load_state_dict(diction) +model = model.to(config["device"]) + + +# generate +print('==> Defining generate..') +def generate(save_path=config["generated_path"], test_command=config["test_command"], need_test=True, index=None): + print("\n==> Generating..") + model.eval() + _, condition = train_set[index] + class_index = str(int("".join([str(int(i)) for i in condition]), 2)).zfill(4) + with torch.no_grad(): + prediction = model(sample=True, condition=condition[None], permutation_state=False) + generated_norm = torch.nanmean((prediction.cpu()).abs()) + print("Generated_norm:", generated_norm.item()) + train_set.save_params(prediction, save_path=save_path.format(config["tag"], f"class{class_index}")) + if need_test: + os.system(test_command.format(config["tag"], f"class{class_index}")) + model.train() + return prediction + + + + +if __name__ == "__main__": + for i in range(config["num_generated"]): + if config["specific_item"] is not None: + assert isinstance(config["specific_item"], int) + i = config["specific_item"] + print(f"generate index {i}\n") + print("Save to", config["generated_path"].format(config["tag"], "class####")) + generate( + save_path=config["generated_path"], + test_command=config["test_command"], + need_test=config["need_test"], + index=random.randint(0, len(train_set)-1) if config["specific_item"] is None else i, + ) \ No newline at end of file diff --git a/workspace/condition/generate_unseen.log b/workspace/condition/generate_unseen.log new file mode 100644 index 0000000000000000000000000000000000000000..79e887a12f539f1b2d145c56da3b805565b438fa --- /dev/null +++ b/workspace/condition/generate_unseen.log @@ -0,0 +1,240 @@ +Training: [0, 1, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0695 | Acc: 0.8700 +Training: [0, 3, 5] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0794 | Acc: 0.8556 +Training: [1, 2, 3, 4, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0884 | Acc: 0.8293 +Training: [0, 1, 2, 3, 4, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0391 | Acc: 0.9390 +Training: [1, 2, 3, 4, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0656 | Acc: 0.9022 +Training: [0, 2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0661 | Acc: 0.8978 +Training: [3, 4, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0750 | Acc: 0.8816 +Training: [1, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0525 | Acc: 0.9362 +Training: [0, 1, 3, 4, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0513 | Acc: 0.9208 +Training: [1, 5, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0474 | Acc: 0.9410 +Training: [1, 3, 4, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0342 | Acc: 0.9609 +Training: [1, 5, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0449 | Acc: 0.9436 +Training: [0, 1, 2, 3, 4, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0389 | Acc: 0.9482 +Training: [2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1034 | Acc: 0.8244 +Training: [1, 2, 3, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0741 | Acc: 0.8700 +Training: [1, 2, 3, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0663 | Acc: 0.8935 +Training: [3, 4, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0539 | Acc: 0.9285 +Training: [0, 1, 3, 4, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0518 | Acc: 0.9255 +Training: [2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0761 | Acc: 0.8788 +Training: [2] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0197 | Acc: 0.9795 +Training: [1, 3, 4, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0278 | Acc: 0.9648 +Training: [1, 2, 3, 4, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0244 | Acc: 0.9659 +Training: [2, 3, 4, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0434 | Acc: 0.9503 +Training: [0, 1, 3, 4, 5, 7] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0633 | Acc: 0.8676 +Training: [1, 3, 4, 5, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0303 | Acc: 0.9556 +Training: [0, 3, 5] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0875 | Acc: 0.8182 +Training: [0, 3, 5] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0933 | Acc: 0.8225 +Training: [3, 4, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0760 | Acc: 0.9079 +Training: [3, 4, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0637 | Acc: 0.9138 +Training: [2, 3, 4, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0548 | Acc: 0.9295 +Training: [0, 2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0641 | Acc: 0.9125 +Training: [0, 1, 2, 3, 4, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0415 | Acc: 0.9350 +Training: [0, 2, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0749 | Acc: 0.8835 +Training: [0, 1, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0667 | Acc: 0.8931 +Training: [0, 3, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0972 | Acc: 0.8214 +Training: [1, 2, 3, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0608 | Acc: 0.9079 +Training: [0, 3, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0970 | Acc: 0.8236 +Training: [1, 2, 3, 4, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0355 | Acc: 0.9539 +Training: [1, 4, 5, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0852 | Acc: 0.8339 +Training: [0, 2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0647 | Acc: 0.9112 +Training: [0, 2, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0361 | Acc: 0.9568 +Training: [2] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0450 | Acc: 0.9276 +Training: [0, 2, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0482 | Acc: 0.9248 +Training: [1, 4, 5, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0585 | Acc: 0.9191 +Training: [2] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0258 | Acc: 0.9663 +Training: [1, 2, 3, 4, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0719 | Acc: 0.8984 +Training: [1, 5, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0585 | Acc: 0.9320 +Training: [0, 3, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0795 | Acc: 0.8671 +Training: [1, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0571 | Acc: 0.9260 +Training: [2, 3, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1447 | Acc: 0.7284 +Training: [3, 4, 6, 7, 8, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1039 | Acc: 0.7761 +Training: [1, 4, 5, 6, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0663 | Acc: 0.8976 +Training: [1, 2, 3, 4, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0506 | Acc: 0.9262 +Training: [3, 4, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0836 | Acc: 0.9060 +Training: [2, 3, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0693 | Acc: 0.9015 +Training: [2, 3, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.1451 | Acc: 0.7445 +Training: [1, 5, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0766 | Acc: 0.8574 +Training: [2, 6, 7, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0597 | Acc: 0.9373 +Training: [2, 3, 4, 6, 7, 8] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0421 | Acc: 0.9448 +Training: [0, 1, 5, 6, 9] +Files already downloaded and verified +Files already downloaded and verified +Loss: 0.0460 | Acc: 0.9400 \ No newline at end of file diff --git a/workspace/condition/generate_unseen.py b/workspace/condition/generate_unseen.py new file mode 100644 index 0000000000000000000000000000000000000000..c070d904677c53ee510ac097ed2fffa5dede9a7a --- /dev/null +++ b/workspace/condition/generate_unseen.py @@ -0,0 +1,75 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +# torch +import torch +from torch import nn +# father +from workspace.condition import generalization as item +train_set = item.train_set +test_set = item.test_set +test_set.set_infinite_dataset(max_num=test_set.real_length) +print("num_generated:", test_set.real_length) +config = item.config +model = item.model +assert config.get("tag") is not None, "Remember to set a tag." + + + + +generate_config = { + "device": "cuda", + "checkpoint": f"./checkpoint/{config['tag']}.pth", + "generated_path": os.path.join(test_set.generated_path.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "test_command": os.path.join(test_set.test_command.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "need_test": True, + "specific_item": None, +} +config.update(generate_config) + + + + +# Model +print('==> Building model..') +diction = torch.load(config["checkpoint"]) +permutation_shape = diction["to_permutation_state.weight"].shape +model.to_permutation_state = nn.Embedding(*permutation_shape) +model.load_state_dict(diction) +model = model.to(config["device"]) + + +# generate +print('==> Defining generate..') +def generate(save_path=config["generated_path"], test_command=config["test_command"], need_test=True, index=None): + print("\n==> Generating..") + model.eval() + _, condition = test_set[index] + class_index = str(int("".join([str(int(i)) for i in condition]), 2)).zfill(4) + with torch.no_grad(): + prediction = model(sample=True, condition=condition[None], permutation_state=False) + generated_norm = torch.nanmean((prediction.cpu()).abs()) + print("Generated_norm:", generated_norm.item()) + train_set.save_params(prediction, save_path=save_path.format(config["tag"], f"class{class_index}")) + if need_test: + os.system(test_command.format(config["tag"], f"class{class_index}")) + model.train() + return prediction + + + + +if __name__ == "__main__": + for i in range(len(test_set)): + if config["specific_item"] is not None: + assert isinstance(config["specific_item"], int) + i = config["specific_item"] + print("Save to", config["generated_path"].format(config["tag"], "classXXX")) + generate( + save_path=config["generated_path"], + test_command=config["test_command"], + need_test=config["need_test"], + index=i, + ) \ No newline at end of file diff --git a/workspace/condition/no_permutation_1.py b/workspace/condition/no_permutation_1.py new file mode 100644 index 0000000000000000000000000000000000000000..69f7ea9cc227fb39315d0a01059149ad065f1eee --- /dev/null +++ b/workspace/condition/no_permutation_1.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Permutation_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 8, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 1, + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "no_permutation_1", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=None) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True, permutation_state=False) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/no_permutation_1.sh b/workspace/condition/no_permutation_1.sh new file mode 100644 index 0000000000000000000000000000000000000000..3d9bd21853d60f66f615b040bb0a9a04844906b2 --- /dev/null +++ b/workspace/condition/no_permutation_1.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + no_permutation_1.py \ No newline at end of file diff --git a/workspace/condition/no_permutation_10.py b/workspace/condition/no_permutation_10.py new file mode 100644 index 0000000000000000000000000000000000000000..326f7c99471f5ebd10c87cb91554f69fceded200 --- /dev/null +++ b/workspace/condition/no_permutation_10.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Permutation_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 8, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 1, + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "no_permutation_10", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=None) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True, permutation_state=False) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/no_permutation_10.sh b/workspace/condition/no_permutation_10.sh new file mode 100644 index 0000000000000000000000000000000000000000..3e728484ecd5446f46dad7db487ce0bfc39a2bf2 --- /dev/null +++ b/workspace/condition/no_permutation_10.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + no_permutation_10.py \ No newline at end of file diff --git a/workspace/condition/no_permutation_20.py b/workspace/condition/no_permutation_20.py new file mode 100644 index 0000000000000000000000000000000000000000..cffebc543d276a6152a8e7f39075668c25a622e9 --- /dev/null +++ b/workspace/condition/no_permutation_20.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Permutation_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 8, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 1, + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "no_permutation_20", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=None) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True, permutation_state=False) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/no_permutation_20.sh b/workspace/condition/no_permutation_20.sh new file mode 100644 index 0000000000000000000000000000000000000000..3611512032713872b6ff2a94bdbb29536504df65 --- /dev/null +++ b/workspace/condition/no_permutation_20.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + no_permutation_20.py \ No newline at end of file diff --git a/workspace/condition/no_permutation_3.py b/workspace/condition/no_permutation_3.py new file mode 100644 index 0000000000000000000000000000000000000000..71be1eca168c90a0e7c57aaf2851caa166a12a20 --- /dev/null +++ b/workspace/condition/no_permutation_3.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Permutation_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 8, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 1, + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "no_permutation_3", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=None) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True, permutation_state=False) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/no_permutation_3.sh b/workspace/condition/no_permutation_3.sh new file mode 100644 index 0000000000000000000000000000000000000000..6a3446a92fefa7a22154fdc5d77dd1572d18ed27 --- /dev/null +++ b/workspace/condition/no_permutation_3.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + no_permutation_3.py \ No newline at end of file diff --git a/workspace/condition/permutation_1.py b/workspace/condition/permutation_1.py new file mode 100644 index 0000000000000000000000000000000000000000..8897abf2f4abc35a9c71fdb5188826744e102e73 --- /dev/null +++ b/workspace/condition/permutation_1.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 995 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Permutation_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 8, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 1, + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "permutation_1", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/permutation_1.sh b/workspace/condition/permutation_1.sh new file mode 100644 index 0000000000000000000000000000000000000000..af2279c1c0b1f17680eb8718f2f5cabbff729f67 --- /dev/null +++ b/workspace/condition/permutation_1.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + permutation_1.py \ No newline at end of file diff --git a/workspace/condition/permutation_10.py b/workspace/condition/permutation_10.py new file mode 100644 index 0000000000000000000000000000000000000000..35faaefb385624c8e5423fe05cd43a5c43ad7e4f --- /dev/null +++ b/workspace/condition/permutation_10.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Permutation_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 8, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 10, + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "permutation_10", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/permutation_10.sh b/workspace/condition/permutation_10.sh new file mode 100644 index 0000000000000000000000000000000000000000..0b264ff3c0c7b8ed2a8ebd55b9843d80905d6c51 --- /dev/null +++ b/workspace/condition/permutation_10.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + permutation_10.py \ No newline at end of file diff --git a/workspace/condition/permutation_20.py b/workspace/condition/permutation_20.py new file mode 100644 index 0000000000000000000000000000000000000000..bd21fc699df70aa0a82046565668409ae187f741 --- /dev/null +++ b/workspace/condition/permutation_20.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Permutation_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 8, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 20, + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "permutation_20", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/permutation_20.sh b/workspace/condition/permutation_20.sh new file mode 100644 index 0000000000000000000000000000000000000000..70bacd4b518a7ea59dc6ecf4138b28715aa90a99 --- /dev/null +++ b/workspace/condition/permutation_20.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + permutation_20.py \ No newline at end of file diff --git a/workspace/condition/permutation_3.py b/workspace/condition/permutation_3.py new file mode 100644 index 0000000000000000000000000000000000000000..d0bcf241907bf4ed09423c12d8e8f82ddc2ed55b --- /dev/null +++ b/workspace/condition/permutation_3.py @@ -0,0 +1,219 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 995 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Permutation_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 8, + "total_steps": 120000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 3, + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "permutation_3", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/condition/permutation_3.sh b/workspace/condition/permutation_3.sh new file mode 100644 index 0000000000000000000000000000000000000000..79026a7e7079e2485304628e1af5320be2be730f --- /dev/null +++ b/workspace/condition/permutation_3.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + permutation_3.py \ No newline at end of file diff --git a/workspace/config.json b/workspace/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9cfb30cae24c3a9bd4bec118b852a3244f660ace --- /dev/null +++ b/workspace/config.json @@ -0,0 +1 @@ +{"use_wandb": true, "wandb_api_key": "your_wandb_key", "test_gpu_ids": "0"} \ No newline at end of file diff --git a/workspace/downtask/detection.py b/workspace/downtask/detection.py new file mode 100644 index 0000000000000000000000000000000000000000..9831a06a0e5f7ad827dce4f7ff565d0d0bd7b2e0 --- /dev/null +++ b/workspace/downtask/detection.py @@ -0,0 +1,271 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 995 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import _thread +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from mamba_ssm import Mamba2 as Mamba +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import CocoDetection as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "resume": False, + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 16384, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 120000, + "learning_rate": 0.00001, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 100000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 12288, + "post_d_model": 16384, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 64, 96, 64, 1], + "model_dim": 16384, + "condition_dim": 16384, + "kernel_size": 7, + "sample_mode": DDIMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "downtask_detection", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model +class VaryMambaModel(nn.Module): + config = {} + def __init__(self, positional_embedding): + super().__init__() + mamba1 = Mamba(d_model=config["model_config"]["d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2 = Mamba(d_model=config["model_config"]["post_d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2.in_proj = nn.Linear(mamba1.out_proj.out_features, mamba2.in_proj.out_features, bias=False) + self.mamba_forward = nn.Sequential(*[mamba1, mamba2]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + def forward(self, output_shape, condition=None): + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x +VaryMambaModel.config = config["model_config"] +model.model = VaryMambaModel( + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # update mamba model +torch.cuda.empty_cache() + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# load checkpoint +if config["resume"] and os.path.exists(f"./cache_{config['tag']}.pt"): + diction = torch.load(f"./cache_{config['tag']}.pt", map_location="cpu") + model.load_state_dict(diction["model"]) + optimizer.load_state_dict(diction["optimizer"]) + scheduler.load_state_dict(diction["scheduler"]) + start_batch_idx = diction["step"] + 1 +else: # not resume + start_batch_idx = 0 + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + batch_idx += start_batch_idx + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + torch.save({ + "model": accelerator.unwrap_model(model).state_dict(), + "optimizer": accelerator.unwrap_model(optimizer).state_dict(), + "scheduler": scheduler.state_dict(), + "step": batch_idx + }, f"./cache_{config['tag']}.pt") + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + _thread.start_new_thread(os.system, (config["test_command"],)) # not stuck here + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) diff --git a/workspace/downtask/detection.sh b/workspace/downtask/detection.sh new file mode 100644 index 0000000000000000000000000000000000000000..4af068fcd2bd1be938433b3eda2705b54bfc6c5f --- /dev/null +++ b/workspace/downtask/detection.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + detection.py \ No newline at end of file diff --git a/workspace/downtask/dora_r16.py b/workspace/downtask/dora_r16.py new file mode 100644 index 0000000000000000000000000000000000000000..e5dc5d595d9fce54cb60ad058e5d3a8bada3ab83 --- /dev/null +++ b/workspace/downtask/dora_r16.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import DoRACommonSenseReasoningR16 as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 16, + "total_steps": 100000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 100000//50, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": "echo ignore_test", + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "downtask_dora_r16", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/downtask/dora_r16.sh b/workspace/downtask/dora_r16.sh new file mode 100644 index 0000000000000000000000000000000000000000..2f78936e616788a1931a5b5484586caf0ce303a2 --- /dev/null +++ b/workspace/downtask/dora_r16.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + dora_r16.py \ No newline at end of file diff --git a/workspace/downtask/dora_r4.py b/workspace/downtask/dora_r4.py new file mode 100644 index 0000000000000000000000000000000000000000..92f515b83a08921a7f92bb3a897606a0225e8929 --- /dev/null +++ b/workspace/downtask/dora_r4.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import DoRACommonSenseReasoningR4 as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 16, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//50, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": "echo ignore_test", + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "downtask_dora_r4", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/downtask/dora_r4.sh b/workspace/downtask/dora_r4.sh new file mode 100644 index 0000000000000000000000000000000000000000..2ae8348af2f96dc36bafe69b6bb96f757fd4a26a --- /dev/null +++ b/workspace/downtask/dora_r4.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + dora_r4.py \ No newline at end of file diff --git a/workspace/downtask/dora_r64.py b/workspace/downtask/dora_r64.py new file mode 100644 index 0000000000000000000000000000000000000000..a77d8c9a69e43d3c86ec0ecaf00295ab1be621b5 --- /dev/null +++ b/workspace/downtask/dora_r64.py @@ -0,0 +1,270 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 995 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from mamba_ssm import Mamba2 as Mamba +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import DoRACommonSenseReasoningR64 as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "resume": False, + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 16384, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 120000, + "learning_rate": 0.00001, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 100000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": "echo ignore_test", + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 12288, + "post_d_model": 16384, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 64, 96, 64, 1], + "model_dim": 16384, + "condition_dim": 16384, + "kernel_size": 7, + "sample_mode": DDIMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "downtask_dora_r64", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model +class VaryMambaModel(nn.Module): + config = {} + def __init__(self, positional_embedding): + super().__init__() + mamba1 = Mamba(d_model=config["model_config"]["d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2 = Mamba(d_model=config["model_config"]["post_d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2.in_proj = nn.Linear(mamba1.out_proj.out_features, mamba2.in_proj.out_features, bias=False) + self.mamba_forward = nn.Sequential(*[mamba1, mamba2]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + def forward(self, output_shape, condition=None): + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x +VaryMambaModel.config = config["model_config"] +model.model = VaryMambaModel( + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # update mamba model +torch.cuda.empty_cache() + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# load checkpoint +if config["resume"] and os.path.exists(f"./cache_{config['tag']}.pt"): + diction = torch.load(f"./cache_{config['tag']}.pt", map_location="cpu") + model.load_state_dict(diction["model"]) + optimizer.load_state_dict(diction["optimizer"]) + scheduler.load_state_dict(diction["scheduler"]) + start_batch_idx = diction["step"] + 1 +else: # not resume + start_batch_idx = 0 + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + batch_idx += start_batch_idx + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + torch.save({ + "model": accelerator.unwrap_model(model).state_dict(), + "optimizer": accelerator.unwrap_model(optimizer).state_dict(), + "scheduler": scheduler.state_dict(), + "step": batch_idx + }, f"./cache_{config['tag']}.pt") + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/downtask/dora_r64.sh b/workspace/downtask/dora_r64.sh new file mode 100644 index 0000000000000000000000000000000000000000..fd3fc42e542f20a760c37098964f534ee6bb1baf --- /dev/null +++ b/workspace/downtask/dora_r64.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + dora_r64.py \ No newline at end of file diff --git a/workspace/downtask/segmentation.py b/workspace/downtask/segmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..72d33b6ee4d1839ce56a3028d909d333a37b8753 --- /dev/null +++ b/workspace/downtask/segmentation.py @@ -0,0 +1,271 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 1000 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import _thread +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from mamba_ssm import Mamba2 as Mamba +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ADE20KSegmentation as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "resume": False, + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 16384, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 120000, + "learning_rate": 0.00001, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 12288, + "post_d_model": 16384, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 64, 96, 64, 1], + "model_dim": 16384, + "condition_dim": 16384, + "kernel_size": 7, + "sample_mode": DDIMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "downtask_segmentation", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model +class VaryMambaModel(nn.Module): + config = {} + def __init__(self, positional_embedding): + super().__init__() + mamba1 = Mamba(d_model=config["model_config"]["d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2 = Mamba(d_model=config["model_config"]["post_d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2.in_proj = nn.Linear(mamba1.out_proj.out_features, mamba2.in_proj.out_features, bias=False) + self.mamba_forward = nn.Sequential(*[mamba1, mamba2]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + def forward(self, output_shape, condition=None): + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x +VaryMambaModel.config = config["model_config"] +model.model = VaryMambaModel( + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # update mamba model +torch.cuda.empty_cache() + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# load checkpoint +if config["resume"] and os.path.exists(f"./cache_{config['tag']}.pt"): + diction = torch.load(f"./cache_{config['tag']}.pt", map_location="cpu") + model.load_state_dict(diction["model"]) + optimizer.load_state_dict(diction["optimizer"]) + scheduler.load_state_dict(diction["scheduler"]) + start_batch_idx = diction["step"] + 1 +else: # not resume + start_batch_idx = 0 + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + batch_idx += start_batch_idx + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + torch.save({ + "model": accelerator.unwrap_model(model).state_dict(), + "optimizer": accelerator.unwrap_model(optimizer).state_dict(), + "scheduler": scheduler.state_dict(), + "step": batch_idx + }, f"./cache_{config['tag']}.pt") + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + _thread.start_new_thread(os.system, (config["test_command"],)) # not stuck here + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/downtask/segmentation.sh b/workspace/downtask/segmentation.sh new file mode 100644 index 0000000000000000000000000000000000000000..840c181f7fe7bc1c3b9c11cb064b3701b137756d --- /dev/null +++ b/workspace/downtask/segmentation.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + segmentation.py \ No newline at end of file diff --git a/workspace/evaluate/efficiency.py b/workspace/evaluate/efficiency.py new file mode 100644 index 0000000000000000000000000000000000000000..b55bdb83952fd0fe100c38c8cc4b7dcfd043dfa7 --- /dev/null +++ b/workspace/evaluate/efficiency.py @@ -0,0 +1,96 @@ +import sys, os +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +# torch +import time +import types +import torch +import copy +from torch import nn +from model.diffusion import DDIMSampler, DDPMSampler +# father +import importlib +item = importlib.import_module(f"{sys.argv[1]}") +Dataset = item.Dataset +train_set = item.train_set +config = item.config +model = item.model +assert config.get("tag") is not None, "Remember to set a tag." + + + + +generate_config = { + "device": "cuda", + "num_generated": 1, + "checkpoint": f"./checkpoint/{config['tag']}.pth", + "generated_path": os.path.join(Dataset.generated_path.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "test_command": os.path.join(Dataset.test_command.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "need_test": True, + # inference setting + "sampler": DDIMSampler, + "steps": 60, # only valid when using DDIMSampler +} +config.update(generate_config) + + + + +# Model +print('==> Building model..') +diction = torch.load(config["checkpoint"]) +permutation_shape = diction["to_permutation_state.weight"].shape +model.to_permutation_state = nn.Embedding(*permutation_shape) +model.load_state_dict(diction) +model.criteria.diffusion_sampler = config["sampler"]( + model=model.criteria.diffusion_sampler.model, + beta=config["model_config"]["beta"], + T=config["model_config"]["T"], +) # sampler will be covered below +model.condi_embedder = copy.deepcopy(model.criteria.diffusion_sampler.model.condi_embedder) +@torch.no_grad() +def new_sample(self, x=None, condition=None): + z = self.model([1, self.sequence_length, self.config["d_model"]], condition) + z = self.condi_embedder(z) + if x is None: + x = torch.randn((1, self.sequence_length, self.config["model_dim"]), device=z.device) + x = self.criteria.sample(x, z, steps=config["steps"]) + return x +model.sample = types.MethodType(new_sample, model) +model.criteria.diffusion_sampler.model.condi_embedder = nn.Identity() +model = model.to(config["device"]) + + +# generate +print('==> Defining generate..') +def generate(save_path=config["generated_path"], test_command=config["test_command"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.cuda.amp.autocast(True, torch.bfloat16): + with torch.no_grad(): + start_time = time.time() + prediction = model(sample=True) + end_time = time.time() + generated_norm = torch.nanmean(prediction.abs()) + print("used time (seconds):", end_time - start_time) + print("memory usage (GB):", torch.cuda.max_memory_allocated() / (1024 ** 3)) + # print("Generated_norm:", generated_norm.item()) + train_set.save_params(prediction, save_path=save_path) + if need_test: + os.system(test_command) + print("\n") + + + + +if __name__ == "__main__": + for i in range(config["num_generated"]): + index = str(i+1).zfill(3) + print("Save to", config["generated_path"].format(config["tag"], index)) + generate( + save_path=config["generated_path"].format(config["tag"], index), + test_command=config["test_command"].format(config["tag"], index), + need_test=config["need_test"], + ) # generate and print info \ No newline at end of file diff --git a/workspace/evaluate/evaluate.py b/workspace/evaluate/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..8c8041538799900c24508411ab389571223e64a4 --- /dev/null +++ b/workspace/evaluate/evaluate.py @@ -0,0 +1,229 @@ +import sys, os +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +import random +import pandas as pd +import numpy as np +import torch +import pickle +import importlib +item = importlib.import_module(f"dataset.{sys.argv[1]}.train") +loader = item.test_loader +model = item.model +test = item.test +# tag in this file is only the name of specific dirname instead of config["tag"] +tag = os.path.basename(os.path.dirname(item.__file__)) + + + + +config = { + "checkpoint_path": f"./dataset/{tag}/checkpoint", + "generated_path": f"./dataset/{tag}/generated", + "cache_file": None, # None means default dataset/tag/cache.pt + "resume": True, # if you updated the checkpoint and generated models, use "resume": False. + "noise_intensity": [0.01, 0.02, 0.03, 0.04, 0.05], + "total_noised_number": 25, +} +assert config["total_noised_number"] % len(config["noise_intensity"]) == 0, \ + "total_noised_number must be a multiple of noise_intensity" +globals().update(config) + + + + +# load paths +checkpoint_items = [os.path.join(checkpoint_path, i) for i in os.listdir(checkpoint_path)] +generated_items = [os.path.join(generated_path, i) for i in os.listdir(generated_path)] +generated_items.sort() +total_items = list(checkpoint_items) + list(generated_items) +num_checkpoint = len(checkpoint_items) +num_generated = len(generated_items) + + +# define compute IoU +@torch.no_grad() +def compute_wrong_indices(diction): + model.load_state_dict(diction, strict=False) + model.eval() + _, acc, all_targets, all_predicts = test(model=model) + not_agreement = torch.logical_not(torch.eq(torch.tensor(all_targets), torch.tensor(all_predicts))) + return not_agreement, acc + +def compute_wrong_iou(a, b): + inter = np.logical_and(a, b) + union = np.logical_or(a, b) + iou = np.sum(inter) / np.sum(union) + return iou + + +# prepare evaluate +print("\n==> start evaluating..") +total_result_list = [] +total_acc_list = [] + +cache_file = os.path.join(os.path.dirname(checkpoint_path), "performance.cache") if cache_file is None else cache_file +if resume is True and os.path.exists(cache_file): + print(f"load cache from {cache_file}") + with open(cache_file, "rb") as f: + total_result_list, total_acc_list = pickle.load(f) +else: # compute checkpoint and generated + print(f"start inferencing on {tag}") + for i, item in enumerate(total_items): + print(f"start: {i+1}/{len(total_items)}") + item = torch.load(item, map_location='cpu') + result, acc = compute_wrong_indices(item) + result = result.numpy() + total_result_list.append(result) + total_acc_list.append(acc) + with open(cache_file, "wb") as f: + pickle.dump([total_result_list, total_acc_list], f) + + +# compute noised +checkpoint_items_for_noise = checkpoint_items.copy() +random.shuffle(checkpoint_items_for_noise) +num_each_noised = total_noised_number // len(noise_intensity) +num_noise_class = len(noise_intensity) +bias = 0 +for this_noise_intensity in noise_intensity: + for i in range(num_each_noised): + i = i + bias + print(f"testing noised: {i+1}/{num_each_noised * num_noise_class}") + item = checkpoint_items_for_noise[i % num_checkpoint] + item = torch.load(item, map_location="cpu") + new_diction = {} + for key, value in item.items(): + if ("num_batches_tracked" in key) or (value.numel() == 1) or not torch.is_floating_point(value): + pass # not add noise to these + elif "running_var" in key: + pre_mean = value.mean() * 0.95 + value = torch.log(value / pre_mean + 0.05) + mean, std = value.mean(), value.std() + value = (value - mean) / std + value += torch.randn_like(value) * this_noise_intensity + value = value * std + mean + value = torch.clip(torch.exp(value) - 0.05, min=0.001) * pre_mean + else: # conv & linear + mean, std = value.mean(), value.std() + value = (value - mean) / std + value += torch.randn_like(value) * this_noise_intensity + value = value * std + mean + new_diction[key] = value + result, acc = compute_wrong_indices(new_diction) + result = result.numpy() + total_result_list.append(result) + total_acc_list.append(acc) + bias += num_each_noised + + +# compute iou_metrix +print("start computing IoU...") +total_num = num_checkpoint + num_generated + num_each_noised * num_noise_class +assert total_num == len(total_result_list), \ + f"total_num:{total_num}, len(total_result_list):{len(total_result_list)}" +iou_matrix = np.zeros(shape=[total_num, total_num]) +for i in range(total_num): + for j in range(total_num): + iou = compute_wrong_iou(total_result_list[i], total_result_list[j]) + iou_matrix[i, j] = iou + + +# save result +df = pd.DataFrame(iou_matrix) +df.to_excel(f"./iou_{tag}.xlsx", index=False) +print(f"finished Saving ./iou_{tag}.xlsx!") + + +# print summary +print("\n\n===============================================") +print(f"Summary: {tag}") +print() +print("num_checkpoint:", num_checkpoint) +print("num_generated:", num_generated) +print(f"num_noised: {num_each_noised}x{num_noise_class}") +print(f"original_acc_mean:", np.array(total_acc_list[:num_checkpoint]).mean()) +print(f"original_acc_max:", np.array(total_acc_list[:num_checkpoint]).max()) +print(f"generated_acc_mean:", np.array(total_acc_list[num_checkpoint:num_checkpoint+num_generated]).mean()) +print(f"generated_acc_max:", np.array(total_acc_list[num_checkpoint:num_checkpoint+num_generated]).max()) + +this_start = num_checkpoint + num_generated +for this_noise_intensity in noise_intensity: + print(f"noise={this_noise_intensity:.4f}_acc_mean:", + np.array(total_acc_list[this_start:this_start+num_each_noised]).mean()) + this_start += num_each_noised +print() # empty line +origin_origin = iou_matrix[:num_checkpoint, :num_checkpoint] +origin_origin = (np.sum(origin_origin) - num_checkpoint) / (num_checkpoint * (num_checkpoint - 1)) +print("origin-origin:", origin_origin) +generated_generated = iou_matrix[num_checkpoint:num_checkpoint + num_generated, + num_checkpoint:num_checkpoint + num_generated] +generated_generated = (np.sum(generated_generated) - num_generated) / (num_generated * (num_generated - 1)) +print("generated-generated:", generated_generated) +origin_generated = iou_matrix[num_checkpoint:num_checkpoint + num_generated, :num_checkpoint] +origin_generated = np.mean(origin_generated) +print("origin-generated:", origin_generated) +origin_generated_max = iou_matrix[num_checkpoint:num_checkpoint + num_generated, :num_checkpoint] +origin_generated_max = np.amax(origin_generated_max, axis=-1) +print("origin-generated(max):", origin_generated_max.mean()) + + +# print noised +noised_max_list = [] +this_start = num_checkpoint + num_generated +for this_noise_intensity in noise_intensity: + print(f"\nnoise_intensity={this_noise_intensity}") + noised_noised = iou_matrix[this_start:this_start + num_each_noised, this_start:this_start + num_each_noised] + noised_noised = (np.sum(noised_noised) - num_each_noised) / (num_each_noised * (num_each_noised - 1)) + print("noised-noised:", noised_noised) + origin_noised = iou_matrix[this_start:this_start + num_each_noised, :num_checkpoint] + origin_noised = np.mean(origin_noised) + print("origin-noised:", origin_noised) + origin_noised_max = iou_matrix[this_start:this_start + num_each_noised, :num_checkpoint] + origin_noised_max = np.amax(origin_noised_max, axis=-1) + noised_max_list.append(origin_noised_max) + print("origin-noised(max):", origin_noised_max.mean()) + this_start += num_each_noised + + +# save summary +summary = { + "summary": tag, + "num_checkpoint": num_checkpoint, + "num_generated": num_generated, + "num_each_noised": num_each_noised, + "num_noise_class": num_noise_class, + "noise_intensity": noise_intensity, + "total_acc_list": total_acc_list, + "iou_matrix": iou_matrix, +} +draw_cache = [summary,] +# final draw +print("\n==> start drawing..") +import seaborn as sns +import matplotlib.pyplot as plt +# origin +draw_origin_origin_max = np.amax(iou_matrix[:num_checkpoint, :num_checkpoint] - np.eye(num_checkpoint), axis=-1) +draw_origin_origin_acc = np.array(total_acc_list[:num_checkpoint]) +sns.scatterplot(x=draw_origin_origin_max, y=draw_origin_origin_acc, label="origin") +draw_cache.append(dict(x=draw_origin_origin_max, y=draw_origin_origin_acc, label="origin")) +# generated +draw_origin_generated_max = origin_generated_max +draw_origin_generated_acc = np.array(total_acc_list[num_checkpoint:num_checkpoint + num_generated]) +sns.scatterplot(x=draw_origin_generated_max, y=draw_origin_generated_acc, label="generated") +draw_cache.append(dict(x=draw_origin_generated_max, y=draw_origin_generated_acc, label="generated")) +# noised +this_start = num_checkpoint + num_generated +for i, this_noise_intensity in enumerate(noise_intensity): + draw_origin_noised_max = noised_max_list[i] + draw_origin_noised_acc = total_acc_list[this_start: this_start+num_each_noised] + sns.scatterplot(x=draw_origin_noised_max, y=draw_origin_noised_acc, label=f"noise={this_noise_intensity:.4f}") + draw_cache.append(dict(x=draw_origin_noised_max, y=draw_origin_noised_acc, label=f"noise={this_noise_intensity:.4f}")) + this_start += num_each_noised +# draw +plt.savefig(f'plot_{tag}.png') +with open(f'plot_{tag}.cache', "wb") as f: + pickle.dump(draw_cache, f) +print(f"plot saved to plot_{tag}.png") \ No newline at end of file diff --git a/workspace/evaluate/generate.py b/workspace/evaluate/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..c687e4af07e1627822f3280aba0fc35b828ef1c6 --- /dev/null +++ b/workspace/evaluate/generate.py @@ -0,0 +1,73 @@ +import sys, os +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +# torch +import torch +from torch import nn +# father +import importlib +item = importlib.import_module(f"{sys.argv[1]}") +Dataset = item.Dataset +train_set = item.train_set +config = item.config +model = item.model +assert config.get("tag") is not None, "Remember to set a tag." + + + + +generate_config = { + "device": "cuda", + "num_generated": 10, + "checkpoint": f"./checkpoint/{config['tag']}.pth", + "generated_path": os.path.join(Dataset.generated_path.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "test_command": os.path.join(Dataset.test_command.rsplit("/", 1)[0], "generated_{}_{}.pth"), + "need_test": True, +} +config.update(generate_config) +if len(sys.argv) == 3: + exec("config.update(dict(" + sys.argv[2] + "))") +else: # more than 3 sys.argv + assert len(sys.argv) == 2, "Got too many argv. Please split by ','." + + + + +# Model +print('==> Building model..') +diction = torch.load(config["checkpoint"]) +permutation_shape = diction["to_permutation_state.weight"].shape +model.to_permutation_state = nn.Embedding(*permutation_shape) +model.load_state_dict(diction) +model = model.to(config["device"]) + + +# generate +print('==> Defining generate..') +def generate(save_path=config["generated_path"], test_command=config["test_command"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.cuda.amp.autocast(True, torch.bfloat16): + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = torch.nanmean(prediction.abs()) + # print("Generated_norm:", generated_norm.item()) + train_set.save_params(prediction, save_path=save_path) + if need_test: + os.system(test_command) + print("\n") + + + + +if __name__ == "__main__": + for i in range(config["num_generated"]): + index = str(i+1).zfill(3) + print("Save to", config["generated_path"].format(config["tag"], index)) + generate( + save_path=config["generated_path"].format(config["tag"], index), + test_command=config["test_command"].format(config["tag"], index), + need_test=config["need_test"], + ) # generate and print info \ No newline at end of file diff --git a/workspace/evaluate/memory.py b/workspace/evaluate/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..2b9c65c2dc734be7872441733f3d75ea469e3c4b --- /dev/null +++ b/workspace/evaluate/memory.py @@ -0,0 +1,64 @@ +import sys, os +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) + +# torch +import torch +from torch import nn +# father +import importlib +item = importlib.import_module(f"{sys.argv[1]}") +Dataset = item.Dataset +train_loader = item.train_loader +optimizer = item.optimizer +train_set = item.train_set +config = item.config +model = item.model +assert config.get("tag") is not None, "Remember to set a tag." + + + + +test_config = { + "device": "cuda", + "checkpoint": f"./checkpoint/{config['tag']}.pth", +} +config.update(test_config) + + + + +# Model +print('==> Building model..') +diction = torch.load(config["checkpoint"]) +permutation_shape = diction["to_permutation_state.weight"].shape +model.to_permutation_state = nn.Embedding(*permutation_shape) +model.load_state_dict(diction) +model = model.to(config["device"]) + + +# test +print('==> Defining training..') +def memory_test(): + print("==> start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # noinspection PyArgumentList + with torch.autocast(enabled=True, dtype=torch.bfloat16, device_type="cuda"): + loss = model(output_shape=param.shape, + x_0=param.to(model.device), + permutation_state=permutation_state.to(model.device)) + loss.backward() + optimizer.step() + if batch_idx >= 10: + break + os.system("nvidia-smi") + input(f"This program running on GPU:{os.environ['CUDA_VISIBLE_DEVICES']}") + + + + +if __name__ == "__main__": + memory_test() \ No newline at end of file diff --git a/workspace/example/cifar10_resnet18.py b/workspace/example/cifar10_resnet18.py new file mode 100644 index 0000000000000000000000000000000000000000..d10112f4d3e82ffd76c30ad98b8e4be74e60d9f4 --- /dev/null +++ b/workspace/example/cifar10_resnet18.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import Cifar10_ResNet18 as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 16, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "quick_start_cifar10_resnet18", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/example/cifar10_resnet18.sh b/workspace/example/cifar10_resnet18.sh new file mode 100644 index 0000000000000000000000000000000000000000..c50effe7f5c6823f9737107a7483aaff8ee7422b --- /dev/null +++ b/workspace/example/cifar10_resnet18.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + cifar10_resnet18.py \ No newline at end of file diff --git a/workspace/launch.sh b/workspace/launch.sh new file mode 100644 index 0000000000000000000000000000000000000000..873ef2ff7a695173cb6923ec3bb4d31aaa444e0a --- /dev/null +++ b/workspace/launch.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# set variable +gpu_ids="$2" +exec_file="$1" + + +# count gpus +IFS=',' read -r -a gpus <<< "$gpu_ids" +num_gpus=${#gpus[@]} + +# find a usable port +find_open_port() { + local port + while true; do + port=$(( (RANDOM % 32768) + 32767 )) + if ! (echo >/dev/tcp/localhost/$port) &>/dev/null; then + break + fi + done + echo $port +} +main_process_port=$(find_open_port) +echo "Using main_process_port=$main_process_port" + +# construct command +command="accelerate launch --main_process_port=$main_process_port --num_processes=$num_gpus --gpu_ids=$gpu_ids" +command+=" --num_machines=1 --mixed_precision=bf16 --dynamo_backend=no" +if [ $num_gpus -ge 2 ]; then + command+=" --multi_gpu" +fi +command+=" $exec_file" + +# execute command +eval $command \ No newline at end of file diff --git a/workspace/main/convnextatto_8192.py b/workspace/main/convnextatto_8192.py new file mode 100644 index 0000000000000000000000000000000000000000..8e2f57f860e847494777eb00f8bce40a85156c30 --- /dev/null +++ b/workspace/main/convnextatto_8192.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ConvNextAtto as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "main_convnextatto_8192", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/main/convnextatto_8192.sh b/workspace/main/convnextatto_8192.sh new file mode 100644 index 0000000000000000000000000000000000000000..81797de79c712e330db0108b778e6c4121614c15 --- /dev/null +++ b/workspace/main/convnextatto_8192.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + convnextatto_8192.py \ No newline at end of file diff --git a/workspace/main/convnextlarge_16384.py b/workspace/main/convnextlarge_16384.py new file mode 100644 index 0000000000000000000000000000000000000000..a1dae64f7554c8e127749ec7596fe932a4e7d4cf --- /dev/null +++ b/workspace/main/convnextlarge_16384.py @@ -0,0 +1,293 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 995 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import _thread +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from mamba_ssm import Mamba2 as Mamba +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ConvNextLarge as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "resume": False, + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 16384, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 120000, + "learning_rate": 0.00001, + "warm_up_steps": 1, + "warmup_factor": 1.0, + "weight_decay": 0.0, + "save_every": 120000//50, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 12288, + "post_d_model": 16384, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 448, + "layer_channels": [1, 64, 96, 64, 1], + "model_dim": 16384, + "condition_dim": 16384, + "kernel_size": 7, + "sample_mode": DDIMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "main_convnextlarge_16384", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model +class VaryMambaModel(nn.Module): + config = {} + def __init__(self, positional_embedding): + super().__init__() + mamba1 = Mamba(d_model=config["model_config"]["d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2 = Mamba(d_model=config["model_config"]["post_d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2.in_proj = nn.Linear(mamba1.out_proj.out_features, mamba2.in_proj.out_features, bias=False) + self.mamba_forward = nn.Sequential(*[mamba1, mamba2]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + def forward(self, output_shape, condition=None): + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x +VaryMambaModel.config = config["model_config"] +model.model = VaryMambaModel( + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # update mamba model +torch.cuda.empty_cache() + + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +class WarmupCosineAnnealingLR(torch.optim.lr_scheduler._LRScheduler): + def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1, warmup_epochs=5, warmup_factor=0.1): + self.T_max = T_max + self.eta_min = eta_min + self.warmup_epochs = warmup_epochs + self.warmup_factor = warmup_factor + super().__init__(optimizer, last_epoch) + def get_lr(self): + if self.last_epoch < self.warmup_epochs: + alpha = float(self.last_epoch) / self.warmup_epochs + factor = self.warmup_factor * (1.0 - alpha) + alpha + else: # end warm up + progress = (self.last_epoch - self.warmup_epochs) / (self.T_max - self.warmup_epochs) + factor = (1 + math.cos(math.pi * progress)) / 2 + factor = (1 - self.eta_min) * factor + self.eta_min + return [base_lr * factor for base_lr in self.base_lrs] +scheduler = WarmupCosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], + warmup_epochs=config["warm_up_steps"], + warmup_factor=config["warmup_factor"], +) + +# load checkpoint +if config["resume"] and os.path.exists(f"./cache_{config['tag']}.pt"): + diction = torch.load(f"./cache_{config['tag']}.pt", map_location="cpu") + model.load_state_dict(diction["model"]) + optimizer.load_state_dict(diction["optimizer"]) + scheduler.load_state_dict(diction["scheduler"]) + start_batch_idx = diction["step"] + 1 +else: # not resume + start_batch_idx = 0 + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + batch_idx += start_batch_idx + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + # if accelerator.sync_gradients: + # accelerator.clip_grad_norm_(model.parameters(), 1.) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + torch.save({ + "model": accelerator.unwrap_model(model).state_dict(), + "optimizer": accelerator.unwrap_model(optimizer).state_dict(), + "scheduler": scheduler.state_dict(), + "step": batch_idx + }, f"./cache_{config['tag']}.pt") + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True, permutation_state=False) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) diff --git a/workspace/main/convnextlarge_16384.sh b/workspace/main/convnextlarge_16384.sh new file mode 100644 index 0000000000000000000000000000000000000000..c3f97049aaa5ea381847706a8cf312aa947d3150 --- /dev/null +++ b/workspace/main/convnextlarge_16384.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + convnextlarge_16384.py \ No newline at end of file diff --git a/workspace/main/resnet18_8192.py b/workspace/main/resnet18_8192.py new file mode 100644 index 0000000000000000000000000000000000000000..c39c9cd5dc1c3ce7646cc51da5e679c396bf9f89 --- /dev/null +++ b/workspace/main/resnet18_8192.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ResNet18 as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 16, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "main_resnet18_8192", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/main/resnet18_8192.sh b/workspace/main/resnet18_8192.sh new file mode 100644 index 0000000000000000000000000000000000000000..416e8dc6cb731057a2abc6cc19e8836f5aa1fd40 --- /dev/null +++ b/workspace/main/resnet18_8192.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + resnet18_8192.py \ No newline at end of file diff --git a/workspace/main/resnet50_8192.py b/workspace/main/resnet50_8192.py new file mode 100644 index 0000000000000000000000000000000000000000..94d5b1cfea706796f432e29173ed3d0bfb042dd5 --- /dev/null +++ b/workspace/main/resnet50_8192.py @@ -0,0 +1,220 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ResNet50 as Dataset +from torch.utils.data import DataLoader + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 8, + "num_workers": 16, + "total_steps": 80000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 80000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": 'auto', + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "main_resnet50_8192", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/main/resnet50_8192.sh b/workspace/main/resnet50_8192.sh new file mode 100644 index 0000000000000000000000000000000000000000..156d7b156715a5bffd006bfdb1bbd15853fb03cb --- /dev/null +++ b/workspace/main/resnet50_8192.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + resnet50_8192.py \ No newline at end of file diff --git a/workspace/main/vitbase_16384.py b/workspace/main/vitbase_16384.py new file mode 100644 index 0000000000000000000000000000000000000000..2b3886a762212ecbca656deda37680c62d337fb5 --- /dev/null +++ b/workspace/main/vitbase_16384.py @@ -0,0 +1,270 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 995 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from mamba_ssm import Mamba2 as Mamba +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTBase as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "resume": False, + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 16384, + "sequence_length": 'auto', + # train setting + "batch_size": 2, + "num_workers": 4, + "total_steps": 120000, + "learning_rate": 0.00001, + "weight_decay": 0.0, + "save_every": 120000//30, + "print_every": 50, + "autocast": lambda i: 5000 < i < 100000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 12288, + "post_d_model": 16384, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 512, + "layer_channels": [1, 64, 96, 64, 1], + "model_dim": 16384, + "condition_dim": 16384, + "kernel_size": 7, + "sample_mode": DDIMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "main_vitbase_16384", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model +class VaryMambaModel(nn.Module): + config = {} + def __init__(self, positional_embedding): + super().__init__() + mamba1 = Mamba(d_model=config["model_config"]["d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2 = Mamba(d_model=config["model_config"]["post_d_model"], + d_state=config["model_config"]["d_state"], + d_conv=config["model_config"]["d_conv"], + expand=config["model_config"]["expand"]) + mamba2.in_proj = nn.Linear(mamba1.out_proj.out_features, mamba2.in_proj.out_features, bias=False) + self.mamba_forward = nn.Sequential(*[mamba1, mamba2]) + pe = positional_embedding[None, :, :] + if self.config.get("trainable_pe"): + self.pe = nn.Parameter(pe) + else: # fixed positional embedding + self.register_buffer("pe", pe) + def forward(self, output_shape, condition=None): + x = self.mamba_forward(self.pe.repeat(output_shape[0], 1, 1) + condition) + return x +VaryMambaModel.config = config["model_config"] +model.model = VaryMambaModel( + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # update mamba model +torch.cuda.empty_cache() + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# load checkpoint +if config["resume"] and os.path.exists(f"./cache_{config['tag']}.pt"): + diction = torch.load(f"./cache_{config['tag']}.pt", map_location="cpu") + model.load_state_dict(diction["model"]) + optimizer.load_state_dict(diction["optimizer"]) + scheduler.load_state_dict(diction["scheduler"]) + start_batch_idx = diction["step"] + 1 +else: # not resume + start_batch_idx = 0 + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + batch_idx += start_batch_idx + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + torch.save({ + "model": accelerator.unwrap_model(model).state_dict(), + "optimizer": accelerator.unwrap_model(optimizer).state_dict(), + "scheduler": scheduler.state_dict(), + "step": batch_idx + }, f"./cache_{config['tag']}.pt") + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/main/vitbase_16384.sh b/workspace/main/vitbase_16384.sh new file mode 100644 index 0000000000000000000000000000000000000000..89f3988d8571ce093c46eaf4ea9d09fa4239b993 --- /dev/null +++ b/workspace/main/vitbase_16384.sh @@ -0,0 +1,9 @@ +accelerate launch \ + --main_process_port=12345 \ + --multi_gpu \ + --num_processes=4 \ + --gpu_ids='1,2,3,4' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vitbase_16384.py \ No newline at end of file diff --git a/workspace/main/vitsmall_8192.py b/workspace/main/vitsmall_8192.py new file mode 100644 index 0000000000000000000000000000000000000000..1a361ed7bec6aaf6341d4184f088d74a02f47527 --- /dev/null +++ b/workspace/main/vitsmall_8192.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import torch.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTSmall as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "main_vitsmall_8192", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/main/vitsmall_8192.sh b/workspace/main/vitsmall_8192.sh new file mode 100644 index 0000000000000000000000000000000000000000..9842bfa2f79f1472506d90cf791e05d9afa5c017 --- /dev/null +++ b/workspace/main/vitsmall_8192.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vitsmall_8192.py \ No newline at end of file diff --git a/workspace/main/vittiny_8192.py b/workspace/main/vittiny_8192.py new file mode 100644 index 0000000000000000000000000000000000000000..07bcb1f23e88f8c48937fece7836bfa111214093 --- /dev/null +++ b/workspace/main/vittiny_8192.py @@ -0,0 +1,221 @@ +import sys, os, json +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) +sys.path.append(root) +os.chdir(root) +with open("./workspace/config.json", "r") as f: + additional_config = json.load(f) +USE_WANDB = additional_config["use_wandb"] + +# set global seed +import random +import numpy as np +import torch +seed = SEED = 999 +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = True +np.random.seed(seed) +random.seed(seed) + +# other +import math +import random +import warnings +from _thread import start_new_thread +warnings.filterwarnings("ignore", category=UserWarning) +if USE_WANDB: import wandb +# torch +import torch +import torch.nn as nn +import bitsandbytes.optim as optim +from torch.nn import functional as F +from torch.cuda.amp import autocast +# model +from model import MambaDiffusion as Model +from model.diffusion import DDPMSampler, DDIMSampler +from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR +from accelerate.utils import DistributedDataParallelKwargs +from accelerate.utils import AutocastKwargs +from accelerate import Accelerator +# dataset +from dataset import ImageNet_ViTTiny as Dataset +from torch.utils.data import DataLoader + + + + +config = { + "seed": SEED, + # dataset setting + "dataset": Dataset, + "dim_per_token": 8192, + "sequence_length": 'auto', + # train setting + "batch_size": 4, + "num_workers": 8, + "total_steps": 50000, + "learning_rate": 0.00003, + "weight_decay": 0.0, + "save_every": 50000//25, + "print_every": 50, + "autocast": lambda i: 5000 < i < 45000, + "checkpoint_save_path": "./checkpoint", + # test setting + "test_batch_size": 1, # fixed, don't change this + "generated_path": Dataset.generated_path, + "test_command": Dataset.test_command, + # to log + "model_config": { + "num_permutation": "auto", + # mamba config + "d_condition": 1, + "d_model": 8192, + "d_state": 128, + "d_conv": 4, + "expand": 2, + "num_layers": 2, + # diffusion config + "diffusion_batch": 1024, + "layer_channels": [1, 32, 64, 128, 64, 32, 1], + "model_dim": "auto", + "condition_dim": "auto", + "kernel_size": 7, + "sample_mode": DDPMSampler, + "beta": (0.0001, 0.02), + "T": 1000, + "forward_once": True, + }, + "tag": "main_vittiny_8192", +} + + + + +# Data +print('==> Preparing data..') +train_set = config["dataset"](dim_per_token=config["dim_per_token"]) +print("Dataset length:", train_set.real_length) +print("input shape:", train_set[0][0].shape) +if config["model_config"]["num_permutation"] == "auto": + config["model_config"]["num_permutation"] = train_set.max_permutation_state +if config["model_config"]["condition_dim"] == "auto": + config["model_config"]["condition_dim"] = config["model_config"]["d_model"] +if config["model_config"]["model_dim"] == "auto": + config["model_config"]["model_dim"] = config["dim_per_token"] +if config["sequence_length"] == "auto": + config["sequence_length"] = train_set.sequence_length + print(f"sequence length: {config['sequence_length']}") +else: # set fixed sequence_length + assert train_set.sequence_length == config["sequence_length"], f"sequence_length={train_set.sequence_length}" +train_loader = DataLoader( + dataset=train_set, + batch_size=config["batch_size"], + num_workers=config["num_workers"], + persistent_workers=True, + drop_last=True, + shuffle=True, +) + +# Model +print('==> Building model..') +Model.config = config["model_config"] +model = Model( + sequence_length=config["sequence_length"], + positional_embedding=train_set.get_position_embedding( + positional_embedding_dim=config["model_config"]["d_model"] + ) # positional_embedding +) # model setting is in model + +# Optimizer +print('==> Building optimizer..') +optimizer = optim.AdamW8bit( + params=model.parameters(), + lr=config["learning_rate"], + weight_decay=config["weight_decay"], +) +scheduler = CosineAnnealingLR( + optimizer=optimizer, + T_max=config["total_steps"], +) + +# accelerator +if __name__ == "__main__": + kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator(kwargs_handlers=[kwargs,]) + if config["dim_per_token"] > 12288 and accelerator.state.num_processes == 1: + print(f"\033[91mWARNING: With token size {config['dim_per_token']}, we suggest to train on multiple GPUs.\033[0m") + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + +# wandb +if __name__ == "__main__" and USE_WANDB and accelerator.is_main_process: + wandb.login(key=additional_config["wandb_api_key"]) + wandb.init(project="Recurrent-Parameter-Generation", name=config['tag'], config=config,) + + + + +# Training +print('==> Defining training..') +def train(): + if not USE_WANDB: + train_loss = 0 + this_steps = 0 + print("==> Start training..") + model.train() + for batch_idx, (param, permutation_state) in enumerate(train_loader): + optimizer.zero_grad() + # train + # noinspection PyArgumentList + with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=config["autocast"](batch_idx))): + loss = model(output_shape=param.shape, x_0=param, permutation_state=permutation_state) + accelerator.backward(loss) + optimizer.step() + if accelerator.is_main_process: + scheduler.step() + # to logging losses and print and save + if USE_WANDB and accelerator.is_main_process: + wandb.log({"train_loss": loss.item()}) + elif USE_WANDB: + pass # don't print + else: # not use wandb + train_loss += loss.item() + this_steps += 1 + if this_steps % config["print_every"] == 0: + print('Loss: %.6f' % (train_loss/this_steps)) + this_steps = 0 + train_loss = 0 + if batch_idx % config["save_every"] == 0 and accelerator.is_main_process: + os.makedirs(config["checkpoint_save_path"], exist_ok=True) + state = accelerator.unwrap_model(model).state_dict() + torch.save(state, os.path.join(config["checkpoint_save_path"], config["tag"]+".pth")) + generate(save_path=config["generated_path"], need_test=True) + if batch_idx >= config["total_steps"]: + break + + +def generate(save_path=config["generated_path"], need_test=True): + print("\n==> Generating..") + model.eval() + with torch.no_grad(): + prediction = model(sample=True) + generated_norm = prediction.abs().mean() + print("Generated_norm:", generated_norm.item()) + if USE_WANDB: + wandb.log({"generated_norm": generated_norm.item()}) + train_set.save_params(prediction, save_path=save_path) + if need_test: + start_new_thread(os.system, (config["test_command"],)) + model.train() + return prediction + + + + +if __name__ == '__main__': + train() + del train_loader # deal problems by dataloader + print("Finished Training!") + exit(0) \ No newline at end of file diff --git a/workspace/main/vittiny_8192.sh b/workspace/main/vittiny_8192.sh new file mode 100644 index 0000000000000000000000000000000000000000..57e73075b37647e96cfb4871c2dbda5b449fe513 --- /dev/null +++ b/workspace/main/vittiny_8192.sh @@ -0,0 +1,8 @@ +accelerate launch \ + --main_process_port=0 \ + --num_processes=1 \ + --gpu_ids='0' \ + --num_machines=1 \ + --mixed_precision=bf16 \ + --dynamo_backend=no \ + vittiny_8192.py \ No newline at end of file diff --git a/workspace/set_configs.py b/workspace/set_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..0d2bb3afd8d38af3df85a5bf1b62e0b7a8bdc9eb --- /dev/null +++ b/workspace/set_configs.py @@ -0,0 +1,115 @@ +import os +import os.path as op +root = os.sep + os.sep.join(__file__.split(os.sep)[1:__file__.split(os.sep).index("Recurrent-Parameter-Generation")+1]) + + + + +assert op.exists(root), "Cannot find the executing root." +assert op.basename(root) == "Recurrent-Parameter-Generation", \ + f""" + You need to rename the repository folder to "Recurrent-Parameter-Generation" manually. + Because the whole project depends on this name. + The file structure is as follow: + └─Recurrent-Parameter-Generation + ├─dataset + │ ├─cifar10_cnnmedium + │ ├─...(total 21 folders) + │ ├─__init__.py + │ ├─config.json + │ ├─dataset.py + │ └─register.py + ├─model + │ ├─__init__.py + │ ├─denoiser.py + │ ├─diffusion.py + │ └─...(total 8 files) + ├─quick_start + │ ├─set_configs.py + │ └─auto_start.sh + ├─workspace + │ ├─main + │ ├─evaluate + │ ├─...(total 6 folders) + │ └─config.json + ├─README.md + └─requirements.txt + """ + + +print("\n1. Set an \033[91mABSOLUTE\033[0m path to download your small dataset, such as CIFAR10 and CIFAR100") +default_dataset_root = op.join(op.dirname(op.abspath(root)), 'Dataset') +dataset_root = input(f"[{default_dataset_root} (default & \033[32mrecommanded\033[0m)]: ") or default_dataset_root +print(f"\033[32mdataset_root is set to {dataset_root}\033[0m") + + +print("\n2. Set the \033[91mABSOLUTE\033[0m path to your ImageNet1k dataset. " + "\033[32m(Press ENTER if you don't want to use ImageNet1k)\033[0m") +print("""The ImageNet1k dataset should be organized as follow: +└─ImageNet1k + ├─train + │ ├─n01443537 + │ ├─n01484850 + │ ├─n######## + └─test + ├─n01443537 + ├─n01484850 + └─n########""") +imagenet_root = input(f"[None (default)]: ") +if imagenet_root == "": + print("\033[32mWe don't use ImageNet1k.\033[0m") + imagenet_root_train = None + imagenet_root_test = None +else: # imagenet path is set + print(f"\033[32mimagenet_root is set to {imagenet_root}\033[0m") + imagenet_root_train = op.join(imagenet_root, "train") + imagenet_root_test = op.join(imagenet_root, "test") + assert op.exists(imagenet_root_train), f"{imagenet_root_train} is not existed." + assert op.exists(imagenet_root_test), f"{imagenet_root_test} is not existed." + + +print("\n3. Do you want to use wandb?") +default_use_wandb = True +use_wandb = input("[True (default & \033[32mrecommanded\033[0m)) / False]: ") +use_wandb = default_use_wandb if use_wandb == "" else eval(use_wandb) +print(f"\033[32muse_wandb is set to {use_wandb}\033[0m") +if use_wandb: + wandb_api_key = input("Set your wandb api key: ") + assert wandb_api_key != "", "You need to set an API_KEY is you want to use wandb." + + + + +print() +import json +from pprint import pprint + +# dataset/config.json +print() +with open(op.join(root, "dataset/config.json"), "r") as f: + dataset_config = json.load(f) + dataset_config.update({ + "dataset_root": dataset_root, + "imagenet_root": { + "train": imagenet_root_train, + "test": imagenet_root_test, + }, + }) +with open(op.join(root, "dataset/config.json"), "w") as f: + print("\033[32mUpdated dataset/config.json as follow:\033[0m") + pprint(dataset_config) + json.dump(dataset_config, f) + +# workspace/config.json +print() +with open(op.join(root, "workspace/config.json"), "r") as f: + workspace_config = json.load(f) + workspace_config.update({ + "use_wandb": use_wandb, + "wandb_api_key": globals().get("wandb_api_key", None), + }) +with open(op.join(root, "workspace/config.json"), "w") as f: + print("\033[32mUpdated workspace/config.json as follow:\033[0m") + pprint(workspace_config) + json.dump(workspace_config, f) +print() \ No newline at end of file