Spaces:
Sleeping
Sleeping
File size: 8,761 Bytes
bd63939 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import torch.nn as nn
import torch
# import math
# from torchvision import transforms
import os
# from timm.models import create_model
from typing import Any, Dict, List, Optional, Union
from transformers import LlamaTokenizer
from diffusers import DiffusionPipeline
# from torchvision.transforms.functional import pil_to_tensor
# import torch
from PIL import Image
from torchvision import transforms
# from qformer.qformer_quantizer import Blip2QformerQuantizer
# from diffusers import StableUnCLIPImg2ImgPipeline
from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
WEIGHTS_NAME = 'seed_quantizer.pt'
DIFFUSION_NAME = 'diffusion_model'
class ImageTokenizer(nn.Module):
def __init__(self,
model_path,
diffusion_model_path=None,
load_diffusion=False,
image_size=224,
device='cuda',
fp16=True,
**kwargs):
super().__init__()
from .seed_qformer.qformer_quantizer import Blip2QformerQuantizer
model = Blip2QformerQuantizer.from_pretrained(pretrained_model_path=model_path,
vit_precision='fp16' if fp16 else 'fp32',
**kwargs).eval()
if diffusion_model_path is not None and load_diffusion:
# diffusion_model = DiffusionPipeline.from_pretrained(diffusion_model_path,
# torch_dtype=torch.float16 if fp16 else torch.float32)
diffusion_model = StableUnCLIPImg2ImgPipeline.from_pretrained(diffusion_model_path,
torch_dtype=torch.float16 if fp16 else torch.float32)
self.diffusion_model = diffusion_model.to(device)
else:
self.diffusion_model = None
model = model.to(device)
processor = transforms.Compose([
transforms.Resize((image_size, image_size), interpolation=3),
# transforms.Resize(image_size, interpolation=3),
# transforms.CenterCrop(image_size),
transforms.ToTensor(),
transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
])
if fp16:
model = model.half()
shape_latents = torch.Size([1, 4, 96, 96])
self.latents = torch.randn(shape_latents, generator=None, device=device, dtype=torch.float16, layout=torch.strided)
shape_noise = torch.Size([1, 1024])
self.noise = torch.randn(shape_noise, generator=None, device=device, dtype=torch.float16, layout=torch.strided)
self.model = model
self.processor = processor
self.device = device
self.fp16 = fp16
def __len__(self):
return self.model.n_embed
def encode(self, image_torch):
'''Convert a batch of img to code
Args:
model: The tokenizer model.
img: [b, c, h, w]
'''
if len(image_torch.shape) == 3:
image_torch = image_torch.unsqueeze(0)
# img = image_torch.to(self.device)
img = image_torch
if self.fp16:
img = img.half()
with torch.no_grad():
id, _ = self.model.get_codebook_indices(img)
return id.view(img.shape[0], -1)
def decode(self, indices, negative_indices=None, guidance_scale=10, num_inference_steps=20):
image_embeds = self.model.get_codebook_entry(indices)
# image = self.diffusion_model(image_embeds=image_embed,
# noise_level=0,
# num_inference_steps=20,
# latents=self.latents,
# noise=self.noise).images
if negative_indices is not None:
assert indices.shape == negative_indices.shape, 'Negative indices must have the same shape with indices'
negative_image_embeds = self.model.get_codebook_entry(negative_indices)
else:
negative_image_embeds = None
image = self.diffusion_model(
image_embeds=image_embeds,
negative_image_embeds=negative_image_embeds,
guidance_scale=guidance_scale,
noise_level=0,
num_inference_steps=num_inference_steps,
latents=self.latents,
).images
return image
class SeedLlamaTokenizer(LlamaTokenizer):
def __init__(self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
device='cuda',
fp16=True,
load_diffusion=False,
encoder_url=None,
diffusion_path=None,
**kwargs):
super().__init__(vocab_file, unk_token, bos_token, eos_token, pad_token, sp_model_kwargs, add_bos_token, add_eos_token,
clean_up_tokenization_spaces, **kwargs)
self.device = device
self.fp16 = fp16
self.pad_token = self.unk_token
self.load_diffusion = load_diffusion
self.encoder_url = encoder_url
self.diffusion_path = diffusion_path
self.load_image_tokenizer()
def load_image_tokenizer(self):
if not hasattr(self, '_image_tokenizer'):
if self.encoder_url is not None:
model_path = self.encoder_url
else:
assert hasattr(self, 'name_or_path') and os.path.exists(self.name_or_path)
model_path = os.path.join(self.name_or_path, WEIGHTS_NAME)
# diffusion_model_path = os.path.join(self.name_or_path, DIFFUSION_NAME)
# diffusion_model_path = 'stabilityai/stable-diffusion-2-1-unclip'
self._image_tokenizer = ImageTokenizer(model_path=model_path,
diffusion_model_path=self.diffusion_path,
load_diffusion=self.load_diffusion,
device=self.device,
fp16=self.fp16)
@property
def image_tokenizer(self):
if not hasattr(self, '_image_tokenizer'):
if self.encoder_url is not None:
model_path = self.encoder_url
else:
assert hasattr(self, 'name_or_path') and os.path.exists(self.name_or_path)
model_path = os.path.join(self.name_or_path, WEIGHTS_NAME)
# diffusion_model_path = os.path.join(self.name_or_path, DIFFUSION_NAME)
# diffusion_model_path = 'stabilityai/stable-diffusion-2-1-unclip'
self._image_tokenizer = ImageTokenizer(model_path=model_path,
diffusion_model_path=self.diffusion_path,
load_diffusion=self.load_diffusion,
device=self.device,
fp16=self.fp16)
return self._image_tokenizer
@property
def num_image_tokens(self):
return 8192 # self.image_tokenizer.num_tokens # allow not load
def to(self, device):
self.device = device
if hasattr(self, '_image_tokenizer'):
self._image_tokenizer.to(device=device)
def encode_image(
self,
image_path=None,
image_pil=None,
image_torch=None,
image_size: int = 224,
):
assert (image_path is None) + (image_pil is None) + (image_torch is None) == 2
# need_norm_to_1 = False
if image_path is not None:
image_pil = Image.open(image_path).convert('RGB')
if image_pil is not None:
image_torch = self.image_tokenizer.processor(image_pil)
image_torch = image_torch.to(self.device)
return self.image_tokenizer.encode(image_torch)
def decode_image(self, indices, negative_indices=None, guidance_scale=10):
indices = indices.to(self.device)
if negative_indices is not None:
negative_indices = negative_indices.to(self.device)
image = self.image_tokenizer.decode(
indices,
negative_indices=negative_indices,
guidance_scale=guidance_scale,
)
return image
|