Surya Narayana commited on
Commit
9511891
·
verified ·
1 Parent(s): 43392c4

Upload text_to_image.py

Browse files
Files changed (1) hide show
  1. text_to_image.py +681 -0
text_to_image.py ADDED
@@ -0,0 +1,681 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """text-to-image.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1OcehPd4sJRgAE0kaYV9y8oTf0G0VElbU
8
+ """
9
+
10
+ # Commented out IPython magic to ensure Python compatibility.
11
+ # %pip install -q "openvino>=2023.1.0"
12
+ # %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "diffusers[torch]>=0.9.0"
13
+ # %pip install -q "huggingface-hub>=0.9.1"
14
+ # %pip install -q gradio
15
+ # %pip install -q transformers
16
+ # %pip install kaleido cohere openai tiktoken
17
+ # %pip install typing-extensions==3.10.0.2
18
+ # %pip install diffusers transformers
19
+
20
+ from diffusers import StableDiffusionPipeline
21
+ import gc
22
+
23
+ pipe = StableDiffusionPipeline.from_pretrained("prompthero/openjourney").to("cpu")
24
+ text_encoder = pipe.text_encoder
25
+ text_encoder.eval()
26
+ unet = pipe.unet
27
+ unet.eval()
28
+ vae = pipe.vae
29
+ vae.eval()
30
+
31
+ del pipe
32
+ gc.collect()
33
+
34
+ from pathlib import Path
35
+ import torch
36
+ import openvino as ov
37
+
38
+ TEXT_ENCODER_OV_PATH = Path("text_encoder.xml")
39
+
40
+ def cleanup_torchscript_cache():
41
+ """
42
+ Helper for removing cached model representation
43
+ """
44
+ torch._C._jit_clear_class_registry()
45
+ torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
46
+ torch.jit._state._clear_class_state()
47
+
48
+ def convert_encoder(text_encoder: torch.nn.Module, ir_path:Path):
49
+ """
50
+ Convert Text Encoder mode.
51
+ Function accepts text encoder model, and prepares example inputs for conversion,
52
+ Parameters:
53
+ text_encoder (torch.nn.Module): text_encoder model from Stable Diffusion pipeline
54
+ ir_path (Path): File for storing model
55
+ Returns:
56
+ None
57
+ """
58
+ input_ids = torch.ones((1, 77), dtype=torch.long)
59
+ # switch model to inference mode
60
+ text_encoder.eval()
61
+
62
+ # disable gradients calculation for reducing memory consumption
63
+ with torch.no_grad():
64
+ # Export model to IR format
65
+ ov_model = ov.convert_model(text_encoder, example_input=input_ids, input=[(1,77),])
66
+ ov.save_model(ov_model, ir_path)
67
+ del ov_model
68
+ cleanup_torchscript_cache()
69
+ print(f'Text Encoder successfully converted to IR and saved to {ir_path}')
70
+
71
+
72
+ if not TEXT_ENCODER_OV_PATH.exists():
73
+ convert_encoder(text_encoder, TEXT_ENCODER_OV_PATH)
74
+ else:
75
+ print(f"Text encoder will be loaded from {TEXT_ENCODER_OV_PATH}")
76
+
77
+ del text_encoder
78
+ gc.collect()
79
+
80
+ import numpy as np
81
+
82
+ UNET_OV_PATH = Path('unet.xml')
83
+
84
+ dtype_mapping = {
85
+ torch.float32: ov.Type.f32,
86
+ torch.float64: ov.Type.f64
87
+ }
88
+
89
+
90
+ def convert_unet(unet:torch.nn.Module, ir_path:Path):
91
+ """
92
+ Convert U-net model to IR format.
93
+ Function accepts unet model, prepares example inputs for conversion,
94
+ Parameters:
95
+ unet (StableDiffusionPipeline): unet from Stable Diffusion pipeline
96
+ ir_path (Path): File for storing model
97
+ Returns:
98
+ None
99
+ """
100
+ # prepare inputs
101
+ encoder_hidden_state = torch.ones((2, 77, 768))
102
+ latents_shape = (2, 4, 512 // 8, 512 // 8)
103
+ latents = torch.randn(latents_shape)
104
+ t = torch.from_numpy(np.array(1, dtype=float))
105
+ dummy_inputs = (latents, t, encoder_hidden_state)
106
+ input_info = []
107
+ for input_tensor in dummy_inputs:
108
+ shape = ov.PartialShape(tuple(input_tensor.shape))
109
+ element_type = dtype_mapping[input_tensor.dtype]
110
+ input_info.append((shape, element_type))
111
+
112
+ unet.eval()
113
+ with torch.no_grad():
114
+ ov_model = ov.convert_model(unet, example_input=dummy_inputs, input=input_info)
115
+ ov.save_model(ov_model, ir_path)
116
+ del ov_model
117
+ cleanup_torchscript_cache()
118
+ print(f'Unet successfully converted to IR and saved to {ir_path}')
119
+
120
+
121
+ if not UNET_OV_PATH.exists():
122
+ convert_unet(unet, UNET_OV_PATH)
123
+ gc.collect()
124
+ else:
125
+ print(f"Unet will be loaded from {UNET_OV_PATH}")
126
+ del unet
127
+ gc.collect()
128
+
129
+ VAE_ENCODER_OV_PATH = Path("vae_encoder.xml")
130
+
131
+ def convert_vae_encoder(vae: torch.nn.Module, ir_path: Path):
132
+ """
133
+ Convert VAE model for encoding to IR format.
134
+ Function accepts vae model, creates wrapper class for export only necessary for inference part,
135
+ prepares example inputs for conversion,
136
+ Parameters:
137
+ vae (torch.nn.Module): VAE model from StableDiffusio pipeline
138
+ ir_path (Path): File for storing model
139
+ Returns:
140
+ None
141
+ """
142
+ class VAEEncoderWrapper(torch.nn.Module):
143
+ def __init__(self, vae):
144
+ super().__init__()
145
+ self.vae = vae
146
+
147
+ def forward(self, image):
148
+ return self.vae.encode(x=image)["latent_dist"].sample()
149
+ vae_encoder = VAEEncoderWrapper(vae)
150
+ vae_encoder.eval()
151
+ image = torch.zeros((1, 3, 512, 512))
152
+ with torch.no_grad():
153
+ ov_model = ov.convert_model(vae_encoder, example_input=image, input=[((1,3,512,512),)])
154
+ ov.save_model(ov_model, ir_path)
155
+ del ov_model
156
+ cleanup_torchscript_cache()
157
+ print(f'VAE encoder successfully converted to IR and saved to {ir_path}')
158
+
159
+
160
+ if not VAE_ENCODER_OV_PATH.exists():
161
+ convert_vae_encoder(vae, VAE_ENCODER_OV_PATH)
162
+ else:
163
+ print(f"VAE encoder will be loaded from {VAE_ENCODER_OV_PATH}")
164
+
165
+ VAE_DECODER_OV_PATH = Path('vae_decoder.xml')
166
+
167
+ def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path):
168
+ """
169
+ Convert VAE model for decoding to IR format.
170
+ Function accepts vae model, creates wrapper class for export only necessary for inference part,
171
+ prepares example inputs for conversion,
172
+ Parameters:
173
+ vae (torch.nn.Module): VAE model frm StableDiffusion pipeline
174
+ ir_path (Path): File for storing model
175
+ Returns:
176
+ None
177
+ """
178
+ class VAEDecoderWrapper(torch.nn.Module):
179
+ def __init__(self, vae):
180
+ super().__init__()
181
+ self.vae = vae
182
+
183
+ def forward(self, latents):
184
+ return self.vae.decode(latents)
185
+
186
+ vae_decoder = VAEDecoderWrapper(vae)
187
+ latents = torch.zeros((1, 4, 64, 64))
188
+
189
+ vae_decoder.eval()
190
+ with torch.no_grad():
191
+ ov_model = ov.convert_model(vae_decoder, example_input=latents, input=[((1,4,64,64),)])
192
+ ov.save_model(ov_model, ir_path)
193
+ del ov_model
194
+ cleanup_torchscript_cache()
195
+ print(f'VAE decoder successfully converted to IR and saved to {ir_path}')
196
+
197
+
198
+ if not VAE_DECODER_OV_PATH.exists():
199
+ convert_vae_decoder(vae, VAE_DECODER_OV_PATH)
200
+ else:
201
+ print(f"VAE decoder will be loaded from {VAE_DECODER_OV_PATH}")
202
+
203
+ del vae
204
+ gc.collect()
205
+
206
+ import inspect
207
+ from typing import List, Optional, Union, Dict
208
+
209
+ import PIL
210
+ import cv2
211
+
212
+ from transformers import CLIPTokenizer
213
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
214
+ from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
215
+ from openvino.runtime import Model
216
+
217
+
218
+ def scale_fit_to_window(dst_width:int, dst_height:int, image_width:int, image_height:int):
219
+ """
220
+ Preprocessing helper function for calculating image size for resize with peserving original aspect ratio
221
+ and fitting image to specific window size
222
+
223
+ Parameters:
224
+ dst_width (int): destination window width
225
+ dst_height (int): destination window height
226
+ image_width (int): source image width
227
+ image_height (int): source image height
228
+ Returns:
229
+ result_width (int): calculated width for resize
230
+ result_height (int): calculated height for resize
231
+ """
232
+ im_scale = min(dst_height / image_height, dst_width / image_width)
233
+ return int(im_scale * image_width), int(im_scale * image_height)
234
+
235
+
236
+ def preprocess(image: PIL.Image.Image):
237
+ """
238
+ Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512,
239
+ then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that
240
+ converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW.
241
+ The function returns preprocessed input tensor and padding size, which can be used in postprocessing.
242
+
243
+ Parameters:
244
+ image (PIL.Image.Image): input image
245
+ Returns:
246
+ image (np.ndarray): preprocessed image tensor
247
+ meta (Dict): dictionary with preprocessing metadata info
248
+ """
249
+ src_width, src_height = image.size
250
+ dst_width, dst_height = scale_fit_to_window(
251
+ 512, 512, src_width, src_height)
252
+ image = np.array(image.resize((dst_width, dst_height),
253
+ resample=PIL.Image.Resampling.LANCZOS))[None, :]
254
+ pad_width = 512 - dst_width
255
+ pad_height = 512 - dst_height
256
+ pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0))
257
+ image = np.pad(image, pad, mode="constant")
258
+ image = image.astype(np.float32) / 255.0
259
+ image = 2.0 * image - 1.0
260
+ image = image.transpose(0, 3, 1, 2)
261
+ return image, {"padding": pad, "src_width": src_width, "src_height": src_height}
262
+
263
+
264
+ class OVStableDiffusionPipeline(DiffusionPipeline):
265
+ def __init__(
266
+ self,
267
+ vae_decoder: Model,
268
+ text_encoder: Model,
269
+ tokenizer: CLIPTokenizer,
270
+ unet: Model,
271
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
272
+ vae_encoder: Model = None,
273
+ ):
274
+ """
275
+ Pipeline for text-to-image generation using Stable Diffusion.
276
+ Parameters:
277
+ vae (Model):
278
+ Variational Auto-Encoder (VAE) Model to decode images to and from latent representations.
279
+ text_encoder (Model):
280
+ Frozen text-encoder. Stable Diffusion uses the text portion of
281
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
282
+ the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant.
283
+ tokenizer (CLIPTokenizer):
284
+ Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
285
+ unet (Model): Conditional U-Net architecture to denoise the encoded image latents.
286
+ scheduler (SchedulerMixin):
287
+ A scheduler to be used in combination with unet to denoise the encoded image latents. Can be one of
288
+ DDIMScheduler, LMSDiscreteScheduler, or PNDMScheduler.
289
+ """
290
+ super().__init__()
291
+ self.scheduler = scheduler
292
+ self.vae_decoder = vae_decoder
293
+ self.vae_encoder = vae_encoder
294
+ self.text_encoder = text_encoder
295
+ self.unet = unet
296
+ self._text_encoder_output = text_encoder.output(0)
297
+ self._unet_output = unet.output(0)
298
+ self._vae_d_output = vae_decoder.output(0)
299
+ self._vae_e_output = vae_encoder.output(0) if vae_encoder is not None else None
300
+ self.height = 512
301
+ self.width = 512
302
+ self.tokenizer = tokenizer
303
+
304
+ def __call__(
305
+ self,
306
+ prompt: Union[str, List[str]],
307
+ image: PIL.Image.Image = None,
308
+ num_inference_steps: Optional[int] = 50,
309
+ negative_prompt: Union[str, List[str]] = None,
310
+ guidance_scale: Optional[float] = 7.5,
311
+ eta: Optional[float] = 0.0,
312
+ output_type: Optional[str] = "pil",
313
+ seed: Optional[int] = None,
314
+ strength: float = 1.0,
315
+ gif: Optional[bool] = False,
316
+ **kwargs,
317
+ ):
318
+ """
319
+ Function invoked when calling the pipeline for generation.
320
+ Parameters:
321
+ prompt (str or List[str]):
322
+ The prompt or prompts to guide the image generation.
323
+ image (PIL.Image.Image, *optional*, None):
324
+ Intinal image for generation.
325
+ num_inference_steps (int, *optional*, defaults to 50):
326
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
327
+ expense of slower inference.
328
+ negative_prompt (str or List[str]):
329
+ The negative prompt or prompts to guide the image generation.
330
+ guidance_scale (float, *optional*, defaults to 7.5):
331
+ Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598).
332
+ guidance_scale is defined as `w` of equation 2.
333
+ Higher guidance scale encourages to generate images that are closely linked to the text prompt,
334
+ usually at the expense of lower image quality.
335
+ eta (float, *optional*, defaults to 0.0):
336
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
337
+ [DDIMScheduler], will be ignored for others.
338
+ output_type (`str`, *optional*, defaults to "pil"):
339
+ The output format of the generate image. Choose between
340
+ [PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array.
341
+ seed (int, *optional*, None):
342
+ Seed for random generator state initialization.
343
+ gif (bool, *optional*, False):
344
+ Flag for storing all steps results or not.
345
+ Returns:
346
+ Dictionary with keys:
347
+ sample - the last generated image PIL.Image.Image or np.array
348
+ iterations - *optional* (if gif=True) images for all diffusion steps, List of PIL.Image.Image or np.array.
349
+ """
350
+ if seed is not None:
351
+ np.random.seed(seed)
352
+
353
+ img_buffer = []
354
+ do_classifier_free_guidance = guidance_scale > 1.0
355
+ # get prompt text embeddings
356
+ text_embeddings = self._encode_prompt(prompt, do_classifier_free_guidance=do_classifier_free_guidance, negative_prompt=negative_prompt)
357
+
358
+ # set timesteps
359
+ accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
360
+ extra_set_kwargs = {}
361
+ if accepts_offset:
362
+ extra_set_kwargs["offset"] = 1
363
+
364
+ self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
365
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
366
+ latent_timestep = timesteps[:1]
367
+
368
+ # get the initial random noise unless the user supplied it
369
+ latents, meta = self.prepare_latents(image, latent_timestep)
370
+
371
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
372
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
373
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
374
+ # and should be between [0, 1]
375
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
376
+ extra_step_kwargs = {}
377
+ if accepts_eta:
378
+ extra_step_kwargs["eta"] = eta
379
+
380
+ for i, t in enumerate(self.progress_bar(timesteps)):
381
+ # expand the latents if you are doing classifier free guidance
382
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
383
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
384
+
385
+ # predict the noise residual
386
+ noise_pred = self.unet([latent_model_input, t, text_embeddings])[self._unet_output]
387
+ # perform guidance
388
+ if do_classifier_free_guidance:
389
+ noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
390
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
391
+
392
+ # compute the previous noisy sample x_t -> x_t-1
393
+ latents = self.scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy()
394
+ if gif:
395
+ image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output]
396
+ image = self.postprocess_image(image, meta, output_type)
397
+ img_buffer.extend(image)
398
+
399
+ # scale and decode the image latents with vae
400
+ image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output]
401
+
402
+ image = self.postprocess_image(image, meta, output_type)
403
+ return {"sample": image, 'iterations': img_buffer}
404
+
405
+ def _encode_prompt(self, prompt:Union[str, List[str]], num_images_per_prompt:int = 1, do_classifier_free_guidance:bool = True, negative_prompt:Union[str, List[str]] = None):
406
+ """
407
+ Encodes the prompt into text encoder hidden states.
408
+
409
+ Parameters:
410
+ prompt (str or list(str)): prompt to be encoded
411
+ num_images_per_prompt (int): number of images that should be generated per prompt
412
+ do_classifier_free_guidance (bool): whether to use classifier free guidance or not
413
+ negative_prompt (str or list(str)): negative prompt to be encoded
414
+ Returns:
415
+ text_embeddings (np.ndarray): text encoder hidden states
416
+ """
417
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
418
+
419
+ # tokenize input prompts
420
+ text_inputs = self.tokenizer(
421
+ prompt,
422
+ padding="max_length",
423
+ max_length=self.tokenizer.model_max_length,
424
+ truncation=True,
425
+ return_tensors="np",
426
+ )
427
+ text_input_ids = text_inputs.input_ids
428
+
429
+ text_embeddings = self.text_encoder(
430
+ text_input_ids)[self._text_encoder_output]
431
+
432
+ # duplicate text embeddings for each generation per prompt
433
+ if num_images_per_prompt != 1:
434
+ bs_embed, seq_len, _ = text_embeddings.shape
435
+ text_embeddings = np.tile(
436
+ text_embeddings, (1, num_images_per_prompt, 1))
437
+ text_embeddings = np.reshape(
438
+ text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1))
439
+
440
+ # get unconditional embeddings for classifier free guidance
441
+ if do_classifier_free_guidance:
442
+ uncond_tokens: List[str]
443
+ max_length = text_input_ids.shape[-1]
444
+ if negative_prompt is None:
445
+ uncond_tokens = [""] * batch_size
446
+ elif isinstance(negative_prompt, str):
447
+ uncond_tokens = [negative_prompt]
448
+ else:
449
+ uncond_tokens = negative_prompt
450
+ uncond_input = self.tokenizer(
451
+ uncond_tokens,
452
+ padding="max_length",
453
+ max_length=max_length,
454
+ truncation=True,
455
+ return_tensors="np",
456
+ )
457
+
458
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self._text_encoder_output]
459
+
460
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
461
+ seq_len = uncond_embeddings.shape[1]
462
+ uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1))
463
+ uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1))
464
+
465
+ # For classifier free guidance, we need to do two forward passes.
466
+ # Here we concatenate the unconditional and text embeddings into a single batch
467
+ # to avoid doing two forward passes
468
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
469
+
470
+ return text_embeddings
471
+
472
+
473
+ def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Tensor = None):
474
+ """
475
+ Function for getting initial latents for starting generation
476
+
477
+ Parameters:
478
+ image (PIL.Image.Image, *optional*, None):
479
+ Input image for generation, if not provided randon noise will be used as starting point
480
+ latent_timestep (torch.Tensor, *optional*, None):
481
+ Predicted by scheduler initial step for image generation, required for latent image mixing with nosie
482
+ Returns:
483
+ latents (np.ndarray):
484
+ Image encoded in latent space
485
+ """
486
+ latents_shape = (1, 4, self.height // 8, self.width // 8)
487
+ noise = np.random.randn(*latents_shape).astype(np.float32)
488
+ if image is None:
489
+ # if you use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
490
+ if isinstance(self.scheduler, LMSDiscreteScheduler):
491
+ noise = noise * self.scheduler.sigmas[0].numpy()
492
+ return noise, {}
493
+ input_image, meta = preprocess(image)
494
+ latents = self.vae_encoder(input_image)[self._vae_e_output] * 0.18215
495
+ latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
496
+ return latents, meta
497
+
498
+ def postprocess_image(self, image:np.ndarray, meta:Dict, output_type:str = "pil"):
499
+ """
500
+ Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required),
501
+ normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format
502
+
503
+ Parameters:
504
+ image (np.ndarray):
505
+ Generated image
506
+ meta (Dict):
507
+ Metadata obtained on latents preparing step, can be empty
508
+ output_type (str, *optional*, pil):
509
+ Output format for result, can be pil or numpy
510
+ Returns:
511
+ image (List of np.ndarray or PIL.Image.Image):
512
+ Postprocessed images
513
+ """
514
+ if "padding" in meta:
515
+ pad = meta["padding"]
516
+ (_, end_h), (_, end_w) = pad[1:3]
517
+ h, w = image.shape[2:]
518
+ unpad_h = h - end_h
519
+ unpad_w = w - end_w
520
+ image = image[:, :, :unpad_h, :unpad_w]
521
+ image = np.clip(image / 2 + 0.5, 0, 1)
522
+ image = np.transpose(image, (0, 2, 3, 1))
523
+ # 9. Convert to PIL
524
+ if output_type == "pil":
525
+ image = self.numpy_to_pil(image)
526
+ if "src_height" in meta:
527
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
528
+ image = [img.resize((orig_width, orig_height),
529
+ PIL.Image.Resampling.LANCZOS) for img in image]
530
+ else:
531
+ if "src_height" in meta:
532
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
533
+ image = [cv2.resize(img, (orig_width, orig_width))
534
+ for img in image]
535
+ return image
536
+
537
+ def get_timesteps(self, num_inference_steps:int, strength:float):
538
+ """
539
+ Helper function for getting scheduler timesteps for generation
540
+ In case of image-to-image generation, it updates number of steps according to strength
541
+
542
+ Parameters:
543
+ num_inference_steps (int):
544
+ number of inference steps for generation
545
+ strength (float):
546
+ value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
547
+ Values that approach 1.0 enable lots of variations but will also produce images that are not semantically consistent with the input.
548
+ """
549
+ # get the original timestep using init_timestep
550
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
551
+
552
+ t_start = max(num_inference_steps - init_timestep, 0)
553
+ timesteps = self.scheduler.timesteps[t_start:]
554
+
555
+ return timesteps, num_inference_steps - t_start
556
+
557
+ core = ov.Core()
558
+
559
+ """Select device from dropdown list for running inference using OpenVINO."""
560
+
561
+ import ipywidgets as widgets
562
+
563
+ device = widgets.Dropdown(
564
+ options=core.available_devices + ["AUTO"],
565
+ value='CPU',
566
+ description='Device:',
567
+ disabled=False,
568
+ )
569
+
570
+ device
571
+
572
+ text_enc = core.compile_model(TEXT_ENCODER_OV_PATH, device.value)
573
+
574
+ unet_model = core.compile_model(UNET_OV_PATH, device.value)
575
+
576
+ ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device.value != "CPU" else {}
577
+
578
+ vae_decoder = core.compile_model(VAE_DECODER_OV_PATH, device.value, ov_config)
579
+ vae_encoder = core.compile_model(VAE_ENCODER_OV_PATH, device.value, ov_config)
580
+
581
+ """Model tokenizer and scheduler are also important parts of the pipeline. Let us define them and put all components together"""
582
+
583
+ from transformers import CLIPTokenizer
584
+ from diffusers.schedulers import LMSDiscreteScheduler
585
+
586
+ lms = LMSDiscreteScheduler(
587
+ beta_start=0.00085,
588
+ beta_end=0.012,
589
+ beta_schedule="scaled_linear"
590
+ )
591
+ tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-large-patch14')
592
+
593
+ ov_pipe = OVStableDiffusionPipeline(
594
+ tokenizer=tokenizer,
595
+ text_encoder=text_enc,
596
+ unet=unet_model,
597
+ vae_encoder=vae_encoder,
598
+ vae_decoder=vae_decoder,
599
+ scheduler=lms
600
+ )
601
+
602
+ """### Text-to-Image generation
603
+ [back to top ⬆️](#Table-of-contents:)
604
+
605
+ Now, you can define a text prompt for image generation and run inference pipeline.
606
+ Optionally, you can also change the random generator seed for latent state initialization and number of steps.
607
+
608
+ > **Note**: Consider increasing `steps` to get more precise results. A suggested value is `50`, but it will take longer time to process.
609
+ """
610
+
611
+ import ipywidgets as widgets
612
+ sample_text = ('cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting, epic composition. '
613
+ 'A golden daylight, hyper-realistic environment. '
614
+ 'Hyper and intricate detail, photo-realistic. '
615
+ 'Cinematic and volumetric light. '
616
+ 'Epic concept art. '
617
+ 'Octane render and Unreal Engine, trending on artstation')
618
+ text_prompt = widgets.Text(value=sample_text, description='your text')
619
+ num_steps = widgets.IntSlider(min=1, max=50, value=20, description='steps:')
620
+ seed = widgets.IntSlider(min=0, max=10000000, description='seed: ', value=42)
621
+ widgets.VBox([text_prompt, seed, num_steps])
622
+
623
+ print('Pipeline settings')
624
+ print(f'Input text: {text_prompt.value}')
625
+ print(f'Seed: {seed.value}')
626
+ print(f'Number of steps: {num_steps.value}')
627
+
628
+ result = ov_pipe(text_prompt.value, num_inference_steps=num_steps.value, seed=seed.value)
629
+
630
+ """Finally, let us save generation results.
631
+ The pipeline returns several results: `sample` contains final generated image, `iterations` contains list of intermediate results for each step.
632
+ """
633
+
634
+ final_image = result['sample'][0]
635
+ if result['iterations']:
636
+ all_frames = result['iterations']
637
+ img = next(iter(all_frames))
638
+ img.save(fp='result.gif', format='GIF', append_images=iter(all_frames), save_all=True, duration=len(all_frames) * 5, loop=0)
639
+ final_image.save('result.png')
640
+
641
+ """Now is show time!"""
642
+
643
+ import ipywidgets as widgets
644
+
645
+ text = '\n\t'.join(text_prompt.value.split('.'))
646
+ print("Input text:")
647
+ print("\t" + text)
648
+ display(final_image)
649
+
650
+ """Nice. As you can see, the picture has quite a high definition 🔥."""
651
+
652
+ import gradio as gr
653
+
654
+ def generate_from_text(text, seed, num_steps, _=gr.Progress(track_tqdm=True)):
655
+ result = ov_pipe(text, num_inference_steps=num_steps, seed=seed)
656
+ return result["sample"][0]
657
+
658
+ with gr.Blocks() as demo:
659
+ with gr.Tab("Text-to-Image generation"):
660
+ with gr.Row():
661
+ with gr.Column():
662
+ text_input = gr.Textbox(lines=3, label="Text")
663
+ seed_input = gr.Slider(0, 10000000, value=42, label="Seed")
664
+ steps_input = gr.Slider(1, 50, value=20, step=1, label="Steps")
665
+ out = gr.Image(label="Result", type="pil")
666
+ btn = gr.Button()
667
+ btn.click(generate_from_text, [text_input, seed_input, steps_input], out)
668
+
669
+ # Remove the "Image-to-Image generation" tab and its content
670
+
671
+ try:
672
+ demo.launch(debug=True)
673
+ except Exception:
674
+ demo.launch(share=True, debug=True)
675
+
676
+ !ls # List files in the current directory
677
+
678
+ !echo "Hello, World!" # Print a message
679
+
680
+ !gradio deploy
681
+