Spaces:
Runtime error
Runtime error
AYYasaswini
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -147,10 +147,10 @@ def latents_to_pil(latents):
|
|
147 |
|
148 |
If you uncomment the cell below you'll see that in this case the `scheduler.add_noise` function literally just adds noise scaled by sigma: `noisy_samples = original_samples + noise * sigmas`
|
149 |
"""
|
150 |
-
encoded = pil_to_latent(input_image)
|
151 |
-
encoded.shape
|
152 |
-
decoded = latents_to_pil(encoded)[0]
|
153 |
-
decoded
|
154 |
# ??scheduler.add_noise
|
155 |
|
156 |
"""Other diffusion models may be trained with different noising and scheduling approaches, some of which keep the variance fairly constant across noise levels ('variance preserving') with different scaling and mixing tricks instead of having noisy latents with higher and higher variance as more noise is added ('variance exploding').
|
@@ -170,57 +170,6 @@ To noise our image we'll use code like that shown above, using the scheduler to
|
|
170 |
"""
|
171 |
|
172 |
# Settings (same as before except for the new prompt)
|
173 |
-
prompt = ["A colorful dancer, nat geo photo"]
|
174 |
-
height = 512 # default height of Stable Diffusion
|
175 |
-
width = 512 # default width of Stable Diffusion
|
176 |
-
num_inference_steps = 50 # Number of denoising steps
|
177 |
-
guidance_scale = 8 # Scale for classifier-free guidance
|
178 |
-
generator = torch.manual_seed(32) # Seed generator to create the inital latent noise
|
179 |
-
batch_size = 1
|
180 |
-
|
181 |
-
# Prep text (same as before)
|
182 |
-
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
183 |
-
with torch.no_grad():
|
184 |
-
text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
|
185 |
-
max_length = text_input.input_ids.shape[-1]
|
186 |
-
uncond_input = tokenizer(
|
187 |
-
[""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
|
188 |
-
)
|
189 |
-
with torch.no_grad():
|
190 |
-
uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
|
191 |
-
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
192 |
-
|
193 |
-
# Prep Scheduler (setting the number of inference steps)
|
194 |
-
set_timesteps(scheduler, num_inference_steps)
|
195 |
-
|
196 |
-
# Prep latents (noising appropriately for start_step)
|
197 |
-
start_step = 10
|
198 |
-
start_sigma = scheduler.sigmas[start_step]
|
199 |
-
noise = torch.randn_like(encoded)
|
200 |
-
latents = scheduler.add_noise(encoded, noise, timesteps=torch.tensor([scheduler.timesteps[start_step]]))
|
201 |
-
latents = latents.to(torch_device).float()
|
202 |
-
|
203 |
-
# Loop
|
204 |
-
for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
|
205 |
-
if i >= start_step: # << This is the only modification to the loop we do
|
206 |
-
|
207 |
-
# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
|
208 |
-
latent_model_input = torch.cat([latents] * 2)
|
209 |
-
sigma = scheduler.sigmas[i]
|
210 |
-
latent_model_input = scheduler.scale_model_input(latent_model_input, t)
|
211 |
-
|
212 |
-
# predict the noise residual
|
213 |
-
with torch.no_grad():
|
214 |
-
noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
|
215 |
-
|
216 |
-
# perform guidance
|
217 |
-
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
218 |
-
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
219 |
-
|
220 |
-
# compute the previous noisy sample x_t -> x_t-1
|
221 |
-
latents = scheduler.step(noise_pred, t, latents).prev_sample
|
222 |
-
|
223 |
-
latents_to_pil(latents)[0]
|
224 |
|
225 |
"""You can see that some colours and structure from the image are kept, but we now have a new picture! The more noise you add and the more steps you do, the further away it gets from the input image.
|
226 |
|
|
|
147 |
|
148 |
If you uncomment the cell below you'll see that in this case the `scheduler.add_noise` function literally just adds noise scaled by sigma: `noisy_samples = original_samples + noise * sigmas`
|
149 |
"""
|
150 |
+
#encoded = pil_to_latent(input_image)
|
151 |
+
#encoded.shape
|
152 |
+
#decoded = latents_to_pil(encoded)[0]
|
153 |
+
#decoded
|
154 |
# ??scheduler.add_noise
|
155 |
|
156 |
"""Other diffusion models may be trained with different noising and scheduling approaches, some of which keep the variance fairly constant across noise levels ('variance preserving') with different scaling and mixing tricks instead of having noisy latents with higher and higher variance as more noise is added ('variance exploding').
|
|
|
170 |
"""
|
171 |
|
172 |
# Settings (same as before except for the new prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
"""You can see that some colours and structure from the image are kept, but we now have a new picture! The more noise you add and the more steps you do, the further away it gets from the input image.
|
175 |
|