blairzheng commited on
Commit
53339de
1 Parent(s): b94dfeb

add cond_kl

Browse files
Files changed (7) hide show
  1. App.py +10 -3
  2. ExtraBlock.js +11 -4
  3. README.md +1 -1
  4. RenderMarkdown.py +30 -2
  5. RenderMarkdownEn.py +184 -72
  6. RenderMarkdownZh.py +192 -73
  7. data.json +0 -0
App.py CHANGED
@@ -13,7 +13,8 @@ from DPMInteractive import fixed_point_init_change, fixed_point_apply_iterate
13
  from DPMInteractive import forward_plot_part, backward_plot_part, fit_plot_part, fixed_plot_part
14
  from RenderMarkdown import md_introduction_block, md_transform_block, md_likelihood_block, md_posterior_block
15
  from RenderMarkdown import md_forward_process_block, md_backward_process_block, md_fit_posterior_block
16
- from RenderMarkdown import md_posterior_transform_block, md_deconvolution_block, md_reference_block, md_about_block
 
17
  from Misc import g_css, js_head, js_load
18
 
19
 
@@ -314,7 +315,7 @@ def run_app():
314
  md_introduction_block()
315
 
316
  md_transform_block()
317
-
318
  rets = transform_block()
319
  trans_param = rets
320
 
@@ -345,10 +346,16 @@ def run_app():
345
 
346
  md_deconvolution_block()
347
 
 
 
 
 
348
  md_reference_block()
349
 
350
  md_about_block()
351
 
 
 
352
  # running initiation consecutively because of the bug of multithreading rendering mathtext in matplotlib
353
  demo.load(trans_param["method"], trans_param["inputs"], trans_param["outputs"], show_progress="minimal").\
354
  then(cond_param["method"], cond_param["inputs"], cond_param["outputs"], show_progress="minimal"). \
@@ -387,7 +394,7 @@ def gtx():
387
  md_reference_block()
388
 
389
  md_about_block()
390
-
391
  demo.queue()
392
  demo.launch(allowed_paths=["/"])
393
  return
 
13
  from DPMInteractive import forward_plot_part, backward_plot_part, fit_plot_part, fixed_plot_part
14
  from RenderMarkdown import md_introduction_block, md_transform_block, md_likelihood_block, md_posterior_block
15
  from RenderMarkdown import md_forward_process_block, md_backward_process_block, md_fit_posterior_block
16
+ from RenderMarkdown import md_posterior_transform_block, md_deconvolution_block, md_cond_kl_block, md_proof_ctr_block
17
+ from RenderMarkdown import md_reference_block, md_about_block
18
  from Misc import g_css, js_head, js_load
19
 
20
 
 
315
  md_introduction_block()
316
 
317
  md_transform_block()
318
+
319
  rets = transform_block()
320
  trans_param = rets
321
 
 
346
 
347
  md_deconvolution_block()
348
 
349
+ md_cond_kl_block()
350
+
351
+ md_proof_ctr_block()
352
+
353
  md_reference_block()
354
 
355
  md_about_block()
356
 
357
+ gr.Markdown("<div><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br></div>", visible=True)
358
+
359
  # running initiation consecutively because of the bug of multithreading rendering mathtext in matplotlib
360
  demo.load(trans_param["method"], trans_param["inputs"], trans_param["outputs"], show_progress="minimal").\
361
  then(cond_param["method"], cond_param["inputs"], cond_param["outputs"], show_progress="minimal"). \
 
394
  md_reference_block()
395
 
396
  md_about_block()
397
+
398
  demo.queue()
399
  demo.launch(allowed_paths=["/"])
400
  return
ExtraBlock.js CHANGED
@@ -3,14 +3,14 @@
3
 
4
  async function write_markdown() {
5
  let names = ["introduction", "transform", "likelihood", "posterior", "forward_process", "backward_process",
6
- "fit_posterior", "posterior_transform", "deconvolution", "reference", "about"];
7
  // names = names.slice(-1)
8
 
9
  let data = await fetch("file/data.json").then(response => response.json());
10
 
11
  names.forEach((name, index) => {
12
  let elem_zh = document.getElementById("md_" + name + "_zh");
13
- if (elem_zh != null) { data[name+"_zh"] = elem_zh.outerHTML; }
14
 
15
  const elem_en = document.getElementById("md_" + name + "_en");
16
  if (elem_en != null) { data[name+"_en"] = elem_en.outerHTML; }
@@ -25,7 +25,7 @@ async function write_markdown() {
25
 
26
  async function insert_markdown() {
27
  let names = ["introduction", "transform", "likelihood", "posterior", "forward_process", "backward_process",
28
- "fit_posterior", "posterior_transform", "deconvolution", "reference", "about"];
29
 
30
  let data = await fetch("file/data.json").then(response => response.json());
31
 
@@ -54,7 +54,7 @@ async function insert_markdown() {
54
 
55
  function control_language() {
56
  const names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
57
- "backward_process", "fit_posterior", "posterior_transform", "deconvolution", "reference", "about"];
58
 
59
  var is_zh = document.getElementById("switch_language").checked;
60
  for (let i = 0; i < names.length; i++) {
@@ -142,6 +142,7 @@ function katex_render(name) {
142
  if (elem == null) { return; }
143
  var text = elem.innerText.replaceAll("{underline}", "_");
144
  text = "\\begin{align}\n" + text + "\n\\end{align}";
 
145
  katex.render(text, elem, {displayMode: true});
146
  }
147
 
@@ -150,7 +151,13 @@ function insert_special_formula() {
150
  katex_render("zh_fit_0");
151
  katex_render("zh_fit_1");
152
  katex_render("zh_fit_2");
 
 
 
153
  katex_render("en_fit_0");
154
  katex_render("en_fit_1");
155
  katex_render("en_fit_2");
 
 
 
156
  }
 
3
 
4
  async function write_markdown() {
5
  let names = ["introduction", "transform", "likelihood", "posterior", "forward_process", "backward_process",
6
+ "fit_posterior", "posterior_transform", "deconvolution", "cond_kl", "proof_ctr", "reference", "about"];
7
  // names = names.slice(-1)
8
 
9
  let data = await fetch("file/data.json").then(response => response.json());
10
 
11
  names.forEach((name, index) => {
12
  let elem_zh = document.getElementById("md_" + name + "_zh");
13
+ if (elem_zh != null) { data[name+"_zh"] = elem_zh.outerHTML; console.log(name);}
14
 
15
  const elem_en = document.getElementById("md_" + name + "_en");
16
  if (elem_en != null) { data[name+"_en"] = elem_en.outerHTML; }
 
25
 
26
  async function insert_markdown() {
27
  let names = ["introduction", "transform", "likelihood", "posterior", "forward_process", "backward_process",
28
+ "fit_posterior", "posterior_transform", "deconvolution", "cond_kl", "proof_ctr", "reference", "about"];
29
 
30
  let data = await fetch("file/data.json").then(response => response.json());
31
 
 
54
 
55
  function control_language() {
56
  const names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
57
+ "backward_process", "fit_posterior", "posterior_transform", "deconvolution", "cond_kl", "proof_ctr", "reference", "about"];
58
 
59
  var is_zh = document.getElementById("switch_language").checked;
60
  for (let i = 0; i < names.length; i++) {
 
142
  if (elem == null) { return; }
143
  var text = elem.innerText.replaceAll("{underline}", "_");
144
  text = "\\begin{align}\n" + text + "\n\\end{align}";
145
+
146
  katex.render(text, elem, {displayMode: true});
147
  }
148
 
 
151
  katex_render("zh_fit_0");
152
  katex_render("zh_fit_1");
153
  katex_render("zh_fit_2");
154
+ katex_render("zh_cond_kl_1");
155
+ katex_render("zh_cond_kl_2");
156
+ katex_render("zh_cond_kl_3");
157
  katex_render("en_fit_0");
158
  katex_render("en_fit_1");
159
  katex_render("en_fit_2");
160
+ katex_render("en_cond_kl_1");
161
+ katex_render("en_cond_kl_2");
162
+ katex_render("en_cond_kl_3");
163
  }
README.md CHANGED
@@ -19,7 +19,7 @@ license: apache-2.0
19
 
20
  This is a Web App which provides a special introduction about the Diffusion Probability Model with interactive demos.
21
 
22
- How to run
23
  <ol>
24
  <li> Open <a href="https://huggingface.co/spaces/blairzheng/DPMInteractive">HuggingFace space</a> in browser directly. </li>
25
  <li> Running locally, recommended.
 
19
 
20
  This is a Web App which provides a special introduction about the Diffusion Probability Model with interactive demos.
21
 
22
+ ### How to run
23
  <ol>
24
  <li> Open <a href="https://huggingface.co/spaces/blairzheng/DPMInteractive">HuggingFace space</a> in browser directly. </li>
25
  <li> Running locally, recommended.
RenderMarkdown.py CHANGED
@@ -3,11 +3,13 @@ import gradio as gr
3
 
4
  from RenderMarkdownZh import md_introduction_zh, md_transform_zh, md_likelihood_zh, md_posterior_zh
5
  from RenderMarkdownZh import md_forward_process_zh, md_backward_process_zh, md_fit_posterior_zh
6
- from RenderMarkdownZh import md_posterior_transform_zh, md_deconvolution_zh, md_reference_zh, md_about_zh
 
7
 
8
  from RenderMarkdownEn import md_introduction_en, md_transform_en, md_likelihood_en, md_posterior_en
9
  from RenderMarkdownEn import md_forward_process_en, md_backward_process_en, md_fit_posterior_en
10
- from RenderMarkdownEn import md_posterior_transform_en, md_deconvolution_en, md_reference_en, md_about_en
 
11
 
12
 
13
  def md_introduction_block(md_type="offline"):
@@ -122,6 +124,32 @@ def md_deconvolution_block(md_type="offline"):
122
  return
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def md_reference_block(md_type="offline"):
126
  if md_type == "offline":
127
  gr.Accordion(label="Reference", elem_classes="first_md", elem_id="reference")
 
3
 
4
  from RenderMarkdownZh import md_introduction_zh, md_transform_zh, md_likelihood_zh, md_posterior_zh
5
  from RenderMarkdownZh import md_forward_process_zh, md_backward_process_zh, md_fit_posterior_zh
6
+ from RenderMarkdownZh import md_posterior_transform_zh, md_deconvolution_zh, md_cond_kl_zh, md_proof_ctr_zh
7
+ from RenderMarkdownZh import md_reference_zh, md_about_zh
8
 
9
  from RenderMarkdownEn import md_introduction_en, md_transform_en, md_likelihood_en, md_posterior_en
10
  from RenderMarkdownEn import md_forward_process_en, md_backward_process_en, md_fit_posterior_en
11
+ from RenderMarkdownEn import md_posterior_transform_en, md_deconvolution_en, md_cond_kl_en, md_proof_ctr_en
12
+ from RenderMarkdownEn import md_reference_en, md_about_en
13
 
14
 
15
  def md_introduction_block(md_type="offline"):
 
124
  return
125
 
126
 
127
+ def md_cond_kl_block(md_type="offline"):
128
+ if md_type == "offline":
129
+ title = "Appendix A Conditional KL Divergence"
130
+ gr.Accordion(label=title, elem_classes="first_md", elem_id="cond_kl")
131
+ elif md_type == "zh":
132
+ md_cond_kl_zh()
133
+ elif md_type == "en":
134
+ md_cond_kl_en()
135
+ else:
136
+ raise NotImplementedError
137
+ return
138
+
139
+
140
+ def md_proof_ctr_block(md_type="offline"):
141
+ if md_type == "offline":
142
+ title = "Appendix B Proof of Contraction"
143
+ gr.Accordion(label=title, elem_classes="first_md", elem_id="proof_ctr")
144
+ elif md_type == "zh":
145
+ md_proof_ctr_zh()
146
+ elif md_type == "en":
147
+ md_proof_ctr_en()
148
+ else:
149
+ raise NotImplementedError
150
+ return
151
+
152
+
153
  def md_reference_block(md_type="offline"):
154
  if md_type == "offline":
155
  gr.Accordion(label="Reference", elem_classes="first_md", elem_id="reference")
RenderMarkdownEn.py CHANGED
@@ -12,7 +12,7 @@ def md_introduction_en():
12
 
13
  gr.Markdown(
14
  r"""
15
- The Diffusion Probability Model[\\[1\\]](#dpm)[\\[2\\]](#ddpm) is currently the main method used in image and video generation, but due to its abstruse theory, many engineers are unable to understand it well. This article will provide a very easy-to-understand method to help readers grasp the principles of the Diffusion Model. Specifically, it will illustrate the Diffusion Model using examples of one-dimensional random variables in an interactive way, explaining several interesting properties of the Diffusion Model in an intuitive manner.
16
 
17
  The diffusion model is a probabilistic model. Probabilistic models mainly offer two functions: calculating the probability of a given sample appearing; and generating new samples. The diffusion model focuses on the latter aspect, facilitating the production of new samples, thus realizing the task of **generation**.
18
 
@@ -43,11 +43,11 @@ def md_transform_en():
43
 
44
  The first sub-transformation performs a linear transformation ($\sqrt{\alpha}X$) on the random variable $X$. According to the conclusion of the literature[\[3\]](#linear_transform), the linear transformation makes the probability distribution of $X$ **narrower and taller**, and the extent of **narrowing and heightening** is directly proportional to the value of $\alpha$.
45
 
46
- This can be specifically seen in Demo 1, where the first figure depicts a randomly generated one-dimensional probability distribution, and the second figure represents the probability distribution after the linear transformation. It can be observed that the curve of the third figure has become **narrower and taller** compared to the first image. Readers can experiment with different $\alpha$ to gain a more intuitive understanding.
47
 
48
  The second sub-transformation is **adding independent random noise**($\sqrt{1-\alpha}\epsilon$). According to the conclusion of the literature[\[4\]](#sum_conv), **adding independent random variables** is equivalent to performing convolution on the two probability distributions. Since the probability distribution of random noise is Gaussian, it is equivalent to performing a **Gaussian Blur** operation. After blurring, the original probability distribution will become smoother and more similar to the standard normal distribution. The degree of blurring is directly proportional to the noise level ($\sqrt{1-\alpha}$).
49
 
50
- For specifics, one can see Demo 1, where the first figure is a randomly generated one-dimensional probability distribution, and the third figure is the result after the transformation. It can be seen that the transformed probability distribution curve is smoother and there are fewer corners. The readers can test different $\alpha$ values to feel how the noise level affect the shape of the probability distribution. The last figure is the result after applying all two sub-transformations.
51
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_transform_en")
52
  return
53
 
@@ -63,7 +63,7 @@ def md_likelihood_en():
63
  \begin{align}
64
  q(z|x) &= \mathcal{N}(\sqrt{\alpha}x,\ 1-\alpha) \tag{2.1}
65
  \end{align}
66
- It can be understood by concrete examples in Demo 2. The third figure depict the shape of $q(z|x)$. From the figure, a uniform slanting line can be observed. This implies that the mean of $q(z|x)$ is linearly related to x, and the variance is fixed. The magnitude of $\alpha$ will determine the width and incline of the slanting line.
67
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_likelihood_en")
68
  return
69
 
@@ -103,13 +103,13 @@ def md_posterior_en():
103
  <li>When the variance of the Gaussian function is large (large noise), or when $q(x)$ changes drastically, the shape of $q(x|z)$ will be more complex, and greatly differ from a Gaussian function, which makes it difficult to model and learn.</li>
104
  </ul>
105
 
106
- The specifics can be seen in Demo 2. The fourth figure present the shape of the posterior $q(x|z)$, which shows an irregular shape and resembles a curved and uneven line. As $\alpha$ increases (noise decreases), the curve tends to be uniform and straight. Readers can adjust different $\alpha$ values and observe the relationship between the shape of posterior and the level of noise. In the last figure, the $\textcolor{blue}{\text{blue dash line}}$ represents $q(x)$, the $\textcolor{green}{\text{green dash line}}$ represents <b>GaussFun</b> in the equation 3.4, and the $\textcolor{orange}{\text{orange curve}}$ represents the result of multiplying the two function and normalizing it, which is the posterior probability $q(x|z=fixed)$ under a fixed z condition. Readers can adjust different values of z to observe how the fluctuation of $q(x)$ affect the shape of the posterior probability $q(x|z)$.
107
 
108
  The posterior $q(x|z)$ under two special states are worth considering.
109
  <ul>
110
- <li>As $\alpha \to 0$, the variance of <b>GaussFun</b> tends to <b>$\infty$</b>, and $q(x|z)$ for different $z$ almost become identical, and almost the same as $q(x)$. Readers can set $\alpha$ to 0.001 in Demo 2 to observe the specific results.</li>
111
 
112
- <li>As $\alpha \to 1$, the variance of <b>GaussFun</b> tends to <b>$0$</b>, The $q(x|z)$ for different $z$ values contract into a series of <em>Dirac delta functions</em> with different offsets equalling to $z$. However, there are some exceptions. When there are regions where $q(x)$ is zero, the corresponding $q(x|z)$ will no longer be a Dirac <em>delta function</em>, but a zero function. Readers can set $\alpha$ to 0.999 in Demo 2 to observe the specific results.</li>
113
  </ul>
114
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_en")
115
  return
@@ -125,11 +125,11 @@ def md_forward_process_en():
125
  r"""
126
  For any arbitrary data distribution $q(x)$, the transform(equation 2.1) in section 2 can be continuously applied(equation 4.1~4.4). As the number of transforms increases, the output probability distribution will become increasingly closer to the standard normal distribution. For more complex data distributions, more iterations or larger noise are needed.
127
 
128
- Specific details can be observed in Demo3. The first figure illustrates a randomly generated one-dimensional probability distribution. After seven transforms, this distribution looks very similar to the standard normal distribution. The degree of similarity increases with the number of iterations and the level of the noise. Given the same degree of similarity, fewer transforms are needed if the noise added at each step is larger (smaller $\alpha$ value). Readers can try different $\alpha$ values and numbers of transforms to see how similar the final probability distribution is.
129
 
130
  The complexity of the initial probability distribution tends to be high, but as the number of transforms increases, the complexity of the probability distribution $q(z_t)$ will decrease. As concluded in section 4, a more complex probability distribution corresponds to a more complex posterior probability distribution. Therefore, in order to ensure that the posterior probability distribution is more similar to the Conditional Gaussian function (easier to learn), a larger value of $\alpha$ (smaller noise) should be used in the initial phase, and a smaller value of $\alpha$ (larger noise) can be appropriately used in the later phase to accelerate the transition to the standard normal distribution.
131
 
132
- In the example of Demo 3.1, it can be seen that as the number of transforms increases, the corners of $q(z_t)$ become fewer and fewer. Meanwhile, the slanting lines in the plot of the posterior probability distribution $q(z_{t-1}|z_t)$ become increasingly straight and uniform, resembling more and more the conditional Gaussian distribution.
133
 
134
  \begin{align}
135
  Z_1 &= \sqrt{\alpha_1} X + \sqrt{1-\alpha_1}\epsilon_1 \tag{4.1} \newline
@@ -160,7 +160,7 @@ def md_forward_process_en():
160
  q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
161
  \end{align}
162
 
163
- If considering only $q(z_T)$, a single transformation can also be used, which is as follows:
164
  \begin{align}
165
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
166
  \end{align}
@@ -179,7 +179,7 @@ def md_backward_process_en():
179
  r"""
180
  If the final probability distribution $q(z_T)$ and the posterior probabilities of each transform $q(x|z),q(z_{t-1}|z_t)$ are known, the data distribution $q(x)$ can be recovered through the Bayes Theorem and the Law of Total Probability, as shown in equations 5.1~5.4. When the final probability distribution $q(z_T)$ is very similar to the standard normal distribution, the standard normal distribution can be used as a substitute.
181
 
182
- Specifics can be seen in Demo 3.2. In the example, $q(z_T)$ substitutes $\mathcal{N}(0,1)$, and the error magnitude is given through JS Divergence. The restored probability distribution $q(z_t)$ and $q(x)$ are identified by the $\textcolor{green}{\text{green curve}}$, and the original probability distribution is identified by the $\textcolor{blue}{\text{blue curve}}$. It can be observed that the data distribution $q(x)$ can be well restored, and the error (JS Divergence) will be smaller than the error caused by the standard normal distribution replacing $q(z_T)$.
183
  \begin{align}
184
  q(z_{T-1}) &= \int q(z_{T-1},z_T)dz_T = \int q(z_{T-1}|z_T)q(z_T)dz_T \tag{5.1} \newline
185
  & \dots \notag \newline
@@ -188,15 +188,15 @@ def md_backward_process_en():
188
  q(z_1) &= \int q(z_1,z_2) dz_1 = \int q(z_1|z_2)q(z_2)dz_2 \tag{5.3} \newline
189
  q(x) &= \int q(x,z_1) dz_1 = \int q(x|z_1)q(z_1)dz_1 \tag{5.4} \newline
190
  \end{align}
191
- In this article, the aforementioned transform is referred to as the <b>Posterior Transform</b>. For example, in equation 5.4, the input of the transform is the probability distribution function $q(z_1)$, and the output is the probability distribution function $q(x)$.The entire transform is determined by the posterior $q(x|z_1)$. This transform can also be considered as the linear weighted sum of a set of basis functions, where the basis functions are $q(x|z_1)$ under different $z_1$, and the weights of each basis function are $q(z_1)$. Some interesting properties of this transform will be introduced in Section 7.
192
 
193
- In Section 3, we have considered two special posterior probability distributions. Next, we analyze their corresponding <em>posterior transforms</em>.
194
  <ul>
195
- <li> When $\alpha \to 0$, the $q(x|z)$ for different $z$ are almost the same as $q(x)$. In other words, the basis functions of linear weighted sum are almost the same. In this state, no matter how the input changes, the output of the transformation is always $q(x)$."</li>
196
  <li> When $\alpha \to 1$, the $q(x|z)$ for different $z$ values becomes a series of Dirac delta functions and zero functions. In this state, as long as the <em>support set</em> of the input distribution is included in the <em>support set</em> of $q(x)$, the output of the transformation will remain the same with the input.</li>
197
  </ul>
198
 
199
- In Section 5, it is mentioned that the 1000 transformations used in the DDPM[\[2\]](#ddpm) can be represented using a single transformation
200
  \begin{align}
201
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
202
  \end{align}
@@ -204,6 +204,8 @@ def md_backward_process_en():
204
  Since $\\alpha=0.0000403$ is very small, the corresponding standard deviation of GaussFun (Equation 3.4) reaches 157.52. However, the range of $X$ is limited within $[-1, 1]$, which is far smaller than the standard deviation of GaussFun. Within the range of $x \\in [-1, 1]$, GaussFun should be close to a constant, showing little variation. Therefore, the $q(x|z_T)$ corresponding to different $z_T$ are almost the same as $q(x)$. In this state, the posterior transform corresponding to $q(x|z_T)$ does not depend on the input distribution, the output distribution will always be $q(x)$.
205
 
206
  <b>Therefore, theoretically, in the DDPM model, it is not necessary to use the standard normal distribution to replace $q(z_T)$. Any other arbitrary distributions can also be used as a substitute.</b>
 
 
207
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_en")
208
  return
209
 
@@ -216,13 +218,13 @@ def md_fit_posterior_en():
216
 
217
  gr.Markdown(
218
  r"""
219
- From the front part of Section 4, it is known that the posterior probability distributions are unknown and related to $q(x)$. Therefore, in order to recover the data distribution or sample from it, it is necessary to learn and estimate each posterior probability distribution.
220
 
221
- From the latter part of Section 4, it can be understood that when certain conditions are met, each posterior probability distribution $q(x|z), q(z_{t-1}|z_t)$ approximates the Gaussian probability distribution. Therefore, by constructing a set of conditional Gaussian probability models $p(x|z), p(z_{t-1}|z_t)$, we can learn to fit the corresponding $q(x|z), q(z_{t-1}|z_t)$.
222
 
223
- Due to the limitations of the model's representative and learning capabilities, there will be certain errors in the fitting process, which will further impact the accuracy of restored $q(x)$. The size of the fitting error is related to the complexity of the posterior probability distribution. As can be seen from Section 4, when $q(x)$ is more complex or the added noise is large, the posterior probability distribution will be more complex, and it will differ greatly from the Gaussian distribution, thus leading to fitting errors and further affecting the restoration of $q(x)$.
224
 
225
- Refer to Demo 3.3 for the specifics. The reader can test different $q(x)$ and $\alpha$, observe the fitting degree of the posterior probability distribution $q(z_{t-1}|z_t)$ and the accuracy of restored $q(x)$. The restored probability distribution is ploted with $\textcolor{orange}{\text{orange}}$, and the error is also measured by JS divergence.
226
 
227
  Regarding the objective function for fitting, similar to other probability models, the cross-entropy loss can be optimized to make $p(z_{t-1}|z_t)$ approaching $q(z_{t-1}|z_t)$. Since $(z_{t-1}|z_t)$ is a conditional probability, it is necessary to fully consider all conditions. This can be achieved by averaging the cross-entropy corresponding to each condition weighted by the probability of each condition happening. The final form of the loss function is as follows.
228
  \begin{align}
@@ -232,7 +234,7 @@ def md_fit_posterior_en():
232
 
233
  KL divergence can also be optimized as the objective function. KL divergence and cross-entropy are equivalent[\\[10\\]](#ce_kl)
234
  <span id="en_fit_0">
235
- loss &= \int q(z_t) KL(q(z_{t-1}|z_t)\|\textcolor{blue}{p(z_{t-1}|z_t)})dz_t \tag{6.3} \newline
236
  &= \int q(z_t) \int q(z_{t-1}|z_t) \frac{q(z_{t-1}|z_t)}{\textcolor{blue}{p(z_{t-1}|z_t)}} dz_{t-1} dz_t \tag{6.4} \newline
237
  &= -\int q(z_t)\ \underbrace{\int q(z_{t-1}|z_t) \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}}{underline}{\text{Cross Entropy}}\ dz_t + \underbrace{\int q(z_t) \int q(z_{t-1}|z_t) \log q(z_{t-1}|z_t)}{underline}{\text{Is Constant}} dz \tag{6.5}
238
  </span>
@@ -256,8 +258,8 @@ def md_fit_posterior_en():
256
  &\quad - \iint \int q(x)q(z_{t-1}, z_t|x) \log \textcolor{blue}{p(z_{t-1}|z_t)}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.11} \newline
257
  &= \iint \int q(x)q(z_{t-1},z_t|x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.12} \newline
258
  &= \iint q(x)q(z_t|x)\int q(z_{t-1}|z_t,x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dz_{t-1}\ dz_xdz_t - \textcolor{orange}{C_1} \tag{6.13} \newline
259
- &= \iint \ q(x)q(z_t|x) KL[q(z_{t-1}|z_t,x)\|\textcolor{blue}{p(z_{t-1}|z_t)}]dxdz_t - \textcolor{orange}{C_1} \tag{6.14} \newline
260
- &\propto \iint \ q(x)q(z_t|x) KL[q(z_{t-1}|z_t,x)\|\textcolor{blue}{p(z_{t-1}|z_t)}]dxdz_t \tag{6.15} \newline
261
  \end{align}
262
 
263
  In the above formula, the term $C_1$ is a fixed value, which does not contain parameters to be optimized. Here, $q(x)$ is a fixed probability distribution, and $q(z_{t-1}|z_t)$ is also a fixed probability distribution, whose specific form is determined by $q(x)$ and the coefficient $\alpha$.
@@ -274,15 +276,15 @@ def md_fit_posterior_en():
274
 
275
  Based on the conclusion of the Consistent Terms proof and the relationship between cross entropy and KL divergence, an interesting conclusion can be drawn:
276
  <span id="en_fit_1">
277
- \mathop{\min}{underline}{\textcolor{blue}{p}} \int q(z_t) KL(q(z_{t-1}|z_t)\|\textcolor{blue}{p(z_{t-1}|z_t)})dz_t \iff \mathop{\min}{underline}{\textcolor{blue}{p}} \iint \ q(x)q(z_t|x) KL[q(z_{t-1}|z_t,x)\|\textcolor{blue}{p(z_{t-1}|z_t)}]dxdz_t \tag{6.19}
278
  </span>
279
  By comparing the expressions on the left and right, it can be observed that the objective function on the right side includes an additional variable $X$ compared to the left side. At the same time, there is an additional integral with respect to $X$, with the occurrence probability of $X$, denoted as $q(x)$, serving as the weighting coefficient for the integral.
280
 
281
  Following a similar proof method, a more general relationship can be derived:
282
  <span id="en_fit_2">
283
- \mathop{\min}{underline}{\textcolor{blue}{p}} KL(q(z)\|\textcolor{blue}{p(z)}) \iff \mathop{\min}_{\textcolor{blue}{p}} \int \ q(x) KL(q(z|x)\|\textcolor{blue}{p(z)})dx \tag{6.20}
284
  </span>
285
-
286
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_fit_posterior_en")
287
  return
288
 
@@ -299,12 +301,12 @@ def md_posterior_transform_en():
299
  q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
300
  \end{align}
301
 
302
- Through extensive experiments with one-dimensional random variables, it was found that the <b>Posterior Transform</b> exhibits the characteristics of <b>Contraction Mapping</b>. This means that, for any two probability distributions $q_{i1}(z) and $q_{i2}(z), after posterior transform, we get $q_{o1}(x)$ and $q_{o2}(x)$. The distance between $q_{o1}(x)$ and $q_{o2}(x)$ is always less than the distance between $q_{i1}(x)$ and $q_{i2}(x)$. Here, the distance can be measured using JS divergence or Total Variance. Furthermore, the contractive ratio of this contraction mapping is positively related to the size of the added noise.
303
  \begin{align}
304
  dist(q_{o1}(z),\ q_{o2}(z)) < dist(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
305
  \end{align}
306
 
307
- Readers can refer to Demo 4.1, where the first three figure present a transform process. The first figure is an arbitrary data distribution $q(x)$, the third figure is the transformed probability distribution, and second figure is the posterior probability distribution $q(x|z)$. You can change the random seed to generate a new data distribution$q(x)$, and adjust the value of $\alpha$ to introduce different degrees of noise.
308
 
309
  The last two figures show contraction of the transform. The fourth figure displays two randomly generated input distributions and their distance, $div_{in}$. The fifth figure displays the two output distributions after transform, with the distance denoted as $div_{out}$.
310
 
@@ -312,7 +314,7 @@ def md_posterior_transform_en():
312
 
313
  According to the Banach fixed-point theorem<a href="#fixed_point">[5]</a>, a contraction mapping has a unique fixed point (converged point). That is to say, for any input distribution, the <b>Posterior Transform</b> can be applied continuously through iterations, and as long as the number of iterations is sufficient, the final output would be the same distribution. After a large number of one-dimensional random variable experiments, it was found that the fixed point (converged point) is <b>located near $q(x)$</b>. Also, the location is related to the value of $\alpha$; the smaller $\alpha$ (larger noise), the closer it is.
314
 
315
- Readers can refer to Demo 4.2, which illustrates an example of applying posterior transform iteratively. Choose an appropriate number of iterations, and click on the button of <em>Apply</em>, and the iteration process will be draw step by step. Each subplot shows the transformed output distribution($\textcolor{green}{\text{green curve}}$) from each transform, with the reference distribution $q(x)$ expressed as a $\textcolor{blue}{\text{blue curve}}$, as well as the distance $div$ between the output distribution and $q(x)$. It can be seen that as the number of iterations increases, the output distribution becomes more and more similar to $q(x)$, and will eventually stabilize near $q(x)$. For more complicated distributions, more iterations or greater noise may be required. The maximum number of iterations can be set to tens of thousands, but it'll take longer.
316
 
317
  For the one-dimensional discrete case, $q(x|z)$ is discretized into a matrix (denoted as $Q_{x|z}$), $q(z)$ is discretized into a vector (denoted as $\boldsymbol{q_i}$). The integration operation $\int q(x|z)q(z)dz$ is discretized into a **matrix-vector** multiplication operation, thus the posterior transform can be written as
318
  \begin{align}
@@ -321,12 +323,147 @@ def md_posterior_transform_en():
321
  & \dots & \notag \newline
322
  \boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
323
  \end{align}
324
- In order to better understand the property of the transform, the matrix $(Q_{x|z})^n$ is also plotted in Demo 4.2. From the demo we can see that, as the iterations converge, the row vectors of the matrix $(Q_{x|z})^n$ will become a constant vector, that is, all components of the vector will be the same, which will appear as a horizontal line in the denisty plot.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  <center> <img id="en_fig2" src="file/fig2.png" width="960" style="margin-top:12px"/> </center>
327
  <center> Figure 2: Only one component in support </center>
328
 
329
- The following will prove that with some conditions, the posterior transform is a contraction mapping, and there exists a unique point, which is also the converged point. The proof assumes that the random variable is discrete, so the posterior transform can be regarded as a single step transition of a <b>discrete Markov Chain</b>. The posterior $q(x|z)$ corresponds to the <b>transfer matrix</b>. Continuous variables can be considered as discrete variables with infinite states.
 
 
330
  <ol style="list-style-type:decimal">
331
  <li> When $q(x)$ is greater than 0, the posterior transform matrix $q(x|z)$ will be greater than 0 too. Therefore, this matrix is the transition matrix of an $\textcolor{red}{\text{irreducible}}\ \textcolor{green}{\text{aperiodic}}$ Markov Chain. According to the conclusion of the literature <a href="#mc_basic_p6">[13]</a>, this transformation is a contraction mapping with respect to Total Variance metric. Therefore, according to the Banach fixed-point theorem, this transformation has a unique fixed point(converged point). </li>
332
 
@@ -366,51 +503,20 @@ def md_posterior_transform_en():
366
 
367
  Additionally, there exists a more generalized relation about the posterior transform that is independent of $q(x|z)$: the Total Variance distance between two output distributions will always be <b>less than or equal to</b> the Total Variance distance between their corresponding input distributions, that is
368
  \begin{align}
369
- dist(q_{o1}(x),\ q_{o2}(x)) <= dist(q_{i1}(z),\ q_{i2}(z)) \notag
370
  \end{align}
371
  The proof is given below in discrete form:
372
  \begin{align}
373
- \lVert q_{o1}-q_{o2}\rVert_{TV} &= \lVert Q_{x|z}q_{i1} - Q_{x|z}q_{i2}\rVert_{TV} \tag{7.6} \newline
374
- &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)q_{i1}(n) - \sum_{n}Q_{x|z}(m,n)q_{i2}(n)\textcolor{red}{|} \tag{7.7} \newline
375
- &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{7.8} \newline
376
- &\leq \sum_{m}\sum_{n}Q_{x|z}(m,n)\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \qquad \qquad \qquad \text{Absolute value inequality} \tag{7.9} \newline
377
- &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \sum_{m} Q_{x|z}(m,n) \qquad \qquad \qquad \sum_{m} Q_{x|z}(m,n) = 1 \tag{7.10} \newline
378
- &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{7.11}
379
- \end{align}
380
- In this context, $Q_{x|z}(m,n)$ represents the element at the m-th row and n-th column of the matrix $Q_{x|z}$, and $q_{i1}(n)$ represents the n-th element of the vector $q_{i1}$.
381
-
382
- The relationship between the converged distribution and the input distribution q(x) cannot be rigorously proven at present.
383
-
384
- <h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution</h3>
385
- From the above analysis, we know that when certain conditions are satisfied, the <em>posterior transform</em> is a contraction mapping. Therefore, the following relationship exists:
386
- \begin{align}
387
- dist(q(x),\ q_o(x)) < dist(q(z),\ q_i(z)) \tag{7.12}
388
  \end{align}
389
- Wherein, $q(z)$ is the ideal input distribution, $q(x)$ is the ideal output distribution, $q_i(x)$ is any arbitrary input distribution, and $q_o(x)$ is the output distribution obtained after transforming $q_i(z)$.
390
-
391
- The above equation indicates that the distance between the output distribution $q_o(x)$ and the ideal output distribution q(x) will always be <b>less than</b> the distance between the input distribution $q_i(z)$ and the ideal input distribution q(x). Hence, the <em>posterior transform</em> has certain resistance to noise. This means that during the process of restoring $q(x)$(Section 5), even if the <em>tail distribution</em> $q(z_T)$ contains some error, the error of the outputed distribution $q(x)$ will be smaller than the error of input after undergoing a series of transform.
392
-
393
- Refer specifically to Demo 3.2, where by increasing the value of the <b>noise ratio</b>, noise can be added to the <em>tail distribution</em> $q(z_T)$. Clicking the "apply" button will gradually draw out the restoring process, with the restored distribution represented by a $\textcolor{red}{\text{red curve}}$, and the error size will be computed by the JS divergence. You will see that the error of restored $q(x)$ is always less than the error of $q(z_T)$.
394
-
395
- From the above discussion, we know that the smaller the $\alpha$ (the larger the noise used in the transform process), the greater the contractive ratio of the contraction mapping, and thus, the stronger the ability to resist noise.
396
-
397
- """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_en")
398
- return
399
-
400
-
401
- def md_deconvolution_en():
402
- global g_latex_del
403
-
404
- title = "8. Can the data distribution be restored by deconvolution?"
405
- with gr.Accordion(label=title, elem_classes="first_md", elem_id="deconvolution"):
406
- gr.Markdown(
407
- r"""
408
- As mentioned in the section 2, the transformation of Equation 2.1 can be divided into two sub-transformations, the first one being a linear transformation and the second being adding independent Gaussian noise. The linear transformation is equivalent to a scaling transform of the probability distribution, so it has an inverse transformation. Adding independent Gaussian noise is equivalent to the execution of a convolution operation on the probability distribution, which can be restored through <b>deconvolution</b>. Therefore, theoretically, the data distribution $q(x)$ can be recovered from the final probability distribution $q(z_T)$ through <b>inverse linear transform</b> and <b>deconvolution</b>.
409
-
410
- However, in actuality, some problems do exist. Due to the extreme sensitivity of deconvolution to errors, having high input sensitivity, even a small amount of input noise can lead to significant changes in output[\[11\]](#deconv_1)[\[12\]](#deconv_2). Meanwhile, in the diffusion model, the standard normal distribution is used as an approximation to replace $q(z_T)$, thus, noise is introduced at the initial stage of recovery. Although the noise is relatively small, because of the sensitivity of deconvolution, the noise will gradually amplify, affecting the recovery.
411
-
412
- In addition, the infeasibility of <b>deconvolution restoring</b> can be understood from another perspective. Since the process of forward transform (equations 4.1 to 4.4) is fixed, the convolution kernel is fixed. Therefore, the corresponding deconvolution transform is also fixed. Since the initial data distribution $q(x)$ is arbitrary, any probability distribution can be transformed into an approximation of $\mathcal{N}(0,I)$ through a series of fixed linear transforms and convolutions. If <b>deconvolution restoring</b> is feasible, it means that a fixed deconvolution can be used to restore any data distribution $q(x)$ from the $\mathcal{N}(0,I)$ , this is clearly <b>paradoxical</b>. The same input, the same transform, cannot have multiple different outputs.
413
- """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_deconvolution_en")
414
  return
415
 
416
 
@@ -448,6 +554,8 @@ def md_reference_en():
448
  <a id="mc_basic_t7" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [13] Markov Chain:Basic Theory - Theorem 7 </a>
449
 
450
  <a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
 
 
451
 
452
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_en")
453
 
@@ -461,7 +569,7 @@ def md_about_en():
461
 
462
  gr.Markdown(
463
  r"""
464
- <b>APP</b>: This APP is developed using Gradio and deployed on HuggingFace. Due to limited resources (2 cores, 16G memory), the response may be slow. For a better experience, it is recommended to clone the source code from <a href="https://github.com/blairstar/The_Art_of_DPM">github</a> and run it locally. This program only relies on Gradio, SciPy, and Matplotlib.
465
 
466
  <b>Author</b>: Zhenxin Zheng, Senior computer vision engineer with ten years of algorithm development experience, Formerly employed by Tencent and JD.com, currently focusing on image and video generation.
467
 
@@ -493,6 +601,10 @@ def run_app():
493
 
494
  md_deconvolution_en()
495
 
 
 
 
 
496
  md_reference_en()
497
 
498
  md_about_en()
 
12
 
13
  gr.Markdown(
14
  r"""
15
+ The Diffusion Probability Model[\[1\]](#dpm)[\[2\]](#ddpm) is currently the main method used in image and video generation, but due to its abstruse theory, many engineers are unable to understand it well. This article will provide a very easy-to-understand method to help readers grasp the principles of the Diffusion Model. Specifically, it will illustrate the Diffusion Model using examples of one-dimensional random variables in an interactive way, explaining several interesting properties of the Diffusion Model in an intuitive manner.
16
 
17
  The diffusion model is a probabilistic model. Probabilistic models mainly offer two functions: calculating the probability of a given sample appearing; and generating new samples. The diffusion model focuses on the latter aspect, facilitating the production of new samples, thus realizing the task of **generation**.
18
 
 
43
 
44
  The first sub-transformation performs a linear transformation ($\sqrt{\alpha}X$) on the random variable $X$. According to the conclusion of the literature[\[3\]](#linear_transform), the linear transformation makes the probability distribution of $X$ **narrower and taller**, and the extent of **narrowing and heightening** is directly proportional to the value of $\alpha$.
45
 
46
+ This can be specifically seen in <a href="#demo_1">Demo 1</a>, where the first figure depicts a randomly generated one-dimensional probability distribution, and the second figure represents the probability distribution after the linear transformation. It can be observed that the curve of the third figure has become **narrower and taller** compared to the first image. Readers can experiment with different $\alpha$ to gain a more intuitive understanding.
47
 
48
  The second sub-transformation is **adding independent random noise**($\sqrt{1-\alpha}\epsilon$). According to the conclusion of the literature[\[4\]](#sum_conv), **adding independent random variables** is equivalent to performing convolution on the two probability distributions. Since the probability distribution of random noise is Gaussian, it is equivalent to performing a **Gaussian Blur** operation. After blurring, the original probability distribution will become smoother and more similar to the standard normal distribution. The degree of blurring is directly proportional to the noise level ($\sqrt{1-\alpha}$).
49
 
50
+ For specifics, one can see <a href="#demo_1">Demo 1</a>, where the first figure is a randomly generated one-dimensional probability distribution, and the third figure is the result after the transformation. It can be seen that the transformed probability distribution curve is smoother and there are fewer corners. The readers can test different $\alpha$ values to feel how the noise level affect the shape of the probability distribution. The last figure is the result after applying all two sub-transformations.
51
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_transform_en")
52
  return
53
 
 
63
  \begin{align}
64
  q(z|x) &= \mathcal{N}(\sqrt{\alpha}x,\ 1-\alpha) \tag{2.1}
65
  \end{align}
66
+ It can be understood by concrete examples in <a href="#demo_2">Demo 2</a>. The third figure depict the shape of $q(z|x)$. From the figure, a uniform slanting line can be observed. This implies that the mean of $q(z|x)$ is linearly related to x, and the variance is fixed. The magnitude of $\alpha$ will determine the width and incline of the slanting line.
67
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_likelihood_en")
68
  return
69
 
 
103
  <li>When the variance of the Gaussian function is large (large noise), or when $q(x)$ changes drastically, the shape of $q(x|z)$ will be more complex, and greatly differ from a Gaussian function, which makes it difficult to model and learn.</li>
104
  </ul>
105
 
106
+ The specifics can be seen in <a href="#demo_2">Demo 2</a>. The fourth figure present the shape of the posterior $q(x|z)$, which shows an irregular shape and resembles a curved and uneven line. As $\alpha$ increases (noise decreases), the curve tends to be uniform and straight. Readers can adjust different $\alpha$ values and observe the relationship between the shape of posterior and the level of noise. In the last figure, the $\textcolor{blue}{\text{blue dash line}}$ represents $q(x)$, the $\textcolor{green}{\text{green dash line}}$ represents <b>GaussFun</b> in the equation 3.4, and the $\textcolor{orange}{\text{orange curve}}$ represents the result of multiplying the two function and normalizing it, which is the posterior probability $q(x|z=fixed)$ under a fixed z condition. Readers can adjust different values of z to observe how the fluctuation of $q(x)$ affect the shape of the posterior probability $q(x|z)$.
107
 
108
  The posterior $q(x|z)$ under two special states are worth considering.
109
  <ul>
110
+ <li>As $\alpha \to 0$, the variance of <b>GaussFun</b> tends to <b>$\infty$</b>, and $q(x|z)$ for different $z$ almost become identical, and almost the same as $q(x)$. Readers can set $\alpha$ to 0.001 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
111
 
112
+ <li>As $\alpha \to 1$, the variance of <b>GaussFun</b> tends to <b>$0$</b>, The $q(x|z)$ for different $z$ values contract into a series of <em>Dirac delta functions</em> with different offsets equalling to $z$. However, there are some exceptions. When there are regions where $q(x)$ is zero, the corresponding $q(x|z)$ will no longer be a Dirac <em>delta function</em>, but a zero function. Readers can set $\alpha$ to 0.999 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
113
  </ul>
114
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_en")
115
  return
 
125
  r"""
126
  For any arbitrary data distribution $q(x)$, the transform(equation 2.1) in section 2 can be continuously applied(equation 4.1~4.4). As the number of transforms increases, the output probability distribution will become increasingly closer to the standard normal distribution. For more complex data distributions, more iterations or larger noise are needed.
127
 
128
+ Specific details can be observed in <a href="#demo_3_1">Demo 3.1</a>. The first figure illustrates a randomly generated one-dimensional probability distribution. After seven transforms, this distribution looks very similar to the standard normal distribution. The degree of similarity increases with the number of iterations and the level of the noise. Given the same degree of similarity, fewer transforms are needed if the noise added at each step is larger (smaller $\alpha$ value). Readers can try different $\alpha$ values and numbers of transforms to see how similar the final probability distribution is.
129
 
130
  The complexity of the initial probability distribution tends to be high, but as the number of transforms increases, the complexity of the probability distribution $q(z_t)$ will decrease. As concluded in section 4, a more complex probability distribution corresponds to a more complex posterior probability distribution. Therefore, in order to ensure that the posterior probability distribution is more similar to the Conditional Gaussian function (easier to learn), a larger value of $\alpha$ (smaller noise) should be used in the initial phase, and a smaller value of $\alpha$ (larger noise) can be appropriately used in the later phase to accelerate the transition to the standard normal distribution.
131
 
132
+ In the example of <a href="#demo_3_1">Demo 3.1</a>, it can be seen that as the number of transforms increases, the corners of $q(z_t)$ become fewer and fewer. Meanwhile, the slanting lines in the plot of the posterior probability distribution $q(z_{t-1}|z_t)$ become increasingly straight and uniform, resembling more and more the conditional Gaussian distribution.
133
 
134
  \begin{align}
135
  Z_1 &= \sqrt{\alpha_1} X + \sqrt{1-\alpha_1}\epsilon_1 \tag{4.1} \newline
 
160
  q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
161
  \end{align}
162
 
163
+ If considering only marginal distribution $q(z_T)$, a single transformation can also be used, which is as follows:
164
  \begin{align}
165
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
166
  \end{align}
 
179
  r"""
180
  If the final probability distribution $q(z_T)$ and the posterior probabilities of each transform $q(x|z),q(z_{t-1}|z_t)$ are known, the data distribution $q(x)$ can be recovered through the Bayes Theorem and the Law of Total Probability, as shown in equations 5.1~5.4. When the final probability distribution $q(z_T)$ is very similar to the standard normal distribution, the standard normal distribution can be used as a substitute.
181
 
182
+ Specifics can be seen in <a href="#demo_3_2">Demo 3.2</a>. In the example, $q(z_T)$ substitutes $\mathcal{N}(0,1)$, and the error magnitude is given through JS Divergence. The restored probability distribution $q(z_t)$ and $q(x)$ are identified by the $\textcolor{green}{\text{green curve}}$, and the original probability distribution is identified by the $\textcolor{blue}{\text{blue curve}}$. It can be observed that the data distribution $q(x)$ can be well restored, and the error (JS Divergence) will be smaller than the error caused by the standard normal distribution replacing $q(z_T)$.
183
  \begin{align}
184
  q(z_{T-1}) &= \int q(z_{T-1},z_T)dz_T = \int q(z_{T-1}|z_T)q(z_T)dz_T \tag{5.1} \newline
185
  & \dots \notag \newline
 
188
  q(z_1) &= \int q(z_1,z_2) dz_1 = \int q(z_1|z_2)q(z_2)dz_2 \tag{5.3} \newline
189
  q(x) &= \int q(x,z_1) dz_1 = \int q(x|z_1)q(z_1)dz_1 \tag{5.4} \newline
190
  \end{align}
191
+ In this article, the aforementioned transform is referred to as the <b>Posterior Transform</b>. For example, in equation 5.4, the input of the transform is the probability distribution function $q(z_1)$, and the output is the probability distribution function $q(x)$.The entire transform is determined by the posterior $q(x|z_1)$. This transform can also be considered as the linear weighted sum of a set of basis functions, where the basis functions are $q(x|z_1)$ under different $z_1$, and the weights of each basis function are $q(z_1)$. Some interesting properties of this transform will be introduced in <a href="#posterior_transform">Section 7</a>.
192
 
193
+ In <a href="#posterior">Section 3</a>, we have considered two special posterior probability distributions. Next, we analyze their corresponding <em>posterior transforms</em>.
194
  <ul>
195
+ <li> When $\alpha \to 0$, the $q(x|z)$ for different $z$ are almost the same as $q(x)$. In other words, the basis functions of linear weighted sum are almost the same. In this state, no matter how the input changes, the output of the transformation is always $q(x)$.</li>
196
  <li> When $\alpha \to 1$, the $q(x|z)$ for different $z$ values becomes a series of Dirac delta functions and zero functions. In this state, as long as the <em>support set</em> of the input distribution is included in the <em>support set</em> of $q(x)$, the output of the transformation will remain the same with the input.</li>
197
  </ul>
198
 
199
+ In <a href="#forward_process">Section 4</a>, it is mentioned that the 1000 transformations used in the DDPM[\[2\]](#ddpm) can be represented using a single transformation
200
  \begin{align}
201
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
202
  \end{align}
 
204
  Since $\\alpha=0.0000403$ is very small, the corresponding standard deviation of GaussFun (Equation 3.4) reaches 157.52. However, the range of $X$ is limited within $[-1, 1]$, which is far smaller than the standard deviation of GaussFun. Within the range of $x \\in [-1, 1]$, GaussFun should be close to a constant, showing little variation. Therefore, the $q(x|z_T)$ corresponding to different $z_T$ are almost the same as $q(x)$. In this state, the posterior transform corresponding to $q(x|z_T)$ does not depend on the input distribution, the output distribution will always be $q(x)$.
205
 
206
  <b>Therefore, theoretically, in the DDPM model, it is not necessary to use the standard normal distribution to replace $q(z_T)$. Any other arbitrary distributions can also be used as a substitute.</b>
207
+
208
+ Readers can conduct a similar experiment themselves. In <a href="#demo_3_1">Demo 3.1</a>, set <em>start_alpha</em> to 0.25, <em>end_alpha</em> to 0.25, and <em>step</em> to 7. At this point, $q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061} \epsilon$, which is roughly equivalent to DDPM's $q(z_T)$. Click on <b>apply</b> to perform the forward transform (plotted using $\textcolor{blue}{\text{blue curves}}$), which prepares for the subsequent restoring process. In <a href="#demo_3_2">Demo 3.2</a>, set the <em>noise_ratio</em> to 1, introducing 100% noise into the <em>tail distribution</em> $q(z_7)$. Changing the value of <em>nose_random_seed</em> will change the distribution of noise. Deselect <em>backward_pdf</em> to reduce screen clutter. Click on <b>apply</b> to restore $q(x)$ through posterior transform. You will see that, no matter what the shape of input $q(z_7)$ may be, the restored $q(x)$ is always exactly the same as the original $q(x)$. The JS Divergence is zero. The restoration process is plotted using a $\textcolor{red}{\text{red curve}}$.
209
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_en")
210
  return
211
 
 
218
 
219
  gr.Markdown(
220
  r"""
221
+ From the front part of <a href="#posterior">Section 3</a>, it is known that the posterior probability distributions are unknown and related to $q(x)$. Therefore, in order to recover the data distribution or sample from it, it is necessary to learn and estimate each posterior probability distribution.
222
 
223
+ From the latter part of <a href="#posterior">Section 3</a>, it can be understood that when certain conditions are met, each posterior probability distribution $q(x|z), q(z_{t-1}|z_t)$ approximates the Gaussian probability distribution. Therefore, by constructing a set of conditional Gaussian probability models $p(x|z), p(z_{t-1}|z_t)$, we can learn to fit the corresponding $q(x|z), q(z_{t-1}|z_t)$.
224
 
225
+ Due to the limitations of the model's representative and learning capabilities, there will be certain errors in the fitting process, which will further impact the accuracy of restored $q(x)$. The size of the fitting error is related to the complexity of the posterior probability distribution. As can be seen from <a href="#posterior">Section 3</a>, when $q(x)$ is more complex or the added noise is large, the posterior probability distribution will be more complex, and it will differ greatly from the Gaussian distribution, thus leading to fitting errors and further affecting the restoration of $q(x)$.
226
 
227
+ Refer to <a href="#demo_3_3">Demo 3.3</a> for the specifics. The reader can test different $q(x)$ and $\alpha$, observe the fitting degree of the posterior probability distribution $q(z_{t-1}|z_t)$ and the accuracy of restored $q(x)$. The restored probability distribution is ploted with $\textcolor{orange}{\text{orange}}$, and the error is also measured by JS divergence.
228
 
229
  Regarding the objective function for fitting, similar to other probability models, the cross-entropy loss can be optimized to make $p(z_{t-1}|z_t)$ approaching $q(z_{t-1}|z_t)$. Since $(z_{t-1}|z_t)$ is a conditional probability, it is necessary to fully consider all conditions. This can be achieved by averaging the cross-entropy corresponding to each condition weighted by the probability of each condition happening. The final form of the loss function is as follows.
230
  \begin{align}
 
234
 
235
  KL divergence can also be optimized as the objective function. KL divergence and cross-entropy are equivalent[\\[10\\]](#ce_kl)
236
  <span id="en_fit_0">
237
+ loss &= \int q(z_t) KL(q(z_{t-1}|z_t) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dz_t \tag{6.3} \newline
238
  &= \int q(z_t) \int q(z_{t-1}|z_t) \frac{q(z_{t-1}|z_t)}{\textcolor{blue}{p(z_{t-1}|z_t)}} dz_{t-1} dz_t \tag{6.4} \newline
239
  &= -\int q(z_t)\ \underbrace{\int q(z_{t-1}|z_t) \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}}{underline}{\text{Cross Entropy}}\ dz_t + \underbrace{\int q(z_t) \int q(z_{t-1}|z_t) \log q(z_{t-1}|z_t)}{underline}{\text{Is Constant}} dz \tag{6.5}
240
  </span>
 
258
  &\quad - \iint \int q(x)q(z_{t-1}, z_t|x) \log \textcolor{blue}{p(z_{t-1}|z_t)}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.11} \newline
259
  &= \iint \int q(x)q(z_{t-1},z_t|x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.12} \newline
260
  &= \iint q(x)q(z_t|x)\int q(z_{t-1}|z_t,x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dz_{t-1}\ dz_xdz_t - \textcolor{orange}{C_1} \tag{6.13} \newline
261
+ &= \iint \ q(x)q(z_t|x) KL[q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)}]dxdz_t - \textcolor{orange}{C_1} \tag{6.14} \newline
262
+ &\propto \iint \ q(x)q(z_t|x) KL(q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dxdz_t \tag{6.15} \newline
263
  \end{align}
264
 
265
  In the above formula, the term $C_1$ is a fixed value, which does not contain parameters to be optimized. Here, $q(x)$ is a fixed probability distribution, and $q(z_{t-1}|z_t)$ is also a fixed probability distribution, whose specific form is determined by $q(x)$ and the coefficient $\alpha$.
 
276
 
277
  Based on the conclusion of the Consistent Terms proof and the relationship between cross entropy and KL divergence, an interesting conclusion can be drawn:
278
  <span id="en_fit_1">
279
+ \mathop{\min}{underline}{\textcolor{blue}{p}} \int q(z_t) KL(q(z_{t-1}|z_t) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dz_t \iff \mathop{\min}{underline}{\textcolor{blue}{p}} \iint \ q(x)q(z_t|x) KL(q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dxdz_t \tag{6.19}
280
  </span>
281
  By comparing the expressions on the left and right, it can be observed that the objective function on the right side includes an additional variable $X$ compared to the left side. At the same time, there is an additional integral with respect to $X$, with the occurrence probability of $X$, denoted as $q(x)$, serving as the weighting coefficient for the integral.
282
 
283
  Following a similar proof method, a more general relationship can be derived:
284
  <span id="en_fit_2">
285
+ \mathop{\min}{underline}{\textcolor{blue}{p}} KL(q(z) \Vert \textcolor{blue}{p(z)}) \iff \mathop{\min}_{\textcolor{blue}{p}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx \tag{6.20}
286
  </span>
287
+ A detailed derivation of this conclusion can be found in <a href="#cond_kl">Appendix A</a>.
288
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_fit_posterior_en")
289
  return
290
 
 
301
  q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
302
  \end{align}
303
 
304
+ Through extensive experiments with one-dimensional random variables, it was found that the <b>Posterior Transform</b> exhibits the characteristics of <b>Contraction Mapping</b>. This means that, for any two probability distributions $q_{i1}(z)$ and $q_{i2}(z)$, after posterior transform, we get $q_{o1}(x)$ and $q_{o2}(x)$. The distance between $q_{o1}(x)$ and $q_{o2}(x)$ is always less than the distance between $q_{i1}(x)$ and $q_{i2}(x)$. Here, the distance can be measured using JS divergence or Total Variance. Furthermore, the contractive ratio of this contraction mapping is positively related to the size of the added noise.
305
  \begin{align}
306
  dist(q_{o1}(z),\ q_{o2}(z)) < dist(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
307
  \end{align}
308
 
309
+ Readers can refer to <a href="#demo_4_1">Demo 4.1</a>, where the first three figure present a transform process. The first figure is an arbitrary data distribution $q(x)$, the third figure is the transformed probability distribution, and second figure is the posterior probability distribution $q(x|z)$. You can change the random seed to generate a new data distribution$q(x)$, and adjust the value of $\alpha$ to introduce different degrees of noise.
310
 
311
  The last two figures show contraction of the transform. The fourth figure displays two randomly generated input distributions and their distance, $div_{in}$. The fifth figure displays the two output distributions after transform, with the distance denoted as $div_{out}$.
312
 
 
314
 
315
  According to the Banach fixed-point theorem<a href="#fixed_point">[5]</a>, a contraction mapping has a unique fixed point (converged point). That is to say, for any input distribution, the <b>Posterior Transform</b> can be applied continuously through iterations, and as long as the number of iterations is sufficient, the final output would be the same distribution. After a large number of one-dimensional random variable experiments, it was found that the fixed point (converged point) is <b>located near $q(x)$</b>. Also, the location is related to the value of $\alpha$; the smaller $\alpha$ (larger noise), the closer it is.
316
 
317
+ Readers can refer to <a href="#demo_4_2">Demo 4.2</a>, which illustrates an example of applying posterior transform iteratively. Choose an appropriate number of iterations, and click on the button of <em>Apply</em>, and the iteration process will be draw step by step. Each subplot shows the transformed output distribution($\textcolor{green}{\text{green curve}}$) from each transform, with the reference distribution $q(x)$ expressed as a $\textcolor{blue}{\text{blue curve}}$, as well as the distance $div$ between the output distribution and $q(x)$. It can be seen that as the number of iterations increases, the output distribution becomes more and more similar to $q(x)$, and will eventually stabilize near $q(x)$. For more complicated distributions, more iterations or greater noise may be required. The maximum number of iterations can be set to tens of thousands, but it'll take longer.
318
 
319
  For the one-dimensional discrete case, $q(x|z)$ is discretized into a matrix (denoted as $Q_{x|z}$), $q(z)$ is discretized into a vector (denoted as $\boldsymbol{q_i}$). The integration operation $\int q(x|z)q(z)dz$ is discretized into a **matrix-vector** multiplication operation, thus the posterior transform can be written as
320
  \begin{align}
 
323
  & \dots & \notag \newline
324
  \boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
325
  \end{align}
326
+ In order to better understand the property of the transform, the matrix $(Q_{x|z})^n$ is also plotted in <a href="#demo_4_2">Demo 4.2</a>. From the demo we can see that, as the iterations converge, the row vectors of the matrix $(Q_{x|z})^n$ will become a constant vector, that is, all components of the vector will be the same, which will appear as a horizontal line in the denisty plot.
327
+
328
+ In the <a href="#proof_ctr">Appendix B</a>, a proof will be provided that, when $q(x)$ and $\alpha$ satisfy some conditions, the posterior transform is a strict Contraction Mapping.
329
+
330
+ The relationship between the converged distribution and the input distribution q(x) cannot be rigorously proven at present.
331
+
332
+ <h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution</h3>
333
+ From the above analysis, we know that when certain conditions are satisfied, the <em>posterior transform</em> is a contraction mapping. Therefore, the following relationship exists:
334
+ \begin{align}
335
+ dist(q(x),\ q_o(x)) < dist(q(z),\ q_i(z)) \tag{7.12}
336
+ \end{align}
337
+ Wherein, $q(z)$ is the ideal input distribution, $q(x)$ is the ideal output distribution, $q_i(x)$ is any arbitrary input distribution, and $q_o(x)$ is the output distribution obtained after transforming $q_i(z)$.
338
+
339
+ The above equation indicates that the distance between the output distribution $q_o(x)$ and the ideal output distribution q(x) will always be <b>less than</b> the distance between the input distribution $q_i(z)$ and the ideal input distribution q(x). Hence, the <em>posterior transform</em> has certain resistance to noise. This means that during the process of restoring $q(x)$(<a href="#backward_process">Section 5</a>), even if the <em>tail distribution</em> $q(z_T)$ contains some error, the error of the outputed distribution $q(x)$ will be smaller than the error of input after undergoing a series of transform.
340
+
341
+ Refer specifically to <a href="#demo_3_2">Demo 3.2</a>, where by increasing the value of the <b>noise ratio</b>, noise can be added to the <em>tail distribution</em> $q(z_T)$. Clicking the "apply" button will gradually draw out the restoring process, with the restored distribution represented by a $\textcolor{red}{\text{red curve}}$, and the error size will be computed by the JS divergence. You will see that the error of restored $q(x)$ is always less than the error of $q(z_T)$.
342
+
343
+ From the above discussion, we know that the smaller the $\alpha$ (the larger the noise used in the transform process), the greater the contractive ratio of the contraction mapping, and thus, the stronger the ability to resist noise.
344
+
345
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_en")
346
+ return
347
+
348
+
349
+ def md_deconvolution_en():
350
+ global g_latex_del
351
+
352
+ title = "8. Can the data distribution be restored by deconvolution?"
353
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="deconvolution"):
354
+ gr.Markdown(
355
+ r"""
356
+ As mentioned in the <a href="#introduction">Section 1</a>, the transform of Equation 2.1 can be divided into two sub-transforms, the first one being a linear transform and the second being adding independent Gaussian noise. The linear transform is equivalent to a scaling transform of the probability distribution, so it has an inverse transformation. Adding independent Gaussian noise is equivalent to the execution of a convolution operation on the probability distribution, which can be restored through <b>deconvolution</b>. Therefore, theoretically, the data distribution $q(x)$ can be recovered from the final probability distribution $q(z_T)$ through <b>inverse linear transform</b> and <b>deconvolution</b>.
357
+
358
+ However, in actuality, some problems do exist. Due to the extreme sensitivity of deconvolution to errors, having high input sensitivity, even a small amount of input noise can lead to significant changes in output[\[11\]](#deconv_1)[\[12\]](#deconv_2). Meanwhile, in the diffusion model, the standard normal distribution is used as an approximation to replace $q(z_T)$, thus, noise is introduced at the initial stage of recovery. Although the noise is relatively small, because of the sensitivity of deconvolution, the noise will gradually amplify, affecting the recovery.
359
 
360
+ In addition, the infeasibility of <b>deconvolution restoring</b> can be understood from another perspective. Since the process of forward transform (equations 4.1 to 4.4) is fixed, the convolution kernel is fixed. Therefore, the corresponding deconvolution transform is also fixed. Since the initial data distribution $q(x)$ is arbitrary, any probability distribution can be transformed into an approximation of $\mathcal{N}(0,I)$ through a series of fixed linear transforms and convolutions. If <b>deconvolution restoring</b> is feasible, it means that a fixed deconvolution can be used to restore any data distribution $q(x)$ from the $\mathcal{N}(0,I)$ , this is clearly <b>paradoxical</b>. The same input, the same transform, cannot have multiple different outputs.
361
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_deconvolution_en")
362
+ return
363
+
364
+
365
+ def md_cond_kl_en():
366
+ global g_latex_del
367
+
368
+ title = "Appendix A Conditional KL Divergence"
369
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="cond_kl"):
370
+ gr.Markdown(
371
+ r"""
372
+ This section mainly introduces the relationship between <b>KL divergence</b> and <b>conditional KL divergence</b>. Before the formal introduction, we will briefly introduce the definitions of entropy and conditional entropy, as well as the inequality relationship between them, in preparation for the subsequent proof
373
+
374
+ <h3 style="font-size:20px">Entropy and Conditional Entropy</h3>
375
+ For any two random variables $Z, X$, the <b>Entropy</b> is defined as follows<a href="#entropy">[16]</a>:
376
+ \begin{align}
377
+ \mathbf{H}(Z) = \int -p(z)\log{p(z)}dz \tag{A.1}
378
+ \end{align}
379
+ The <b>Conditional Entropy</b> is defined as followed <a href="#cond_entropy">[17]</a>:
380
+ \begin{align}
381
+ \mathbf{H}(Z|X) = \int p(x) \overbrace{\int -p(z|x)\log{p(z|x)}dz}^{\text{Entropy}}\ dx \tag{A.2}
382
+ \end{align}
383
+ The following inequality relationship exists between the two:
384
+ \begin{align}
385
+ \mathbf{H}(Z|X) \le \mathbf{H}(Z) \tag{A.3}
386
+ \end{align}
387
+ It is to say that the Conditional Entropy is always less than or equal to the Entropy, and they are equal only when X and Z are independent. The proof of this relationship can be found in the literature <a href="#cond_entropy">[17]</a>.
388
+
389
+ <h3 style="font-size:20px">KL Divergence and Conditional KL Divergence</h3>
390
+ In the same manner as the definition of Conditional Entropy, we introduce a new definition, <b>Conditional KL Divergence</b>, denoted as $KL_{\\mathcal{C}}$. Since KL Divergence is non-symmetric, there exist two forms as follows.
391
+ \begin{align}
392
+ KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) = \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx \tag{A.4} \newline
393
+ KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) = \int \ \textcolor{blue}{p(x)} KL(q(z) \Vert \textcolor{blue}{p(z|x)})dx \tag{A.5}
394
+ \end{align}
395
+
396
+ Similar to Conditional Entropy, there also exists a similar inequality relationship for Conditional KL Divergence:
397
+ \begin{align}
398
+ KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) \ge KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.6} \newline
399
+ KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) \ge KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.7}
400
+ \end{align}
401
+ It is to say that the Conditional KL Divergence is always less than or equal to the KL Divergence, and they are equal only when X and Z are independent.
402
+
403
+ The following provides proofs for the conclusions on Equation A.5 and Equation A.6 respectively.
404
+
405
+ For equation A.6, the proof is as follows:
406
+ \begin{align}
407
+ KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) &= \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx \tag{A.8} \newline
408
+ &= \iint q(x) q(z|x) \log \frac{q(z|x)}{\textcolor{blue}{p(z)}}dzdx \tag{A.9} \newline
409
+ &= -\overbrace{\iint - q(x)q(z|x) \log q(z|x) dzdx}^{\text{Condtional Entropy }\mathbf{H}_q(Z|X)} - \iint q(x) q(z|x) \log \textcolor{blue}{p(z)} dzdx \tag{A.10} \newline
410
+ &= -\mathbf{H}_q(Z|X) - \int \left\lbrace \int q(x) q(z|x)dx \right\rbrace \log \textcolor{blue}{p(z)}dz \tag{A.11} \newline
411
+ &= -\mathbf{H}_q(Z|X) + \overbrace{\int - q(z) \log p(z)dz}^{\text{Cross Entropy}} \tag{A.12} \newline
412
+ &= -\mathbf{H}_q(Z|X) + \int q(z)\left\lbrace \log\frac{q(z)}{\textcolor{blue}{p(z)}} -\log q(z)\right\rbrace dz \tag{A.13} \newline
413
+ &= -\mathbf{H}_q(Z|X) + \int q(z)\log\frac{q(z)}{\textcolor{blue}{p(z)}}dz + \overbrace{\int - q(z)\log q(z)dz}^{\text{Entropy } \mathbf{H}_q(Z)} \tag{A.14} \newline
414
+ &= KL(q(z) \Vert \textcolor{blue}{p(z)}) + \overbrace{\mathbf{H}_q(Z) - \mathbf{H}_q(Z|X)}^{\ge 0} \tag{A.15} \newline
415
+ &\le KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.16}
416
+ \end{align}
417
+ In this context, equation A.15 applies the conclusion that <b>Conditional Entropy is always less than or equal to Entropy</b>. Thus, the relationship in equation A.6 is derived.
418
+
419
+ For equation A.6, the proof is as follows:
420
+ \begin{align}
421
+ KL(\textcolor{blue}{q(z)} \Vert p(z)) &= \int \textcolor{blue}{q(z)}\log\frac{\textcolor{blue}{q(z)}}{p(z)}dx \tag{A.15} \newline
422
+ &= \int q(z)\log\frac{q(z)}{\int p(z|x)p(z)dz}dz \tag{A.16} \newline
423
+ &= \textcolor{orange}{\int p(x)dx}\int q(z)\log q(z)dz - \int q(z)\textcolor{red}{\log\int p(z|x)p(x)dx}dz \qquad \ \textcolor{orange}{\int p(x)dx=1} \tag{A.17} \newline
424
+ &\le \iint p(x) q(z)\log q(z)dzdx - \int q(z)\textcolor{red}{\int p(x)\log p(z|x)dx}dz \ \qquad \textcolor{red}{\text{jensen\ inequality}} \tag{A.18} \newline
425
+ &= \iint p(x)q(z)\log q(z)dzdx - \iint p(z)q(z)\log p(z|x)dzdx \tag{A.19} \newline
426
+ &= \iint p(x)q(z)(\log q(z) - \log p(z|x))dzdx \tag{A.20} \newline
427
+ &= \iint p(x)q(z)\log \frac{q(z)}{p(z|x)}dzdx \tag{A.21} \newline
428
+ &= \int p(x)\left\lbrace \int q(z)\log \frac{q(z)}{p(z|x)}dz\right\rbrace dx \tag{A.22} \newline
429
+ &= \int p(x)KL(\textcolor{blue}{q(z)} \Vert p(z|x))dx \tag{A.23} \newline
430
+ &= KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) \tag{A.24}
431
+ \end{align}
432
+ Thus, the relationship in equation A.7 is obtained.
433
+
434
+ Another <b>important conclusion</b> can be drawn from equation A.15.
435
+
436
+ The KL Divergence is often used to fit the distribution of data. In this scenario, the distribution of the data is denoted by $q(z)$ and the parameterized model distribution is denoted by $\textcolor{blue}{p_\theta(z)}$. During the optimization process, since both $q(z|x)$ and $q(x)$ remain constant, the term $\mathbf{H}(Z) - \mathbf{H}(Z|X)$ in Equation A.15 is a constant. Thus, the following relationship is obtained:
437
+ <span id="zh_cond_kl_2">
438
+ \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} KL(q(z) \Vert \textcolor{blue}{p_\theta(z)}) \iff \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p_\theta(z)})dx \tag{A.25}
439
+ </span>
440
+
441
+ Comparing the above relationship with <b>Denoised Score Matching</b> <a href="#dsm">[18]</a>(equation A.26), some similarities can be observed. Both introduce a new variable $X$, and substitute the targeted fitting distribution q(z) with q(z|x). After the substitution, since q(z|x) is a conditional probability distribution, both consider all conditions and perform a weighted sum using the probability of the conditions occurring, $q(x)$, as the weight coefficient.
442
+ <span id="zh_cond_kl_3">
443
+ \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \frac{1}{2} \int q(z) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z)}{\partial z} \right\rVert^2 dz \iff \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \int q(x)\ \overbrace{\frac{1}{2} \int q(z|x) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z|x)}{\partial z} \right\rVert^2 dz}^{\text{Score Matching of }q(z|x)}\ dx \tag{A.26}
444
+ </span>
445
+
446
+ The operation of the above weighted sum is somewhat similar to <em> Elimination by Total Probability Formula </b>.
447
+ \begin{align}
448
+ q(z) = \int q(z,x) dx = \int q(x) q(z|x) dx \tag{A.27}
449
+ \end{align}
450
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_cond_kl_en")
451
+ return
452
+
453
+
454
+ def md_proof_ctr_en():
455
+ global g_latex_del
456
+
457
+ title = "Appendix B Proof of Contraction"
458
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="proof_ctr"):
459
+ gr.Markdown(
460
+ r"""
461
  <center> <img id="en_fig2" src="file/fig2.png" width="960" style="margin-top:12px"/> </center>
462
  <center> Figure 2: Only one component in support </center>
463
 
464
+ The following will prove that with some conditions, the posterior transform is a contraction mapping, and there exists a unique point, which is also the converged point.
465
+
466
+ The proof will be divided into several cases, and assumes that the random variable is discrete, so the posterior transform can be regarded as a single step transition of a <b>discrete Markov Chain</b>. The posterior $q(x|z)$ corresponds to the <b>transfer matrix</b>. Continuous variables can be considered as discrete variables with infinite states.
467
  <ol style="list-style-type:decimal">
468
  <li> When $q(x)$ is greater than 0, the posterior transform matrix $q(x|z)$ will be greater than 0 too. Therefore, this matrix is the transition matrix of an $\textcolor{red}{\text{irreducible}}\ \textcolor{green}{\text{aperiodic}}$ Markov Chain. According to the conclusion of the literature <a href="#mc_basic_p6">[13]</a>, this transformation is a contraction mapping with respect to Total Variance metric. Therefore, according to the Banach fixed-point theorem, this transformation has a unique fixed point(converged point). </li>
469
 
 
503
 
504
  Additionally, there exists a more generalized relation about the posterior transform that is independent of $q(x|z)$: the Total Variance distance between two output distributions will always be <b>less than or equal to</b> the Total Variance distance between their corresponding input distributions, that is
505
  \begin{align}
506
+ dist(q_{o1}(x),\ q_{o2}(x)) <= dist(q_{i1}(z),\ q_{i2}(z)) \tag{B.1}
507
  \end{align}
508
  The proof is given below in discrete form:
509
  \begin{align}
510
+ \lVert q_{o1}-q_{o2}\rVert_{TV} &= \lVert Q_{x|z}q_{i1} - Q_{x|z}q_{i2}\rVert_{TV} \tag{B.2} \newline
511
+ &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)q_{i1}(n) - \sum_{n}Q_{x|z}(m,n)q_{i2}(n)\textcolor{red}{|} \tag{B.3} \newline
512
+ &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.4} \newline
513
+ &\leq \sum_{m}\sum_{n}Q_{x|z}(m,n)\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \qquad \qquad \qquad \text{Absolute value inequality} \tag{B.5} \newline
514
+ &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \sum_{m} Q_{x|z}(m,n) \qquad \qquad \qquad \sum_{m} Q_{x|z}(m,n) = 1 \tag{B.6} \newline
515
+ &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.7}
 
 
 
 
 
 
 
 
 
516
  \end{align}
517
+ In this context, $Q_{x|z}(m,n)$ represents the element at the m-th row and n-th column of the matrix $Q_{x|z}$, and $q_{i1}(n)$ represents the n-th element of the vector $q_{i1}$.
518
+
519
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_proof_ctr_en")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  return
521
 
522
 
 
554
  <a id="mc_basic_t7" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [13] Markov Chain:Basic Theory - Theorem 7 </a>
555
 
556
  <a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
557
+
558
+ <a id="vdm" href="https://arxiv.org/pdf/2107.00630"> [15] Variational Diffusion Models </a>
559
 
560
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_en")
561
 
 
569
 
570
  gr.Markdown(
571
  r"""
572
+ <b>APP</b>: This Web APP is developed using Gradio and deployed on HuggingFace. Due to limited resources (2 cores, 16G memory), the response may be slow. For a better experience, it is recommended to clone the source code from <a href="https://github.com/blairstar/The_Art_of_DPM">github</a> and run it locally. This program only relies on Gradio, SciPy, and Matplotlib.
573
 
574
  <b>Author</b>: Zhenxin Zheng, Senior computer vision engineer with ten years of algorithm development experience, Formerly employed by Tencent and JD.com, currently focusing on image and video generation.
575
 
 
601
 
602
  md_deconvolution_en()
603
 
604
+ md_cond_kl_en()
605
+
606
+ md_proof_ctr_en()
607
+
608
  md_reference_en()
609
 
610
  md_about_en()
RenderMarkdownZh.py CHANGED
@@ -41,9 +41,9 @@ def md_transform_zh():
41
 
42
  此变换可分为两个子变换。
43
 
44
- 第一个子变换是对随机变量$X$执行一个线性变换($\sqrt{\alpha}X$),根据文献[\[3\]](#linear_transform)的结论,线性变换使$X$的概率分布“变窄变高”,并且"变窄变高"的程度与$\alpha$的值成正比;具体可看Demo 1,左1图为随机生成的一维的概率分布,左2图是经过线性变换后的概率分布,可以看出,与左1图相比,左2图的曲线“变窄变高”了。读者可亲自测试不同的$\alpha$值,获得更直观的理解。
45
 
46
- 第二个子变换是“加上独立的随机噪声”($\sqrt{1-\alpha}\epsilon$),根据文献[\[4\]](#sum_conv)的结论,“加上独立的随机变量”等效于对两个概率分布执行卷积,由于随机噪声的概率分布为高斯形状,所以相当于执行”高斯模糊“的操作。经过模糊后,原来的概率分布将变得更加平滑,与标准正态分布将更加相似。模糊的程度与噪声大小($1-\alpha$)正相关。具体可看Demo 1,左1图是随机生成的一维概率分布,左3图是经过变换后的结果,可以看出,变换后的曲线变光滑了,棱角变少了。读者可测试不同的$\alpha$值,感受噪声大小对概率分布曲线形状的影响。左4图是综合两个子变换后的结果。
47
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_transform_zh")
48
  return
49
 
@@ -59,7 +59,7 @@ def md_likelihood_zh():
59
  \begin{align}
60
  q(z|x) &= \mathcal{N}(\sqrt{\alpha}x,\ 1-\alpha) \tag{2.1}
61
  \end{align}
62
- 具体可看Demo 2,左3图展示了$q(z|x)$的形状,从图中可以看到一条均匀的斜线,这意味着$q(z|x)$的均值与x线性相关,方差固定不变。$\alpha$值的大小将决定斜线宽度和倾斜程度。
63
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_likelihood_zh")
64
  return
65
 
@@ -96,12 +96,12 @@ def md_posterior_zh():
96
  <li> 当高斯函数的方差较大(较大噪声),或者$q(x)$剧烈变化时,$q(x|z)$的形状将较复杂,与高斯函数有较大的差别,难以建模学习。</li>
97
  </ul>
98
 
99
- 具体可看Demo 2,左4图给出后验概率分布$q(x|z)$的形态,可以看出,其形状较不规则,像一条弯曲且不均匀的曲线。当$\alpha$较大时(噪声较小),曲线将趋向于均匀且笔直。读者可调整不同的$\alpha$值,观察后验概率分布与噪声大小的关系;左5图,$\textcolor{blue}{蓝色虚线}$给出$q(x)$,$\textcolor{green}{绿色虚线}$给出式3.4中的GaussFun,$\textcolor{orange}{黄色实线}$给出两者相乘并归一化的结果,即固定z条件下后验概率$q(x|z=fixed)$。读者可调整不同z值,观察$q(x)$的波动变化对后验概率$q(x|z)$形态的影响。
100
 
101
  两个特殊状态下的后验概率分布$q(x|z)$值得考虑一下。
102
  <ul>
103
- <li> 当$\alpha \to 0$时,GaussFun的方差趋向于<b>无穷大</b>,不同$z$值的$q(x|z)$几乎变成一致,并与$q(x)$几乎相同。读者可在Demo 2中,将$\alpha$设置为0.01,观察具体的结果。</li>
104
- <li> 当$\alpha \to 1$时,GaussFun的方差趋向于<b>无穷小</b>,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数, 偏移量等于$z$。但有一些例外,当q(x)存在为零的区域时,其对应的q(x|z)将不再为Dirac delta函数,而是零函数。可在Demo 2中,将$\alpha$设置为0.999,观察具体的结果。</li>
105
  </ul>
106
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_zh")
107
  return
@@ -117,11 +117,11 @@ def md_forward_process_zh():
117
  r"""
118
  对于任意的数据分布$q(x)$,均可连续应用上述的变换(如式4.1~4.4),随着变换的次数的增多,输出的概率分布将变得越来越接近于标准正态分布。对于较复杂的数据分布,需要较多的次数或者较大的噪声。
119
 
120
- 具体可看Demo 3.1,第一子图是随机生成的一维概率分布,经过7次的变换后,最终的概率分布与标准正态分布非常相似。相似的程度与迭代的次数和噪声大小正相关。对于相同的相似程度,如果每次所加的噪声较大(较小的$\alpha$值),那所需变换的次数将较少。读者可尝试不同的$\alpha$值和次数,观测最终概率分布的相似程度。
121
 
122
  起始概率分布的复杂度会比较高,随着变换的次数增多,概率分布$q(z_t)$的复杂度将会下降。根据第3节结论,更复杂的概率分布对应更复杂的后验概率分布,所以,为了保证后验概率分布与高斯函数较相似(较容易学习),在起始阶段,需使用较大的$\alpha$(较小的噪声),后期阶段可适当使用较小的$\alpha$(较大的噪声),加快向标准正态分布转变。
123
 
124
- 在Demo3的例子可以看到,随着变换次数增多,$q(z_t)$的棱角变得越来越少,同时,后验概率分布$q(z_{t-1}|z_t)$图中的斜线变得越来越笔直匀称,越来越像条件高斯分布。
125
 
126
  \begin{align}
127
  Z_1 &= \sqrt{\alpha_1} X + \sqrt{1-\alpha_1}\epsilon_1 \tag{4.1} \newline
@@ -152,7 +152,7 @@ def md_forward_process_zh():
152
  q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
153
  \end{align}
154
 
155
- 如果只考虑$q(z_T)$,也可使用一次变换代替,变换如下:
156
  \begin{align}
157
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
158
  \end{align}
@@ -171,7 +171,7 @@ def md_backward_process_zh():
171
  r"""
172
  如果知道了最终的概率分布$q(z_T)$及各个转换过程的后验概率$q(x|z),q(z_{t-1}|z_t)$,则可通过“贝叶斯公式”和“全概率公式”恢复数据分布$q(x)$,见式5.1~5.4。当最终的概率分布$q(z_T)$与标准正态分布很相似时,可用标准正态分布代替。
173
 
174
- 具体可看Demo 3.2。示例中$q(z_T)$使用$\mathcal{N}(0,1)$代替,同时通过JS Div给���了误差大小。恢复的概率分布$q(z_t)$及$q(x)$使用$\textcolor{green}{绿色曲线}$标识,原始的概率分布使用$\textcolor{blue}{蓝色曲线}$标识。可以看出,数据分布$q(x)$能够被很好地恢复回来,并且误差(JS Divergence)会小于标准正态分布替换$q(z_T)$引起的误差。
175
  \begin{align}
176
  q(z_{T-1}) &= \int q(z_{T-1},z_T)dz_T = \int q(z_{T-1}|z_T)q(z_T)dz_T \tag{5.1} \newline
177
  & \dots \notag \newline
@@ -180,21 +180,23 @@ def md_backward_process_zh():
180
  q(z_1) &= \int q(z_1,z_2) dz_1 = \int q(z_1|z_2)q(z_2)dz_2 \tag{5.3} \newline
181
  q(x) &= \int q(x,z_1) dz_1 = \int q(x|z_1)q(z_1)dz_1 \tag{5.4} \newline
182
  \end{align}
183
- 在本文中,将上述恢复过程(式5.1~5.4)所使用的变换称之为“后验概率变换”。例如,在式5.4中,变换的输入为概率分布函数$q(z_1)$,输出为概率分布函数$q(x)$,整个变换由后验概率分布$q(x|z_1)$决定。此变换也可看作为一组基函数的线性加权和,基函数为不同条件下的$q(x|z_1)$,各个基函数的权重为$q(z_1)$。在第7节,将会进一步介绍此变换的一些有趣性质。
184
 
185
- 在第3节中,我们考虑了两个特殊的后验概率分布。接下来,分析其对应的”后验概率变换“。
186
  <ul>
187
  <li> 当$\alpha \to 0$时,不同$z$值的$q(x|z)$均与$q(x)$几乎相同,也就是说,线性加权和的基函数几乎相同。此状态下,不管输入如何变化,变换的输出总为$q(x)$。</li>
188
  <li> 当$\alpha \to 1$时,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数及零函数。此状态下,只要输入分布的支撑集(support set)包含于$q(x)$的支撑集,变换的输出与输入将保持一致。</li>
189
  </ul>
190
 
191
- 在第5节中提到,DDPM[\[2\]](#ddpm)论文所使用的1000次变换可使用一次变换表示:
192
  \begin{align}
193
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
194
  \end{align}
195
  由于$\alpha=0.0000403$非常小,其对应的GaussFun(式3.4)的标准差达到157.52,而$X$的范围限制在$[-1, 1]$,远小于GaussFun的标准差。在$x \in [-1, 1]$范围内,GaussFun应该接近于常量,没有什么变化,所以不同的$z_T$对应的$q(x|z_T)$均与$q(x)$几乎相同。在这种状态下,对于$q(x|z_T)$相应的后验概率变换,不管输入分布是什么,输出分布都将是$q(x)$。
196
 
197
  <b>所以,理论上,在DDPM模型中,无需非得使用标准正态分布代替$q(z_T)$,也可使用其它任意的分布代替。</b>
 
 
198
 
199
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_zh")
200
  return
@@ -209,13 +211,13 @@ def md_fit_posterior_zh():
209
  # because of the render bug in gradio markdown, some formulas are render in ExtraBlock.js
210
  gr.Markdown(
211
  r"""
212
- 由第四节前半部分可知,各个后验概率分布是未知的,并且与$q(x)$有关。所以,为了恢复数据分布或者从数据分布中采样,需要对各个后验概率分布进行学习估计。
213
 
214
- 由第四节后半部分可知,当满足一定条件时,各个后验概率分布$q(x|z)、q(z_{t-1}|z_t)$近似于高斯概率分布,所以可通过构建一批条件高斯概率模型$p(x|z),p(z_{t-1}|z_t)$,学习拟合对应的$q(x|z),q(z_{t-1}|z_t)$。
215
 
216
- 由于模型表示能力和学习能力的局限性,拟合过程会存在一定的误差,进一步会影响恢复$q(x)$的准确性。拟合误差大小与后验概率分布的形状有关。由第4节可知,当$q(x)$较复杂或者所加噪声较大时,后验概率分布会较复杂,与高斯分布差别较大,从而导致拟合误差,进一步影响恢复$q(x)$。
217
 
218
- 具体可看Demo 3.3,读者可测试不同复杂程度的$q(x)$和$\alpha$,观看后验概率分布$q(z_{t-1}|z_t)$的拟合程度,以及恢复$q(x)$的准确度。恢复的概率分布使用$\textcolor{orange}{橙色}$标识,同时也通过JS divergence给出误差。
219
 
220
  关于拟合的目标函数,与其它概率模型类似,可$\textcolor{red}{优化交叉熵损失}$,使$p(z_{t-1}|z_t)$逼近于$q(z_{t-1}|z_t)$。由于$(z_{t-1}|z_t)$是条件概率,所以需要综合考虑各个条件,以<b>各个条件发生的概率$q(z_t)$</b>加权平均<b>各个条件对应的交叉熵</b>。最终的损失函数形式如下:
221
  \begin{align}
@@ -224,7 +226,7 @@ def md_fit_posterior_zh():
224
  \end{align}
225
  也可以KL散度作为目标函数进行优化,KL散度与交叉熵是等价的[\[10\]](#ce_kl)。
226
  <span id="zh_fit_0">
227
- loss &= \int q(z_t) KL(q(z_{t-1}|z_t)\|\textcolor{blue}{p(z_{t-1}|z_t)})dz_t \tag{6.3} \newline
228
  &= \int q(z_t) \int q(z_{t-1}|z_t) \frac{q(z_{t-1}|z_t)}{\textcolor{blue}{p(z_{t-1}|z_t)}} dz_{t-1} dz_t \tag{6.4} \newline
229
  &= -\int q(z_t)\ \underbrace{\int q(z_{t-1}|z_t) \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}}{underline}{\text{Cross Entropy}}\ dz_t + \underbrace{\int q(z_t) \int q(z_{t-1}|z_t) \log q(z_{t-1}|z_t)}{underline}{\text{Is Constant}} dz \tag{6.5}
230
  </span>
@@ -248,8 +250,8 @@ def md_fit_posterior_zh():
248
  &\quad - \iint \int q(x)q(z_{t-1}, z_t|x) \log \textcolor{blue}{p(z_{t-1}|z_t)}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.11} \newline
249
  &= \iint \int q(x)q(z_{t-1},z_t|x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.12} \newline
250
  &= \iint q(x)q(z_t|x)\int q(z_{t-1}|z_t,x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dz_{t-1}\ dz_xdz_t - \textcolor{orange}{C_1} \tag{6.13} \newline
251
- &= \iint \ q(x)q(z_t|x) KL[q(z_{t-1}|z_t,x)\|\textcolor{blue}{p(z_{t-1}|z_t)}]dxdz_t - \textcolor{orange}{C_1} \tag{6.14} \newline
252
- &\propto \iint \ q(x)q(z_t|x) KL[q(z_{t-1}|z_t,x)\|\textcolor{blue}{p(z_{t-1}|z_t)}]dxdz_t \tag{6.15} \newline
253
  \end{align}
254
 
255
  上式中的$C_1$项是一个固定值,不包含待优化的参数,其中,$q(x)$是固定的概率分布,$q(z_{t-1}|z_t)$也是固定概率分布,具体形式由$q(x)$及系数$\alpha$确定。
@@ -265,14 +267,15 @@ def md_fit_posterior_zh():
265
 
266
  根据一致项证明的结论,以及交叉熵与KL散度的关系,可得出一个有趣的结论:
267
  <span id="zh_fit_1">
268
- \mathop{\min}{underline}{\textcolor{blue}{p}} \int q(z_t) KL(q(z_{t-1}|z_t)\|\textcolor{blue}{p(z_{t-1}|z_t)})dz_t \iff \mathop{\min}{underline}{\textcolor{blue}{p}} \iint \ q(x)q(z_t|x) KL[q(z_{t-1}|z_t,x)\|\textcolor{blue}{p(z_{t-1}|z_t)}]dxdz_t
269
  </span>
270
  比较左右两边的式子,可以看出,右边的目标函数比左边的目标函数多了一个条件变量$X$,同时也多了一个关于$X$积分,并且以$X$的发生的概率$q(x)$作为积分的加权系数。
271
 
272
  依照类似的思路,可推导出一个更通用的关系:
273
  <span id="zh_fit_2">
274
- \mathop{\min}{underline}{\textcolor{blue}{p}} KL(q(z)\|\textcolor{blue}{p(z)}) \iff \mathop{\min}_{\textcolor{blue}{p}} \int \ q(x) KL(q(z|x)\|\textcolor{blue}{p(z)})dx
275
  </span>
 
276
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_fit_posterior_zh")
277
  return
278
 
@@ -284,7 +287,7 @@ def md_posterior_transform_zh():
284
 
285
  gr.Markdown(
286
  r"""
287
- <h3 style="font-size:18px"> Contraction Mapping and Fixed Point </h3>
288
  \begin{align}
289
  q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
290
  \end{align}
@@ -294,25 +297,161 @@ def md_posterior_transform_zh():
294
  dist(q_{o1}(z),\ q_{o2}(z)) < dist(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
295
  \end{align}
296
 
297
- 读者可查看Demo 4.1,左侧三个图呈现一个变换的过程,左1图是任意的数据分布$q(x)$,左3图是变换后的概率分布,左2图是后验概率分布。可更改随机种子生成新的数据分布,调整$\alpha$值引入不同程度的噪声。左侧最后两个图展示变换的“压缩性质”,左4图展示随机生成的两个输入分布,同时给出其距离度量值$div_{in}$;左5图展示经过变换后的两个输出分布,输出分布之间的距离标识为$div_{out}$。读者可改变输入的随机种子,切换不同的输入。可在图中看到,对于任意的输入,$div_{in}$总是小于$div_{out}$。另外,也可改变$\alpha$的值,将会看到,$\alpha$越小(噪声越大),$\frac{div_{out}}{div_{in}}$的比值也越小,即收缩率越大。
298
-
299
  由Banach fixed-point theorem<a href="#fixed_point">[5]</a>可知,压缩映射存在惟一一个定点(收敛点)。也就是说,对于任意的输入分布,可以连续迭代应用“后验概率变换”,只要迭代次数足够多,最终都会输出同一个分布。经过大量一维随机变量实验发现,定点(收敛点)<b>位于$q(x)$附近</b>。并且,与$\alpha$的值有关,$\alpha$越小(噪声越大),离得越近。
300
 
301
- 读者可看Demo 4.2,此部分展示迭代收敛的例子。选择合适的迭代次数,点中“apply iteration transform”,将逐步画出迭代的过程,每个子图均会展示各自变换后的输出分布($\textcolor{green}{绿色曲线}$),收敛的参考点分布$q(x)$以$\textcolor{blue}{蓝色曲线}$画出,同时给出输出分布与$q(x)$之间的距离$dist$。可以看出,随着迭代的次数增加,输出分布与$q(x)$越来越相似,并最终会稳定在$q(x)$附近。对于较复杂的分布,可能需要较多迭代的次数或者较大的噪声。迭代次数可以设置为上万步,但会花费较长时间。
302
 
303
- 对于一维离散的情况,$q(x|z)$将离散成一个矩阵(暂记为$Q_{x|z}$),$q(z)$离散成一个向量(记为$\boldsymbol{q_i}$),积分操作$\int q(x|z)q(z)dz$将离散成"矩阵-向量"乘法操作,所以后验概率变换可写成
304
  \begin{align}
305
  \boldsymbol{q_o} &= Q_{x|z}\ \boldsymbol{q_i} & \quad\quad &\text{1 iteration} \tag{7.3} \newline
306
  \boldsymbol{q_o} &= Q_{x|z}\ Q_{x|z}\ \boldsymbol{q_i} & \quad\quad &\text{2 iteration} \tag{7.4} \newline
307
  & \dots & \notag \newline
308
  \boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
309
  \end{align}
310
- 于是,为了更深入地理解变换的特点,Demo 4.2也画出矩阵$(Q_{x|z})^n$的结果。从图里可以看到,当迭代趋向收敛时,矩阵$(Q_{x|z})^n$的行向量将变成一个常数向量,即向量的各分量都相等。在二维密度图里将表现为一条横线。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  <center> <img src="file/fig2.png" width="960" style="margin-top:12px"/> </center>
313
  <center> Figure 2: Only one component in support </center>
314
-
315
- 下面分几种情况证明,后验概率变换是一个压缩映射,并存在惟一收敛点。证明的过程假设随机变量是离散型的,因此,后验概率变换可看作是一个<b>离散Markov Chain</b>的一步转移,后验概率$q(x|z)$对应于<b>转移矩阵</b>(Transfer Matrix)。连续型的变量可认为是无限多状态的离散型变量。
 
 
316
 
317
  <ol style="list-style-type:decimal">
318
  <li> 当$q(x)$均大于0时,后验概率变换矩阵$q(x|z)$将大于0,于是此矩阵是一个$\textcolor{red}{不可约}\textcolor{green}{非周期}$的Markov Chain的转移矩阵,根据文献<a href="#mc_basic_p6">[13]</a>的结论,此变换是一个关于Total Variance度量的压缩映射,于是,根据Banach fixed-point theorem,此变换存在惟一定点(收敛点)。</li>
@@ -354,55 +493,23 @@ def md_posterior_transform_zh():
354
 
355
  另外,后验概率变换存在一个更通用的关系,与$q(x|z)$的具体值无关: 两个输出分布的之间的Total Variance距离总是会<b>小于等于</b>对应输入分布之间的Total Variance距离,即
356
  \begin{align}
357
- dist(q_{o1}(x),\ q_{o2}(x)) <= dist(q_{i1}(z),\ q_{i2}(z)) \notag
358
  \end{align}
359
  下面通过离散的形式给出证明:
360
  \begin{align}
361
- \lVert q_{o1}-q_{o2}\rVert_{TV} &= \lVert Q_{x|z}q_{i1} - Q_{x|z}q_{i2}\rVert_{TV} \tag{7.6} \newline
362
- &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)q_{i1}(n) - \sum_{n}Q_{x|z}(m,n)q_{i2}(n)\textcolor{red}{|} \tag{7.7} \newline
363
- &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{7.8} \newline
364
- &\leq \sum_{m}\sum_{n}Q_{x|z}(m,n)\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \qquad \qquad \qquad \text{Absolute value inequality} \tag{7.9} \newline
365
- &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \sum_{m} Q_{x|z}(m,n) \qquad \qquad \qquad \sum_{m} Q_{x|z}(m,n) = 1 \tag{7.10} \newline
366
- &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{7.11}
367
  \end{align}
368
  其中,$Q_{x|z}(m,n)$表示矩阵$Q_{x|z}$的第m行第n列的元素,$q_{i1}(n)$表示向量$q_{i1}$的第n个元素。
369
 
370
- 关于定点分布与输入分布q(x)之间距离的关系,目前尚不能严格证明。
371
-
372
- <h3 style="font-size:18px"> 恢复数据分布过程中的抗噪声能力 </h3>
373
- 由上面的分析可知,当满足一些条件时,"后验概率变换"是一个压缩映射,所以存在如下的关系:
374
- \begin{align}
375
- dist(q(x),\ q_o(x)) < dist(q(z),\ q_i(z)) \tag{7.12}
376
- \end{align}
377
- 其中,$q(z)$是理想的输入分布,$q(x)$理想的输出分布,$q_i(x)$是任意的输入分布,$q_o(x)$是$q_i(z)$经过变换后的输出分布。
378
-
379
- 上式表明,输出的分布$q_o(x)$与理想输出分布q(x)之间的距离总会</em>小于</em>输入分布$q_i(z)$与理想输入分布q(x)的距离。于是,"后验概率变换"具备一定的抵抗噪声能力。这意味着,在恢复$q(x)$的过程中,哪怕输入的“末尾分布$q(z_T)”$存在一定的误差,经过一系列变换后,输出的“数据分布$q(x)$“的误差也会比输入的误差更小。
380
-
381
- 具体可看Demo 3.2,通过增加“noise ratio”的值可以向“末尾分布$q(z_T)$”添加噪声,点击“apply”按钮将逐步画出恢复的过程,恢复的分布以$\textcolor{red}{红色曲线}$画出,同时也会通过JS散度标出误差的大小。将会看到,恢复的$q(x)$的误差总是小于$q(z_T)$的误差。
382
-
383
- 由上面的讨论可知,$\alpha$越小(即变换过程中使用的噪声越大),压缩映射的压缩率越大,于是,抗噪声的能力也越强。
384
- """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_zh")
385
  return
386
 
387
 
388
- def md_deconvolution_zh():
389
- global g_latex_del
390
-
391
- title = "8. Can the data distribution be restored by deconvolution?"
392
- with gr.Accordion(label=title, elem_classes="first_md", elem_id="deconvolution"):
393
-
394
- gr.Markdown(
395
- r"""
396
- 在第二节中提到,式2.1的变换可分为两个子变换,第一个子变换为”线性变换“,第二个为“加上独立高斯噪声”。线性变换相当于对概率分布进行拉伸变换,所以存在逆变换。"加上独立高斯噪声”相当于对概率分布执行卷积操作,卷积操作可通过逆卷积恢复。所以,理论上,可通过“逆线性变换”和“逆卷积”从最终的概率分布$q(z_T)$恢复数据分布$q(x)$。
397
-
398
- 但实际上,会存在一些问题。由于逆卷积对误差极为敏感,具有很高的输入灵敏度,很小的输入噪声就会引起输出极大的变化[\[11\]](#deconv_1)[\[12\]](#deconv_2)。而在扩散模型中,会使用标准正态分布近似代替$q(z_T)$,因此,在恢复的起始阶段就会引入噪声。虽然噪声较小,但由于逆卷积的敏感性,噪声会逐步放大,影响恢复。
399
-
400
- 另外,也可以从另一个角度理解“逆卷积恢复”的不可行性。由于前向变换的过程(式4.1~4.4)是确定的,所以卷积核是固定的,因此,相应的“逆卷积变换“也是固定的。由于起始的数据分布$q(x)$可以是任意的分布,所以,通过一系列固定的“卷积正变换”,可以将任意的概率分布转换成近似$\mathcal{N}(0,I)$的分布。如“逆卷积变换“可行,则意味着,可用一个固定的“逆卷积变换",将$\mathcal{N}(0,I)$分布恢复成任意的数据分布$q(x)$,这明显是一个悖论。同一个输入,同一个变换,不可能会有多个输出。
401
- """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_deconvolution_zh")
402
- return
403
-
404
-
405
-
406
  def md_reference_zh():
407
  global g_latex_del
408
 
@@ -438,6 +545,14 @@ def md_reference_zh():
438
 
439
  <a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
440
 
 
 
 
 
 
 
 
 
441
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_zh")
442
 
443
  return
@@ -450,7 +565,7 @@ def md_about_zh():
450
 
451
  gr.Markdown(
452
  r"""
453
- <b>APP</b>: 本APP是使用Gradio开发,并部署在HuggingFace。由于资源有限(2核,16G内存),所以可能会响应较慢。为了更好地体验,建议从<a href="https://github.com/blairstar/The_Art_of_DPM">github</a>复制源代码,在本地机器运行。本APP只依赖Gradio, SciPy, Matplotlib。
454
 
455
  <b>Author</b>: 郑镇鑫,资深视觉算法工程师,十年算法开发经历,曾就职于腾讯京东等互联网公司,目前专注于视频生成(类似Sora)。
456
 
@@ -462,7 +577,7 @@ def md_about_zh():
462
 
463
  def run_app():
464
 
465
- # with gr.Blocks(css=g_css, js="() => insert_special_formula(); ", head=js_head) as demo:
466
  with gr.Blocks(css=g_css, js="() => {insert_special_formula(); write_markdown();}", head=js_head) as demo:
467
  md_introduction_zh()
468
 
@@ -481,7 +596,11 @@ def run_app():
481
  md_posterior_transform_zh()
482
 
483
  md_deconvolution_zh()
484
-
 
 
 
 
485
  md_reference_zh()
486
 
487
  md_about_zh()
 
41
 
42
  此变换可分为两个子变换。
43
 
44
+ 第一个子变换是对随机变量$X$执行一个线性变换($\sqrt{\alpha}X$),根据文献[\[3\]](#linear_transform)的结论,线性变换使$X$的概率分布“变窄变高”,并且"变窄变高"的程度与$\alpha$的值成正比。具体可看<a href="#demo_1">Demo 1</a>,左1图为随机生成的一维的概率分布,左2图是经过线性变换后的概率分布,可以看出,与左1图相比,左2图的曲线“变窄变高”了。读者可亲自测试不同的$\alpha$值,获得更直观的理解。
45
 
46
+ 第二个子变换是“加上独立的随机噪声”($\sqrt{1-\alpha}\epsilon$),根据文献[\[4\]](#sum_conv)的结论,“加上独立的随机变量”等效于对两个概率分布执行卷积,由于随机噪声的概率分布为高斯形状,所以相当于执行”高斯模糊“的操作。经过模糊后,原来的概率分布将变得更加平滑,与标准正态分布将更加相似。模糊的程度与噪声大小($1-\alpha$)正相关。具体可看<a href="#demo_1">Demo 1</a>,左1图是随机生成的一维概率分布,左3图是经过变换后的结果,可以看出,变换后的曲线变光滑了,棱角变少了。读者可测试不同的$\alpha$值,感受噪声大小对概率分布曲线形状的影响。左4图是综合两个子变换后的结果。
47
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_transform_zh")
48
  return
49
 
 
59
  \begin{align}
60
  q(z|x) &= \mathcal{N}(\sqrt{\alpha}x,\ 1-\alpha) \tag{2.1}
61
  \end{align}
62
+ 具体可看<a href="#demo_2">Demo 2</a>,左3图展示了$q(z|x)$的形状,从图中可以看到一条均匀的斜线,这意味着$q(z|x)$的均值与x线性相关,方差固定不变。$\alpha$值的大小将决定斜线宽度和倾斜程度。
63
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_likelihood_zh")
64
  return
65
 
 
96
  <li> 当高斯函数的方差较大(较大噪声),或者$q(x)$剧烈变化时,$q(x|z)$的形状将较复杂,与高斯函数有较大的差别,难以建模学习。</li>
97
  </ul>
98
 
99
+ 具体可看<a href="#demo_2">Demo 2</a>,左4图给出后验概率分布$q(x|z)$的形态,可以看出,其形状较不规则,像一条弯曲且不均匀的曲线。当$\alpha$较大时(噪声较小),曲线将趋向于均匀且笔直。读者可调整不同的$\alpha$值,观察后验概率分布与噪声大小的关系;左5图,$\textcolor{blue}{蓝色虚线}$给出$q(x)$,$\textcolor{green}{绿色虚线}$给出式3.4中的GaussFun,$\textcolor{orange}{黄色实线}$给出两者相乘并归一化的结果,即固定z条件下后验概率$q(x|z=fixed)$。读者可调整不同z值,观察$q(x)$的波动变化对后验概率$q(x|z)$形态的影响。
100
 
101
  两个特殊状态下的后验概率分布$q(x|z)$值得考虑一下。
102
  <ul>
103
+ <li> 当$\alpha \to 0$时,GaussFun的方差趋向于<b>无穷大</b>,不同$z$值的$q(x|z)$几乎变成一致,并与$q(x)$几乎相同。读者可在<a href="#demo_2">Demo 2</a>中,将$\alpha$设置为0.01,观察具体的结果。</li>
104
+ <li> 当$\alpha \to 1$时,GaussFun的方差趋向于<b>无穷小</b>,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数, 偏移量等于$z$。但有一些例外,当q(x)存在为零的区域时,其对应的q(x|z)将不再为Dirac delta函数,而是零函数。可在<a href="#demo_2">Demo 2</a>中,将$\alpha$设置为0.999,观察具体的结果。</li>
105
  </ul>
106
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_zh")
107
  return
 
117
  r"""
118
  对于任意的数据分布$q(x)$,均可连续应用上述的变换(如式4.1~4.4),随着变换的次数的增多,输出的概率分布将变得越来越接近于标准正态分布。对于较复杂的数据分布,需要较多的次数或者较大的噪声。
119
 
120
+ 具体可看<a href="#demo_3_1">Demo 3.1</a>,第一子图是随机生成的一维概率分布,经过7次的变换后,最终的概率分布与标准正态分布非常相似。相似的程度与迭代的次数和噪声大小正相关。对于相同的相似程度,如果每次所加的噪声较大(较小的$\alpha$值),那所需变换的次数将较少。读者可尝试不同的$\alpha$值和次数,观测最终概率分布的相似程度。
121
 
122
  起始概率分布的复杂度会比较高,随着变换的次数增多,概率分布$q(z_t)$的复杂度将会下降。根据第3节结论,更复杂的概率分布对应更复杂的后验概率分布,所以,为了保证后验概率分布与高斯函数较相似(较容易学习),在起始阶段,需使用较大的$\alpha$(较小的噪声),后期阶段可适当使用较小的$\alpha$(较大的噪声),加快向标准正态分布转变。
123
 
124
+ 在<a href="#demo_3_1">Demo 3.1</a>的例子可以看到,随着变换次数增多,$q(z_t)$的棱角变得越来越少,同时,后验概率分布$q(z_{t-1}|z_t)$图中的斜线变得越来越笔直匀称,越来越像条件高斯分布。
125
 
126
  \begin{align}
127
  Z_1 &= \sqrt{\alpha_1} X + \sqrt{1-\alpha_1}\epsilon_1 \tag{4.1} \newline
 
152
  q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
153
  \end{align}
154
 
155
+ 如果只考虑边际分布$q(z_T)$,也可使用一次变换代替,变换如下:
156
  \begin{align}
157
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
158
  \end{align}
 
171
  r"""
172
  如果知道了最终的概率分布$q(z_T)$及各个转换过程的后验概率$q(x|z),q(z_{t-1}|z_t)$,则可通过“贝叶斯公式”和“全概率公式”恢复数据分布$q(x)$,见式5.1~5.4。当最终的概率分布$q(z_T)$与标准正态分布很相似时,可用标准正态分布代替。
173
 
174
+ 具体可看<a href="#demo_3_2">Demo 3.2</a>。示例中$q(z_T)$使用$\mathcal{N}(0,1)$代替,同时通过JS Div给出了误差大小。恢复的概率分布$q(z_t)$及$q(x)$使用$\textcolor{green}{绿色曲线}$标识,原始的概率分布使用$\textcolor{blue}{蓝色曲线}$标识。可以看出,数据分布$q(x)$能够被很好地恢复回来,并且误差(JS Divergence)会小于标准正态分布替换$q(z_T)$引起的误差。
175
  \begin{align}
176
  q(z_{T-1}) &= \int q(z_{T-1},z_T)dz_T = \int q(z_{T-1}|z_T)q(z_T)dz_T \tag{5.1} \newline
177
  & \dots \notag \newline
 
180
  q(z_1) &= \int q(z_1,z_2) dz_1 = \int q(z_1|z_2)q(z_2)dz_2 \tag{5.3} \newline
181
  q(x) &= \int q(x,z_1) dz_1 = \int q(x|z_1)q(z_1)dz_1 \tag{5.4} \newline
182
  \end{align}
183
+ 在本文中,将上述恢复过程(式5.1~5.4)所使用的变换称之为“后验概率变换”。例如,在式5.4中,变换的输入为概率分布函数$q(z_1)$,输出为概率分布函数$q(x)$,整个变换由后验概率分布$q(x|z_1)$决定。此变换也可看作为一组基函数的线性加权和,基函数为不同条件下的$q(x|z_1)$,各个基函数的权重为$q(z_1)$。在<a href="#posterior_transform">第7节</a>,将会进一步介绍此变换的一些有趣性质。
184
 
185
+ 在<a href="#posterior">第3节</a>中,我们考虑了两个特殊的后验概率分布。接下来,分析其对应的”后验概率变换“。
186
  <ul>
187
  <li> 当$\alpha \to 0$时,不同$z$值的$q(x|z)$均与$q(x)$几乎相同,也就是说,线性加权和的基函数几乎相同。此状态下,不管输入如何变化,变换的输出总为$q(x)$。</li>
188
  <li> 当$\alpha \to 1$时,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数及零函数。此状态下,只要输入分布的支撑集(support set)包含于$q(x)$的支撑集,变换的输出与输入将保持一致。</li>
189
  </ul>
190
 
191
+ 在<a href="#forward_process">第4节</a>中提到,DDPM[\[2\]](#ddpm)论文所使用的1000次变换可使用一次变换表示:
192
  \begin{align}
193
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
194
  \end{align}
195
  由于$\alpha=0.0000403$非常小,其对应的GaussFun(式3.4)的标准差达到157.52,而$X$的范围限制在$[-1, 1]$,远小于GaussFun的标准差。在$x \in [-1, 1]$范围内,GaussFun应该接近于常量,没有什么变化,所以不同的$z_T$对应的$q(x|z_T)$均与$q(x)$几乎相同。在这种状态下,对于$q(x|z_T)$相应的后验概率变换,不管输入分布是什么,输出分布都将是$q(x)$。
196
 
197
  <b>所以,理论上,在DDPM模型中,无需非得使用标准正态分布代替$q(z_T)$,也可使用其它任意的分布代替。</b>
198
+
199
+ 读者可亲自做一个类似的实验。在<a href="#demo_3_1">Demo 3.1</a>中,将start_alpha设置0.25,end_alpha也设置为0.25,step设置为7,此时$q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061}\epsilon$,与DDPM的$q(z_T)$基本相似。点击<b>apply</b>执行前向变换($\textcolor{blue}{蓝色曲线}$),为接下来的反向恢复做准备。在<a href="#demo_3_2">Demo 3.2</a>中,noise_ratio设置为1,为末端分布$q(z_7)$引入100%的噪声,切换nose_random_seed的值可改变噪声的分布,取消选择backward_pdf,减少画面的干扰。点击<b>apply</b>将通过后验概率变换恢复$q(x)$,将会看到,不管输入的$q(z_7)$的形状如何,恢复的$q(x)$均与原始的$q(x)$完全相同, JS Divergence为0,���复的过程使用$\textcolor{red}{红色曲线}$画出。
200
 
201
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_zh")
202
  return
 
211
  # because of the render bug in gradio markdown, some formulas are render in ExtraBlock.js
212
  gr.Markdown(
213
  r"""
214
+ 由<a href="#posterior">第3节</a>前半部分可知,各个后验概率分布是未知的,并且与$q(x)$有关。所以,为了恢复数据分布或者从数据分布中采样,需要对各个后验概率分布进行学习估计。
215
 
216
+ 由<a href="#posterior">第3节</a>后半部分可知,当满足一定条件时,各个后验概率分布$q(x|z)、q(z_{t-1}|z_t)$近似于高斯概率分布,所以可通过构建一批条件高斯概率模型$p(x|z),p(z_{t-1}|z_t)$,学习拟合对应的$q(x|z),q(z_{t-1}|z_t)$。
217
 
218
+ 由于模型表示能力和学习能力的局限性,拟合过程会存在一定的误差,进一步会影响恢复$q(x)$的准确性。拟合误差大小与后验概率分布的形状有关。由<a href="#posterior">第3节</a>可知,当$q(x)$较复杂或者所加噪声较大时,后验概率分布会较复杂,与高斯分布差别较大,从而导致拟合误差,进一步影响恢复$q(x)$。
219
 
220
+ 具体可看<a href="#demo_3_3">Demo 3.3</a>,读者可测试不同复杂程度的$q(x)$和$\alpha$,观看后验概率分布$q(z_{t-1}|z_t)$的拟合程度,以及恢复$q(x)$的准确度。恢复的概率分布使用$\textcolor{orange}{橙色}$标识,同时也通过JS divergence给出误差。
221
 
222
  关于拟合的目标函数,与其它概率模型类似,可$\textcolor{red}{优化交叉熵损失}$,使$p(z_{t-1}|z_t)$逼近于$q(z_{t-1}|z_t)$。由于$(z_{t-1}|z_t)$是条件概率,所以需要综合考虑各个条件,以<b>各个条件发生的概率$q(z_t)$</b>加权平均<b>各个条件对应的交叉熵</b>。最终的损失函数形式如下:
223
  \begin{align}
 
226
  \end{align}
227
  也可以KL散度作为目标函数进行优化,KL散度与交叉熵是等价的[\[10\]](#ce_kl)。
228
  <span id="zh_fit_0">
229
+ loss &= \int q(z_t) KL(q(z_{t-1}|z_t) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dz_t \tag{6.3} \newline
230
  &= \int q(z_t) \int q(z_{t-1}|z_t) \frac{q(z_{t-1}|z_t)}{\textcolor{blue}{p(z_{t-1}|z_t)}} dz_{t-1} dz_t \tag{6.4} \newline
231
  &= -\int q(z_t)\ \underbrace{\int q(z_{t-1}|z_t) \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}}{underline}{\text{Cross Entropy}}\ dz_t + \underbrace{\int q(z_t) \int q(z_{t-1}|z_t) \log q(z_{t-1}|z_t)}{underline}{\text{Is Constant}} dz \tag{6.5}
232
  </span>
 
250
  &\quad - \iint \int q(x)q(z_{t-1}, z_t|x) \log \textcolor{blue}{p(z_{t-1}|z_t)}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.11} \newline
251
  &= \iint \int q(x)q(z_{t-1},z_t|x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.12} \newline
252
  &= \iint q(x)q(z_t|x)\int q(z_{t-1}|z_t,x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dz_{t-1}\ dz_xdz_t - \textcolor{orange}{C_1} \tag{6.13} \newline
253
+ &= \iint \ q(x)q(z_t|x) KL(q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dxdz_t - \textcolor{orange}{C_1} \tag{6.14} \newline
254
+ &\propto \iint \ q(x)q(z_t|x) KL(q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dxdz_t \tag{6.15} \newline
255
  \end{align}
256
 
257
  上式中的$C_1$项是一个固定值,不包含待优化的参数,其中,$q(x)$是固定的概率分布,$q(z_{t-1}|z_t)$也是固定概率分布,具体形式由$q(x)$及系数$\alpha$确定。
 
267
 
268
  根据一致项证明的结论,以及交叉熵与KL散度的关系,可得出一个有趣的结论:
269
  <span id="zh_fit_1">
270
+ \mathop{\min}{underline}{\textcolor{blue}{p}} \int q(z_t) KL(q(z_{t-1}|z_t) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dz_t \iff \mathop{\min}{underline}{\textcolor{blue}{p}} \iint \ q(x)q(z_t|x) KL(q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dxdz_t
271
  </span>
272
  比较左右两边的式子,可以看出,右边的目标函数比左边的目标函数多了一个条件变量$X$,同时也多了一个关于$X$积分,并且以$X$的发生的概率$q(x)$作为积分的加权系数。
273
 
274
  依照类似的思路,可推导出一个更通用的关系:
275
  <span id="zh_fit_2">
276
+ \mathop{\min}{underline}{\textcolor{blue}{p}} KL(q(z) \Vert \textcolor{blue}{p(z)}) \iff \mathop{\min}_{\textcolor{blue}{p}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx
277
  </span>
278
+ 关于此结论的详细推导,可见<a href="#cond_kl">Appendix A</a>。
279
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_fit_posterior_zh")
280
  return
281
 
 
287
 
288
  gr.Markdown(
289
  r"""
290
+ <h3 style="font-size:18px"> 压缩映射及收敛点 </h3>
291
  \begin{align}
292
  q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
293
  \end{align}
 
297
  dist(q_{o1}(z),\ q_{o2}(z)) < dist(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
298
  \end{align}
299
 
300
+ 读者可查看<a href="#demo_4_1">Demo 4.1</a>,左侧三个图呈现一个变换的过程,左1图是任意的数据分布$q(x)$,左3图是变换后的概率分布,左2图是后验概率分布。可更改随机种子生成新的数据分布,调整$\alpha$值引入不同程度的噪声。左侧最后两个图展示变换的“压缩性质”,左4图展示随机生成的两个输入分布,同时给出其距离度量值$div_{in}$;左5图展示经过变换后的两个输出分布,输出分布之间的距离标识为$div_{out}$。读者可改变输入的随机种子,切换不同的输入。可在图中看到,对于任意的输入,$div_{in}$总是小于$div_{out}$。另外,也可改变$\alpha$的值,将会看到,$\alpha$越小(噪声越大),$\frac{div_{out}}{div_{in}}$的比值也越小,即收缩率越大。
301
+
302
  由Banach fixed-point theorem<a href="#fixed_point">[5]</a>可知,压缩映射存在惟一一个定点(收敛点)。也就是说,对于任意的输入分布,可以连续迭代应用“后验概率变换”,只要迭代次数足够多,最终都会输出同一个分布。经过大量一维随机变量实验发现,定点(收敛点)<b>位于$q(x)$附近</b>。并且,与$\alpha$的值有关,$\alpha$越小(噪声越大),离得越近。
303
 
304
+ 读者可看<a href="#demo_4_2">Demo 4.2</a>,此部分展示迭代收敛的例子。选择合适的迭代次数,点中“apply iteration transform”,将逐步画出迭代的过程,每个子图均会展示各自变换后的输出分布($\textcolor{green}{绿色曲线}$),收敛的参考点分布$q(x)$以$\textcolor{blue}{蓝色曲线}$画出,同时给出输出分布与$q(x)$之间的距离$dist$。可以看出,随着迭代的次数增加,输出分布与$q(x)$越来越相似,并最终会稳定在$q(x)$附近。对于较复杂的分布,可能需要较多迭代的次数或者较大的噪声。迭代次数可以设置为上万步,但会花费较长时间。
305
 
306
+ 对于一维离散的情况,$q(x|z)$将离散成一个矩阵(记为$Q_{x|z}$),$q(z)$离散成一个向量(记为$\boldsymbol{q_i}$),积分操作$\int q(x|z)q(z)dz$将离散成"矩阵-向量"乘法操作,所以后验概率变换可写成
307
  \begin{align}
308
  \boldsymbol{q_o} &= Q_{x|z}\ \boldsymbol{q_i} & \quad\quad &\text{1 iteration} \tag{7.3} \newline
309
  \boldsymbol{q_o} &= Q_{x|z}\ Q_{x|z}\ \boldsymbol{q_i} & \quad\quad &\text{2 iteration} \tag{7.4} \newline
310
  & \dots & \notag \newline
311
  \boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
312
  \end{align}
313
+ 于是,为了更深入地理解变换的特点,<a href="#demo_4_2">Demo 4.2</a>也画出矩阵$(Q_{x|z})^n$的结果。从图里可以看到,当迭代趋向收敛时,矩阵$(Q_{x|z})^n$的行向量将变成一个常数向量,即向量的各分量都相等。在二维密度图里将表现为一条横线。
314
+
315
+ 在<a href="#proof_ctr">Appendix B</a>中,将会提供一个证明,当$q(x)$和$\alpha$满足一些条件时,后验概率变换是一个严格的压缩映射。
316
+
317
+ 关于定点分布与输入分布q(x)之间距离的关系,目前尚不能严格证明。
318
+
319
+ <h3 style="font-size:18px"> 恢复数据分布过程中的抗噪声能力 </h3>
320
+ 由上面的分析可知,当满足一些条件时,"后验概率变换"是一个压缩映射,所以存在如下的关系:
321
+ \begin{align}
322
+ dist(q(x),\ q_o(x)) < dist(q(z),\ q_i(z)) \tag{7.12}
323
+ \end{align}
324
+ 其中,$q(z)$是理想的输入分布,$q(x)$理想的输出分布,$q_i(x)$是任意的输入分布,$q_o(x)$是$q_i(z)$经过变换后的输出分布。
325
+
326
+ 上式表明,输出的分布$q_o(x)$与理想输出分布q(x)之间的距离总会</em>小于</em>输入分布$q_i(z)$与理想输入分布q(x)的距离。于是,"后验概率变换"具备一定的抵抗噪声能力。这意味着,在恢复$q(x)$的过程中(<a href="#backward_process">第5节</a>),哪怕输入的“末尾分布$q(z_T)”$存在一定的误差,经过一系列变换后,输出的“数据分布$q(x)$“的误差也会比输入的误差更小。
327
+
328
+ 具体可看<a href="#demo_3_2">Demo 3.2</a>,通过增加“noise ratio”的值可以向“末尾分布$q(z_T)$”添加噪声,点击“apply”按钮将逐步画出恢复的过程,恢复的分布以$\textcolor{red}{红色曲线}$画出,同时也会通过JS散度标出误差的大小。将会看到,恢复的$q(x)$的误差总是小于$q(z_T)$的误差。
329
+
330
+ 由上面的讨论可知,$\alpha$越小(即变换过程中使用的噪声越大),压缩映射的压缩率越大,于是,抗噪声的能力也越强。
331
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_zh")
332
+ return
333
+
334
+
335
+ def md_deconvolution_zh():
336
+ global g_latex_del
337
+
338
+ title = "8. Can the data distribution be restored by deconvolution?"
339
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="deconvolution"):
340
+
341
+ gr.Markdown(
342
+ r"""
343
+ 在<a href="#introduction">第1节</a>中提到,式2.1的变换可分为两个子变换,第一个子变换为”线性变换“,第二个为“加上独立高斯噪声”。线性变换相当于对概率分布进行拉伸变换,所以存在逆变换。"加上独立高斯噪声”相当于对概率分布执行卷积操作,卷积操作可通过逆卷积恢复。所以,理论上,可通过“逆线性变换”和“逆卷积”从最终的概率分布$q(z_T)$恢复数据分布$q(x)$。
344
+
345
+ 但实际上,会存在一些问题。由于逆卷积对误差极为敏感,具有很高的输入灵敏度,很小的输入噪声就会引起输出极大的变化[\[11\]](#deconv_1)[\[12\]](#deconv_2)。而在扩散模型中,会使用标准正态分布近似代替$q(z_T)$,因此,在恢复的起始阶段就会引入噪声。虽然噪声较小,但由于逆卷积的敏感性,噪声会逐步放大,影响恢复。
346
+
347
+ 另外,也可以从另一个角度理解“逆卷积恢复”的不可行性。由于前向变换的过程(式4.1~4.4)是确定的,所以卷积核是固定的,因此,相应的“逆卷积变换“也是固定的。由于起始的数据分布$q(x)$可以是任意的分布,所以,通过一系列固定的“卷积正变换”,可以将任意的概率分布转换成近似$\mathcal{N}(0,I)$的分布。如“逆卷积变换“可行,则意味着,可用一个固定的“逆卷积变换",将$\mathcal{N}(0,I)$分布恢复成任意的数据分布$q(x)$,这明显是一个悖论。同一个输入,同一个变换,不可能会有多个输出。
348
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_deconvolution_zh")
349
+ return
350
+
351
+
352
+ def md_cond_kl_zh():
353
+ global g_latex_del
354
+
355
+ title = "Appendix A Conditional KL Divergence"
356
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="cond_kl"):
357
+ gr.Markdown(
358
+ r"""
359
+ 本节主要介绍<b>KL散度</b>与<b>条件KL散度</b>之间的关系。在正式介绍之前,先简单介绍<b>熵</b>和<b>条件熵</b>的定义,以及两者之间存在的不等式关系,为后面的证明作准备。
360
+
361
+ <h3 style="font-size:18px">熵及条件熵</h3>
362
+ 对于任意两个随机变量$Z,X$,<b>熵</b>(Entropy)定义如下<a href="#entropy">[16]</a>:
363
+ \begin{align}
364
+ \mathbf{H}(Z) = \int -p(z)\log{p(z)}dz \tag{A.1}
365
+ \end{align}
366
+ <b>条件熵</b>(Conditional Entropy)的定义如下<a href="#cond_entropy">[17]</a>:
367
+ \begin{align}
368
+ \mathbf{H}(Z|X) = \int p(x) \overbrace{\int -p(z|x)\log{p(z|x)}dz}^{\text{Entropy}}\ dx \tag{A.2}
369
+ \end{align}
370
+ 两者存在如下的不等式关系:
371
+ \begin{align}
372
+ \mathbf{H}(Z|X) \le \mathbf{H}(Z) \tag{A.3}
373
+ \end{align}
374
+ 也就是说,条件熵总是小于或者等于熵,当且仅当X与Z相互独立时,两者相等。此关系的证明可看文献<a href="#cond_entropy">[17]</a>。
375
+
376
+ <h3 style="font-size:18px"> KL散度及条件KL散度 </h3>
377
+ 仿照条件熵定义的方式,引入一个新定义,<b>条件KL散度</b>,记为$KL_{\mathcal{C}}$。由于KL散度的定义是非对称的,所以存在两种形式,如下:
378
+ \begin{align}
379
+ KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) = \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx \tag{A.4} \newline
380
+ KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) = \int \ \textcolor{blue}{p(x)} KL(q(z) \Vert \textcolor{blue}{p(z|x)})dx \tag{A.5}
381
+ \end{align}
382
+
383
+ 与条件熵类似,条件KL散度也存在类似的不等式关系:
384
+ \begin{align}
385
+ KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) \ge KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.6} \newline
386
+ KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) \ge KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.7}
387
+ \end{align}
388
+ 也就是说,条件KL散度总是大于或者等于KL散度,当且仅当X与Z相互独立时,两者相等。
389
+
390
+ 下面对式A.5和式A.6的结论分别证明。
391
+
392
+ 对于式A.6,证明如下:
393
+ \begin{align}
394
+ KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) &= \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx \tag{A.8} \newline
395
+ &= \iint q(x) q(z|x) \log \frac{q(z|x)}{\textcolor{blue}{p(z)}}dzdx \tag{A.9} \newline
396
+ &= -\overbrace{\iint - q(x)q(z|x) \log q(z|x) dzdx}^{\text{Condtional Entropy }\mathbf{H}_q(Z|X)} - \iint q(x) q(z|x) \log \textcolor{blue}{p(z)} dzdx \tag{A.10} \newline
397
+ &= -\mathbf{H}_q(Z|X) - \int \left\lbrace \int q(x) q(z|x)dx \right\rbrace \log \textcolor{blue}{p(z)}dz \tag{A.11} \newline
398
+ &= -\mathbf{H}_q(Z|X) + \overbrace{\int - q(z) \log p(z)dz}^{\text{Cross Entropy}} \tag{A.12} \newline
399
+ &= -\mathbf{H}_q(Z|X) + \int q(z)\left\lbrace \log\frac{q(z)}{\textcolor{blue}{p(z)}} -\log q(z)\right\rbrace dz \tag{A.13} \newline
400
+ &= -\mathbf{H}_q(Z|X) + \int q(z)\log\frac{q(z)}{\textcolor{blue}{p(z)}}dz + \overbrace{\int - q(z)\log q(z)dz}^{\text{Entropy } \mathbf{H}_q(Z)} \tag{A.14} \newline
401
+ &= KL(q(z) \Vert \textcolor{blue}{p(z)}) + \overbrace{\mathbf{H}_q(Z) - \mathbf{H}_q(Z|X)}^{\ge 0} \tag{A.15} \newline
402
+ &\le KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.16}
403
+ \end{align}
404
+ 其中式A.15应用了"条件熵总是小于或者等于熵"的结论。于是,得到式A.6的关系。
405
 
406
+ 对于式A.7,证明如下:
407
+ \begin{align}
408
+ KL(\textcolor{blue}{q(z)} \Vert p(z)) &= \int \textcolor{blue}{q(z)}\log\frac{\textcolor{blue}{q(z)}}{p(z)}dx \tag{A.15} \newline
409
+ &= \int q(z)\log\frac{q(z)}{\int p(z|x)p(z)dz}dz \tag{A.16} \newline
410
+ &= \textcolor{orange}{\int p(x)dx}\int q(z)\log q(z)dz - \int q(z)\textcolor{red}{\log\int p(z|x)p(x)dx}dz \qquad \ \textcolor{orange}{\int p(x)dx=1} \tag{A.17} \newline
411
+ &\le \iint p(x) q(z)\log q(z)dzdx - \int q(z)\textcolor{red}{\int p(x)\log p(z|x)dx}dz \ \qquad \textcolor{red}{\text{jensen\ inequality}} \tag{A.18} \newline
412
+ &= \iint p(x)q(z)\log q(z)dzdx - \iint p(z)q(z)\log p(z|x)dzdx \tag{A.19} \newline
413
+ &= \iint p(x)q(z)(\log q(z) - \log p(z|x))dzdx \tag{A.20} \newline
414
+ &= \iint p(x)q(z)\log \frac{q(z)}{p(z|x)}dzdx \tag{A.21} \newline
415
+ &= \int p(x)\left\lbrace \int q(z)\log \frac{q(z)}{p(z|x)}dz\right\rbrace dx \tag{A.22} \newline
416
+ &= \int p(x)KL(\textcolor{blue}{q(z)} \Vert p(z|x))dx \tag{A.23} \newline
417
+ &= KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) \tag{A.24}
418
+ \end{align}
419
+ 于是,得到式A.7的关系。
420
+
421
+ 从式A.15可得出另外一个<b>重要的结论</b>。
422
+
423
+ KL散度常用于拟合数据的分布。在此场景中,数据潜在的分布用$q(z)$表示,参数化的模型分布用$\textcolor{blue}{p_\theta(z)}$表示。在优化的过程中,由于$q(z|x)$和$q(x)$均保持不变,所以式A.15中的$\mathbf{H}(Z) - \mathbf{H}(Z|X)$为一个常数项。于是,可得到如下的关系
424
+ <span id="zh_cond_kl_2">
425
+ \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} KL(q(z) \Vert \textcolor{blue}{p_\theta(z)}) \iff \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p_\theta(z)})dx \tag{A.25}
426
+ </span>
427
+
428
+ 把上述的关系与Denoised Score Matching<a href="#dsm">[18]</a>作比较,可发现一些相似的地方。两者均引入一个新变量$X$,并且将拟合的目标分布q(z)代替为q(z|x)。代替后,由于q(z|x)是条件概率分布,所以,两者均考虑了所有的条件,并以条件发生的概率$q(x)$作为权重系数执行加权和。
429
+ <span id="zh_cond_kl_3">
430
+ \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \frac{1}{2} \int q(z) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z)}{\partial z} \right\rVert^2 dz \iff \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \int q(x)\ \overbrace{\frac{1}{2} \int q(z|x) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z|x)}{\partial z} \right\rVert^2 dz}^{\text{Score Matching of }q(z|x)}\ dx \tag{A.26}
431
+ </span>
432
+
433
+ 上述加权和的操作有点类似于"全概率公式消元"。
434
+ \begin{align}
435
+ q(z) = \int q(z,x) dx = \int q(x) q(z|x) dx \tag{A.27}
436
+ \end{align}
437
+
438
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_cond_kl_zh")
439
+ return
440
+
441
+
442
+ def md_proof_ctr_zh():
443
+ global g_latex_del
444
+
445
+ title = "Appendix B Proof of Contraction"
446
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="proof_ctr"):
447
+ gr.Markdown(
448
+ r"""
449
  <center> <img src="file/fig2.png" width="960" style="margin-top:12px"/> </center>
450
  <center> Figure 2: Only one component in support </center>
451
+
452
+ 本节将证明,当$q(x)$及$\alpha$满足一些条件时,后验概率变换是一个压缩映射,并存在惟一收敛点。
453
+
454
+ 下面分四种情况进行证明。证明的过程假设随机变量是离散型的,因此,后验概率变换可看作是一个<b>离散Markov Chain</b>的一步转移,后验概率$q(x|z)$对应于<b>转移矩阵</b>(Transfer Matrix)。连续型的变量可认为是无限多状态的离散型变量。
455
 
456
  <ol style="list-style-type:decimal">
457
  <li> 当$q(x)$均大于0时,后验概率变换矩阵$q(x|z)$将大于0,于是此矩阵是一个$\textcolor{red}{不可约}\textcolor{green}{非周期}$的Markov Chain的转移矩阵,根据文献<a href="#mc_basic_p6">[13]</a>的结论,此变换是一个关于Total Variance度量的压缩映射,于是,根据Banach fixed-point theorem,此变换存在惟一定点(收敛点)。</li>
 
493
 
494
  另外,后验概率变换存在一个更通用的关系,与$q(x|z)$的具体值无关: 两个输出分布的之间的Total Variance距离总是会<b>小于等于</b>对应输入分布之间的Total Variance距离,即
495
  \begin{align}
496
+ dist(q_{o1}(x),\ q_{o2}(x)) \le dist(q_{i1}(z),\ q_{i2}(z)) \tag{B.1}
497
  \end{align}
498
  下面通过离散的形式给出证明:
499
  \begin{align}
500
+ \lVert q_{o1}-q_{o2}\rVert_{TV} &= \lVert Q_{x|z}q_{i1} - Q_{x|z}q_{i2}\rVert_{TV} \tag{B.2} \newline
501
+ &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)q_{i1}(n) - \sum_{n}Q_{x|z}(m,n)q_{i2}(n)\textcolor{red}{|} \tag{B.3} \newline
502
+ &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.4} \newline
503
+ &\leq \sum_{m}\sum_{n}Q_{x|z}(m,n)\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \qquad \qquad \qquad \text{Absolute value inequality} \tag{B.5} \newline
504
+ &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \sum_{m} Q_{x|z}(m,n) \qquad \qquad \qquad \sum_{m} Q_{x|z}(m,n) = 1 \tag{B.6} \newline
505
+ &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.7}
506
  \end{align}
507
  其中,$Q_{x|z}(m,n)$表示矩阵$Q_{x|z}$的第m行第n列的元素,$q_{i1}(n)$表示向量$q_{i1}$的第n个元素。
508
 
509
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_proof_ctr_zh")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  return
511
 
512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  def md_reference_zh():
514
  global g_latex_del
515
 
 
545
 
546
  <a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
547
 
548
+ <a id="vdm" href="https://arxiv.org/pdf/2107.00630"> [15] Variational Diffusion Models </a>
549
+
550
+ <a id="entropy" href="https://en.wikipedia.org/wiki/Entropy"> [16] Entropy </a>
551
+
552
+ <a id="cond_entropy" href="https://en.wikipedia.org/wiki/Conditional_entropy"> [17] Conditional Entropy </a>
553
+
554
+ <a id="dsm" href="https://www.iro.umontreal.ca/~vincentp/Publications/smdae_techreport_1358_v1.pdf"> [18] A Connection Between Score Matching and Denoising autoencoders </a>
555
+
556
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_zh")
557
 
558
  return
 
565
 
566
  gr.Markdown(
567
  r"""
568
+ <b>APP</b>: 本Web APP是使用Gradio开发,并部署在HuggingFace。由于资源有限(2核,16G内存),所以可能会响应较慢。为了更好地体验,建议从<a href="https://github.com/blairstar/The_Art_of_DPM">github</a>复制源代码,在本地机器运行。本APP只依赖Gradio, SciPy, Matplotlib。
569
 
570
  <b>Author</b>: 郑镇鑫,资深视觉算法工程师,十年算法开发经历,曾就职于腾讯京东等互联网公司,目前专注于视频生成(类似Sora)。
571
 
 
577
 
578
  def run_app():
579
 
580
+ # with gr.Blocks(css=g_css, js="() => insert_special_formula() ", head=js_head) as demo:
581
  with gr.Blocks(css=g_css, js="() => {insert_special_formula(); write_markdown();}", head=js_head) as demo:
582
  md_introduction_zh()
583
 
 
596
  md_posterior_transform_zh()
597
 
598
  md_deconvolution_zh()
599
+
600
+ md_cond_kl_zh()
601
+
602
+ md_proof_ctr_zh()
603
+
604
  md_reference_zh()
605
 
606
  md_about_zh()
data.json CHANGED
The diff for this file is too large to render. See raw diff