blairzheng commited on
Commit
ec03973
·
1 Parent(s): e6088ac

add approx_gauss; add eigenvalue; replace contraction with non_expanding

Browse files
Files changed (19) hide show
  1. 7.1.png +0 -0
  2. App.py +11 -7
  3. C1.png +0 -0
  4. C2.png +0 -0
  5. D1.png +0 -0
  6. D2.png +0 -0
  7. D3.png +0 -0
  8. D4.png +0 -0
  9. D5.png +0 -0
  10. DPMInteractive.py +9 -4
  11. ExtraBlock.js +8 -5
  12. Misc.py +1 -1
  13. RenderMarkdown.py +35 -9
  14. RenderMarkdownEn.py +337 -82
  15. RenderMarkdownZh.py +313 -72
  16. data.json +0 -0
  17. fig2.png +0 -0
  18. fig3.png +0 -0
  19. fig4.png +0 -0
7.1.png ADDED
App.py CHANGED
@@ -13,8 +13,8 @@ from DPMInteractive import fixed_point_init_change, fixed_point_apply_iterate
13
  from DPMInteractive import forward_plot_part, backward_plot_part, fit_plot_part, fixed_plot_part
14
  from RenderMarkdown import md_introduction_block, md_transform_block, md_likelihood_block, md_posterior_block
15
  from RenderMarkdown import md_forward_process_block, md_backward_process_block, md_fit_posterior_block
16
- from RenderMarkdown import md_posterior_transform_block, md_deconvolution_block, md_cond_kl_block, md_proof_ctr_block
17
- from RenderMarkdown import md_reference_block, md_about_block
18
  from Misc import g_css, js_head, js_load
19
 
20
 
@@ -145,7 +145,7 @@ def forward_block(seq_info_state):
145
  seed = gr_number("random seed", 0, 1E6, 100, 1, 0, min_width=80)
146
  st_alpha = gr_number("start alpha", 0.001, 0.999, 0.98, 0.001, 3, min_width=80)
147
  et_alpha = gr_number("end alpha", 0.001, 0.999, 0.98, 0.001, 3, min_width=80)
148
- step = gr.Slider(label="step", value=7, minimum=2, maximum=15, step=1, min_width=80)
149
  apply_button = gr.Button(value="apply", min_width=80)
150
 
151
  node_plot = gr.Plot(label="latent variable's pdf", show_label=False)
@@ -236,7 +236,7 @@ def contraction_block():
236
  with gr.Row():
237
  ctr_init_seed = gr_number("random seed", 0, 1E6, 100, 1, 0, min_width=80)
238
  ctr_alpha = gr_number("alpha", 0.001, 0.999, 0.95, 0.001, 3, min_width=80)
239
- gr_empty_space(5)
240
 
241
  with gr.Row():
242
  inp_plot = gr.Plot(label="input variable pdf", min_width=80, show_label=False)
@@ -265,11 +265,11 @@ def contraction_block():
265
  power_mat_plot = gr.Plot(show_label=False)
266
 
267
  ctr_init_inputs = [ctr_init_seed, ctr_alpha, two_inputs_seed]
268
- ctr_init_outputs = [inp_plot, x_state, x_pdf_state, pos_plot, out_plot, z_state, xcz_pdf_state, inp_out_plot]
269
  ctr_init_seed.change(contraction_init_change, ctr_init_inputs, ctr_init_outputs, show_progress="minimal")
270
 
271
  ctr_alpha_inputs = [x_state, x_pdf_state, ctr_alpha, two_inputs_seed]
272
- ctr_alpha_outputs = [pos_plot, out_plot, z_state, xcz_pdf_state, inp_out_plot]
273
  ctr_alpha.change(contraction_alpha_change, ctr_alpha_inputs, ctr_alpha_outputs, show_progress="minimal")
274
 
275
  ctr_apply_inputs, ctr_apply_outputs = [x_state, x_pdf_state, xcz_pdf_state, two_inputs_seed], [inp_out_plot]
@@ -348,7 +348,11 @@ def run_app():
348
 
349
  md_cond_kl_block()
350
 
351
- md_proof_ctr_block()
 
 
 
 
352
 
353
  md_reference_block()
354
 
 
13
  from DPMInteractive import forward_plot_part, backward_plot_part, fit_plot_part, fixed_plot_part
14
  from RenderMarkdown import md_introduction_block, md_transform_block, md_likelihood_block, md_posterior_block
15
  from RenderMarkdown import md_forward_process_block, md_backward_process_block, md_fit_posterior_block
16
+ from RenderMarkdown import md_posterior_transform_block, md_deconvolution_block, md_cond_kl_block, md_approx_gauss_block
17
+ from RenderMarkdown import md_non_expanding_block, md_stationary_block, md_reference_block, md_about_block
18
  from Misc import g_css, js_head, js_load
19
 
20
 
 
145
  seed = gr_number("random seed", 0, 1E6, 100, 1, 0, min_width=80)
146
  st_alpha = gr_number("start alpha", 0.001, 0.999, 0.98, 0.001, 3, min_width=80)
147
  et_alpha = gr_number("end alpha", 0.001, 0.999, 0.98, 0.001, 3, min_width=80)
148
+ step = gr.Slider(label="step", value=7, minimum=1, maximum=15, step=1, min_width=80)
149
  apply_button = gr.Button(value="apply", min_width=80)
150
 
151
  node_plot = gr.Plot(label="latent variable's pdf", show_label=False)
 
236
  with gr.Row():
237
  ctr_init_seed = gr_number("random seed", 0, 1E6, 100, 1, 0, min_width=80)
238
  ctr_alpha = gr_number("alpha", 0.001, 0.999, 0.95, 0.001, 3, min_width=80)
239
+ lambda_2 = gr_number("second largest eigenvalue", 0, 0, 1.0, 0.0001, 4, min_width=80)
240
 
241
  with gr.Row():
242
  inp_plot = gr.Plot(label="input variable pdf", min_width=80, show_label=False)
 
265
  power_mat_plot = gr.Plot(show_label=False)
266
 
267
  ctr_init_inputs = [ctr_init_seed, ctr_alpha, two_inputs_seed]
268
+ ctr_init_outputs = [inp_plot, x_state, x_pdf_state, pos_plot, out_plot, z_state, xcz_pdf_state, inp_out_plot, lambda_2]
269
  ctr_init_seed.change(contraction_init_change, ctr_init_inputs, ctr_init_outputs, show_progress="minimal")
270
 
271
  ctr_alpha_inputs = [x_state, x_pdf_state, ctr_alpha, two_inputs_seed]
272
+ ctr_alpha_outputs = [pos_plot, out_plot, z_state, xcz_pdf_state, inp_out_plot, lambda_2]
273
  ctr_alpha.change(contraction_alpha_change, ctr_alpha_inputs, ctr_alpha_outputs, show_progress="minimal")
274
 
275
  ctr_apply_inputs, ctr_apply_outputs = [x_state, x_pdf_state, xcz_pdf_state, two_inputs_seed], [inp_out_plot]
 
348
 
349
  md_cond_kl_block()
350
 
351
+ md_approx_gauss_block()
352
+
353
+ md_non_expanding_block()
354
+
355
+ md_stationary_block()
356
 
357
  md_reference_block()
358
 
C1.png ADDED
C2.png ADDED
D1.png ADDED
D2.png ADDED
D3.png ADDED
D4.png ADDED
D5.png ADDED
DPMInteractive.py CHANGED
@@ -697,15 +697,15 @@ def contraction_init_change(seed, alpha, two_inputs_seed):
697
  x_pdf = hijack(seed, x, x_pdf)
698
 
699
  # test
700
- x_pdf[x_pdf < 0.01] = 0
701
 
702
  x_pdf = x_pdf / (x_pdf * g_res).sum() # normalized to 1
703
  fig = plot_pdf(x, x_pdf, title="input variable pdf", titlesize=9)
704
 
705
  info = contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed)
706
- fig_xcz, fig_z, z, xcz_pdf, fig_inp_out = info
707
 
708
- return fig, x, x_pdf, fig_xcz, fig_z, z, xcz_pdf, fig_inp_out
709
 
710
 
711
  def contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed):
@@ -721,9 +721,14 @@ def contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed):
721
  fig_xcz = plot_2d_pdf(x, z, xcz_pdf, None, label="$q(x|z)$",
722
  title=xcz_title, titlesize=9, xlabel="z domain(cond)", ylabel="x domain")
723
 
 
 
 
 
 
724
  fig_inp_out = contraction_apply(x, x_pdf, xcz_pdf, two_inputs_seed)
725
 
726
- return fig_xcz, fig_z, z, xcz_pdf, fig_inp_out
727
 
728
 
729
  def change_two_inputs_seed():
 
697
  x_pdf = hijack(seed, x, x_pdf)
698
 
699
  # test
700
+ # x_pdf[x_pdf < 0.01] = 0
701
 
702
  x_pdf = x_pdf / (x_pdf * g_res).sum() # normalized to 1
703
  fig = plot_pdf(x, x_pdf, title="input variable pdf", titlesize=9)
704
 
705
  info = contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed)
706
+ fig_xcz, fig_z, z, xcz_pdf, fig_inp_out, lambda_2 = info
707
 
708
+ return fig, x, x_pdf, fig_xcz, fig_z, z, xcz_pdf, fig_inp_out, lambda_2
709
 
710
 
711
  def contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed):
 
721
  fig_xcz = plot_2d_pdf(x, z, xcz_pdf, None, label="$q(x|z)$",
722
  title=xcz_title, titlesize=9, xlabel="z domain(cond)", ylabel="x domain")
723
 
724
+ xcz = xcz_pdf/xcz_pdf.sum(axis=0, keepdims=True)
725
+ evals = np.linalg.eigvals(xcz)
726
+ evals = sorted(np.absolute(evals), reverse=True)
727
+ lambda_2 = evals[1]
728
+
729
  fig_inp_out = contraction_apply(x, x_pdf, xcz_pdf, two_inputs_seed)
730
 
731
+ return fig_xcz, fig_z, z, xcz_pdf, fig_inp_out, lambda_2
732
 
733
 
734
  def change_two_inputs_seed():
ExtraBlock.js CHANGED
@@ -2,8 +2,9 @@
2
 
3
 
4
  async function write_markdown() {
5
- let names = ["introduction", "transform", "likelihood", "posterior", "forward_process", "backward_process",
6
- "fit_posterior", "posterior_transform", "deconvolution", "cond_kl", "proof_ctr", "reference", "about"];
 
7
  // names = names.slice(-1)
8
 
9
  let data = await fetch("file/data.json").then(response => response.json());
@@ -24,8 +25,9 @@ async function write_markdown() {
24
 
25
 
26
  async function insert_markdown() {
27
- let names = ["introduction", "transform", "likelihood", "posterior", "forward_process", "backward_process",
28
- "fit_posterior", "posterior_transform", "deconvolution", "cond_kl", "proof_ctr", "reference", "about"];
 
29
 
30
  let data = await fetch("file/data.json").then(response => response.json());
31
 
@@ -54,7 +56,8 @@ async function insert_markdown() {
54
 
55
  function control_language() {
56
  const names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
57
- "backward_process", "fit_posterior", "posterior_transform", "deconvolution", "cond_kl", "proof_ctr", "reference", "about"];
 
58
 
59
  var is_zh = document.getElementById("switch_language").checked;
60
  for (let i = 0; i < names.length; i++) {
 
2
 
3
 
4
  async function write_markdown() {
5
+ let names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
6
+ "backward_process", "fit_posterior", "posterior_transform", "deconvolution",
7
+ "cond_kl", "approx_gauss", "non_expanding", "stationary", "reference", "about"];
8
  // names = names.slice(-1)
9
 
10
  let data = await fetch("file/data.json").then(response => response.json());
 
25
 
26
 
27
  async function insert_markdown() {
28
+ let names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
29
+ "backward_process", "fit_posterior", "posterior_transform", "deconvolution",
30
+ "cond_kl", "approx_gauss", "non_expanding", "stationary", "reference", "about"];
31
 
32
  let data = await fetch("file/data.json").then(response => response.json());
33
 
 
56
 
57
  function control_language() {
58
  const names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
59
+ "backward_process", "fit_posterior", "posterior_transform", "deconvolution",
60
+ "cond_kl", "approx_gauss", "non_expanding", "stationary", "reference", "about"];
61
 
62
  var is_zh = document.getElementById("switch_language").checked;
63
  for (let i = 0; i < names.length; i++) {
Misc.py CHANGED
@@ -18,7 +18,7 @@ g_css = """
18
  .first_md span{font-size: 140%; font-weight: bold; color: orange}
19
  .normal span{font-size: 100%; font-weight: normal; color: black}
20
  .second span{font-size: 100%; font-weight: bold; color: blue}
21
- .mds div{margin-top: 10px; margin-bottom: 20px; margin-left:10px; margin-right:10px; font-size:16px}
22
  .gps div{margin-top: 10px; margin-bottom: 20px;}
23
 
24
  .switchbar {position: relative; display: inline-block; width: 60px; height: 30px; margin-left: 10px; margin-right: 10px}
 
18
  .first_md span{font-size: 140%; font-weight: bold; color: orange}
19
  .normal span{font-size: 100%; font-weight: normal; color: black}
20
  .second span{font-size: 100%; font-weight: bold; color: blue}
21
+ .mds div{margin-top: 10px; margin-bottom: 20px; margin-left:10px; margin-right:10px; font-size:16px;}
22
  .gps div{margin-top: 10px; margin-bottom: 20px;}
23
 
24
  .switchbar {position: relative; display: inline-block; width: 60px; height: 30px; margin-left: 10px; margin-right: 10px}
RenderMarkdown.py CHANGED
@@ -3,13 +3,13 @@ import gradio as gr
3
 
4
  from RenderMarkdownZh import md_introduction_zh, md_transform_zh, md_likelihood_zh, md_posterior_zh
5
  from RenderMarkdownZh import md_forward_process_zh, md_backward_process_zh, md_fit_posterior_zh
6
- from RenderMarkdownZh import md_posterior_transform_zh, md_deconvolution_zh, md_cond_kl_zh, md_proof_ctr_zh
7
- from RenderMarkdownZh import md_reference_zh, md_about_zh
8
 
9
  from RenderMarkdownEn import md_introduction_en, md_transform_en, md_likelihood_en, md_posterior_en
10
  from RenderMarkdownEn import md_forward_process_en, md_backward_process_en, md_fit_posterior_en
11
- from RenderMarkdownEn import md_posterior_transform_en, md_deconvolution_en, md_cond_kl_en, md_proof_ctr_en
12
- from RenderMarkdownEn import md_reference_en, md_about_en
13
 
14
 
15
  def md_introduction_block(md_type="offline"):
@@ -137,14 +137,40 @@ def md_cond_kl_block(md_type="offline"):
137
  return
138
 
139
 
140
- def md_proof_ctr_block(md_type="offline"):
141
  if md_type == "offline":
142
- title = "Appendix B Proof of Contraction"
143
- gr.Accordion(label=title, elem_classes="first_md", elem_id="proof_ctr")
144
  elif md_type == "zh":
145
- md_proof_ctr_zh()
146
  elif md_type == "en":
147
- md_proof_ctr_en()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  else:
149
  raise NotImplementedError
150
  return
 
3
 
4
  from RenderMarkdownZh import md_introduction_zh, md_transform_zh, md_likelihood_zh, md_posterior_zh
5
  from RenderMarkdownZh import md_forward_process_zh, md_backward_process_zh, md_fit_posterior_zh
6
+ from RenderMarkdownZh import md_posterior_transform_zh, md_deconvolution_zh, md_cond_kl_zh, md_approx_gauss_zh
7
+ from RenderMarkdownZh import md_non_expanding_zh, md_stationary_zh, md_reference_zh, md_about_zh
8
 
9
  from RenderMarkdownEn import md_introduction_en, md_transform_en, md_likelihood_en, md_posterior_en
10
  from RenderMarkdownEn import md_forward_process_en, md_backward_process_en, md_fit_posterior_en
11
+ from RenderMarkdownEn import md_posterior_transform_en, md_deconvolution_en, md_cond_kl_en, md_approx_gauss_en
12
+ from RenderMarkdownEn import md_non_expanding_en, md_stationary_en, md_reference_en, md_about_en
13
 
14
 
15
  def md_introduction_block(md_type="offline"):
 
137
  return
138
 
139
 
140
+ def md_approx_gauss_block(md_type="offline"):
141
  if md_type == "offline":
142
+ title = "Appendix B When does the Posterior Approximate to Gaussian ?"
143
+ gr.Accordion(label=title, elem_classes="first_md", elem_id="approx_gauss")
144
  elif md_type == "zh":
145
+ md_approx_gauss_zh()
146
  elif md_type == "en":
147
+ md_approx_gauss_en()
148
+ else:
149
+ raise NotImplementedError
150
+ return
151
+
152
+
153
+ def md_non_expanding_block(md_type="offline"):
154
+ if md_type == "offline":
155
+ title = "Appendix C Posterior Transform is a Non-expanding Mapping"
156
+ gr.Accordion(label=title, elem_classes="first_md", elem_id="non_expanding")
157
+ elif md_type == "zh":
158
+ md_non_expanding_zh()
159
+ elif md_type == "en":
160
+ md_non_expanding_en()
161
+ else:
162
+ raise NotImplementedError
163
+ return
164
+
165
+
166
+ def md_stationary_block(md_type="offline"):
167
+ if md_type == "offline":
168
+ title = "Appendix D Posterior Transform Converges to the Unique Stationary Distribution"
169
+ gr.Accordion(label=title, elem_classes="first_md", elem_id="stationary")
170
+ elif md_type == "zh":
171
+ md_stationary_zh()
172
+ elif md_type == "en":
173
+ md_stationary_en()
174
  else:
175
  raise NotImplementedError
176
  return
RenderMarkdownEn.py CHANGED
@@ -82,15 +82,21 @@ def md_posterior_en():
82
  q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
83
  \end{align}
84
 
85
- When $z$ takes a fixed value, $q(z)$ is a constant, so the shape of $q(x|z)$ is only related to ${q(z|x)q(x)}$.
86
  \begin{align}
87
  q(x|z) \propto q(z|x)q(x) \qquad where\ z\ is\ fixed \tag{3.2}
88
  \end{align}
89
 
 
 
 
 
 
90
  From Equation 2.1, we can see that $q(z|x)$ is a Gaussian distribution, so we have
91
  \begin{align}
92
- q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad &where\ z\ is\ fixed \tag{3.3} \newline
93
- &= \frac{1}{\sqrt{\alpha}} \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x)& \qquad &where\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{3.4}
 
94
  \end{align}
95
 
96
  It can be observed that the <b>GaussFun</b> part is a Gaussian function of $x$, with a mean of $\frac{z}{\sqrt{\alpha}}$ and a variance of $\sqrt{\frac{1-\alpha}{\alpha}}$, so the shape of $q(x|z)$ is determined by **the product of GaussFun and q(x)**.
@@ -98,19 +104,27 @@ def md_posterior_en():
98
  According to the characteristics of <em>multiplication</em>, the characteristics of the shape of the $q(x|z)$ function can be summarized.
99
 
100
  <ul>
101
- <li>When the variance of the Gaussian function is small (small noise), or when $q(x)$ changes slowly, the shape of $q(x|z)$ will approximate to the Gaussian function, and have a simpler function form, which is convenient for modeling and learning.</li>
 
 
102
 
103
  <li>When the variance of the Gaussian function is large (large noise), or when $q(x)$ changes drastically, the shape of $q(x|z)$ will be more complex, and greatly differ from a Gaussian function, which makes it difficult to model and learn.</li>
104
  </ul>
105
 
 
 
106
  The specifics can be seen in <a href="#demo_2">Demo 2</a>. The fourth figure present the shape of the posterior $q(x|z)$, which shows an irregular shape and resembles a curved and uneven line. As $\alpha$ increases (noise decreases), the curve tends to be uniform and straight. Readers can adjust different $\alpha$ values and observe the relationship between the shape of posterior and the level of noise. In the last figure, the $\textcolor{blue}{\text{blue dash line}}$ represents $q(x)$, the $\textcolor{green}{\text{green dash line}}$ represents <b>GaussFun</b> in the equation 3.4, and the $\textcolor{orange}{\text{orange curve}}$ represents the result of multiplying the two function and normalizing it, which is the posterior probability $q(x|z=fixed)$ under a fixed z condition. Readers can adjust different values of z to observe how the fluctuation of $q(x)$ affect the shape of the posterior probability $q(x|z)$.
107
 
108
  The posterior $q(x|z)$ under two special states are worth considering.
109
  <ul>
110
- <li>As $\alpha \to 0$, the variance of <b>GaussFun</b> tends to <b>$\infty$</b>, and $q(x|z)$ for different $z$ almost become identical, and almost the same as $q(x)$. Readers can set $\alpha$ to 0.001 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
111
 
112
  <li>As $\alpha \to 1$, the variance of <b>GaussFun</b> tends to <b>$0$</b>, The $q(x|z)$ for different $z$ values contract into a series of <em>Dirac delta functions</em> with different offsets equalling to $z$. However, there are some exceptions. When there are regions where $q(x)$ is zero, the corresponding $q(x|z)$ will no longer be a Dirac <em>delta function</em>, but a zero function. Readers can set $\alpha$ to 0.999 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
113
  </ul>
 
 
 
 
114
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_en")
115
  return
116
 
@@ -153,18 +167,22 @@ def md_forward_process_en():
153
  q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
154
  \end{align}
155
 
156
- Comparing the forms of Equation 4.8 and Equation 2.1, it can be found that their forms are completely consistent. If only focusing on the final transformed distribution $q(z_t)$, then the t consective small transformations can be replaced by one large transformation. The $\alpha$ of the large transformation is the accumulation of the $\alpha$ from each small transformation.
 
 
157
 
 
 
158
  In the DDPM[\[2\]](#ddpm) paper, the authors used 1000 steps (T=1000) to transform the data distribution $q(x)$ to $q(z_T)$. The probability distribution of $q(z_T|x)$ is as follows:
159
  \begin{align}
160
  q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
161
  \end{align}
162
 
163
- If considering only marginal distribution $q(z_T)$, a single transformation can also be used, which is as follows:
164
  \begin{align}
165
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
166
  \end{align}
167
- It can be seen that, after applying two transforms, the transformed distributions $q(z_T|x)$ are the same. Thus, $q(z_T)$ is also the same.
168
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_en")
169
  return
170
 
@@ -193,7 +211,7 @@ def md_backward_process_en():
193
  In <a href="#posterior">Section 3</a>, we have considered two special posterior probability distributions. Next, we analyze their corresponding <em>posterior transforms</em>.
194
  <ul>
195
  <li> When $\alpha \to 0$, the $q(x|z)$ for different $z$ are almost the same as $q(x)$. In other words, the basis functions of linear weighted sum are almost the same. In this state, no matter how the input changes, the output of the transformation is always $q(x)$.</li>
196
- <li> When $\alpha \to 1$, the $q(x|z)$ for different $z$ values becomes a series of Dirac delta functions and zero functions. In this state, as long as the <em>support set</em> of the input distribution is included in the <em>support set</em> of $q(x)$, the output of the transformation will remain the same with the input.</li>
197
  </ul>
198
 
199
  In <a href="#forward_process">Section 4</a>, it is mentioned that the 1000 transformations used in the DDPM[\[2\]](#ddpm) can be represented using a single transformation
@@ -201,11 +219,13 @@ def md_backward_process_en():
201
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
202
  \end{align}
203
 
204
- Since $\\alpha=0.0000403$ is very small, the corresponding standard deviation of GaussFun (Equation 3.4) reaches 157.52. However, the range of $X$ is limited within $[-1, 1]$, which is far smaller than the standard deviation of GaussFun. Within the range of $x \\in [-1, 1]$, GaussFun should be close to a constant, showing little variation. Therefore, the $q(x|z_T)$ corresponding to different $z_T$ are almost the same as $q(x)$. In this state, the posterior transform corresponding to $q(x|z_T)$ does not depend on the input distribution, the output distribution will always be $q(x)$.
205
 
206
- <b>Therefore, theoretically, in the DDPM model, it is not necessary to use the standard normal distribution to replace $q(z_T)$. Any other arbitrary distributions can also be used as a substitute.</b>
207
 
208
  Readers can conduct a similar experiment themselves. In <a href="#demo_3_1">Demo 3.1</a>, set <em>start_alpha</em> to 0.25, <em>end_alpha</em> to 0.25, and <em>step</em> to 7. At this point, $q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061} \epsilon$, which is roughly equivalent to DDPM's $q(z_T)$. Click on <b>apply</b> to perform the forward transform (plotted using $\textcolor{blue}{\text{blue curves}}$), which prepares for the subsequent restoring process. In <a href="#demo_3_2">Demo 3.2</a>, set the <em>noise_ratio</em> to 1, introducing 100% noise into the <em>tail distribution</em> $q(z_7)$. Changing the value of <em>nose_random_seed</em> will change the distribution of noise. Deselect <em>backward_pdf</em> to reduce screen clutter. Click on <b>apply</b> to restore $q(x)$ through posterior transform. You will see that, no matter what the shape of input $q(z_7)$ may be, the restored $q(x)$ is always exactly the same as the original $q(x)$. The JS Divergence is zero. The restoration process is plotted using a $\textcolor{red}{\text{red curve}}$.
 
 
209
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_en")
210
  return
211
 
@@ -296,15 +316,17 @@ def md_posterior_transform_en():
296
 
297
  gr.Markdown(
298
  r"""
299
- <h3 style="font-size:18px"> Contraction Mapping and Converging Point </h3>
 
300
  \begin{align}
301
  q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
302
  \end{align}
303
-
304
- Through extensive experiments with one-dimensional random variables, it was found that the <b>Posterior Transform</b> exhibits the characteristics of <b>Contraction Mapping</b>. This means that, for any two probability distributions $q_{i1}(z)$ and $q_{i2}(z)$, after posterior transform, we get $q_{o1}(x)$ and $q_{o2}(x)$. The distance between $q_{o1}(x)$ and $q_{o2}(x)$ is always less than the distance between $q_{i1}(x)$ and $q_{i2}(x)$. Here, the distance can be measured using JS divergence or Total Variance. Furthermore, the contractive ratio of this contraction mapping is positively related to the size of the added noise.
305
  \begin{align}
306
- dist(q_{o1}(z),\ q_{o2}(z)) < dist(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
307
  \end{align}
 
308
 
309
  Readers can refer to <a href="#demo_4_1">Demo 4.1</a>, where the first three figure present a transform process. The first figure is an arbitrary data distribution $q(x)$, the third figure is the transformed probability distribution, and second figure is the posterior probability distribution $q(x|z)$. You can change the random seed to generate a new data distribution$q(x)$, and adjust the value of $\alpha$ to introduce different degrees of noise.
310
 
@@ -312,7 +334,7 @@ def md_posterior_transform_en():
312
 
313
  Readers can change the input random seed to toggle different inputs. It can be observed from the figures that $div_{in}$ is always smaller than $div_{out}$ for any input. Additionally, if you change the value of $\alpha$, you will see that the smaller the $\alpha$(larger noise), the smaller the ratio of $div_{out}/div_{in}$,indicating a larger rate of contraction.
314
 
315
- According to the Banach fixed-point theorem<a href="#fixed_point">[5]</a>, a contraction mapping has a unique fixed point (converged point). That is to say, for any input distribution, the <b>Posterior Transform</b> can be applied continuously through iterations, and as long as the number of iterations is sufficient, the final output would be the same distribution. After a large number of one-dimensional random variable experiments, it was found that the fixed point (converged point) is <b>located near $q(x)$</b>. Also, the location is related to the value of $\alpha$; the smaller $\alpha$ (larger noise), the closer it is.
316
 
317
  Readers can refer to <a href="#demo_4_2">Demo 4.2</a>, which illustrates an example of applying posterior transform iteratively. Choose an appropriate number of iterations, and click on the button of <em>Apply</em>, and the iteration process will be draw step by step. Each subplot shows the transformed output distribution($\textcolor{green}{\text{green curve}}$) from each transform, with the reference distribution $q(x)$ expressed as a $\textcolor{blue}{\text{blue curve}}$, as well as the distance $div$ between the output distribution and $q(x)$. It can be seen that as the number of iterations increases, the output distribution becomes more and more similar to $q(x)$, and will eventually stabilize near $q(x)$. For more complicated distributions, more iterations or greater noise may be required. The maximum number of iterations can be set to tens of thousands, but it'll take longer.
318
 
@@ -325,23 +347,34 @@ def md_posterior_transform_en():
325
  \end{align}
326
  In order to better understand the property of the transform, the matrix $(Q_{x|z})^n$ is also plotted in <a href="#demo_4_2">Demo 4.2</a>. From the demo we can see that, as the iterations converge, the row vectors of the matrix $(Q_{x|z})^n$ will become a constant vector, that is, all components of the vector will be the same, which will appear as a horizontal line in the denisty plot.
327
 
328
- In the <a href="#proof_ctr">Appendix B</a>, a proof will be provided that, when $q(x)$ and $\alpha$ satisfy some conditions, the posterior transform is a strict Contraction Mapping.
329
 
330
- The relationship between the converged distribution and the input distribution q(x) cannot be rigorously proven at present.
331
 
 
332
  <h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution</h3>
333
- From the above analysis, we know that when certain conditions are satisfied, the <em>posterior transform</em> is a contraction mapping. Therefore, the following relationship exists:
 
 
334
  \begin{align}
335
- dist(q(x),\ q_o(x)) < dist(q(z),\ q_i(z)) \tag{7.12}
336
  \end{align}
337
- Wherein, $q(z)$ is the ideal input distribution, $q(x)$ is the ideal output distribution, $q_i(x)$ is any arbitrary input distribution, and $q_o(x)$ is the output distribution obtained after transforming $q_i(z)$.
338
 
339
- The above equation indicates that the distance between the output distribution $q_o(x)$ and the ideal output distribution q(x) will always be <b>less than</b> the distance between the input distribution $q_i(z)$ and the ideal input distribution q(x). Hence, the <em>posterior transform</em> has certain resistance to noise. This means that during the process of restoring $q(x)$(<a href="#backward_process">Section 5</a>), even if the <em>tail distribution</em> $q(z_T)$ contains some error, the error of the outputed distribution $q(x)$ will be smaller than the error of input after undergoing a series of transform.
 
 
340
 
341
  Refer specifically to <a href="#demo_3_2">Demo 3.2</a>, where by increasing the value of the <b>noise ratio</b>, noise can be added to the <em>tail distribution</em> $q(z_T)$. Clicking the "apply" button will gradually draw out the restoring process, with the restored distribution represented by a $\textcolor{red}{\text{red curve}}$, and the error size will be computed by the JS divergence. You will see that the error of restored $q(x)$ is always less than the error of $q(z_T)$.
342
 
343
- From the above discussion, we know that the smaller the $\alpha$ (the larger the noise used in the transform process), the greater the contractive ratio of the contraction mapping, and thus, the stronger the ability to resist noise.
 
 
 
344
 
 
 
 
 
 
345
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_en")
346
  return
347
 
@@ -370,7 +403,8 @@ def md_cond_kl_en():
370
  gr.Markdown(
371
  r"""
372
  This section mainly introduces the relationship between <b>KL divergence</b> and <b>conditional KL divergence</b>. Before the formal introduction, we will briefly introduce the definitions of <b>Entropy</b> and <b>Conditional Entropy</b>, as well as the inequality relationship between them, in preparation for the subsequent proof.
373
-
 
374
  <h3 style="font-size:20px">Entropy and Conditional Entropy</h3>
375
  For any two random variables $Z, X$, the <b>Entropy</b> is defined as follows<a href="#entropy">[16]</a>:
376
  \begin{align}
@@ -385,7 +419,8 @@ def md_cond_kl_en():
385
  \mathbf{H}(Z|X) \le \mathbf{H}(Z) \tag{A.3}
386
  \end{align}
387
  It is to say that <b>the Conditional Entropy is always less than or equal to the Entropy</b>, and they are equal only when X and Z are independent. The proof of this relationship can be found in the literature <a href="#cond_entropy">[17]</a>.
388
-
 
389
  <h3 style="font-size:20px">KL Divergence and Conditional KL Divergence</h3>
390
  In the same manner as the definition of Conditional Entropy, we introduce a new definition, <b>Conditional KL Divergence</b>, denoted as $KL_{\mathcal{C}}$. Since KL Divergence is non-symmetric, there exist two forms as follows.
391
  \begin{align}
@@ -434,12 +469,12 @@ def md_cond_kl_en():
434
  Another <b>important conclusion</b> can be drawn from equation A.15.
435
 
436
  The KL Divergence is often used to fit the distribution of data. In this scenario, the distribution of the data is denoted by $q(z)$ and the parameterized model distribution is denoted by $\textcolor{blue}{p_\theta(z)}$. During the optimization process, since both $q(z|x)$ and $q(x)$ remain constant, the term $\mathbf{H}(Z) - \mathbf{H}(Z|X)$ in Equation A.15 is a constant. Thus, the following relationship is obtained:
437
- <span id="zh_cond_kl_2">
438
  \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} KL(q(z) \Vert \textcolor{blue}{p_\theta(z)}) \iff \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p_\theta(z)})dx \tag{A.25}
439
  </span>
440
 
441
  Comparing the above relationship with <b>Denoised Score Matching</b> <a href="#dsm">[18]</a>(equation A.26), some similarities can be observed. Both introduce a new variable $X$, and substitute the targeted fitting distribution q(z) with q(z|x). After the substitution, since q(z|x) is a conditional probability distribution, both consider all conditions and perform a weighted sum using the probability of the conditions occurring, $q(x)$, as the weight coefficient.
442
- <span id="zh_cond_kl_3">
443
  \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \frac{1}{2} \int q(z) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z)}{\partial z} \right\rVert^2 dz \iff \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \int q(x)\ \overbrace{\frac{1}{2} \int q(z|x) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z|x)}{\partial z} \right\rVert^2 dz}^{\text{Score Matching of }q(z|x)}\ dx \tag{A.26}
444
  </span>
445
 
@@ -451,72 +486,273 @@ def md_cond_kl_en():
451
  return
452
 
453
 
454
- def md_proof_ctr_en():
455
  global g_latex_del
456
 
457
- title = "Appendix B Proof of Contraction"
458
- with gr.Accordion(label=title, elem_classes="first_md", elem_id="proof_ctr"):
459
  gr.Markdown(
460
  r"""
461
- <center> <img id="en_fig2" src="file/fig2.png" width="960" style="margin-top:12px"/> </center>
462
- <center> Figure 2: Only one component in support </center>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
- The following will prove that with some conditions, the posterior transform is a contraction mapping, and there exists a unique point, which is also the converged point.
465
-
466
- The proof will be divided into several cases, and assumes that the random variable is discrete, so the posterior transform can be regarded as a single step transition of a <b>discrete Markov Chain</b>. The posterior $q(x|z)$ corresponds to the <b>transfer matrix</b>. Continuous variables can be considered as discrete variables with infinite states.
467
- <ol style="list-style-type:decimal">
468
- <li> When $q(x)$ is greater than 0, the posterior transform matrix $q(x|z)$ will be greater than 0 too. Therefore, this matrix is the transition matrix of an $\textcolor{red}{\text{irreducible}}\ \textcolor{green}{\text{aperiodic}}$ Markov Chain. According to the conclusion of the literature <a href="#mc_basic_p6">[13]</a>, this transformation is a contraction mapping with respect to Total Variance metric. Therefore, according to the Banach fixed-point theorem, this transformation has a unique fixed point(converged point). </li>
469
-
470
- <li> When $q(x)$ is partially greater than 0, and the support of $q(x)$ (the region where $q(x)$ is greater than 0) consists only one connected component (Figure 2), several conclusions can be drawn from equation (3.4):
471
 
472
- <ol style="list-style-type:lower-alpha; padding-inline-start: 0px;font-size:16px;">
473
- <li> When $z$ and $x$ are within the support set, since both $q(x)$ and GaussFun are greater than 0, the diagonal elements of the transfer matrix $\{q(x|z)|z=x\}$ are greater than 0. This means that the state within the support set is $\textcolor{green}{\text{aperiodic}}$. </li>
474
 
475
- <li> When $z$ and $x$ are within the support set, since GaussFun's support set has a certain range, elements above and below the diagonal $\{q(x|z)|x=z+\epsilon\}$is also greater than 0. This means that states within the support set are accessible to each other, forming a $\textcolor{red}{\text{Communication Class}}$<a href="#mc_basic_d4">[14]</a>, see in Figure 2b. </li>
476
-
477
- <li> When <em>$z$ is within the support set</em> and <em>$x$ is outside the support set</em>, ${q(x|z)}$ is entirely 0. This means that the state within the support set is <em>inaccessible</em> to the state outside the support set (Inaccessible Region in Figure 2b) </li>
478
-
479
- <li> When <em>$z$ is outside the support set</em> and <em>$x$ is inside the support set</em>, due to the existence of a certain range of the support set of GaussFun, there are some extension areas (Extension Region in Figure 2b), where the corresponding $\{q(x|z)|x \in support\}$ is not all zero. This means that the state of this part of the extension area can <em>unidirectionally</em> access the state inside the support set (Unidirectional Region in Figure 2b).</li>
480
-
481
- <li> When <em>$z$ is outside the support set</em> and <em>$x$ is outside the support set</em>, the corresponding $q(x|z)$ is entirely zero. This implies that, states outside the support set will not transit to states outside the support set. In other words, states outside the support set only originate from states within the support set. </li>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  </ol>
484
- <p style="margin-top:8px">
485
- From (c), we know that states within the support set will not transition to states outside of the support set. From (a) and (b), we know that the states within the support set are non-periodic and form a Communicate Class. Therefore, the states within the support set independently form an irreducible and non-periodic Markov Chain. According to the conclusion of Theorem 11.4.1 in reference <a href="#mc_limit">[7]</a>, as $n\to\infty$, $q(x|z)^n$ will converge to a constant matrix, with each column vector in the matrix being identical. This implies that for different values of z, $q(x|z)^n$ are the same (as seen in Figure 2c). In Addition, according to (d) and (e), there exist some states z, which are outside of the support set, that can transition into the support set and will carry information from within the support set back to the outside. Thus, the corresponding $q(x|z)^n$ for these z states (the $q(x|z_{ex})$ region in Figure 2c) will equal the corresponding $q(x|z)^n$ in the support set (the $q(x|z_{sup})$ region in Figure 2c).
486
- </p>
 
 
 
 
 
487
 
488
- <p style="margin-top:8px">
489
- Therefore, it can be concluded that when the state is confined within the support set and two extension regions, $\lim_{n\to\infty}{q(x|z)^n}$ will converge to a fixed matrix, and each column vector is identical. Hence, for any input distribution, if posterior transforms are continuously applied, it will eventually converge to a fixed distribution, which is equal to the column vector of the converged matrix. Based on the conclusion from the literature <a href=\"#fp_converse\">[9]</a>, when a iterative transform converges to a unique fixed point, this transform is a Contraction Mapping with respect to a certain metric.
490
- </p>
491
  </li>
 
 
 
492
 
493
- <li> When $q(x)$ is partially greater than 0, and multiple connected component exist in the support set of $q(x)$, and the maximum distance of each connected component <b>can</b> be covered by the support set of corresponding GaussFun, the states within each connected domain <b>constitute only one Communicate Class</b>. As shown in Figure 3, $q(x)$ has two connected component. On the edge of the first component, the support set of GaussFun corresponding to $q(x|z=-0.3)$ can span the gap to reach the second component, so the states of the first component can <em>access</em> the states of the second component. On the edge of the second component, the support set of GaussFun corresponding to $q(x|z=0)$ can also span the gap to reach the first. Thus, the states of the second component can <em>access</em> the states of the first component, so these two component form a Communicate Class. Therefore, similar to the case with a single component, when states are confined to each component, gaps, and extension areas, the posterior transform has a unique iterative convergence point, which is a contraction mapping with respect to a certain metric. </li>
494
-
495
- <li> When $q(x)$ is partially greater than 0, and multiple connected component exist in the support set of $q(x)$, and the maximum distance of each connected component <b>cannot</b> be covered by the support set of corresponding GaussFun, the states within each component <b>constitute multiple Communicate Classes</b>, as shown in Figure 4. Under such circumstances, as $n\to\infty$, $q(x|z)^n$ will also converge to a fixed matrix, but not all the column vectors are identical. Therefore, the posterior transforma is not a strict contraction mapping. However, when the state of the input distribution is confined to a single Communicate Class and its corresponding extension, the posterior transform is also a contraction mapping with a unique convergence point. </li>
496
  </ol>
 
 
 
 
 
 
 
 
497
 
498
- <center> <img id="en_fig3" src="file/fig3.png" width="960" style="margin-top:12px"/> </center>
499
- <center> Figure 3: Two components which can communicate with each other </center>
 
500
 
501
- <center> <img id="en_fig4" src="file/fig4.png" width="960" style="margin-top:12px"/> </center>
502
- <center> Figure 4: Two components which <b>cannot</b> communicate with each other </center>
503
 
504
- Additionally, there exists a more generalized relation about the posterior transform that is independent of $q(x|z)$: the Total Variance distance between two output distributions will always be <b>less than or equal to</b> the Total Variance distance between their corresponding input distributions, that is
 
505
  \begin{align}
506
- dist(q_{o1}(x),\ q_{o2}(x)) <= dist(q_{i1}(z),\ q_{i2}(z)) \tag{B.1}
507
  \end{align}
508
- The proof is given below in discrete form:
 
 
 
 
 
 
 
 
 
 
509
  \begin{align}
510
- \lVert q_{o1}-q_{o2}\rVert_{TV} &= \lVert Q_{x|z}q_{i1} - Q_{x|z}q_{i2}\rVert_{TV} \tag{B.2} \newline
511
- &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)q_{i1}(n) - \sum_{n}Q_{x|z}(m,n)q_{i2}(n)\textcolor{red}{|} \tag{B.3} \newline
512
- &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.4} \newline
513
- &\leq \sum_{m}\sum_{n}Q_{x|z}(m,n)\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \qquad \qquad \qquad \text{Absolute value inequality} \tag{B.5} \newline
514
- &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \sum_{m} Q_{x|z}(m,n) \qquad \qquad \qquad \sum_{m} Q_{x|z}(m,n) = 1 \tag{B.6} \newline
515
- &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.7}
516
  \end{align}
517
- In this context, $Q_{x|z}(m,n)$ represents the element at the m-th row and n-th column of the matrix $Q_{x|z}$, and $q_{i1}(n)$ represents the n-th element of the vector $q_{i1}$.
518
-
519
- """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_proof_ctr_en")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  return
521
 
522
 
@@ -524,7 +760,6 @@ def md_reference_en():
524
  global g_latex_del
525
 
526
  with gr.Accordion(label="Reference", elem_classes="first_md", elem_id="reference"):
527
-
528
  gr.Markdown(
529
  r"""
530
  <a id="dpm" href="https://arxiv.org/abs/1503.03585"> [1] Deep Unsupervised Learning Using Nonequilibrium Thermodynami </a>
@@ -542,21 +777,37 @@ def md_reference_en():
542
  <a id="mc_limit" href="https://stats.libretexts.org/Bookshelves/Probability_Theory/Book%3A_Introductory_Probability_(Grinstead_and_Snell)/11%3A_Markov_Chains/11.04%3A_Fundamental_Limit_Theorem_for_Regular_Chains"> [7] Fundamental Limit Theorem for Regular Chains </a>
543
 
544
  <a id="mc_basic_p6" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [8] Markov Chain:Basic Theory - Proposition 6 </a>
545
-
546
  <a id="fp_converse" href="https://arxiv.org/abs/1702.07339"> [9] A Converse to Banach's Fixed Point Theorem and its CLS Completeness </a>
547
 
548
  <a id="ce_kl" href="https://en.wikipedia.org/wiki/Cross-entropy#Cross-entropy_minimization"> [10] Cross-entropy minimization </a>
549
-
550
  <a id="deconv_1" href="https://thewolfsound.com/deconvolution-inverse-convolution/"> [11] Deconvolution Using Frequency-Domain Division </a>
551
-
552
  <a id="deconv_2" href="https://www.strollswithmydog.com/deconvolution-by-division-in-the-frequency-domain/"> [12] deconvolution-by-division-in-the-frequency-domain </a>
553
-
554
  <a id="mc_basic_t7" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [13] Markov Chain:Basic Theory - Theorem 7 </a>
555
-
556
  <a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
557
-
558
  <a id="vdm" href="https://arxiv.org/pdf/2107.00630"> [15] Variational Diffusion Models </a>
559
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_en")
561
 
562
  return
@@ -603,7 +854,11 @@ def run_app():
603
 
604
  md_cond_kl_en()
605
 
606
- md_proof_ctr_en()
 
 
 
 
607
 
608
  md_reference_en()
609
 
 
82
  q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
83
  \end{align}
84
 
85
+ When $z$ is fixed, $q(z)$ is a constant, so $q(x|z)$ is a probability density function with respect to $x$, and its shape depends only on $q(z|x)q(x)$.
86
  \begin{align}
87
  q(x|z) \propto q(z|x)q(x) \qquad where\ z\ is\ fixed \tag{3.2}
88
  \end{align}
89
 
90
+ In fact, $q(z)=\int q(z|x)q(x)dx$, which means that $q(z)$ is the sum over $x$ of the function $q(z|x)q(x)$. Therefore, dividing $q(z|x)q(x)$ by $q(z)$ is equivalent to normalizing $q(z|x)q(x)$.
91
+ \begin{align}
92
+ q(x|z) = \operatorname{Normalize}\big(q(z|x)q(x)\big) \tag{3.3}
93
+ \end{align}
94
+
95
  From Equation 2.1, we can see that $q(z|x)$ is a Gaussian distribution, so we have
96
  \begin{align}
97
+ q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad &\text{where z is fixed} \notag \newline
98
+ &= \frac{1}{\sqrt{\alpha}}\frac{1}{\sqrt{2\pi\frac{1-\alpha}{\alpha}}}\exp{\frac{-(\frac{z}{\sqrt{\alpha}}-x)^2}{2\frac{1-\alpha}{\alpha}}}\ q(x)& \notag \newline
99
+ &= \frac{1}{\sqrt{\alpha}} \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{3.4}
100
  \end{align}
101
 
102
  It can be observed that the <b>GaussFun</b> part is a Gaussian function of $x$, with a mean of $\frac{z}{\sqrt{\alpha}}$ and a variance of $\sqrt{\frac{1-\alpha}{\alpha}}$, so the shape of $q(x|z)$ is determined by **the product of GaussFun and q(x)**.
 
104
  According to the characteristics of <em>multiplication</em>, the characteristics of the shape of the $q(x|z)$ function can be summarized.
105
 
106
  <ul>
107
+ <li>The support set of $q(x|z)$ should be contained within the support set of GaussFun. The support set of GaussFun is a hypersphere, centered at the mean $\mu$ with a radius of approximately 3 times the standard deviation $\sigma$. </li>
108
+
109
+ <li>When the variance of the Gaussian function is small (small noise), or when $q(x)$ changes linearly, the shape of $q(x|z)$ will approximate to the Gaussian function, and have a simpler function form, which is convenient for modeling and learning.</li>
110
 
111
  <li>When the variance of the Gaussian function is large (large noise), or when $q(x)$ changes drastically, the shape of $q(x|z)$ will be more complex, and greatly differ from a Gaussian function, which makes it difficult to model and learn.</li>
112
  </ul>
113
 
114
+ <a href="#approx_gauss">Appendix B</a> provides a more rigorous analysis. When $\sigma$ satisfies certain conditions, $q(x|z)$ approximates to Gaussiani distribution.
115
+
116
  The specifics can be seen in <a href="#demo_2">Demo 2</a>. The fourth figure present the shape of the posterior $q(x|z)$, which shows an irregular shape and resembles a curved and uneven line. As $\alpha$ increases (noise decreases), the curve tends to be uniform and straight. Readers can adjust different $\alpha$ values and observe the relationship between the shape of posterior and the level of noise. In the last figure, the $\textcolor{blue}{\text{blue dash line}}$ represents $q(x)$, the $\textcolor{green}{\text{green dash line}}$ represents <b>GaussFun</b> in the equation 3.4, and the $\textcolor{orange}{\text{orange curve}}$ represents the result of multiplying the two function and normalizing it, which is the posterior probability $q(x|z=fixed)$ under a fixed z condition. Readers can adjust different values of z to observe how the fluctuation of $q(x)$ affect the shape of the posterior probability $q(x|z)$.
117
 
118
  The posterior $q(x|z)$ under two special states are worth considering.
119
  <ul>
120
+ <li>As $\alpha \to 0$, the variance of <b>GaussFun</b> tends to <b>$\infty$</b>, and GaussFun almost becomes a uniform distribution over a very large support set, and the result of multiplying $q(x)$ by the uniform distribution is still $q(x)$, therefore, $q(x|z)$ for different $z$ almost become identical, and almost the same as $q(x)$. Readers can set $\alpha$ to 0.001 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
121
 
122
  <li>As $\alpha \to 1$, the variance of <b>GaussFun</b> tends to <b>$0$</b>, The $q(x|z)$ for different $z$ values contract into a series of <em>Dirac delta functions</em> with different offsets equalling to $z$. However, there are some exceptions. When there are regions where $q(x)$ is zero, the corresponding $q(x|z)$ will no longer be a Dirac <em>delta function</em>, but a zero function. Readers can set $\alpha$ to 0.999 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
123
  </ul>
124
+
125
+ There is one point to note. when $\alpha \to 0$, the mean of GaussFun corresponding for larger $z$ values($\mu = \frac{z}{\sqrt{\alpha}}$) also increases sharply. This means that GaussFun is located farther from the support of $q(x)$. In this case, the "uniformity" of the part of GaussFun corresponding to the support of $q(x)$ will slightly decrease, thereby slightly reducing the similarity between $q(x|z)$ and $q(x)$. However, this effect will further diminish as $\alpha$ decreases. Readers can observe this effect in <a href="#demo_2">Demo 2</a>. Set $\alpha$ to 0.001, and you will see a slight difference between $q(x|z=-2)$ and $q(x)$, but no noticeable difference between $q(x|z=0)$ and $q(x)$.
126
+
127
+ Regarding the "uniformity" of the Gaussian function, there are two characteristics: the larger the standard deviation, the greater the uniformity; the farther away from the mean, the smaller the uniformity.
128
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_en")
129
  return
130
 
 
167
  q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
168
  \end{align}
169
 
170
+ Comparing the forms of Equation 4.8 and Equation 2.1, it can be found that their forms are completely consistent.
171
+
172
+ If we only focus on the relationship between the initial and final random variables, then a sequence of t small transforms can be replaced by one large transform, and the $\alpha$ of the large transform is the accumulation of the $\alpha$ from each small transform, because the joint probability distributions corresponding to both types of transforms are the same.
173
 
174
+ Readers can perform an experiment in <a href="#demo_3_1">Demo 3.1</a> using the same input distribution $q(x)$ but with two different transform methods: 1) using three transformations, each with $\alpha$ equal to 0.95; 2) using a single transform with $\alpha$ set to 0.857375. Perform the transformations separately and then compare the two resulting distributions. You will see that the two distributions are identical.
175
+
176
  In the DDPM[\[2\]](#ddpm) paper, the authors used 1000 steps (T=1000) to transform the data distribution $q(x)$ to $q(z_T)$. The probability distribution of $q(z_T|x)$ is as follows:
177
  \begin{align}
178
  q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
179
  \end{align}
180
 
181
+ If only considering the joint distribution $q(x, z_T)$, a single transformation can also be used as a substitute, which is as follows:
182
  \begin{align}
183
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
184
  \end{align}
185
+ It can be seen that, after applying two transforms, the transformed distributions $q(z_T|x)$ are the same. Thus, $q(x,z_T)$ is also the same.
186
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_en")
187
  return
188
 
 
211
  In <a href="#posterior">Section 3</a>, we have considered two special posterior probability distributions. Next, we analyze their corresponding <em>posterior transforms</em>.
212
  <ul>
213
  <li> When $\alpha \to 0$, the $q(x|z)$ for different $z$ are almost the same as $q(x)$. In other words, the basis functions of linear weighted sum are almost the same. In this state, no matter how the input changes, the output of the transformation is always $q(x)$.</li>
214
+ <li> When $\alpha \to 1$, the $q(x|z)$ for different $z$ values becomes a series of Dirac delta functions and zero functions. In this state, as long as the <em>support</em> of the input distribution is included in the <em>support set</em> of $q(x)$, the output of the transformation will remain the same with the input.</li>
215
  </ul>
216
 
217
  In <a href="#forward_process">Section 4</a>, it is mentioned that the 1000 transformations used in the DDPM[\[2\]](#ddpm) can be represented using a single transformation
 
219
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
220
  \end{align}
221
 
222
+ Since $\alpha=0.0000403$ is very small, the corresponding standard deviation of GaussFun (Equation 3.4) reaches 157.52. If we constrain the support of $q(x)$ within the unit hypersphere ($\lVert x \rVert_2 < 1$), then for $z_T$ in the range $[-2, +2]$, each corresponding $q(x|z_T)$ is very similar to $q(x)$. In this state, for the posterior transform of $q(x|z_T)$, regardless of the shape of the input distribution, as long as the support set is within the range $[-2,+2]$, the output distribution will be $q(x)$.
223
 
224
+ <b>Furthermore, we can conclude that in the DPM model, if the support of $q(x)$ is finite and the signal-to-noise ratio of the final variable $Z_T$ is sufficiently high, the process of restoring $q(x)$ can use any distribution; it doesn't necessarily have to use the standard normal distribution.</b>
225
 
226
  Readers can conduct a similar experiment themselves. In <a href="#demo_3_1">Demo 3.1</a>, set <em>start_alpha</em> to 0.25, <em>end_alpha</em> to 0.25, and <em>step</em> to 7. At this point, $q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061} \epsilon$, which is roughly equivalent to DDPM's $q(z_T)$. Click on <b>apply</b> to perform the forward transform (plotted using $\textcolor{blue}{\text{blue curves}}$), which prepares for the subsequent restoring process. In <a href="#demo_3_2">Demo 3.2</a>, set the <em>noise_ratio</em> to 1, introducing 100% noise into the <em>tail distribution</em> $q(z_7)$. Changing the value of <em>nose_random_seed</em> will change the distribution of noise. Deselect <em>backward_pdf</em> to reduce screen clutter. Click on <b>apply</b> to restore $q(x)$ through posterior transform. You will see that, no matter what the shape of input $q(z_7)$ may be, the restored $q(x)$ is always exactly the same as the original $q(x)$. The JS Divergence is zero. The restoration process is plotted using a $\textcolor{red}{\text{red curve}}$.
227
+
228
+ There is another point worth noting. In deep learning tasks, it is common to scale each dimension of the input within the range [-1, 1], which means within a unit hypercube. The maximum Euclidean distance between any two points in a unit hypercube increases with the dimensionality. For example, in one dimension, the maximum distance is $2$, two dimensions is $2\sqrt{2}$, three dimensions is $2\sqrt{3}$, and n dimensions is $2\sqrt{n}$. Therefore, for data with higher dimensions, the variable $Z_T$ needs a higher signal-to-noise ratio to allow the starting distribution of the recovery process to accept any distribution.
229
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_en")
230
  return
231
 
 
316
 
317
  gr.Markdown(
318
  r"""
319
+ </br>
320
+ <h3 style="font-size:18px"> Non-expanding Mapping and Stationary Distribution </h3>
321
  \begin{align}
322
  q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
323
  \end{align}
324
+
325
+ According to Corollary 1 and Corollary 2 in <a href="#non_expanping">Appendix B</a>, the posterior transform is a <b>non-expanding mapping</b>. This means that for any two probability distributions $q_{i1}(z)$ and $q_{i2}(z)$, after the posterior transform, the resulting distributions $q_{o1}(x)$ and $q_{o2}(x)$ will have a distance that is <b>always less than or equal to</b> the distance between $q_{i1}(x)$ and $q_{i2}(x)$. The distance here can be measured using KL Divergence or Total Variance.
326
  \begin{align}
327
+ d(q_{o1}(z),\ q_{o2}(z)) \le d(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
328
  \end{align}
329
+ According to the analysis in <a href="#non_expanping">Appendix B</a>, the aforementioned equality does not hold in most cases and posterior transform becomes a <b>shrinkig mapping</b>. Furthermore, <b>the smaller $\alpha$ is (the more noise), the smaller $d(q_{o1},q_{o2})$ will be compared to $d(q_{i1},q_{i2})$</b>.
330
 
331
  Readers can refer to <a href="#demo_4_1">Demo 4.1</a>, where the first three figure present a transform process. The first figure is an arbitrary data distribution $q(x)$, the third figure is the transformed probability distribution, and second figure is the posterior probability distribution $q(x|z)$. You can change the random seed to generate a new data distribution$q(x)$, and adjust the value of $\alpha$ to introduce different degrees of noise.
332
 
 
334
 
335
  Readers can change the input random seed to toggle different inputs. It can be observed from the figures that $div_{in}$ is always smaller than $div_{out}$ for any input. Additionally, if you change the value of $\alpha$, you will see that the smaller the $\alpha$(larger noise), the smaller the ratio of $div_{out}/div_{in}$,indicating a larger rate of contraction.
336
 
337
+ According to the analysis in <a href="#stationary">Appendix C</a>: the posterior transform can be seen as a one-step jump of a Markov chain, and <b>when $q(x)$ and $\alpha$ meet certain conditions, this Markov chain will converge to a unique stationary distribution</b>. Additionally, numerous experiments have shown that <b>the stationary distribution is very similar to the data distribution $q(x)$, and the smaller $\alpha$ is, the more similar the stationary distribution is to $q(x)$</b>. Specifically, according to the conclusion in <a href="#backward_process">Section 5</a>, <b>when $\alpha \to 0$, after one step of transform, the output distribution will be $q(x)$, so the stationary distribution must be $q(x)$</b>.
338
 
339
  Readers can refer to <a href="#demo_4_2">Demo 4.2</a>, which illustrates an example of applying posterior transform iteratively. Choose an appropriate number of iterations, and click on the button of <em>Apply</em>, and the iteration process will be draw step by step. Each subplot shows the transformed output distribution($\textcolor{green}{\text{green curve}}$) from each transform, with the reference distribution $q(x)$ expressed as a $\textcolor{blue}{\text{blue curve}}$, as well as the distance $div$ between the output distribution and $q(x)$. It can be seen that as the number of iterations increases, the output distribution becomes more and more similar to $q(x)$, and will eventually stabilize near $q(x)$. For more complicated distributions, more iterations or greater noise may be required. The maximum number of iterations can be set to tens of thousands, but it'll take longer.
340
 
 
347
  \end{align}
348
  In order to better understand the property of the transform, the matrix $(Q_{x|z})^n$ is also plotted in <a href="#demo_4_2">Demo 4.2</a>. From the demo we can see that, as the iterations converge, the row vectors of the matrix $(Q_{x|z})^n$ will become a constant vector, that is, all components of the vector will be the same, which will appear as a horizontal line in the denisty plot.
349
 
350
+ For a one-dimensional discrete Markov chain, the convergence rate is inversely related to the absolute value of the second largest eigenvalue of the transition probability matrix ($\lvert \lambda_2 \rvert$). The smaller $\lvert \lambda_2 \rvert$ is, the faster the convergence. Numerous experiments have shown that $\alpha$ has a clear linear relationship with $\lvert \lambda_2 \rvert$; the smaller $\alpha$ is, the smaller $\lvert \lambda_2 \rvert$ is. Therefore, <b>the smaller $\alpha$ (the greater the noise), the faster the convergence rate</b>. Specifically, when $\alpha \to 0$, according to the conclusion in <a href="#posterior">Section 3</a>, the posterior probability distributions corresponding to different $z$ tend to be consistent. Additionally, according to Theorem 21 in <a href="#non_neg_lambda">[21]</a>, $\lvert \lambda_2 \rvert$ is smaller than the L1 distance between any two posterior probability distributions corresponding to different $z$, so it can be concluded that $\lvert \lambda_2 \rvert \to 0$.
351
 
 
352
 
353
+ </br>
354
  <h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution</h3>
355
+
356
+ From the above analysis, it can be seen that, in most cases, the <b>posterior transform</b> is a shrinking mapping, which means the following relationship
357
+
358
  \begin{align}
359
+ d(q(x),\ q_o(x)) < d(q(z),\ q_i(z)) \tag{7.12}
360
  \end{align}
 
361
 
362
+ Among them, $q(z)$ is the ideal input distribution, $q(x)$ is the ideal output distribution, $q(x) = \int q(x|z) q(z) dz$, $q_i(z)$ is any input distribution, and $q_o(x)$ is the transformed output distribution, $q_o(x) = \int q(x|z) q_i(z) dz$.
363
+
364
+ The above equation indicates that the distance between the output distribution $q_o(x)$ and the ideal output distribution q(x) will always be <b>less than</b> the distance between the input distribution $q_i(z)$ and the ideal input distribution q(x). Hence, <b>the posterior transform naturally possesses a certain ability to resist noise </b>. This means that during the process of restoring $q(x)$(<a href="#backward_process">Section 5</a>), even if the <em>tail distribution</em> $q(z_T)$ contains some error, the error of the outputed distribution $q(x)$ will be smaller than the error of input after undergoing a series of transform.
365
 
366
  Refer specifically to <a href="#demo_3_2">Demo 3.2</a>, where by increasing the value of the <b>noise ratio</b>, noise can be added to the <em>tail distribution</em> $q(z_T)$. Clicking the "apply" button will gradually draw out the restoring process, with the restored distribution represented by a $\textcolor{red}{\text{red curve}}$, and the error size will be computed by the JS divergence. You will see that the error of restored $q(x)$ is always less than the error of $q(z_T)$.
367
 
368
+ From the above discussion, it can be seen that the smaller $\alpha$ is (the larger the noise used in the transform), the greater the shrinking rate of the shrink mapping, and correspondingly, the stronger the error resistance capability. Specifically, when $\alpha \to 0$, the noise resistance capability becomes infinite, meaning that regardless of the magnitude of the error in the input, the output will always be $q(x)$.
369
+
370
+ </br>
371
+ <h3 style="font-size:18px"> Markov Chain Monte Carlo Sampling</h3>
372
 
373
+ In DPM models, sampling is typically performed using <b>Ancestral Sampling</b>. From the analysis above, it can be inferred that when $\alpha$ is sufficiently small, the posterior transform will converge to $q(x)$. Therefore, sampling can be conducted using <b>Markov Chain Monte Carlo</b> (MCMC) methods, as depicted in Figure 7.1. In the figure, $\alpha$ represents a posterior transform with relatively large noise, where larger noise makes the steady-state distribution closer to the data distribution $q(x)$. However, as discussed in Section 3, posterior transform with larger noise are less favorable for fitting. Therefore, transform with larger noise are split into multiple transform with smaller noise.
374
+
375
+ <center> <img src="file/7.1.png" width="1024" style="margin-top:12px"/> </center>
376
+ <center> Figure 7.1: Markov Chain Monte Carlo Sampling</center>
377
+
378
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_en")
379
  return
380
 
 
403
  gr.Markdown(
404
  r"""
405
  This section mainly introduces the relationship between <b>KL divergence</b> and <b>conditional KL divergence</b>. Before the formal introduction, we will briefly introduce the definitions of <b>Entropy</b> and <b>Conditional Entropy</b>, as well as the inequality relationship between them, in preparation for the subsequent proof.
406
+
407
+ </br>
408
  <h3 style="font-size:20px">Entropy and Conditional Entropy</h3>
409
  For any two random variables $Z, X$, the <b>Entropy</b> is defined as follows<a href="#entropy">[16]</a>:
410
  \begin{align}
 
419
  \mathbf{H}(Z|X) \le \mathbf{H}(Z) \tag{A.3}
420
  \end{align}
421
  It is to say that <b>the Conditional Entropy is always less than or equal to the Entropy</b>, and they are equal only when X and Z are independent. The proof of this relationship can be found in the literature <a href="#cond_entropy">[17]</a>.
422
+
423
+ </br>
424
  <h3 style="font-size:20px">KL Divergence and Conditional KL Divergence</h3>
425
  In the same manner as the definition of Conditional Entropy, we introduce a new definition, <b>Conditional KL Divergence</b>, denoted as $KL_{\mathcal{C}}$. Since KL Divergence is non-symmetric, there exist two forms as follows.
426
  \begin{align}
 
469
  Another <b>important conclusion</b> can be drawn from equation A.15.
470
 
471
  The KL Divergence is often used to fit the distribution of data. In this scenario, the distribution of the data is denoted by $q(z)$ and the parameterized model distribution is denoted by $\textcolor{blue}{p_\theta(z)}$. During the optimization process, since both $q(z|x)$ and $q(x)$ remain constant, the term $\mathbf{H}(Z) - \mathbf{H}(Z|X)$ in Equation A.15 is a constant. Thus, the following relationship is obtained:
472
+ <span id="en_cond_kl_2">
473
  \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} KL(q(z) \Vert \textcolor{blue}{p_\theta(z)}) \iff \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p_\theta(z)})dx \tag{A.25}
474
  </span>
475
 
476
  Comparing the above relationship with <b>Denoised Score Matching</b> <a href="#dsm">[18]</a>(equation A.26), some similarities can be observed. Both introduce a new variable $X$, and substitute the targeted fitting distribution q(z) with q(z|x). After the substitution, since q(z|x) is a conditional probability distribution, both consider all conditions and perform a weighted sum using the probability of the conditions occurring, $q(x)$, as the weight coefficient.
477
+ <span id="en_cond_kl_3">
478
  \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \frac{1}{2} \int q(z) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z)}{\partial z} \right\rVert^2 dz \iff \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \int q(x)\ \overbrace{\frac{1}{2} \int q(z|x) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z|x)}{\partial z} \right\rVert^2 dz}^{\text{Score Matching of }q(z|x)}\ dx \tag{A.26}
479
  </span>
480
 
 
486
  return
487
 
488
 
489
+ def md_approx_gauss_en():
490
  global g_latex_del
491
 
492
+ title = "Appendix B When does the Posterior Approximate to Gaussian ?"
493
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="approx_gauss"):
494
  gr.Markdown(
495
  r"""
496
+ From equation 3.4, it can be seen that $q(x|z)$ takes the following form:
497
+ \begin{align}
498
+ q(x|z) &= \operatorname{Normalize} \Big(\ \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x)\ \Big)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{B.1} \newline
499
+ &\propto \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x) \tag{B.2}
500
+ \end{align}
501
+
502
+ Below we will prove that if the following two assumptions are satisfied, $q(x|z)$ approximates a Gaussian distribution.
503
+ <ul>
504
+ <li>
505
+ Assume that within the support of GaussFun, $q(x)$ undergoes linear changes. Expand $q(x)$ around the mean of GaussFun using a Taylor series. According to the properties of Taylor expansion, these assumptions can be satisfied when the standard deviation $\sigma$ of GaussFun is sufficiently small.
506
+ \begin{align}
507
+ q(x) &\approx q(\mu) + \nabla_xq(\mu)(x-\mu)& \quad &\text{where}\quad q(\mu)\triangleq q(x)\bigg|_{x=\mu} \quad \nabla_xq(\mu)\triangleq \nabla_xq(x)\bigg|_{x=\mu} \tag{B.3} \newline
508
+ &= q(\mu)\big(1+ \frac{\nabla_xq(\mu)}{q(\mu)}(x-\mu)\big)& \tag{B.4} \newline
509
+ &= q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)& \quad &\text{where}\quad \nabla_x\log{q(\mu)}\triangleq \nabla_x\log{q(x)}\bigg|_{x=\mu} \tag{B.5}
510
+ \end{align}
511
+ </li>
512
+ <li>
513
+ Assuming within the support of GaussFun, $\log\big(1+\nabla_x\log{q(\mu)}(x-\mu)\big)$ can be approximated by $\nabla_x\log{q(\mu)}(x-\mu)$. By expanding $\log(1+y)$ using Taylor series, according to the properties of Taylor expansion, when $\lVert y\rVert_2$ is small, $\log(1+y)$ can be approximated by $y$. When $\sigma$ is sufficiently small, $\lVert x-u\rVert_2$ will be small, and $\nabla_x\log{q(\mu)}(x-\mu)$will also be small, hence the above assumption can be satisfied. Generally, when $\nabla_x\log{q(\mu)}(x-\mu)<0.1$, the approximation error is small enough to be negligible.
514
+ \begin{align}
515
+ \log(1+y) &\approx \log(1+y)\bigg|_{y=0} + \nabla_y\log(1+y)\bigg|_{y=0}(y-0) \tag{B.6} \newline
516
+ &= y \tag{B.7}
517
+ \end{align}
518
+ </li>
519
+ </ul>
520
+ Using the above two assumptions, $q(x|z)$ can be transformed into the following form:
521
+
522
+ \begin{align}
523
+ q(x|z) &\propto \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x) \tag{B.8} \newline
524
+ &\approx \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big) \tag{B.9} \newline
525
+ &= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\log\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)\right) \tag{B.10} \newline
526
+ &\approx \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\nabla_x\log{q(\mu)}(x-\mu)\right) \tag{B.11} \newline
527
+ &= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{(x-\mu)^2-2\sigma^2\nabla_x\log{q(\mu)}(x-\mu)}{2\sigma^2}\right) \tag{B.12} \newline
528
+ &= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}+\frac{\big(\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \tag{B.13} \newline
529
+ &= \exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \underbrace{\frac{q(\mu)}{\sqrt{2\pi}\sigma} \exp\left( \frac{1}{2}\big(\sigma\nabla_x\log{q(\mu)}\big)^2\right)}_{\text{const}} \tag{B.14}
530
+ \end{align}
531
+
532
+ Among them, Equation B.9 applies the conclusion of Assumption 1, and Equation B.11 applies the conclusion of Assumption 2.
533
+
534
+ The <em>const term</em> in Equation B.14 is constant and does not affect the shape of the function. Additionally, as can be seen from the above, $q(x|z)$ is self-normalizing. Therefore, $q(x|z)$ is a Gaussian probability density function with a mean of $\mu + \sigma^2 \nabla_x \log{q(\mu)}$ and a variance of $\sigma^2$.
535
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_approx_gauss_en")
536
+
537
+ return
538
+
539
+
540
+ def md_non_expanding_en():
541
+ global g_latex_del
542
+
543
+ title = "Appendix C Posterior Transform is a Non-expanding Mapping"
544
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="non_expanding"):
545
+ gr.Markdown(
546
+ r"""
547
+ <b>Corollary 1</b>
548
+
549
+ Using KL Divergence as a metric, the transition transform of Markov chain is non-expanding<a href="#elem">[23]</a>, which means
550
+ \begin{align}
551
+ KL\big(p(x), q(x)\big) &\le KL\big(p(z), q(z)\big) \tag{C.1} \newline
552
+ \end{align}
553
+ Here, $p(z)$ and $q(z)$ are arbitrary probability density functions, and $r(x|z)$ is the transition probability density function of the Markov chain. We have $p(x) = \int r(x|z)p(z)dz$ and $q(x) = \int r(x|z)q(z)dz$.
554
+
555
+ Proof:
556
+
557
+ For the KL divergence of $p(x,z)$ and $q(x,z)$, the following relationship exists:
558
+ \begin{align}
559
+ KL\big(p(x,z), q(x,z)\big) &= \iint p(x,z)\log \frac{p(x,z)}{q(x,z)}dxdz \tag{C.2} \newline
560
+ & = \iint p(x,z)\log \frac{p(x)p(x|z)}{q(x)q(x|z)}dxdz \tag{C.3} \newline
561
+ &= \iint p(x,z)\log \frac{p(x)}{q(x)}dxdz + \iint p(x,z) \log\frac{p(x|z)}{q(x|z)} dxdz \tag{C.4} \newline
562
+ &= \int \int p(x,z) dz\ \log \frac{p(x)}{q(x)}dx + \int p(z)\int p(x|z) \log\frac{p(x|z)}{q(x|z)} dx\ dz \tag{C.5} \newline
563
+ &= KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.6} \newline
564
+ \end{align}
565
+
566
+ Similarly, by swapping the order of $Z$ and $X$, the following relationship can be obtaine:
567
+ \begin{align}
568
+ KL\big(p(x,z), q(x,z)\big) &= KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.7}
569
+ \end{align}
570
+
571
+ Comparing the two equations, we can obtain:
572
+ \begin{align}
573
+ KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx = KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.8}
574
+ \end{align}
575
+
576
+ Since $q(x|z)$ and $p(x|z)$ are both transition probability densities of the Markov chain, equal to $r(x|z)$, the integral $\int p(z) KL\big(p(x|z), q(x|z)\big)dz$ equals 0. Therefore, the above equation simplifies to:
577
+ \begin{align}
578
+ KL\big(p(x), q(x)\big) = KL\big(p(z), q(z)\big) - \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.9}
579
+ \end{align}
580
+
581
+ Since KL divergence is always greater than or equal to 0, the weighted sum $\int p(x) KL\big(p(z|x), q(z|x)\big)dx$ is also greater than or equal to 0. Therefore, we can conclude:
582
+ \begin{align}
583
+ KL\big(p(x), q(x)\big) \le KL\big(p(z), q(z)\big) \tag{C.10}
584
+ \end{align}
585
+
586
+ </br>
587
+
588
+ The condition for the above equation to hold is that $\int p(x) KL\big(p(z|x), q(z|x)\big)dx$ equals 0, which requires that for different conditions $x$, $p(z|x)$ and $q(z|x)$ must be equal. In most cases, when $p(z)$ and $q(z)$ are different, $p(z|x)$ and $q(z|x)$ are also different. This means that in most cases, we have
589
+ \begin{align}
590
+ KL\big(p(x), q(x)\big) < KL\big(p(z), q(z)\big) \tag{C.11}
591
+ \end{align}
592
+
593
+ </br></br>
594
+ <b>Corollary 2</b>
595
+
596
+ Using Total Variance (L1 distance) as a metric, the transition transform of a Markov chain is non-expanding, which means
597
+ \begin{align}
598
+ \left\lVert p(x)-q(x) \right\rVert_1\ &\le\ \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.12}
599
+ \end{align}
600
+
601
+ Here, $p(z)$ and $q(z)$ are arbitrary probability density functions, and $r(x|z)$ is the transition probability density function of a Markov chain. We have $p(x) = \int r(x|z)p(z)dz$ and $q(x) = \int r(x|z) q(z) dz$.
602
+
603
+ Proof:
604
+ \begin{align}
605
+ \left\lVert p(x)-q(x) \right\rVert_1\ &= \int \big\lvert p(x) - q(x) \big\rvert dx \tag{C.13} \newline
606
+ &= \int \left\lvert \int r(x|z) p(z) dz - \int r(x|z)q(z)dz \right\rvert dx \tag{C.14} \newline
607
+ &= \int \left\lvert \int r(x|z) \big(p(z)-q(z)\big) dz \right\rvert dx \tag{C.15} \newline
608
+ &\le \int \int r(x|z) \left\lvert \big(p(z)-q(z)\big) \right\rvert dz dx \tag{C.16} \newline
609
+ &= \int \int r(x|z)dx \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.17} \newline
610
+ &= \int \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.18} \newline
611
+ &= \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.19}
612
+ \end{align}
613
+
614
+ Here, Equation C.16 applies the Absolute Value Inequality, while Equation C.18 utilizes the property of $r(x|z)$ being a probability distribution.
615
+
616
+ Proof completed.
617
+
618
+ </br>
619
+
620
+ Figure C.1 shows an example of a one-dimensional random variable, which can help better understand the derivation process described above.
621
 
622
+ The condition for the above equation to hold is that all non-zero terms inside the absolute value brackets have the same sign. As shown in Figure C.1(a), there are five absolute value brackets, each corresponding to a row, with five terms in each bracket. The above equation holds if and only if all non-zero terms in each row have the same sign. If different signs occur, this will lead to $\lVert p(x)-q(x) \rVert_1\ <\ \lVert p(z) - q(z) \rVert_1$. The number of different signs is related to the nonzero elements of the transition probability matrix. In general, the more nonzero elements there are, the more different signs there will be.
623
+
624
+ For the posterior transform, generally, when $\alpha$ decreases (more noise), the transition probability density function will have more nonzero elements, as shown in Figure C.2(a); whereas when $\alpha$ increases (less noise), the transition probability density function will have fewer nonzero elements, as shown in Figure C.2(b).
 
 
 
 
625
 
626
+ So, there is a feature: <b>when $\alpha$ decreases, it leads to $\lVert p(x)-q(x) \rVert_1$ being smaller than $\lVert p(z) - q(z) \rVert_1$, which means the shrinking rate of the posterior transform is larger.</b>
 
627
 
628
+ <center> <img src="file/C1.png" width="1024" style="margin-top:12px"/> </center>
629
+ <center> Figure C.1: Non-expanding under L1 norm </center>
630
+ </br>
631
+ <center> <img src="file/C2.png" width="568" style="margin-top:12px"/> </center>
632
+ <center> Figure C.2: More non-zero elements as $\alpha$ gets smaller </center>
633
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_non_expanding_en")
634
+
635
+ return
636
+
637
+
638
+ def md_stationary_en():
639
+ global g_latex_del
640
+
641
+ title = "Appendix D Posterior Transform Converges to the Unique Stationary Distribution"
642
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="stationary"):
643
+ gr.Markdown(
644
+ r"""
645
+ According to the conclusion of Theorem 3 in <a href="#mc_basic_t3">[19]</a>, <b>an aperiodic and irreducible Markov chain will converge to a unique stationary distribution</b>.
646
+
647
+ The following will show that under certain conditions, the posterior transform is the transition probability density function of an <b>aperiodic and irreducible Markov chain</b>.
648
+
649
+ For convenience, the forward transform of the diffusion model is described below in a more general form.
650
+ \begin{align}
651
+ Z = \sqrt{\alpha}X + \sqrt{\beta}\ \epsilon \tag{D.1} \newline
652
+ \end{align}
653
+
654
+ As described in <a href="#transform">Section 1</a>, $\sqrt{\alpha}X$ narrows the probability density function of $X$, so $\alpha$ controls the narrowing intensity, while $\beta$ controls the amount of noise added(smoothing). When $\beta = 1 - \alpha$, the above transform is consistent with the equation 1.1.
655
 
656
+ The form of the posterior probability distribution corresponding to the new transformation is as follows:
657
+ \begin{align}
658
+ q(x|z=c) = \operatorname{Normalize} \Big(\ \overbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}^{\text{GaussFun}}\ q(x)\ \Big) \tag{D.2} \newline
659
+ \text{where}\ \mu=\frac{c}{\sqrt{\alpha}}\qquad \sigma=\sqrt{\frac{\beta}{\alpha}} \qquad \text{$c$ is a fixed value} \notag
660
+ \end{align}
661
+
662
+ When $\beta = 1 - \alpha$, the above transform is consistent with the equation 3.4.
663
+
664
+ For convenience, let $g(x)$ represent GaussFun in Equation D.2.
665
+
666
+ Since $\sqrt{\alpha}X$ narrows the probability density function $q(x)$ of $X$, this makes the analysis of the aperiodicity and irreducibility of the transition probability density function $q(x|z)$ more complex. Therefore, for the sake of analysis, we first assume $\alpha = 1$ and later analyze the case when $\alpha \neq 1$ and $\beta = 1 - \alpha$.
667
+
668
+ <center> <img src="file/D1.png" width="960" style="margin-top:12px"/> </center>
669
+ <center> Figure D.1: Only one component in support </center>
670
+
671
+ <center> <img src="file/D2.png" width="960" style="margin-top:12px"/> </center>
672
+ <center> Figure D.2: One component which can communicate with each other </center>
673
+
674
+ </br>
675
+ <h3 style="font-size:24px"> $\alpha=1$ </h3>
676
+
677
+ When $\alpha=1$, if $q(x)$ and $\beta$ satisfy either of the following two conditions, the Markov chain corresponding to $q(x|z)$ is aperiodic and irreducible.
678
+
679
+ <ol style="list-style-type:decimal">
680
+ <li>If the support of $q(x)$ contains only one connected component.</li>
681
+ <li>If the support of $q(x)$ has multiple connected components, but the distance between each connected component is less than $3$ times $\sigma$. In other words, the gaps can be covered by the radius of the effective region of $g(x)$.</li>
682
  </ol>
683
+
684
+ Proof:
685
+
686
+ <ol style="list-style-type:decimal">
687
+ <li>
688
+ For any point $c$ in the support of $q(x)$, when $z=c$ and $x=c$, $q(x=c)>0$; from Equation D.2, we know that the center of $g(x)$ is located at $c$, so $g(x)$ is also greater than 0 at $x=c$. Therefore, according to characteristics of multiplication in the equation D.2, $q(x=c|z=c)>0$. Hence, the Markov chain corresponding to $q(x|z)$ is aperiodic.
689
+
690
+ For any point $c$ in the support of $q(x)$, when $z=c$, the center of $g(x)$ is located at $c$, so there exists a hypersphere with $c$ as its center ($\lVert x-c\rVert_2 < \delta$). Within this hypersphere, $q(x|z=c)>0$, which means that state $c$ can access nearby states. Since every state in the support has this property, all states within the entire support form a $\textcolor{red}{\text{Communicate Class}}$ <a href="#mc_basic_d4">[14]</a>. Therefore, the Markov chain corresponding to $q(x|z)$ is irreducible.
691
 
692
+ Therefore, a Markov chain that satisfies condition 1 is aperiodic and irreducible. See the example in Figure D.1, which illustrates a single connected component
 
 
693
  </li>
694
+
695
+ <li>
696
+ When the support set of $q(x)$ has multiple connected components, the Markov chain may have multiple communicate classes. However, if the gaps between components are smaller than $3\sigma$(standard deviation of $g(x)$), the states of each component can access each other. Thus, the Markov chain corresponding to $q(x|z)$ will have only one communicate class, similar to the case in condition 1. Therefore, a Markov chain that satisfies condition 2 is aperiodic and irreducible.
697
 
698
+ In Figure D.2, an example of multiple connected components is shown.
699
+ </li>
 
700
  </ol>
701
+
702
+ <center> <img src="file/D3.png" width="960" style="margin-top:12px"/> </center>
703
+ <center> Figure D.3: Two component which <b>cannot</b> communicate with each other </center>
704
+
705
+ </br>
706
+ <h3 style="font-size:24px"> $\alpha \neq 1$ </h3>
707
+
708
+ When $\alpha \neq 1$, for any point $c$ within the support of $q(x)$, it follows from Equation D.2 that the center of $g(x)$ is no longer $c$ but rather $\frac{c}{\sqrt{\alpha}}$. That is to say, the center of $g(x)$ deviates from $c$, with the deviation distance being $\lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})$. It can be observed that the larger $\lVert c\rVert$ is, the greater the deviation. See the examples in Figures D.4(c) and D.4(d) for specifics. In Figure D.4(d), when $z=2.0$, the center of $g(x)$ is noticeably offset from $x=2.0$. This phenomenon is referred to in this article as <b>the Center Deviation Phenomenon</b>.
709
 
710
+ The <b>Center Deviation Phenomenon</b> will affect the properties of some states in the Markov chain.
711
+
712
+ When the deviation distance is significantly greater than $3\sigma$, $g(x)$ may be zero at $x = c$ and its vicinity. Consequently, $q(x=c|z=c)$ may also be zero, and $q(x|z=c)$ in the vicinity of $x = c$ may also be zero. Therefore, state $c$ may not be able to access nearby states and may be periodic. This is different from the case when $\alpha=1$. Refer to the example in Figure D.5: the $\textcolor{green}{\text{green curve}}$ represents $g(x)$ for $z=6.0$, and the $\textcolor{orange}{\text{orange curve}}$ represents $q(x|z=6.0)$. Because the center of $g(x)$ deviates too much from $x=6.0$, $q(x=6.0|z=6.0)=0$.
713
 
714
+ When the deviation distance is significantly less than $3\sigma$, $g(x)$ is non-zero at $x = c$ and its vicinity. Consequently, $q(x=c|z=c)$ will not be zero, and $q(x|z=c)$ in the vicinity of $x = c$ will also not be zero. Therefore, state $c$ can access nearby states and is aperiodic.
 
715
 
716
+ Under what conditions for $c$ will the deviation distance of the center of $g(x)$ be less than $3\sigma$?
717
+
718
  \begin{align}
719
+ \lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})\ <\ 3\frac{\sqrt{\beta}}{\sqrt{\alpha}} \qquad \Rightarrow \qquad \lVert c\rVert \ <\ 3\frac{\sqrt{\beta}}{1-\sqrt{\alpha}} \tag{D.3} \newline
720
  \end{align}
721
+
722
+ From the above, it is known that there exists an upper limit such that as long as $\lVert c\rVert$ is less than this upper limit, the deviation amount will be less than $3\sigma$.
723
+
724
+ When $\beta=1-\alpha$, the above expression becomes
725
+ \begin{align}
726
+ \lVert c\rVert \ <\ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} \tag{D.4} \newline
727
+ \end{align}
728
+
729
+ $3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$ has a strictly monotonically decreasing relationship with $\alpha$.
730
+
731
+ When $\alpha \in (0, 1)$,
732
  \begin{align}
733
+ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} > 3 \tag{D.5} \newline
 
 
 
 
 
734
  \end{align}
735
+
736
+ Based on the analysis above, the following conclusion can be drawn
737
+
738
+ <ol style="list-style-type:decimal">
739
+ <li>
740
+ <b>If the support of $q(x)$ contains only one connected component, and the points of the support set are all within a distance less than $3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$ from the origin, then the Markov chain corresponding to $q(x|z)$ will be aperiodic and irreducible.</b>
741
+ </li>
742
+
743
+ <li>
744
+ If the support of $q(x)$ contains multiple connected components, the accurate determination of whether two components can access each other becomes more complex due to the Center Deviation Phenomenon of $g(x)$. Here, we won't delve into further analysis. But just give a conservative conclusion: <b>If the points of the support are all within a distance less than $1$ from the origin, and the gaps between each connected component are all less than $2\sigma$, then the Markov chain corresponding to $q(x|z)$ will be aperiodic and irreducible.</b>
745
+ </li>
746
+ </ol>
747
+
748
+ <center> <img src="file/D4.png" width="1280" style="margin-top:12px"/> </center>
749
+ <center> Figure D.4: Center Deviation of the GaussFun </center>
750
+ </br>
751
+ <center> <img src="file/D5.png" width="568" style="margin-top:12px"/> </center>
752
+ <center> Figure D.5: Deviation is More Than $3\sigma$ </center>
753
+
754
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_stationary_en")
755
+
756
  return
757
 
758
 
 
760
  global g_latex_del
761
 
762
  with gr.Accordion(label="Reference", elem_classes="first_md", elem_id="reference"):
 
763
  gr.Markdown(
764
  r"""
765
  <a id="dpm" href="https://arxiv.org/abs/1503.03585"> [1] Deep Unsupervised Learning Using Nonequilibrium Thermodynami </a>
 
777
  <a id="mc_limit" href="https://stats.libretexts.org/Bookshelves/Probability_Theory/Book%3A_Introductory_Probability_(Grinstead_and_Snell)/11%3A_Markov_Chains/11.04%3A_Fundamental_Limit_Theorem_for_Regular_Chains"> [7] Fundamental Limit Theorem for Regular Chains </a>
778
 
779
  <a id="mc_basic_p6" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [8] Markov Chain:Basic Theory - Proposition 6 </a>
780
+
781
  <a id="fp_converse" href="https://arxiv.org/abs/1702.07339"> [9] A Converse to Banach's Fixed Point Theorem and its CLS Completeness </a>
782
 
783
  <a id="ce_kl" href="https://en.wikipedia.org/wiki/Cross-entropy#Cross-entropy_minimization"> [10] Cross-entropy minimization </a>
784
+
785
  <a id="deconv_1" href="https://thewolfsound.com/deconvolution-inverse-convolution/"> [11] Deconvolution Using Frequency-Domain Division </a>
786
+
787
  <a id="deconv_2" href="https://www.strollswithmydog.com/deconvolution-by-division-in-the-frequency-domain/"> [12] deconvolution-by-division-in-the-frequency-domain </a>
788
+
789
  <a id="mc_basic_t7" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [13] Markov Chain:Basic Theory - Theorem 7 </a>
790
+
791
  <a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
792
+
793
  <a id="vdm" href="https://arxiv.org/pdf/2107.00630"> [15] Variational Diffusion Models </a>
794
+
795
+ <a id="entropy" href="https://en.wikipedia.org/wiki/Entropy"> [16] Entropy </a>
796
+
797
+ <a id="cond_entropy" href="https://en.wikipedia.org/wiki/Conditional_entropy"> [17] Conditional Entropy </a>
798
+
799
+ <a id="dsm" href="https://www.iro.umontreal.ca/~vincentp/Publications/smdae_techreport_1358_v1.pdf"> [18] A Connection Between Score Matching and Denoising autoencoders </a>
800
+
801
+ <a id="mc_basic_t3" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [19] Markov Chain:Basic Theory - Theorem 3 </a>
802
+
803
+ <a id="mc_mt_lambda" href="https://pages.uoregon.edu/dlevin/MARKOV/markovmixing.pdf"> [20] Markov Chains and Mixing Times, second edition - 12.2 The Relaxation Time </a>
804
+
805
+ <a id="non_neg_lambda" href="https://link.springer.com/book/10.1007/0-387-32792-4"> [21] Non-negative Matrices and Markov Chains - Theorem 2.10 </a>
806
+
807
+ <a id="prml_mcmc" href="https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf"> [22] Pattern Recognition and Machine Learning - 11.2. Markov Chain Monte Carlo </a>
808
+
809
+ <a id="elem" href="https://cs-114.org/wp-content/uploads/2015/01/Elements_of_Information_Theory_Elements.pdf"> [23] Elements_of_Information_Theory_Elements - 2.9 The Second Law of Thermodynamics </a>
810
+
811
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_en")
812
 
813
  return
 
854
 
855
  md_cond_kl_en()
856
 
857
+ md_approx_gauss_en()
858
+
859
+ md_non_expanding_en()
860
+
861
+ md_stationary_en()
862
 
863
  md_reference_en()
864
 
RenderMarkdownZh.py CHANGED
@@ -78,31 +78,45 @@ def md_posterior_zh():
78
  q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
79
  \end{align}
80
 
81
- 当$z$是取固定值时,$q(z)$是常数,所以$q(x|z)$的形状只与${q(z|x)q(x)}$有关。
82
  \begin{align}
83
- q(x|z) \propto q(z|x)q(x) \qquad where\ z\ is\ fixed \tag{3.2}
84
  \end{align}
 
 
 
 
 
 
85
  由式2.1可知,$q(z|x)$为高斯分布,于是有
86
  \begin{align}
87
- q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad &where\ z\ is\ fixed \tag{3.3} \newline
88
- &= \frac{1}{\sqrt{\alpha}} \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x)& \qquad &where\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{3.4}
 
89
  \end{align}
90
 
91
- 可以看出,<b>GaussFun</b>部分是关于$x$的高斯函数,均值为$\frac{z}{\sqrt{\alpha}}$,方差为$\sqrt{\frac{1-\alpha}{\alpha}}$,所以$q(x|z)$的形状由“<b>GaussFun与$q(x)$相乘</b>”决定。
92
 
93
  根据”乘法“的特点,可以总结$q(x|z)$函数形状具有的特点。
94
  <ul>
95
- <li> 当高斯函数的方差较小(较小噪声),或者$q(x)$变化缓慢时,$q(x|z)$的形状将近似于高斯函数,函数形式较简单,方便建模学习。</li>
 
96
  <li> 当高斯函数的方差较大(较大噪声),或者$q(x)$剧烈变化时,$q(x|z)$的形状将较复杂,与高斯函数有较大的差别,难以建模学习。</li>
97
  </ul>
98
 
 
 
99
  具体可看<a href="#demo_2">Demo 2</a>,左4图给出后验概率分布$q(x|z)$的形态,可以看出,其形状较不规则,像一条弯曲且不均匀的曲线。当$\alpha$较大时(噪声较小),曲线将趋向于均匀且笔直。读者可调整不同的$\alpha$值,观察后验概率分布与噪声大小的关系;左5图,$\textcolor{blue}{蓝色虚线}$给出$q(x)$,$\textcolor{green}{绿色虚线}$给出式3.4中的GaussFun,$\textcolor{orange}{黄色实线}$给出两者相乘并归一化的结果,即固定z条件下后验概率$q(x|z=fixed)$。读者可调整不同z值,观察$q(x)$的波动变化对后验概率$q(x|z)$形态的影响。
100
 
101
  两个特殊状态下的后验概率分布$q(x|z)$值得考虑一下。
102
  <ul>
103
- <li> 当$\alpha \to 0$时,GaussFun的方差趋向于<b>无穷大</b>,不同$z$值的$q(x|z)$几乎变成一致,并与$q(x)$几乎相同。读者可在<a href="#demo_2">Demo 2</a>中,将$\alpha$设置为0.01,观察具体的结果。</li>
104
- <li> 当$\alpha \to 1$时,GaussFun的方差趋向于<b>无穷小</b>,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数, 偏移量等于$z$。但有一些例外,当q(x)存在为零的区域时,其对应的q(x|z)将不再为Dirac delta函数,而是零函数。可在<a href="#demo_2">Demo 2</a>中��将$\alpha$设置为0.999,观察具体的结果。</li>
105
  </ul>
 
 
 
 
106
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_zh")
107
  return
108
 
@@ -145,18 +159,22 @@ def md_forward_process_zh():
145
  q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
146
  \end{align}
147
 
148
- 比较式4.8和式2.1的形式,可发现,两者的形式是完全一致的。如果只关注最终变换后的分布$q(z_t)$,那么连续t次的小变换可用一次大变换替代,大变换的$\alpha$是各个小变换的$\alpha$累积。
 
 
 
 
149
 
150
  在DDPM[\[2\]](#ddpm)论文中,作者使用了1000步(T=1000),将数据分布$q(x)$转换至$q(z_T)$,$q(z_T|x)$的概率分布如下:
151
  \begin{align}
152
  q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
153
  \end{align}
154
 
155
- 如果只考虑边际分布$q(z_T)$,也可使用一次变换代替,变换如下:
156
  \begin{align}
157
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
158
  \end{align}
159
- 可以看出,应用两种变换后,变换后的分布$q(z_T|x)$相同,因此,$q(z_T)$也相同。
160
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_zh")
161
  return
162
 
@@ -184,7 +202,7 @@ def md_backward_process_zh():
184
 
185
  在<a href="#posterior">第3节</a>中,我们考虑了两个特殊的后验概率分布。接下来,分析其对应的”后验概率变换“。
186
  <ul>
187
- <li> 当$\alpha \to 0$时,不同$z$值的$q(x|z)$均与$q(x)$几乎相同,也就是说,线性加权和的基函数几乎相同。此状态下,不管输入如何变化,变换的输出总为$q(x)$。</li>
188
  <li> 当$\alpha \to 1$时,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数及零函数。此状态下,只要输入分布的支撑集(support set)包含于$q(x)$的支撑集,变换的输出与输入将保持一致。</li>
189
  </ul>
190
 
@@ -192,11 +210,13 @@ def md_backward_process_zh():
192
  \begin{align}
193
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
194
  \end{align}
195
- 由于$\alpha=0.0000403$非常小,其对应的GaussFun(式3.4)的标准差达到157.52,而$X$的范围限制在$[-1, 1]$,远小于GaussFun的标准差。在$x \in [-1, 1]$范围内,GaussFun应该接近于常量,没有什么变化,所以不同的$z_T$对应的$q(x|z_T)$均与$q(x)$几乎相同。在这种状态下,对于$q(x|z_T)$相应的后验概率变换,不管输入分布是什么,输出分布都将是$q(x)$。
196
 
197
- <b>所以,理论上,在DDPM模型中,无需非得使用标准正态分布代替$q(z_T)$,也可使用其它任意的分布代替。</b>
198
 
199
  读者可亲自做一个类似的实验。在<a href="#demo_3_1">Demo 3.1</a>中,将start_alpha设置0.25,end_alpha也设置为0.25,step设置为7,此时$q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061}\epsilon$,与DDPM的$q(z_T)$基本相似。点击<b>apply</b>执行前向变换($\textcolor{blue}{蓝色曲线}$),为接下来的反向恢复做准备。在<a href="#demo_3_2">Demo 3.2</a>中,noise_ratio设置为1,为末端分布$q(z_7)$引入100%的噪声,切换nose_random_seed的值可改变噪声的分布,取消选择backward_pdf,减少画面的干扰。点击<b>apply</b>将通过后验概率变换恢复$q(x)$,将会看到,不管输入的$q(z_7)$的形状如何,恢复的$q(x)$均与原始的$q(x)$完全相同, JS Divergence为0,恢复的过程使用$\textcolor{red}{红色曲线}$画出。
 
 
200
 
201
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_zh")
202
  return
@@ -287,19 +307,20 @@ def md_posterior_transform_zh():
287
 
288
  gr.Markdown(
289
  r"""
290
- <h3 style="font-size:18px"> 压缩映射及收敛点 </h3>
291
  \begin{align}
292
  q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
293
  \end{align}
294
 
295
- 经过大量一维随机变量的实验发现,后验概率变换呈现出“压缩映射”(Contraction Mapping[\[6\]](#ctr))的特征。也是说,对任意的两个概率分布$q_{i1}(z)和q_{i2}(z)$,经过后验概率变换后得到$q_{o1}(x)$和$q_{o2}(x)$,$q_{o1}(z)$和$q_{o2}(z)$的距离<b>总是小于</b>$q_{i1}(x)$和$q_{i2}(x)$的距离。这里的距离可使用JS Divergence或Total Variance或度量。并且,这个压缩映射的压缩程度跟所加噪声大小正相关。
296
  \begin{align}
297
- dist(q_{o1}(z),\ q_{o2}(z)) < dist(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
298
  \end{align}
 
299
 
300
  读者可查看<a href="#demo_4_1">Demo 4.1</a>,左侧三个图呈现一个变换的��程,左1图是任意的数据分布$q(x)$,左3图是变换后的概率分布,左2图是后验概率分布。可更改随机种子生成新的数据分布,调整$\alpha$值引入不同程度的噪声。左侧最后两个图展示变换的“压缩性质”,左4图展示随机生成的两个输入分布,同时给出其距离度量值$div_{in}$;左5图展示经过变换后的两个输出分布,输出分布之间的距离标识为$div_{out}$。读者可改变输入的随机种子,切换不同的输入。可在图中看到,对于任意的输入,$div_{in}$总是小于$div_{out}$。另外,也可改变$\alpha$的值,将会看到,$\alpha$越小(噪声越大),$\frac{div_{out}}{div_{in}}$的比值也越小,即收缩率越大。
301
 
302
- 由Banach fixed-point theorem<a href="#fixed_point">[5]</a>可知,压缩映射存在惟一一个定点(收敛点)。也就是说,对于任意的输入分布,可以连续迭代应用“后验概率变换”,只要迭代次数足够多,最终都会输出同一个分布。经过大量一维随机变量实验发现,定点(收敛点)<b>位于$q(x)$附近</b>。并且,与$\alpha$的值有关,$\alpha$越小(噪声越大),离得越近。
303
 
304
  读者可看<a href="#demo_4_2">Demo 4.2</a>,此部分展示迭代收敛的例子。选择合适的迭代次数,点中“apply iteration transform”,将逐步画出迭代的过程,每个子图均会展示各自变换后的输出分布($\textcolor{green}{绿色曲线}$),收敛的参考点分布$q(x)$以$\textcolor{blue}{蓝色曲线}$画出,同时给出输出分布与$q(x)$之间的距离$dist$。可以看出,随着迭代的次数增加,输出分布与$q(x)$越来越相似,并最终会稳定在$q(x)$附近。对于较复杂的分布,可能需要较多迭代的次数或者较大的噪声。迭代次数可以设置为上万步,但会花费较长时间。
305
 
@@ -311,23 +332,31 @@ def md_posterior_transform_zh():
311
  \boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
312
  \end{align}
313
  于是,为了更深入地理解变换的特点,<a href="#demo_4_2">Demo 4.2</a>也画出矩阵$(Q_{x|z})^n$的结果。从图里可以看到,当迭代趋向收敛时,矩阵$(Q_{x|z})^n$的行向量将变成一个常数向量,即向量的各分量都相等。在二维密度图里将表现为一条横线。
314
-
315
- 在<a href="#proof_ctr">Appendix B</a>中,将会提供一个证明,当$q(x)$和$\alpha$满足一些条件时,后验概率变换是一个严格的压缩映射。
316
-
317
- 关于定点分布与输入分布q(x)之间距离的关系,目前尚不能严格证明。
318
 
319
- <h3 style="font-size:18px"> 恢复数据分布过程中的抗噪声能力 </h3>
320
- 由上面的分析可知,当满足一些条件时,"后验概率变换"是一个压缩映射,所以存在如下的关系:
 
 
 
321
  \begin{align}
322
- dist(q(x),\ q_o(x)) < dist(q(z),\ q_i(z)) \tag{7.12}
323
  \end{align}
324
- 其中,$q(z)$是理想的输入分布,$q(x)$理想的输出分布,$q_i(x)$是任意的输入分布,$q_o(x)$是$q_i(z)$经过变换后的输出分布。
325
 
326
- 上式表明,输出的分布$q_o(x)$与理想输出分布q(x)之间的距离总会</em>小于</em>输入分布$q_i(z)$与理想输入分布q(x)的距离。于是,"后验概率变换"具备一定的抵抗噪声能力。这意味着,在恢复$q(x)$的过程中(<a href="#backward_process">第5节</a>),哪怕输入的“末尾分布$q(z_T)”$存在一定的误差,经过一系列变换后,输出的“数据分布$q(x)$“的误差也会比输入的误差更小。
327
 
328
  具体可看<a href="#demo_3_2">Demo 3.2</a>,通过增加“noise ratio”的值可以向“末尾分布$q(z_T)$”添加噪声,点击“apply”按钮将逐步画出恢复的过程,恢复的分布以$\textcolor{red}{红色曲线}$画出,同时也会通过JS散度标出误差的大小。将会看到,恢复的$q(x)$的误差总是小于$q(z_T)$的误差。
329
 
330
- 由上面的讨论可知,$\alpha$越小(即变换过程中使用的噪声越大),压缩映射的压缩率越大,于是,抗噪声的能力也越强。
 
 
 
 
 
 
 
 
 
331
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_zh")
332
  return
333
 
@@ -439,74 +468,272 @@ def md_cond_kl_zh():
439
  return
440
 
441
 
442
- def md_proof_ctr_zh():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  global g_latex_del
444
 
445
- title = "Appendix B Proof of Contraction"
446
- with gr.Accordion(label=title, elem_classes="first_md", elem_id="proof_ctr"):
447
  gr.Markdown(
448
  r"""
449
- <center> <img src="file/fig2.png" width="960" style="margin-top:12px"/> </center>
450
- <center> Figure 2: Only one component in support </center>
451
 
452
- 本节将证明,当$q(x)$及$\alpha$满足一些条件时,后验概率变换是一个压缩映射,并存在惟一收敛点。
 
 
 
 
453
 
454
- 下面分四种情况进行证明。证明的过程假设随机变量是离散型的,因此,后验概率变换可看作是一个<b>离散Markov Chain</b>的一步转移,后验概率$q(x|z)$对应于<b>转移矩阵</b>(Transfer Matrix)。连续型的变量可认为是无限多状态的离散型变量。
455
 
456
- <ol style="list-style-type:decimal">
457
- <li> 当$q(x)$均大于0时,后验概率变换矩阵$q(x|z)$将大于0,于是此矩阵是一个$\textcolor{red}{不可约}\textcolor{green}{非周期}$的Markov Chain的转移矩阵,根据文献<a href="#mc_basic_p6">[13]</a>的结论,此变换是一个关于Total Variance度量的压缩映射,于是,根据Banach fixed-point theorem,此变换存在惟一定点(收敛点)。</li>
458
-
459
- <li> 当$q(x)$部分大于0,并且$q(x)$的支撑集($q(x)$大于0的区域)只存在一个连通域时(图2),由式(3.4)可分析出几个结论:
 
 
 
 
460
 
461
- <ol style="list-style-type:lower-alpha; padding-inline-start: 0px;font-size:16px;">
462
- <li> 当$z$和$x$在支撑集内时,由于$q(x)$和GaussFun均大于0,所以,转移矩阵的对角元素$\{q(x|z)|z=x\}$大于0。这意味着,支撑集内的状态是$\textcolor{green}{非周期}$的。</li>
 
 
463
 
464
- <li> 当$z$和$x$在支撑集内时,由于GaussFun的支撑集存在一定的半径,所以,在对角元素上下附近区域内的$\{q(x|z)|x=z+\epsilon\}$也大于0。这意味着,支撑集内的状态可相互访问(accessible),形成一个$\textcolor{red}{\text{Communication Class}}$<a href="#mc_basic_d4">[14]</a>。</li>
465
-
466
- <li> 当<em>$z$在支撑集内</em>且<em>$x$在支撑集外</em>时,${q(x|z)}$全为0。这意味着,支撑集内的状态<em>不可访问</em>支撑集外的状态(图2b的inaccessible区域)。</li>
467
-
468
- <li> 当<em>$z$在支撑集外</em>且<em>$x$在支撑集内</em>时,由于GaussFun的支撑集存在一定的范围,所以,存在部分扩展区域(图2b的extension区域),其对应的$\{q(x|z)|x\in support\}$不全为0。这意味着,此部分扩展区域的状态可<em>单向</em>访问(access)支撑集内的状态(图2b的unidirectional区域)。</li>
469
-
470
- <li> 当<em>$z$在支撑集外</em>且<em>$x$在支撑集外</em>时,对应的$q(x|z)$全为0。这意味着,支撑集外的状态不会转移至支撑集外的状态,也就是说,支撑集外的状态只来源于支撑集内的状态。</li>
 
 
 
 
 
 
 
 
 
471
 
472
- <p style="margin-top:8px">
473
- 由(c)可知,支撑集内的状态<em>不会转移到</em>支撑集外的状态,由(a)和(b)可知,支撑集内的状态是非周期且构成一个Communicate Class,所以,支撑集内的状态独立构成一个不可约且非周期的Markov Chain,根据文献<a href="#mc_limit">[7]</a>中Theorem 11.4.1的结论,当$n\to+\infty$时,$q(x|z)^n$收敛于一个固定矩阵,并且矩阵每个列向量都相同。这意味着,对于不同的z,$q(x|z)^n$都相同(可见图2c)。另外,由(d)和(e)可知,存在部分支撑集外的z状态,能转移至支撑集内,并且会带着支撑集内的信息转移回支撑集外,于是,此部分z状态对应的$q(x|z)$(图2c的$q(x|z_{ex})$区域)也会等于支撑集内对应的$q(x|z)$(图2c的$q(x|z_{sup})$区域)。
474
- </p>
 
 
 
 
 
 
 
 
 
475
 
476
- <p style="margin-top:8px">
477
- 所以,可以得出结论,当状态限制在支撑集和两个扩展区域内时,$\lim_{n\to\infty}{q(x|z)^n}$会收敛于一个固定矩阵,并且每个列向量均相同。于是,对于任意的输入分布,如果连续应用足够多后验概率变换,最终会收敛于一个固定分布,此分布等于收敛的矩阵的列向量。根据文献<a href="#fp_converse">[9]</a>的结论,当迭代变换收敛于惟一定点时,此变换是关于某个metric的Contraction Mapping。
478
- </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  </ol>
 
 
 
 
 
 
 
 
 
 
481
  </li>
482
 
483
- <li> 当$q(x)$部分大于0,$q(x)$的支撑集存在多个连通域,并且各个连通域的最大距离<b>能</b>被相应的GaussFun的支撑集所覆盖时,那各个连通域内的状态构成一个Communicate Class。如图3所示,$q(x)$存在两个连通域,在第一个连通域的边缘,$q(x|z=-0.3)$对应的GaussFun的支撑集能跨越间隙到达第二个连通域,于是第一个连通域的状态能<em>访问</em>第二个连通域的状态;在第二个连通域的边缘,$q(x|z=0)$对应的GaussFun的支撑集也能跨越间隙到达第一个连通域,于是第二个连通域的状态能<em>访问</em>第一个连通域的状态,所以两个连通域构成一个Communicate Class。因此,与单个连通域的情况类似,当状态限制在各个连通域、间隙及扩展区域内时,后验概率变换存在惟一一个迭代收敛点,并且是关于某个metric的压缩映射。</li>
 
484
 
485
- <li> 当$q(x)$部分大于0,$q(x)$的支撑集存在多个连通域时,并且各个连通域的最大距离<b>不能</b>被相应的GaussFun的支撑集所覆盖时,那各个连通域内的状态构成多个Communicate Class,如图4所示。此情况下,当$n\to\infty$时,$q(x|z)^n$也会收敛于一个固定矩阵,但每个列向量不尽相同。所以,后验概率变换不是一个严格的压缩映射。但当输入分布的状态限制在单个Communicate Class及相应的扩展范围内时,后验概率变换也是一个压缩映射,存在惟一收敛点。</li>
 
486
  </ol>
487
 
488
- <center> <img src="file/fig3.png" width="960" style="margin-top:12px"/> </center>
489
- <center> Figure 3: Two component which can communicate with each other </center>
 
 
 
 
 
490
 
491
- <center> <img src="file/fig4.png" width="960" style="margin-top:12px"/> </center>
492
- <center> Figure 4: Two component which <b>cannot</b> communicate with each other </center>
493
 
494
- 另外,后验概率变换存在一个更通用的关系,与$q(x|z)$的具体值无关: 两个输出分布的之间的Total Variance距离总是会<b>小于等于</b>对应输入分布之间的Total Variance距离,即
 
 
 
 
 
 
 
 
 
 
 
495
  \begin{align}
496
- dist(q_{o1}(x),\ q_{o2}(x)) \le dist(q_{i1}(z),\ q_{i2}(z)) \tag{B.1}
497
  \end{align}
498
- 下面通过离散的形式给出证明:
 
 
 
499
  \begin{align}
500
- \lVert q_{o1}-q_{o2}\rVert_{TV} &= \lVert Q_{x|z}q_{i1} - Q_{x|z}q_{i2}\rVert_{TV} \tag{B.2} \newline
501
- &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)q_{i1}(n) - \sum_{n}Q_{x|z}(m,n)q_{i2}(n)\textcolor{red}{|} \tag{B.3} \newline
502
- &= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.4} \newline
503
- &\leq \sum_{m}\sum_{n}Q_{x|z}(m,n)\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \qquad \qquad \qquad \text{Absolute value inequality} \tag{B.5} \newline
504
- &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \sum_{m} Q_{x|z}(m,n) \qquad \qquad \qquad \sum_{m} Q_{x|z}(m,n) = 1 \tag{B.6} \newline
505
- &= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.7}
506
  \end{align}
507
- 其中,$Q_{x|z}(m,n)$表示矩阵$Q_{x|z}$的第m行第n列的元素,$q_{i1}(n)$表示向量$q_{i1}$的第n个元素。
508
 
509
- """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_proof_ctr_zh")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  return
511
 
512
 
@@ -553,6 +780,16 @@ def md_reference_zh():
553
 
554
  <a id="dsm" href="https://www.iro.umontreal.ca/~vincentp/Publications/smdae_techreport_1358_v1.pdf"> [18] A Connection Between Score Matching and Denoising autoencoders </a>
555
 
 
 
 
 
 
 
 
 
 
 
556
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_zh")
557
 
558
  return
@@ -599,7 +836,11 @@ def run_app():
599
 
600
  md_cond_kl_zh()
601
 
602
- md_proof_ctr_zh()
 
 
 
 
603
 
604
  md_reference_zh()
605
 
 
78
  q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
79
  \end{align}
80
 
81
+ 当$z$是取固定值时,$q(z)$是常数,所以$q(x|z)$是关于$x$的概率密度函数,并且其形状只与${q(z|x)q(x)}$有关。
82
  \begin{align}
83
+ q(x|z) &=\propto q(z|x)q(x) \qquad \text{where z is fixed} \tag{3.2}
84
  \end{align}
85
+
86
+ 实际上,$q(z)=\int q(z|x)q(x)dx$,也就是说,$q(z)$是对函数$q(z|x)q(x)$遍历$x$求和,所以,$q(z|x)q(x)$除以$q(z)$相当于对$q(z|x)q(x)$执行归一化。
87
+ \begin{align}
88
+ q(x|z) = \operatorname{Normalize}\big(q(z|x)q(x)\big) \tag{3.3}
89
+ \end{align}
90
+
91
  由式2.1可知,$q(z|x)$为高斯分布,于是有
92
  \begin{align}
93
+ q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad &\text{where z is fixed} \notag \newline
94
+ &= \frac{1}{\sqrt{\alpha}}\frac{1}{\sqrt{2\pi\frac{1-\alpha}{\alpha}}}\exp{\frac{-(\frac{z}{\sqrt{\alpha}}-x)^2}{2\frac{1-\alpha}{\alpha}}}\ q(x)& \notag \newline
95
+ &= \frac{1}{\sqrt{\alpha}} \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{3.4}
96
  \end{align}
97
 
98
+ 可以看出,<b>GaussFun</b>部分是关于$x$的高斯函数,均值为$\frac{z}{\sqrt{\alpha}}$,标准差为$\sqrt{\frac{1-\alpha}{\alpha}}$,所以$q(x|z)$的形状由“<b>GaussFun与$q(x)$相乘</b>”决定。
99
 
100
  根据”乘法“的特点,可以总结$q(x|z)$函数形状具有的特点。
101
  <ul>
102
+ <li> $q(x|z)$的支撑集应该包含于GaussFun的支撑集,GaussFun的支撑集是一个超球体,中心位于均值$\mu$,半径约为3倍标准差$\sigma$。</li>
103
+ <li> 当高斯函数的方差较小(较小噪声),或者$q(x)$线性变化时,$q(x|z)$的形状将近似于高斯函数,函数形式较简单,方便建模学习。</li>
104
  <li> 当高斯函数的方差较大(较大噪声),或者$q(x)$剧烈变化时,$q(x|z)$的形状将较复杂,与高斯函数有较大的差别,难以建模学习。</li>
105
  </ul>
106
 
107
+ <a href="#approx_gauss">Appendix B</a>给出了较严谨的分析,当$\sigma$满足一些条件时,$q(x|z)$的近似于高斯分布。
108
+
109
  具体可看<a href="#demo_2">Demo 2</a>,左4图给出后验概率分布$q(x|z)$的形态,可以看出,其形状较不规则,像一条弯曲且不均匀的曲线。当$\alpha$较大时(噪声较小),曲线将趋向于均匀且笔直。读者可调整不同的$\alpha$值,观察后验概率分布与噪声大小的关系;左5图,$\textcolor{blue}{蓝色虚线}$给出$q(x)$,$\textcolor{green}{绿色虚线}$给出式3.4中的GaussFun,$\textcolor{orange}{黄色实线}$给出两者相乘并归一化的结果,即固定z条件下后验概率$q(x|z=fixed)$。读者可调整不同z值,观察$q(x)$的波动变化对后验概率$q(x|z)$形态的影响。
110
 
111
  两个特殊状态下的后验概率分布$q(x|z)$值得考虑一下。
112
  <ul>
113
+ <li> 当$\alpha \to 0$时,GaussFun的标准差趋向于<b>无穷大</b>,GaussFun变成一个很大支撑集的近似的均匀分布,$q(x)$与均匀分布<b>相乘</b>结果仍为$q(x)$,所以,不同$z$值对应的$q(x|z)$几乎变成一致,并与$q(x)$几乎相同。读者可在<a href="#demo_2">Demo 2</a>中,将$\alpha$设置为0.001,观察具体的结果。</li>
114
+ <li> 当$\alpha \to 1$时,GaussFun的标准差趋向于<b>无穷小</b>,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数, 偏移量等于$z$。但有一些例外,当$q(x)$存在为零的区域时,其对应的$q(x|z)$将不再为Dirac delta函数,而是零函数。可在<a href="#demo_2">Demo 2</a>中,将$\alpha$设置为0.999,观察具体的结果。</li>
115
  </ul>
116
+
117
+ 有一点需要注意一下,当$\alpha \to 0$时,较大$z$值对应的GaussFun的均值($\mu=\frac{z}{\sqrt{\alpha}}$)也急剧变大,也就是说,GaussFun位于离原点较远的地方,此时,$q(x)$的支撑集对应的GaussFun部分的“均匀程度”会略微有所下降, 从而会略微降低$q(x|z)$与$q(x)$的相似度,但这种影响会随着$\alpha$减小而进一步降低。读者可在<a href="#demo_2">Demo 2</a>中观察此影响,将$\alpha$设置为0.001,$q(x|z=-2)$与$q(x)$会略微有一点差别,但$q(x|z=0)$与$q(x)$却看不出区别。
118
+
119
+ 关于高斯函数的"均匀程度",有如下两个特点:标准差越大,均匀程度越大;离均值越远,均匀程度越小。
120
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_zh")
121
  return
122
 
 
159
  q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
160
  \end{align}
161
 
162
+ 比较式4.8和式2.1的形式,可发现,两者的形式是完全一致的。
163
+
164
+ 如果只关注首尾两个变量之间的关系,那么连续t次的小变换可用一次大变换替代,大变换的$\alpha$是各个小变换的$\alpha$累积,因为两种变换对应的联合概率分布相同。
165
+
166
+ 读者可在<a href="#demo_3_1">Demo 3.1</a>中做一个实验,对同样的输入分布$q(x)$,使用两种不同的变换方式:1)使用三个变换,$\alpha$均为0.95; 2)使用一个变换,$\alpha$设置为0.857375。分别执行变换,然后比较变换后的两个分布,将会看到,两个分布是完全相同的。
167
 
168
  在DDPM[\[2\]](#ddpm)论文中,作者使用了1000步(T=1000),将数据分布$q(x)$转换至$q(z_T)$,$q(z_T|x)$的概率分布如下:
169
  \begin{align}
170
  q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
171
  \end{align}
172
 
173
+ 如果只考虑$X,Z_T$的联合分布$q(x,z_T)$,也可使用一次变换代替,变换如下:
174
  \begin{align}
175
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
176
  \end{align}
177
+ 可以看出,应用两种变换后,变换后的分布$q(z_T|x)$相同,因此,$q(x, z_T)$也相同。
178
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_zh")
179
  return
180
 
 
202
 
203
  在<a href="#posterior">第3节</a>中,我们考虑了两个特殊的后验概率分布。接下来,分析其对应的”后验概率变换“。
204
  <ul>
205
+ <li> 当$\alpha \to 0$时,不同$z$值的$q(x|z)$均与$q(x)$几乎相同,也就是说,线性加权和的基函数几乎相同。此状态下,<b>不管输入如何变化,变换的输出总为$q(x)$</b>。</li>
206
  <li> 当$\alpha \to 1$时,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数及零函数。此状态下,只要输入分布的支撑集(support set)包含于$q(x)$的支撑集,变换的输出与输入将保持一致。</li>
207
  </ul>
208
 
 
210
  \begin{align}
211
  Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
212
  \end{align}
213
+ 由于$\alpha=0.0000403$非常小,其对应的GaussFun(式3.4)的标准差达到157.52。如果把$q(x)$的支撑集限制在单位超球范围内($\lVert x \rVert_2 < 1$),那当$z_T \in [-2, +2]$时,对应的各个$q(x|z_T)$均与$q(x)$非常相似。在这种状态下,对于$q(x|z_T)$相应的后验概率变换,不管输入分布的形状的如何,只要支撑集在$[-2,+2]$范围内,其输出分布都将是$q(x)$。
214
 
215
+ <b>所以,可以总结,在DPM模型中,如果$q(x)$的支撑集是有限的,并且最终变量$Z_T$的信噪比足够大,那恢复$q(x)$的过程可以使用任意的分布,不必一定需要使用标准正态分布。</b>
216
 
217
  读者可亲自做一个类似的实验。在<a href="#demo_3_1">Demo 3.1</a>中,将start_alpha设置0.25,end_alpha也设置为0.25,step设置为7,此时$q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061}\epsilon$,与DDPM的$q(z_T)$基本相似。点击<b>apply</b>执行前向变换($\textcolor{blue}{蓝色曲线}$),为接下来的反向恢复做准备。在<a href="#demo_3_2">Demo 3.2</a>中,noise_ratio设置为1,为末端分布$q(z_7)$引入100%的噪声,切换nose_random_seed的值可改变噪声的分布,取消选择backward_pdf,减少画面的干扰。点击<b>apply</b>将通过后验概率变换恢复$q(x)$,将会看到,不管输入的$q(z_7)$的形状如何,恢复的$q(x)$均与原始的$q(x)$完全相同, JS Divergence为0,恢复的过程使用$\textcolor{red}{红色曲线}$画出。
218
+
219
+ 另外有一点值得注意一下,在深度学习任务中,常将输入样本的各个维度缩放在[-1,1]范围内,也是说在一个超立方体内(hypercube)。超立方体内任意两点的最大欧氏距离会随着维度的增多而变大,比如,对于一维,最大距离为$2$,对于二维,最大距离为$2\sqrt{2}$,对于三维,最大距离为$2\sqrt{3}$,对于n维,最大距离为$2\sqrt{n}$。所以,对于维度较高的数据,需要$Z_T$变量有更高的信噪比,才能让恢复过程的起始分布接受任意的分布。
220
 
221
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_zh")
222
  return
 
307
 
308
  gr.Markdown(
309
  r"""
310
+ <h3 style="font-size:18px"> Non-expanding mapping and Stationary Distribution </h3>
311
  \begin{align}
312
  q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
313
  \end{align}
314
 
315
+ 根据<a href="#non_expanping">Appendix B</a>的Corollary 1和Corollary 2可知,后验概率变换是一个non-expanding mapping。也是说,对任意的两个概率分布$q_{i1}(z)和q_{i2}(z)$,经过后验概率变换后得到$q_{o1}(x)$和$q_{o2}(x)$,$q_{o1}(z)$和$q_{o2}(z)$的距离<b>总是小于或等于</b>$q_{i1}(x)$和$q_{i2}(x)$的距离。这里的距离可使用KL Divergence或Total Variance或度量。
316
  \begin{align}
317
+ d(q_{o1}(z),\ q_{o2}(z)) \le d(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
318
  \end{align}
319
+ 根据<a href="#non_expanping">Appendix B</a>的分析可知,在大多数情况,上述的等号并不会成立。并且,<b>当$\alpha$越小时(噪声越多),$d(q_{o1},q_{o2})$会越小于$d(q_{i1},q_{i2})$</b>。
320
 
321
  读者可查看<a href="#demo_4_1">Demo 4.1</a>,左侧三个图呈现一个变换的��程,左1图是任意的数据分布$q(x)$,左3图是变换后的概率分布,左2图是后验概率分布。可更改随机种子生成新的数据分布,调整$\alpha$值引入不同程度的噪声。左侧最后两个图展示变换的“压缩性质”,左4图展示随机生成的两个输入分布,同时给出其距离度量值$div_{in}$;左5图展示经过变换后的两个输出分布,输出分布之间的距离标识为$div_{out}$。读者可改变输入的随机种子,切换不同的输入。可在图中看到,对于任意的输入,$div_{in}$总是小于$div_{out}$。另外,也可改变$\alpha$的值,将会看到,$\alpha$越小(噪声越大),$\frac{div_{out}}{div_{in}}$的比值也越小,即收缩率越大。
322
 
323
+ 根据<a href="#stationary">Appendix C</a>的分析可知:后验概率变换可视为markov chain的一步跳转,并且,<b>当$q(x)$和$\alpha$满足一些条件时,此markov chain会收敛于惟一的稳态分布</b>。另外,通过大量实验发现,<b>稳态分布与数据分布$q(x)$非常相似,当$\alpha$越小时,稳态分布与$q(x)$越相似</b>。特别地,根据<a href="#backward_process">第5节</a>的结论,<b>当$\alpha \to 0$时,经过一步变换后,输出分布即是$q(x)$,所以稳态分布必定是$q(x)$</b>。
324
 
325
  读者可看<a href="#demo_4_2">Demo 4.2</a>,此部分展示迭代收敛的例子。选择合适的迭代次数,点中“apply iteration transform”,将逐步画出迭代的过程,每个子图均会展示各自变换后的输出分布($\textcolor{green}{绿色曲线}$),收敛的参考点分布$q(x)$以$\textcolor{blue}{蓝色曲线}$画出,同时给出输出分布与$q(x)$之间的距离$dist$。可以看出,随着迭代的次数增加,输出分布与$q(x)$越来越相似,并最终会稳定在$q(x)$附近。对于较复杂的分布,可能需要较多迭代的次数或者较大的噪声。迭代次数可以设置为上万步,但会花费较长时间。
326
 
 
332
  \boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
333
  \end{align}
334
  于是,为了更深入地理解变换的特点,<a href="#demo_4_2">Demo 4.2</a>也画出矩阵$(Q_{x|z})^n$的结果。从图里可以看到,当迭代趋向收敛时,矩阵$(Q_{x|z})^n$的行向量将变成一个常数向量,即向量的各分量都相等。在二维密度图里将表现为一条横线。
 
 
 
 
335
 
336
+ 对于一维离散的markov chain,收敛速度与转移概率矩阵的第二大特征值的绝对值($\lvert \lambda_2 \rvert$)反相关,$\lvert \lambda_2 \rvert$越小,收敛速度越快。经过大量的实验发现,$\alpha$与$\lvert \lambda_2 \rvert$有着明确的线性关系,$\alpha$越小,$\lvert \lambda_2 \rvert$也越小。所以,<b>$\alpha$越小(噪声越大),收敛速度越快</b>。 特别地,当$\alpha \to 0$时,由<a href="#posterior">第3节</a>的结论可知,各个$z$对应的后验概率分布趋向一致,而由文献<a href="#non_neg_lambda">[21]</a>的Theorem 21可知,$\lvert \lambda_2 \rvert$小于任意两个$z$对应的后验概率分布的L1距离,所以,可知$\lvert \lambda_2 \rvert \to 0$。
337
+
338
+ </br>
339
+ <h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution </h3>
340
+ 由上面的分析可知,在大多数情况下,"后验概率变换"是一个收缩映射,所以存在如下的关系:
341
  \begin{align}
342
+ d(q(x),\ q_o(x)) < d(q(z),\ q_i(z)) \tag{7.12}
343
  \end{align}
344
+ 其中,$q(z)$是理想的输入分布,$q(x)$理想的输出分��,$q(x)=\int q(x|z)q(z)dz$,$q_i(z)$是任意的输入分布,$q_o(x)$是变换后的输出分布,$q_o(x)=\int q(x|z)q_i(z)dz$。
345
 
346
+ 上式表明,输出的分布$q_o(x)$与理想输出分布$q(x)$之间的距离总会<b>小于</b>输入分布$q_i(z)$与理想输入分布$q(x)$的距离。所以,<b>”后验概率变换“天然具备一定的抵抗噪声能力</b>。这意味着,在恢复$q(x)$的过程中(<a href="#backward_process">第5节</a>),哪怕输入的“末尾分布$q(z_T)”$存在一定的误差,经过一系列变换后,输出的“数据分布$q(x)$“的误差也会比输入的误差更小。
347
 
348
  具体可看<a href="#demo_3_2">Demo 3.2</a>,通过增加“noise ratio”的值可以向“末尾分布$q(z_T)$”添加噪声,点击“apply”按钮将逐步画出恢复的过程,恢复的分布以$\textcolor{red}{红色曲线}$画出,同时也会通过JS散度标出误差的大小。将会看到,恢复的$q(x)$的误差总是小于$q(z_T)$的误差。
349
 
350
+ 由上面的讨论可知,$\alpha$越小(即变换过程中使用的噪声越大),收缩映射的收缩率越大,相应地,抗噪声的能力也越强。特别地,当$\alpha \to 0$时,抗噪声能力无限大,不论多大噪声的输入,输出都为$q(x)$。
351
+
352
+ </br>
353
+ <h3 style="font-size:18px"> Markov Chain Monte Carlo Sampling</h3>
354
+
355
+ 在DPM模型中,通常是通过Ancestral Sampling的方式进行采样。由上面的分析可知,当$\alpha$足够小时,后验概率变换会收敛于$q(x)$,所以,可通过Markov Chain Monte Carlo的方式进行采样。如图7.1所示。图中$\alpha$代表一个较大的噪声的后验概率变换,较大的噪声使稳态分布更接近于数据分布$q(x)$,但由<a href="#posterior">第3节</a>可知,较大噪声的后验变换不利于拟合,所以把较大噪声的后验概率变换分成多个小噪声的后验概率变换。
356
+
357
+ <center> <img src="file/7.1.png" width="1024" style="margin-top:12px"/> </center>
358
+ <center> Figure 7.1: Markov Chain Monte Carlo Sampling</center>
359
+
360
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_zh")
361
  return
362
 
 
468
  return
469
 
470
 
471
+ def md_approx_gauss_zh():
472
+ global g_latex_del
473
+
474
+ title = "Appendix B When does the Posterior Approximate to Gaussian ?"
475
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="approx_gauss"):
476
+ gr.Markdown(
477
+ r"""
478
+ 由式3.4可知,$q(x|z)$有如下的形式
479
+ \begin{align}
480
+ q(x|z) &= \operatorname{Normalize} \Big(\ \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x)\ \Big)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{B.1} \newline
481
+ &\propto \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x) \tag{B.2}
482
+ \end{align}
483
+
484
+ 下面证明,如果满足如下两个假设,$q(x|z)$近似于高斯分布。
485
+ <ul>
486
+ <li>
487
+ 假设在GaussFun的支撑集内,$q(x)$是线性变化的。以GaussFun的均值为中心,对$q(x)$进行泰勒展开。由泰勒展开的性质可知,当GaussFun的标准差$\sigma$足够小时,上述假设可以满足。
488
+ \begin{align}
489
+ q(x) &\approx q(\mu) + \nabla_xq(\mu)(x-\mu)& \quad &\text{where}\quad q(\mu)\triangleq q(x)\bigg|_{x=\mu} \quad \nabla_xq(\mu)\triangleq \nabla_xq(x)\bigg|_{x=\mu} \tag{B.3} \newline
490
+ &= q(\mu)\big(1+ \frac{\nabla_xq(\mu)}{q(\mu)}(x-\mu)\big)& \tag{B.4} \newline
491
+ &= q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)& \quad &\text{where}\quad \nabla_x\log{q(\mu)}\triangleq \nabla_x\log{q(x)}\bigg|_{x=\mu} \tag{B.5}
492
+ \end{align}
493
+ </li>
494
+ <li>
495
+ 假设在GaussFun的支撑集内,$\log\big(1+\nabla_x\log{q(\mu)}(x-\mu)\big)$可近似为 $\nabla_x\log{q(\mu)}(x-\mu)$。对$\log(1+y)$进行泰勒展开,由泰勒展开的性质可知,当$\lVert y\rVert_2$较小时,$\log(1+y)$可近似为$y$。当$\sigma$足够小时,$\lVert x-u\rVert_2$将较小,$\nabla_x\log{q(\mu)}(x-\mu)$也将较小,所以上述假设可以满足。一般情况下,当$\nabla_x\log{q(\mu)}(x-\mu)<0.1$时,近似的误差较小,可忽略。
496
+ \begin{align}
497
+ \log(1+y) &\approx \log(1+y)\bigg|_{y=0} + \nabla_y\log(1+y)\bigg|_{y=0}(y-0) \tag{B.6} \newline
498
+ &= y \tag{B.7}
499
+ \end{align}
500
+ </li>
501
+ </ul>
502
+ 利用上面的两个假设,可对$q(x|z)$进行如下的推导:
503
+
504
+ \begin{align}
505
+ q(x|z) &\propto \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x) \tag{B.8} \newline
506
+ &\approx \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big) \tag{B.9} \newline
507
+ &= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\log\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)\right) \tag{B.10} \newline
508
+ &\approx \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\nabla_x\log{q(\mu)}(x-\mu)\right) \tag{B.11} \newline
509
+ &= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{(x-\mu)^2-2\sigma^2\nabla_x\log{q(\mu)}(x-\mu)}{2\sigma^2}\right) \tag{B.12} \newline
510
+ &= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}+\frac{\big(\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \tag{B.13} \newline
511
+ &= \exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \underbrace{\frac{q(\mu)}{\sqrt{2\pi}\sigma} \exp\left( \frac{1}{2}\big(\sigma\nabla_x\log{q(\mu)}\big)^2\right)}_{\text{const}} \tag{B.14}
512
+ \end{align}
513
+
514
+ 其中,式B.9应用了假设1的结论,式B.11应用了假设2的结论。
515
+
516
+ 式B.14中的const项是常数项,不会影响函数的形状。另外,由上面可知,$q(x|z)$具有自归一化的功能,所以,$q(x|z)$是一个高斯概率密度函数,均值为$\mu+\sigma^2\nabla_x\log{q(\mu)}$,方差为$\sigma^2$。
517
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_approx_gauss_zh")
518
+
519
+ return
520
+
521
+
522
+ def md_non_expanding_zh():
523
  global g_latex_del
524
 
525
+ title = "Appendix C Posterior Transform is a Non-expanding Mapping"
526
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="non_expanding"):
527
  gr.Markdown(
528
  r"""
529
+ <b>Corollary 1</b>
 
530
 
531
+ 以KL Divergence为度量,markov chain的转移变换是non-expanding的<a href="#elem">[23]</a>,即
532
+ \begin{align}
533
+ KL\big(p(x), q(x)\big) &\le KL\big(p(z), q(z)\big) \tag{C.1} \newline
534
+ \end{align}
535
+ 其中,$p(z)$和$q(z)$是任意的概率密度函数,$r(x|z)$是markov chain的转移概率密度函数,$p(x) = \int r(x|z)p(z)dz$,$q(x) = \int r(x|z) q(z) dz$。
536
 
537
+ 证明:
538
 
539
+ 对于$p(x,z)$和$q(x,z)$的KL divergence,存在如下的关系:
540
+ \begin{align}
541
+ KL\big(p(x,z), q(x,z)\big) &= \iint p(x,z)\log \frac{p(x,z)}{q(x,z)}dxdz \tag{C.2} \newline
542
+ & = \iint p(x,z)\log \frac{p(x)p(x|z)}{q(x)q(x|z)}dxdz \tag{C.3} \newline
543
+ &= \iint p(x,z)\log \frac{p(x)}{q(x)}dxdz + \iint p(x,z) \log\frac{p(x|z)}{q(x|z)} dxdz \tag{C.4} \newline
544
+ &= \int \int p(x,z) dz\ \log \frac{p(x)}{q(x)}dx + \int p(z)\int p(x|z) \log\frac{p(x|z)}{q(x|z)} dx\ dz \tag{C.5} \newline
545
+ &= KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.6} \newline
546
+ \end{align}
547
 
548
+ 类似地,调换$Z$和$X$的顺序,可得到下面的关系:
549
+ \begin{align}
550
+ KL\big(p(x,z), q(x,z)\big) &= KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.7}
551
+ \end{align}
552
 
553
+ 比较两个关系式,可得:
554
+ \begin{align}
555
+ KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx = KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.8}
556
+ \end{align}
557
+
558
+ 由于$q(x|z)$和$p(x|z)$都是markov chain的转移概率密度,均等于$r(x|z)$,所以$\int p(z) KL\big(p(x|z), q(x|z)\big)dz$等于0。于是,上式简化为:
559
+ \begin{align}
560
+ KL\big(p(x), q(x)\big) = KL\big(p(z), q(z)\big) - \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.9}
561
+ \end{align}
562
+
563
+ 由于KL divergence总是大于或者等于0,所以,加权和$\int p(x) KL\big(p(z|x), q(z|x)\big)dx$也是大于等于0。于是,可得:
564
+ \begin{align}
565
+ KL\big(p(x), q(x)\big) \le KL\big(p(z), q(z)\big) \tag{C.10}
566
+ \end{align}
567
+
568
+ </br>
569
 
570
+ 上式等号成立的条件是$\int p(x) KL\big(p(z|x), q(z|x)\big)dx$等于0,这要求对不同的条件$x$,$p(z|x)$与$q(z|x)$均要相等。在大多数情况下,当$p(z)$和$q(z)$不同时,$p(z|x)$也和$q(z|x)$不同。这意味着,在大多数情况下,有
571
+ \begin{align}
572
+ KL\big(p(x), q(x)\big) < KL\big(p(z), q(z)\big) \tag{C.11}
573
+ \end{align}
574
+
575
+ </br></br>
576
+ <b>Corollary 2</b>
577
+
578
+ 以Total Variance(L1 distance)为度量,markov chain的转移变换是non-expanding,即
579
+ \begin{align}
580
+ \left\lVert p(x)-q(x) \right\rVert_1\ &\le\ \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.12}
581
+ \end{align}
582
 
583
+ 其中,$p(z)$和$q(z)$是任意的概率密度函数,$r(x|z)$是markov chain的转移概率密度函数,$p(x) = \int r(x|z)p(z)dz$,$q(x) = \int r(x|z) q(z) dz$。
584
+
585
+ 证明:
586
+ \begin{align}
587
+ \left\lVert p(x)-q(x) \right\rVert_1\ &= \int \big\lvert p(x) - q(x) \big\rvert dx \tag{C.13} \newline
588
+ &= \int \left\lvert \int r(x|z) p(z) dz - \int r(x|z)q(z)dz \right\rvert dx \tag{C.14} \newline
589
+ &= \int \left\lvert \int r(x|z) \big(p(z)-q(z)\big) dz \right\rvert dx \tag{C.15} \newline
590
+ &\le \int \int r(x|z) \left\lvert \big(p(z)-q(z)\big) \right\rvert dz dx \tag{C.16} \newline
591
+ &= \int \int r(x|z)dx \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.17} \newline
592
+ &= \int \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.18} \newline
593
+ &= \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.19}
594
+ \end{align}
595
+
596
+ 其中,式C.16应用了绝对值不等式,式C.18利用了$r(x|z)$是概率分布的性质。
597
+
598
+ 证明完毕。
599
+
600
+ </br>
601
+
602
+ 图C.1展示了一个一维随机变量的例子,可以更直观地理解上述推导的过程。
603
+
604
+ 上述等式的成立的条件是:各个绝对值括号内的非零项均是同样的符号。如图C.1(a),包含5个绝对值括号,每个对应一行,每个括号内有5项,当且仅当每行各个非零项同号时,上述的等式才成立。如果出现不同号的情况,则会导致$\lVert p(x)-q(x) \rVert_1\ <\ \lVert p(z) - q(z) \rVert_1$。不同号出现的数量与转移概率矩阵的非零元素有关,一般情况下,非零元素越多,不同号出现的数量会越多。
605
+
606
+ 在后验概率变换中,一般情况下,当$\alpha$越小(噪声越多)时,转移概率密度函数会有越多的非零元素,如图C.2(a)所示;当$\alpha$越大(噪声越小)时,转移概率密度函数会有越少的非零元素,如图C.2(b)所示。
607
+
608
+ 所以,有这么一个规律:<b>当$\alpha$越小时,则会导致$\lVert p(x)-q(x) \rVert_1$越小于$\lVert p(z) - q(z) \rVert_1$,也就是说,这个变换的压缩率越大</b>。
609
+
610
+ <center> <img src="file/C1.png" width="1024" style="margin-top:12px"/> </center>
611
+ <center> Figure C.1: Non-expanding under L1 norm </center>
612
+ </br>
613
+ <center> <img src="file/C2.png" width="568" style="margin-top:12px"/> </center>
614
+ <center> Figure C.2: More non-zero elements as $\alpha$ gets smaller </center>
615
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_non_expanding_zh")
616
 
617
+ return
618
+
619
+
620
+ def md_stationary_zh():
621
+ global g_latex_del
622
+
623
+ title = "Appendix D Posterior Transform Converges to the Unique Stationary Distribution"
624
+ with gr.Accordion(label=title, elem_classes="first_md", elem_id="stationary"):
625
+ gr.Markdown(
626
+ r"""
627
+ 根据文献<a href="#mc_basic_t3">[19]</a>Theorem 3的结论,<b>非周期(aperiodic)不可约(irreducible)的markov chain会收敛于惟一的稳态分布</b>。
628
+
629
+ 下面将表明,当满足一定的条件时,后验概率变换是一个非周期不可约的markov chain的转移概率密度函数。
630
+
631
+ 为了表述方便,下面以一个更通用的形式来描述扩散模型的前向变换。
632
+ \begin{align}
633
+ Z = \sqrt{\alpha}X + \sqrt{\beta}\ \epsilon \tag{D.1} \newline
634
+ \end{align}
635
+
636
+ 由<a href="#transform">第1节</a>可知,$\sqrt{\alpha}X$会对$X$的概率密度函数执行缩放,所以$\alpha$控制着缩放的强度,$\beta$控制着添加噪声的大小。当$\beta = 1-\alpha$时,上述的变换与式1.1一致。
637
+
638
+ 新变换对应的后验概率分布的形式如下:
639
+ \begin{align}
640
+ q(x|z=c) = \operatorname{Normalize} \Big(\ \overbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}^{\text{GaussFun}}\ q(x)\ \Big) \tag{D.2} \newline
641
+ \text{where}\ \mu=\frac{c}{\sqrt{\alpha}}\qquad \sigma=\sqrt{\frac{\beta}{\alpha}} \qquad \text{$c$ is a fixed value} \notag
642
+ \end{align}
643
+
644
+ 当$\beta = 1-\alpha$时,上述的变换与式3.4一致。
645
+
646
+ 为了表述方便,下面以$g(x)$表示式D.2中GaussFun。
647
+
648
+ 由于$\sqrt{\alpha}X$会缩放$X$的概率密��函数$q(x)$,这会使分析转移概率密度函数$q(x|z)$的非周期性和不可约性变得更复杂。所以,为了分析方便,先假设$\alpha=1$,后面再分析$\alpha \neq 1$且$\beta = 1-\alpha$的情况。
649
+
650
+ <center> <img src="file/D1.png" width="960" style="margin-top:12px"/> </center>
651
+ <center> Figure D.1: Only one component in support </center>
652
+
653
+ <center> <img src="file/D2.png" width="960" style="margin-top:12px"/> </center>
654
+ <center> Figure D.2: One component which can communicate with each other </center>
655
+
656
+ </br>
657
+ <h3 style="font-size:24px"> $\alpha=1$ </h3>
658
+
659
+ 当$\alpha=1$时,如果$q(x)$和$\beta$满足下面两个条件之一,则$q(x|z)$对应的markov chain是非周期且不可约的。
660
+
661
+ <ol style="list-style-type:decimal">
662
+ <li>如果$q(x)$的支撑集只存在一个connected component。</li>
663
+ <li>如果$q(x)$的支撑集存在多个connected component,但各个connected component之间的距离小于$3$倍$\sigma$。也就是说,间隙能被$g(x)$的有效区域的半径所覆盖。</li>
664
  </ol>
665
+
666
+ 证明如下:
667
+
668
+ <ol style="list-style-type:decimal">
669
+ <li>
670
+ 对$q(x)$支撑集内的任意点$c$,当$z=c$和$x=c$时,$q(x=c)>0$;由式D.2可知,$g(x)$的中心位于$c$,所以$g(x)$在$x=c$处也大于0。于是,根据式D.2中相乘的关系可知,$q(x=c|z=c)>0$。因此,$q(x|z)$对应的markov chain是非周期的。
671
+
672
+ 对$q(x)$支撑集内的任意点$c$,当$z=c$时,$g(x)$的中心位于$c$, 所以存在一个以$c$为中心的超球($\lVert x-c\rVert_2 < \delta$),在此超球内,$q(x|z=c)>0$,也就是说,状态$c$可以访问(access)附近的其它状态。由于支撑集内每个状态都具有此性质,所以,整个支撑集内的状态构成一个$\textcolor{red}{\text{Communicate Class}}$<a href="#mc_basic_d4">[14]</a>。因此,$q(x|z)$对应的markov chain是不可约的。
673
+
674
+ 所以,满足条件1的markov chain是非周期和不可约的。可看图D.1的例子,其展示了单个connected component的例子。
675
  </li>
676
 
677
+ <li>
678
+ 当$q(x)$支撑集存在多个connected component时,markov chain可能存在多个communicate class。但当各间隙小于$g(x)$的3倍标准差时,那各个component的状态的将可互相访问(access),因此,$q(x|z)$对应的markov chain也只存在一个communicate class,与条件1的情况相同。所以,满足条件2的markov chain是非周期和不可约的。
679
 
680
+ 可看图d2的例子,其展示了多个connected component的例子。
681
+ </li>
682
  </ol>
683
 
684
+ <center> <img src="file/D3.png" width="960" style="margin-top:12px"/> </center>
685
+ <center> Figure D.3: Two component which <b>cannot</b> communicate with each other </center>
686
+
687
+ </br>
688
+ <h3 style="font-size:24px"> $\alpha \neq 1$ </h3>
689
+
690
+ 当$\alpha \neq 1$时,对$q(x)$支撑集内的任意点$c$,由式D.2可知,$g(x)$的中心不再是$c$,而是$\frac{c}{\sqrt{\alpha}}$。也就是说$g(x)$的中心会偏离$c$,偏离的距离为$\lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})$。可以看出,$\lVert c\rVert$越大,偏离越多。具体可看图D.4(c)和图D.4(d)的例子,在图D.4(d)中,当$z=2.0$,$g(x)$的中心明显偏离$x=2.0$。本文将此现象称之为<b>中心偏离现象</b>。
691
 
692
+ <b>中心偏离现象</b>将会影响markov chain一些状态的性质。
 
693
 
694
+ 当偏离的距离明显大于$3\sigma$时,$g(x)$在$x=c$及其附近<b>可能均为零</b>,于是,$q(x=c|z=c)$将<b>可能等于0</b>,并且在$x=c$附近$q(x|z=c)$<b>也可能等于0</b>。所以,状态$c$不一定可访问附近的状态。这一点与$\alpha=1$的情况不同。具体可图D.5的例子,$\textcolor{green}{\text{绿色曲线}}$是$z=6.0$的$g(x)$,$\textcolor{orange}{\text{黄线曲线}}$是$q(x|z=6.0)$,由于$g(x)$的中心偏离$x=6.0$太多,导致$q(x=6.0|z=6.0)=0$。
695
+
696
+ 当偏离的距离明显小于$3\sigma$时,$g(x)$在$x=c$及其附近<b>均不为零</b>,于是,$q(x=c|z=c)$将<b>不等于0</b>,并且在$x=c$附近$q(x|z=c)$<b>也不等于0</b>。所以,状态$c$可访问附近的状态,并且是非周期的。
697
+
698
+ 当$c$满足什么要求时,$g(x)$中心的偏离距离会小于$3\sigma$呢?
699
+ \begin{align}
700
+ \lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})\ <\ 3\frac{\sqrt{\beta}}{\sqrt{\alpha}} \qquad \Rightarrow \qquad \lVert c\rVert \ <\ 3\frac{\sqrt{\beta}}{1-\sqrt{\alpha}} \tag{D.3} \newline
701
+ \end{align}
702
+
703
+ 由上可知,存在一个上限,只要$\lVert c\rVert$小于这个上限,可保证偏离量小于$3\sigma$。
704
+
705
+ 当$\beta=1-\alpha$时,上式变为
706
  \begin{align}
707
+ \lVert c\rVert \ <\ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} \tag{D.4} \newline
708
  \end{align}
709
+
710
+ $3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$与$\alpha$有着严格的单调递减的关系。
711
+
712
+ 当$\alpha \in (0, 1)$时,
713
  \begin{align}
714
+ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} > 3 \tag{D.5} \newline
 
 
 
 
 
715
  \end{align}
 
716
 
717
+ 根据上面的分析,可总结出以下的结论:
718
+
719
+ <ol style="list-style-type:decimal">
720
+ <li>
721
+ <b>如果$q(x)$的支撑集只存在一个connected component,并且支撑集的点离原点的距离均小于$ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$,那么$q(x|z)$对应的markov chain是非周期和不可约的。</b>
722
+ </li>
723
+
724
+ <li>
725
+ 如果$q(x)$的支撑集存在多个connected component,由于$g(x)$的中心偏离效应,准确判断两个component之间是否可以互相访问变得更加复杂,这里不再详细分析。但下面给出一个保守的结论:<b>如果支撑集的点离原点的距离均小于$1$,并且各个connected component之间的间隙均小于$2\sigma$,那么$q(x|z)$对应的markov chain是非周期和不可约的。</b>
726
+ </li>
727
+ </ol>
728
+
729
+ <center> <img src="file/D4.png" width="1280" style="margin-top:12px"/> </center>
730
+ <center> Figure D.4: Center Deviation of the GaussFun </center>
731
+ </br>
732
+ <center> <img src="file/D5.png" width="568" style="margin-top:12px"/> </center>
733
+ <center> Figure D.5: Deviation is More Than $3\sigma$ </center>
734
+
735
+ """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_stationary_zh")
736
+
737
  return
738
 
739
 
 
780
 
781
  <a id="dsm" href="https://www.iro.umontreal.ca/~vincentp/Publications/smdae_techreport_1358_v1.pdf"> [18] A Connection Between Score Matching and Denoising autoencoders </a>
782
 
783
+ <a id="mc_basic_t3" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [19] Markov Chain:Basic Theory - Theorem 3 </a>
784
+
785
+ <a id="mc_mt_lambda" href="https://pages.uoregon.edu/dlevin/MARKOV/markovmixing.pdf"> [20] Markov Chains and Mixing Times, second edition - 12.2 The Relaxation Time </a>
786
+
787
+ <a id="non_neg_lambda" href="https://link.springer.com/book/10.1007/0-387-32792-4"> [21] Non-negative Matrices and Markov Chains - Theorem 2.10 </a>
788
+
789
+ <a id="prml_mcmc" href="https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf"> [22] Pattern Recognition and Machine Learning - 11.2. Markov Chain Monte Carlo </a>
790
+
791
+ <a id="elem" href="https://cs-114.org/wp-content/uploads/2015/01/Elements_of_Information_Theory_Elements.pdf"> [23] Elements_of_Information_Theory_Elements - 2.9 The Second Law of Thermodynamics </a>
792
+
793
  """, latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_zh")
794
 
795
  return
 
836
 
837
  md_cond_kl_zh()
838
 
839
+ md_approx_gauss_zh()
840
+
841
+ md_non_expanding_zh()
842
+
843
+ md_stationary_zh()
844
 
845
  md_reference_zh()
846
 
data.json CHANGED
The diff for this file is too large to render. See raw diff
 
fig2.png DELETED
Binary file (107 kB)
 
fig3.png DELETED
Binary file (123 kB)
 
fig4.png DELETED
Binary file (122 kB)